CleanHTML.java.html

package Torello.JavaDoc.SyntaxHiLite;

// Many, Many of both of these classes
import Torello.HTML.*;
import Torello.HTML.NodeSearch.*;

import Torello.Java.StrPrint;
import Torello.Java.StrCmpr;

import static Torello.Java.C.BBLACK;
import static Torello.Java.C.BBLUE_BKGND;
import static Torello.Java.C.BRED_BKGND;
import static Torello.Java.C.BCYAN_BKGND;
import static Torello.Java.C.RESET;

import java.util.Vector;
import java.util.regex.Pattern;
import java.util.stream.IntStream;

class CleanHTML 
{
    private static final boolean DEBUGGING = false;


    // ********************************************************************************************
    // ********************************************************************************************
    // This Class Main Method
    // ********************************************************************************************
    // ********************************************************************************************


    static void clean(final Vector<HTMLNode> html, final boolean hasLineNumbers)
    {
        if (DEBUGGING) CleanHTML.PRINT_PAGE(html, true);     

        CleanHTML.removeAllDIVS_ExceptFirst(html); // FIX ME / CHECK ME - if (hasLineNumners) ?
        CleanHTML.fixThePRE_AndPossibeTableColumnTags(html, 0);

        int mainSPANS_StartPos = 0;

        if (hasLineNumbers)
        {
            final int secondTD_TagPos = CleanHTML.lineNumberSPANS_ToAnchorsWithIDs(html);
            CleanHTML.fixThePRE_AndPossibeTableColumnTags(html, secondTD_TagPos);
            mainSPANS_StartPos = secondTD_TagPos;
        }

        CleanHTML.fixTheMainSPANS(html, mainSPANS_StartPos);
        CleanHTML.removeDuplicateColorSpans(html, mainSPANS_StartPos);

        if (DEBUGGING) CleanHTML.PRINT_PAGE(html, false);
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Print the Page
    // ********************************************************************************************
    // ********************************************************************************************


    private static final String V1 = BRED_BKGND     + " Input HTML: ";
    private static final String V2 = BBLUE_BKGND    + " Output HTML: ";

    private static void PRINT_PAGE(final Vector<HTMLNode> html, boolean startEnd)    
    {
        final String s = Util.pageToString(html);

        System.out.println(
            (startEnd ? V1 : V2) + RESET + '\n' +
            StrPrint.widthHeightAbbrev(s, " ... ", 110, 10, true)
        );
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Remove Extra <DIV> Tags
    // ********************************************************************************************
    // ********************************************************************************************


    private static void removeAllDIVS_ExceptFirst(Vector<HTMLNode> html)
    {
        // There is an extra set of <DIV> ... </DIV> that must be removed
        // This only happens for "Complete Pages" HiLiting (with Line-Numbers)
        // For Snippets - there are no line numbers, and also no second <DIV> either

        final Vector<DotPair> allDivs =
            TagNodeFindInclusive.all(html, 1, html.size()-1, "div");

        if (allDivs.size() > 0)
        {
            final int[] divPosArr =
                DPUtil.endPointsToPosArray(allDivs, true);

            Util.Remove.nodesOPT(html, divPosArr);
        }
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Convert <SPAN CLASS=LineNum> xxx</SPAN>  ==>  <A ID=Lxxx> xxx</A>
    // ********************************************************************************************
    // ********************************************************************************************


    private static final TagNode CLOSING_ANCHOR = TagNode.getInstance("A", TC.ClosingTags);

    private static int lineNumberSPANS_ToAnchorsWithIDs(Vector<HTMLNode> html)
    {
        final DotPair pre = TagNodeFindInclusive.first(html, "pre");

        if (pre == null) throw new PygmentizeError
            ("A <PRE> Element was not found in the returned HTML from pygmentize");

        if (pre.size() < 3) throw new PygmentizeError
            ("An empty <PRE> Element was found in the returned HTML from pygmentize");

        int     lineNum = 1;
        TagNode tn      = null;

        for (int i=pre.start+1; i < pre.end; i++)
            if ((tn = html.elementAt(i).ifTagNode()) != null)
                if (tn.tok.equals("span"))
                {                    
                    html.setElementAt(new TagNode("<A ID=L" + (lineNum++) + '>'), i);
                    i++;
                    html.setElementAt(new TextNode(html.elementAt(i).str.trim()), i);
                    i++;
                    html.setElementAt(CLOSING_ANCHOR, i);
                }

        // Returns Location of next <td>  ==>  </pre></td><td>
        return pre.end + 2;
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Remove Quotations & toUpperCase for <SPAN>'s:  <span class="H1-w">  ==>  <SPAN CLASS=H1-w>
    // ********************************************************************************************
    // ********************************************************************************************


    private static void fixTheMainSPANS(Vector<HTMLNode> html, int startPos)
    {
        for (final TagNodeIndex tni : TagNodePeek.all(html, startPos, -1, TC.Both, "span"))
        {
            final TagNode tn = tni.n.isClosing
                ? tni.n.toUpperCase(false)
                : tni.n.removeAllAVQuotes().toUpperCase(false);

            // System.out.println(tni.n.str + " ==> " + x.str);
            html.setElementAt(tn, tni.index);
        }
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // <PRE>, <TABLE>, <TR>, <TD>
    // ********************************************************************************************
    // ********************************************************************************************


    private static final TagNode PRE    = new TagNode("<PRE STYLE='margin: 0; line-height: 125%'>");
    private static final TagNode TABLE  = TagNode.getInstance("TABLE", TC.OpeningTags);
    private static final TagNode TR     = TagNode.getInstance("TR", TC.OpeningTags);
    private static final TagNode TD     = TagNode.getInstance("TD", TC.OpeningTags);

    private static void fixThePRE_AndPossibeTableColumnTags(Vector<HTMLNode> html, int startPos)
    {
        TagNode tn;

        final int END = Math.min(startPos + 10, html.size());

        for (int i=0; i < END; i++)
            if ((tn = html.elementAt(i).openTag()) != null)
                        if (tn.tok.equals("pre"))   html.setElementAt(PRE, i);
                else    if (tn.tok.equals("table")) html.setElementAt(TABLE, i);
                else    if (tn.tok.equals("tr"))    html.setElementAt(TR, i);
                else    if (tn.tok.equals("td"))    html.setElementAt(TD, i);
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Remove Consecutive Dupiclates: <SPAN CLASS=SomeClass> </SPAN> <SPAN CLASS=SameExactClass>
    // ********************************************************************************************
    // ********************************************************************************************


    // The pygmentize output has many places where a <SPAN>...</SPAN> of code is redundant, and 
    // should be removed.  It usually looks like the following (PAY CLOSE ATTENTION!):
    // 
    // <span style=H1-x>removeArr</span> <span style=H1-x>=</span>
    // 
    // This method converts that to:
    //
    // <SPAN STYLE=H1-x>removeArr =</SPAN>

    private static final Pattern P = Pattern.compile("H(\\d{1,3})-([\\w\\d]+{1,3})");

    private static int removeDuplicateColorSpans(final Vector<HTMLNode> v, int sPos)
    {
        final HNLIInclusive     iter    = InnerTagInclusiveIterator.get(v, "span", "class", P);
        final IntStream.Builder b       = IntStream.builder();

        DotPair prev        = new DotPair(0, v.size() - 1);  // Non-sense initialization
        String  prevClass   = "x"; // Non-sense initialization

        iter.restrictCursor(sPos, v.size() - 1);

        while (iter.hasNext())
        {
            final DotPair   cur         = iter.nextDotPair();
            final TagNode   tn          = (TagNode) v.elementAt(cur.start);
            final String    curClass    = tn.AV("class");

            boolean     isClear = prev.end < cur.start;
            HTMLNode    n;

            for (int i=(prev.end+1); isClear && (i < (cur.start-1)); i++)
                if (((n = v.elementAt(i)) instanceof TagNode) || (n.str.trim().length() > 0))
                    isClear = false;

            if (isClear && (curClass.equals(prevClass)))
            { b.accept(prev.end); b.accept(cur.start); }

            prev        = cur;
            prevClass   = curClass;
        };

        int[] removeArr = b.build().toArray();

        if (DEBUGGING) System.out.println("REM:" + removeArr.length);

        Util.Remove.nodesOPT(v, removeArr);

        return removeArr.length;
    }
}