1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244 | package Torello.JavaDoc.SyntaxHiLite;
// Many, Many of both of these classes
import Torello.HTML.*;
import Torello.HTML.NodeSearch.*;
import Torello.Java.StrPrint;
import Torello.Java.StrCmpr;
import static Torello.Java.C.BBLACK;
import static Torello.Java.C.BBLUE_BKGND;
import static Torello.Java.C.BRED_BKGND;
import static Torello.Java.C.BCYAN_BKGND;
import static Torello.Java.C.RESET;
import java.util.Vector;
import java.util.regex.Pattern;
import java.util.stream.IntStream;
class CleanHTML
{
private static final boolean DEBUGGING = false;
// ********************************************************************************************
// ********************************************************************************************
// This Class Main Method
// ********************************************************************************************
// ********************************************************************************************
static void clean(final Vector<HTMLNode> html, final boolean hasLineNumbers)
{
if (DEBUGGING) CleanHTML.PRINT_PAGE(html, true);
CleanHTML.removeAllDIVS_ExceptFirst(html); // FIX ME / CHECK ME - if (hasLineNumners) ?
CleanHTML.fixThePRE_AndPossibeTableColumnTags(html, 0);
int mainSPANS_StartPos = 0;
if (hasLineNumbers)
{
final int secondTD_TagPos = CleanHTML.lineNumberSPANS_ToAnchorsWithIDs(html);
CleanHTML.fixThePRE_AndPossibeTableColumnTags(html, secondTD_TagPos);
mainSPANS_StartPos = secondTD_TagPos;
}
CleanHTML.fixTheMainSPANS(html, mainSPANS_StartPos);
CleanHTML.removeDuplicateColorSpans(html, mainSPANS_StartPos);
if (DEBUGGING) CleanHTML.PRINT_PAGE(html, false);
}
// ********************************************************************************************
// ********************************************************************************************
// Print the Page
// ********************************************************************************************
// ********************************************************************************************
private static final String V1 = BRED_BKGND + " Input HTML: ";
private static final String V2 = BBLUE_BKGND + " Output HTML: ";
private static void PRINT_PAGE(final Vector<HTMLNode> html, boolean startEnd)
{
final String s = Util.pageToString(html);
System.out.println(
(startEnd ? V1 : V2) + RESET + '\n' +
StrPrint.widthHeightAbbrev(s, " ... ", 110, 10, true)
);
}
// ********************************************************************************************
// ********************************************************************************************
// Remove Extra <DIV> Tags
// ********************************************************************************************
// ********************************************************************************************
private static void removeAllDIVS_ExceptFirst(Vector<HTMLNode> html)
{
// There is an extra set of <DIV> ... </DIV> that must be removed
// This only happens for "Complete Pages" HiLiting (with Line-Numbers)
// For Snippets - there are no line numbers, and also no second <DIV> either
final Vector<DotPair> allDivs =
TagNodeFindInclusive.all(html, 1, html.size()-1, "div");
if (allDivs.size() > 0)
{
final int[] divPosArr =
DPUtil.endPointsToPosArray(allDivs, true);
Util.Remove.nodesOPT(html, divPosArr);
}
}
// ********************************************************************************************
// ********************************************************************************************
// Convert <SPAN CLASS=LineNum> xxx</SPAN> ==> <A ID=Lxxx> xxx</A>
// ********************************************************************************************
// ********************************************************************************************
private static final TagNode CLOSING_ANCHOR = TagNode.getInstance("A", TC.ClosingTags);
private static int lineNumberSPANS_ToAnchorsWithIDs(Vector<HTMLNode> html)
{
final DotPair pre = TagNodeFindInclusive.first(html, "pre");
if (pre == null) throw new PygmentizeError
("A <PRE> Element was not found in the returned HTML from pygmentize");
if (pre.size() < 3) throw new PygmentizeError
("An empty <PRE> Element was found in the returned HTML from pygmentize");
int lineNum = 1;
TagNode tn = null;
for (int i=pre.start+1; i < pre.end; i++)
if ((tn = html.elementAt(i).ifTagNode()) != null)
if (tn.tok.equals("span"))
{
html.setElementAt(new TagNode("<A ID=L" + (lineNum++) + '>'), i);
i++;
html.setElementAt(new TextNode(html.elementAt(i).str.trim()), i);
i++;
html.setElementAt(CLOSING_ANCHOR, i);
}
// Returns Location of next <td> ==> </pre></td><td>
return pre.end + 2;
}
// ********************************************************************************************
// ********************************************************************************************
// Remove Quotations & toUpperCase for <SPAN>'s: <span class="H1-w"> ==> <SPAN CLASS=H1-w>
// ********************************************************************************************
// ********************************************************************************************
private static void fixTheMainSPANS(Vector<HTMLNode> html, int startPos)
{
for (final TagNodeIndex tni : TagNodePeek.all(html, startPos, -1, TC.Both, "span"))
{
final TagNode tn = tni.n.isClosing
? tni.n.toUpperCase(false)
: tni.n.removeAllAVQuotes().toUpperCase(false);
// System.out.println(tni.n.str + " ==> " + x.str);
html.setElementAt(tn, tni.index);
}
}
// ********************************************************************************************
// ********************************************************************************************
// <PRE>, <TABLE>, <TR>, <TD>
// ********************************************************************************************
// ********************************************************************************************
private static final TagNode PRE = new TagNode("<PRE STYLE='margin: 0; line-height: 125%'>");
private static final TagNode TABLE = TagNode.getInstance("TABLE", TC.OpeningTags);
private static final TagNode TR = TagNode.getInstance("TR", TC.OpeningTags);
private static final TagNode TD = TagNode.getInstance("TD", TC.OpeningTags);
private static void fixThePRE_AndPossibeTableColumnTags(Vector<HTMLNode> html, int startPos)
{
TagNode tn;
final int END = Math.min(startPos + 10, html.size());
for (int i=0; i < END; i++)
if ((tn = html.elementAt(i).openTag()) != null)
if (tn.tok.equals("pre")) html.setElementAt(PRE, i);
else if (tn.tok.equals("table")) html.setElementAt(TABLE, i);
else if (tn.tok.equals("tr")) html.setElementAt(TR, i);
else if (tn.tok.equals("td")) html.setElementAt(TD, i);
}
// ********************************************************************************************
// ********************************************************************************************
// Remove Consecutive Dupiclates: <SPAN CLASS=SomeClass> </SPAN> <SPAN CLASS=SameExactClass>
// ********************************************************************************************
// ********************************************************************************************
// The pygmentize output has many places where a <SPAN>...</SPAN> of code is redundant, and
// should be removed. It usually looks like the following (PAY CLOSE ATTENTION!):
//
// <span style=H1-x>removeArr</span> <span style=H1-x>=</span>
//
// This method converts that to:
//
// <SPAN STYLE=H1-x>removeArr =</SPAN>
private static final Pattern P = Pattern.compile("H(\\d{1,3})-([\\w\\d]+{1,3})");
private static int removeDuplicateColorSpans(final Vector<HTMLNode> v, int sPos)
{
final HNLIInclusive iter = InnerTagInclusiveIterator.get(v, "span", "class", P);
final IntStream.Builder b = IntStream.builder();
DotPair prev = new DotPair(0, v.size() - 1); // Non-sense initialization
String prevClass = "x"; // Non-sense initialization
iter.restrictCursor(sPos, v.size() - 1);
while (iter.hasNext())
{
final DotPair cur = iter.nextDotPair();
final TagNode tn = (TagNode) v.elementAt(cur.start);
final String curClass = tn.AV("class");
boolean isClear = prev.end < cur.start;
HTMLNode n;
for (int i=(prev.end+1); isClear && (i < (cur.start-1)); i++)
if (((n = v.elementAt(i)) instanceof TagNode) || (n.str.trim().length() > 0))
isClear = false;
if (isClear && (curClass.equals(prevClass)))
{ b.accept(prev.end); b.accept(cur.start); }
prev = cur;
prevClass = curClass;
};
int[] removeArr = b.build().toArray();
if (DEBUGGING) System.out.println("REM:" + removeArr.length);
Util.Remove.nodesOPT(v, removeArr);
return removeArr.length;
}
}
|