1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73 | package Torello.HTML;
import Torello.Java.FileRW;
import java.util.Vector;
import java.io.IOException;
import java.util.regex.*;
final class ParserRE
{
static Vector<HTMLNode> parsePageTokens(
CharSequence html,
boolean eliminateHTMLTags,
String rawHTMLFile,
String matchesFile,
String justTextFile
)
throws IOException
{
if (rawHTMLFile != null) FileRW.writeFile(html, rawHTMLFile);
if (matchesFile != null) FileRW.writeFile("", matchesFile);
if (justTextFile != null) FileRW.writeFile("", justTextFile);
Vector<HTMLNode> ret = new Vector<>();
String htmlStr = html.toString();
int end = 0;
Matcher m = HTMLRegEx.P2.matcher(htmlStr); // P2 FIND'S COMMENT NODES
while (m.find())
{
CommentNode newCommentNode = new CommentNode(m.group());
int start = m.start();
// The 'Primary' (Core) Parser will append parsed HTMLNode's to the Vector 'ret'
// The HTML is 'split' by comment-nodes first!
if (start > end)
ParserREInternal.getTokens(
ret, htmlStr, end /* previous value of end */, start,
eliminateHTMLTags, matchesFile, justTextFile
);
// NOTICE THE ORDER: Use the "previous value" of 'end', then update
end = m.end();
// NOTICE THE ORDER: The HTML *before* the Comment RegEx Matcher is parsed, and
// incorporated into the return vector first. Then the Comment that was matched is
// added to the Vector.
ret.addElement(newCommentNode);
// LEGACY FEATURE: Keeping the "matches" file output is very good for debugging, and
// error checking.
if (matchesFile != null)
FileRW.appendToFile("COMMENT:\t[" + newCommentNode.str + "]\n", matchesFile);
}
// if the last CommentNode had more HTML/TextNode's after it, this text also has to be
// parsed. This text is demarcated by [end, htmlStr.length()], where the value of 'end' is
// the index of the end of the last/final CommentNode RegEx Match.
if (end < htmlStr.length())
ParserREInternal.getTokens(
ret, htmlStr, end, htmlStr.length(),
eliminateHTMLTags, matchesFile, justTextFile
);
return ret;
}
}
|