1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82 | package Torello.HTML.Tools.NewsSite;
import Torello.HTML.HTMLNode;
import Torello.HTML.NodeSearch.InnerTagGetInclusive;
import Torello.HTML.NodeSearch.TextComparitor;
import Torello.HTML.NodeSearch.TCCompareStrException;
import java.util.Vector;
import java.net.URL;
class Usual_tc
{
static ArticleGet generate(
final TextComparitor tc,
final String... cssClassCompareStrings
)
{
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
// FAIL-FAST: Check user-input for possible errors BEFORE building the Lambda-Function.
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
// Check for valid compareStrings
TCCompareStrException.check(cssClassCompareStrings);
if (tc == null) throw new NullPointerException
("Null has been passed to TextComparitor Parameter 'tc', but this is not allowed here.");
// This 'final' String is merely used for proper error reporting in any potential
// exception-messages, nothing else.
final String functionNameStr =
"InnerTagGetInclusive.first(page, \"div\", \"class\", " +
Helper.STR_FORMAT_TC_PARAMS(tc, cssClassCompareStrings) + ")";
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
// Build the instance, using a lambda-expression
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
return (URL url, Vector<HTMLNode> page) ->
{
// This exception-check is done on every invocation of this Lambda-Function.
// It is merely checking that these inputs are not-null, and page is of non-zero size.
// ArticleGetException is a compile-time, checked exception. It is important to halt
// News-Site Scrape Progress when "Empty News-Page Data" is being passed here.
// NOTE: This would imply an internal-error with class Download has occurred.
ArticleGetException.check(url, page);
final Vector<HTMLNode> ret;
try
{
ret = InnerTagGetInclusive.first
(page, "div", "class", tc, cssClassCompareStrings);
}
catch (Exception e)
{
throw new ArticleGetException
(ArticleGetException.GOT_EXCEPTION, functionNameStr, e);
}
// These error-checks are used to deduce whether the "Article Get" was successful.
// When this exception is thrown, it means that the user-specified means of "Retrieving
// an Article Body" FAILED. In this case, the "innerHTML" of the specified htmltag and
// class of the <DIV CLASS=...> produced a null news-article page, or an empty
// news-article page.
if (ret == null) throw new ArticleGetException
(ArticleGetException.RET_NULL, functionNameStr);
if (ret.size() == 0) throw new ArticleGetException
(ArticleGetException.RET_EMPTY_VECTOR, functionNameStr);
return ret;
};
}
}
|