1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92 | package Torello.HTML.Tools.NewsSite;
import static Torello.Java.C.BRED;
import static Torello.Java.C.YELLOW;
import static Torello.Java.C.RESET;
import Torello.HTML.HTMLNode;
import Torello.Java.EXCC;
import java.util.Vector;
import java.net.URL;
import java.io.IOException;
class ExtractArticleAndVerify
{
static boolean run(final RECORD r) throws IOException
{
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
// Use the Article-Getter to get the Article-Body. Watch for Exceptions.
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
//
// The function-pointer (FunctionInterface) 'articleGetter' is supposed to
// locate and extract the Article's HTML from the surrounding web-page, which
// is usually fully-loaded with advertisements, and "See This Also" links.
//
// All news-websites I have seen wrap the article itself in an HTML <MAIN>
// <ARTICLE>, <SECTION role='article'> or a <DIV CLASS='main'> tag
// that is very easy to find. Also, these tags differ from site-to-site, each
// site will use the same tag for all of its articles.
//
// (But you have to look at the HTML first)
try
{ r.article = r.articleGetter.apply(r.url, r.page); }
catch (ArticleGetException e)
{
r.log.append(
BRED + "\tArticleGet.apply(...) failed: " + e.getMessage() +
RESET + "\nException Cause Chain:\n" + EXCC.toString(e) + '\n'
);
r.ret.elementAt(r.outerCounter()).add(DownloadResult.ARTICLE_GET_EXCEPTION);
return false;
}
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
// Verify Results, Select appropriate 'DownloadResult' Enum-Constant
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
//
// Verify the results of the Article-Get, and choose the right DownloadResult
// Enumerated-Constant if the download has failed in any way ...
if (r.article == null)
{
r.log.append(
BRED + "\tContent-body not found by ArticleGet.apply(...)\n" +
RESET
);
r.ret
.elementAt(r.outerCounter())
.add(DownloadResult.ARTICLE_GET_RETURNED_NULL);
return false;
}
if (r.article.size() == 0)
{
r.log.append(
BRED + "\tContent-body not found by ArticleGet.apply(...)\n" +
RESET
);
r.ret
.elementAt(r.outerCounter())
.add(DownloadResult.ARTICLE_GET_RETURNED_EMPTY_VECTOR);
return false;
}
r.log.append(
"\tArticle body contains (" + YELLOW + r.article.size() + RESET +
") HTMLNodes.\n"
);
return true;
}
}
|