1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
package Torello.HTML.Tools.NewsSite;

import static Torello.Java.C.BRED;
import static Torello.Java.C.YELLOW;
import static Torello.Java.C.RESET;

import Torello.HTML.HTMLNode;

import Torello.Java.EXCC;

import java.util.Vector;
import java.net.URL;

import java.io.IOException;

class ExtractArticleAndVerify 
{
    static boolean run(final RECORD r) throws IOException
    {
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Use the Article-Getter to get the Article-Body.  Watch for Exceptions.
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // 
        // The function-pointer (FunctionInterface) 'articleGetter' is supposed to 
        // locate and extract the Article's HTML from the surrounding web-page, which
        // is usually fully-loaded with advertisements, and "See This Also" links.
        //
        // All news-websites I have seen wrap the article itself in an HTML <MAIN>
        // <ARTICLE>, <SECTION role='article'> or a <DIV CLASS='main'> tag
        // that is very easy to find.  Also, these tags differ from site-to-site, each
        // site will use the same tag for all of its articles.
        //
        // (But you have to look at the HTML first)

        try
            { r.article = r.articleGetter.apply(r.url, r.page); }

        catch (ArticleGetException e)
        {
            r.log.append(
                BRED + "\tArticleGet.apply(...) failed: " + e.getMessage() +
                RESET + "\nException Cause Chain:\n" + EXCC.toString(e) + '\n'
            );

            r.ret.elementAt(r.outerCounter()).add(DownloadResult.ARTICLE_GET_EXCEPTION);
            return false;
        }


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Verify Results, Select appropriate 'DownloadResult' Enum-Constant
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // 
        // Verify the results of the Article-Get, and choose the right DownloadResult
        // Enumerated-Constant if the download has failed in any way ...

        if (r.article == null)
        {
            r.log.append(
                BRED + "\tContent-body not found by ArticleGet.apply(...)\n" +
                RESET
            );

            r.ret
                .elementAt(r.outerCounter())
                .add(DownloadResult.ARTICLE_GET_RETURNED_NULL);

            return false;
        }

        if (r.article.size() == 0)
        {
            r.log.append(
                BRED + "\tContent-body not found by ArticleGet.apply(...)\n" +
                RESET
            );

            r.ret
                .elementAt(r.outerCounter())
                .add(DownloadResult.ARTICLE_GET_RETURNED_EMPTY_VECTOR);

            return false;
        }

        r.log.append(
            "\tArticle body contains (" + YELLOW + r.article.size() + RESET +
            ") HTMLNodes.\n"
        );

        return true;
    }
}