1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
package Torello.HTML.Tools.NewsSite;

import Torello.HTML.HTMLPageMWT;
import Torello.Java.StrFilter;

import Torello.Java.Additional.Ret4;

import static Torello.Java.C.RESET;
import static Torello.Java.C.BRED;
import static Torello.Java.C.GREEN;

import java.util.Vector;
import java.io.IOException;

import java.net.URL;

/**
 * This class runs the primary iteration-loop for downloading news-articles using a list of
 * article-{@code URL's}.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE_ARTICLES>
 */
@Torello.JavaDoc.StaticFunctional
public class ScrapeArticles
{
    private ScrapeArticles() { }

    private static final String STARS =
        "*********************************************" +
        "********************************************\n";

    /**
     * This is used to do the downloading of newspaper articles.
     * @param articleReceiver           <EMBED CLASS='external-html' DATA-FILE-ID=SA_ARTICLE_REC>
     * @param articleURLs               <EMBED CLASS='external-html' DATA-FILE-ID=SA_ARTICLE_URLS>
     * @param articleGetter             <EMBED CLASS='external-html' DATA-FILE-ID=SA_ARTICLE_GETTER>
     * @param skipArticlesWithoutPhotos <EMBED CLASS='external-html' DATA-FILE-ID=SA_SKIP_NO_PHOTOS>
     * @param bannerAndAdFinder         <EMBED CLASS='external-html' DATA-FILE-ID=SA_BANNER_ADS>
     * @param keepOriginalPageHTML      <EMBED CLASS='external-html' DATA-FILE-ID=SA_KEEP_HTML_BOOL>
     * 
     * @param pause If there are many / numerous articles to download, pass an instance of
     * {@code class Pause}, and intermediate progress can be saved, and reloaded at a later time.
     * 
     * @param log This parameter may not be null, or a {@code NullPointerException} shall throw.
     * As articles are downloaded, notices shall be posted to this {@code 'log'} by this method.
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     * 
     * @return                  <EMBED CLASS='external-html' DATA-FILE-ID=SA_RETURNS>
     * @throws PauseException   If there is an error when attempting to save the download state.
     * @throws ReceiveException <EMBED CLASS='external-html' DATA-FILE-ID=SA_RECEIVE_EX>
     * @throws IOException      <EMBED CLASS='external-html' DATA-FILE-ID=SA_IO_EX>
     */
    public static Vector<Vector<DownloadResult>> download(   
        final ScrapedArticleReceiver    articleReceiver,
        final Vector<Vector<String>>    articleURLs,
        final ArticleGet                articleGetter,
        final boolean                   skipArticlesWithoutPhotos,
        final StrFilter                 bannerAndAdFinder,   
        final boolean                   keepOriginalPageHTML,
        final Pause                     pause,
        final Appendable                log
    )
        throws PauseException, ReceiveException, IOException
    {
        final RECORD r = new RECORD(
            articleReceiver,
            articleURLs,
            articleGetter,
            skipArticlesWithoutPhotos,
            bannerAndAdFinder,
            keepOriginalPageHTML,
            pause,
            log
        );

        log.append(
            "\n" + BRED + STARS + STARS +
            RESET + " Downloading Articles" + BRED + "\n" +
            STARS + STARS + RESET + '\n'
        );

        // If the user has passed an instance of 'pause' then it should be loaded from disk.
        if (pause != null)
        {
            final Ret4<Vector<Vector<DownloadResult>>, Integer, Integer, Integer> r4 =
                pause.loadState();

            r.ret.addAll(r4.a);
            r.setCounters(r4);
        }


        // If the user did not provide a "Pause" mechanism, **OR** the "Pause Mechanism" asserts
        // that the download process is starting from the beginning of the article-URL Vector,
        // THEN a *new vector* should be built.
        // 
        // Initializes the capacity (sizes) of the two-dimensional "Return Vector."
        // 
        // NOTE: The return Vector is exactly parallel to the input "articleURLs"
        //       two-dimensional input Vector.

        if ((pause == null) || r.countersAllZero())
            for (int i=0; i < articleURLs.size(); i++) 
                r.ret.add(new Vector<DownloadResult>(articleURLs.elementAt(i).size()));


        for (; r.outerLoopTest(); r.incOuterCounter())

            for (r.initInnerCounter(); r.innerLoopTest(); r.incInnerCounter())

                try
                    { loopBody(r); }

                catch (ReceiveException re)
                { HandleExceptions.receiverException(r, re); }

                catch (IOException ioe)
                { HandleExceptions.ioException(r, ioe); }

                catch (Exception e)
                { HandleExceptions.exception(r, e); }

                finally
                {
                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
                    // Write the current "READ STATE" information (two integers)
                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
                    // 
                    // This makes sure that the download-progress is not lost when large numbers
                    // of articles are being processed.  Restart the download, and the loop
                    // variables will automatically be initialized to where they were before the
                    // JVM exited.  (Pretty Useful)

                    if (pause != null) pause.saveState
                        (r.ret, r.outerCounter(), r.innerCounter(), r.successCounter());
                }


        log.append(
            BRED + STARS + RESET +
            "Traversing Site Completed.\n" +
            "Loaded a total of (" + r.successCounter() + ") articles.\n"
        );


        // Returns the two-dimensional "Download Result" Vector
        // Make sure to stop the "Max Wait Time Threads"

        HTMLPageMWT.shutdownMWTThreads();

        return r.ret;
    }


    private static void loopBody(final RECORD r)
        throws IOException, ReceiveException
    {
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Download one article
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        r.urlStr = r.articleURLs
            .elementAt(r.outerCounter())
            .elementAt(r.innerCounter());

        // Generate the URL-Instance, call the Garbage-Collector, if needed
        boolean success = URL_GC.run(r);
        if (! success) return;

        // Scrape the Web-Page, Verify that the page has non-Empty HTML
        success = ScrapePageAndVerify.run(r);
        if (! success) return;

        // Retrieve the <TITLE> element (as a String) from the page - if it has one.
        ScrapeTitle.run(r);

        // Extract the Article and Verify non-Empty Results
        success = ExtractArticleAndVerify.run(r);
        if (! success) return;

        // Extract the Image-Locations, Verify Non-Empty Results
        success = GetImageLocations.run(r);
        if (! success) return;

        // Skip "Banner Images"
        success = SkipBannerImages.run(r);
        if (! success) return;


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Output the Results
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        final Article articleResult = new Article(
            r.url, r.title, (r.keepOriginalPageHTML ? r.page : null), r.article,
            r.imageURLs, r.imagePosArr
        );


        // The article was successfully downloaded and parsed.  Send it to the
        // "Receiver" and add DownloadResult to the return vector.

        r.log.append(
            GREEN + "ARTICLE LOADED." + RESET +
            "  Sending to ScrapedArticleReceiver.\n"
        );

        r.articleReceiver.receive(articleResult, r.outerCounter(), r.innerCounter());

        r.ret
            .elementAt(r.outerCounter())
            .add(DownloadResult.SUCCESS);

        r.incSuccessCounter();
    }

}