Source code

001package Torello.HTML.Tools.NewsSite;
002
003import Torello.HTML.HTMLPageMWT;
004import Torello.Java.StrFilter;
005
006import Torello.Java.Additional.Ret4;
007
008import static Torello.Java.C.RESET;
009import static Torello.Java.C.BRED;
010import static Torello.Java.C.GREEN;
011
012import java.util.Vector;
013import java.io.IOException;
014
015import java.net.URL;
016
017/**
018 * This class runs the primary iteration-loop for downloading news-articles using a list of
019 * article-{@code URL's}.
020 * 
021 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE_ARTICLES>
022 */
023@Torello.JavaDoc.StaticFunctional
024public class ScrapeArticles
025{
026    private ScrapeArticles() { }
027
028    private static final String STARS =
029        "*********************************************" +
030        "********************************************\n";
031
032    /**
033     * This is used to do the downloading of newspaper articles.
034     * @param articleReceiver           <EMBED CLASS='external-html' DATA-FILE-ID=SA_ARTICLE_REC>
035     * @param articleURLs               <EMBED CLASS='external-html' DATA-FILE-ID=SA_ARTICLE_URLS>
036     * @param articleGetter             <EMBED CLASS='external-html' DATA-FILE-ID=SA_ARTICLE_GETTER>
037     * @param skipArticlesWithoutPhotos <EMBED CLASS='external-html' DATA-FILE-ID=SA_SKIP_NO_PHOTOS>
038     * @param bannerAndAdFinder         <EMBED CLASS='external-html' DATA-FILE-ID=SA_BANNER_ADS>
039     * @param keepOriginalPageHTML      <EMBED CLASS='external-html' DATA-FILE-ID=SA_KEEP_HTML_BOOL>
040     * 
041     * @param pause If there are many / numerous articles to download, pass an instance of
042     * {@code class Pause}, and intermediate progress can be saved, and reloaded at a later time.
043     * 
044     * @param log This parameter may not be null, or a {@code NullPointerException} shall throw.
045     * As articles are downloaded, notices shall be posted to this {@code 'log'} by this method.
046     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
047     * 
048     * @return                  <EMBED CLASS='external-html' DATA-FILE-ID=SA_RETURNS>
049     * @throws PauseException   If there is an error when attempting to save the download state.
050     * @throws ReceiveException <EMBED CLASS='external-html' DATA-FILE-ID=SA_RECEIVE_EX>
051     * @throws IOException      <EMBED CLASS='external-html' DATA-FILE-ID=SA_IO_EX>
052     */
053    public static Vector<Vector<DownloadResult>> download(   
054        final ScrapedArticleReceiver    articleReceiver,
055        final Vector<Vector<String>>    articleURLs,
056        final ArticleGet                articleGetter,
057        final boolean                   skipArticlesWithoutPhotos,
058        final StrFilter                 bannerAndAdFinder,   
059        final boolean                   keepOriginalPageHTML,
060        final Pause                     pause,
061        final Appendable                log
062    )
063        throws PauseException, ReceiveException, IOException
064    {
065        final RECORD r = new RECORD(
066            articleReceiver,
067            articleURLs,
068            articleGetter,
069            skipArticlesWithoutPhotos,
070            bannerAndAdFinder,
071            keepOriginalPageHTML,
072            pause,
073            log
074        );
075
076        log.append(
077            "\n" + BRED + STARS + STARS +
078            RESET + " Downloading Articles" + BRED + "\n" +
079            STARS + STARS + RESET + '\n'
080        );
081
082        // If the user has passed an instance of 'pause' then it should be loaded from disk.
083        if (pause != null)
084        {
085            final Ret4<Vector<Vector<DownloadResult>>, Integer, Integer, Integer> r4 =
086                pause.loadState();
087
088            r.ret.addAll(r4.a);
089            r.setCounters(r4);
090        }
091
092
093        // If the user did not provide a "Pause" mechanism, **OR** the "Pause Mechanism" asserts
094        // that the download process is starting from the beginning of the article-URL Vector,
095        // THEN a *new vector* should be built.
096        // 
097        // Initializes the capacity (sizes) of the two-dimensional "Return Vector."
098        // 
099        // NOTE: The return Vector is exactly parallel to the input "articleURLs"
100        //       two-dimensional input Vector.
101
102        if ((pause == null) || r.countersAllZero())
103            for (int i=0; i < articleURLs.size(); i++) 
104                r.ret.add(new Vector<DownloadResult>(articleURLs.elementAt(i).size()));
105
106
107        for (; r.outerLoopTest(); r.incOuterCounter())
108
109            for (r.initInnerCounter(); r.innerLoopTest(); r.incInnerCounter())
110
111                try
112                    { loopBody(r); }
113
114                catch (ReceiveException re)
115                { HandleExceptions.receiverException(r, re); }
116
117                catch (IOException ioe)
118                { HandleExceptions.ioException(r, ioe); }
119
120                catch (Exception e)
121                { HandleExceptions.exception(r, e); }
122
123                finally
124                {
125                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
126                    // Write the current "READ STATE" information (two integers)
127                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
128                    // 
129                    // This makes sure that the download-progress is not lost when large numbers
130                    // of articles are being processed.  Restart the download, and the loop
131                    // variables will automatically be initialized to where they were before the
132                    // JVM exited.  (Pretty Useful)
133
134                    if (pause != null) pause.saveState
135                        (r.ret, r.outerCounter(), r.innerCounter(), r.successCounter());
136                }
137
138
139        log.append(
140            BRED + STARS + RESET +
141            "Traversing Site Completed.\n" +
142            "Loaded a total of (" + r.successCounter() + ") articles.\n"
143        );
144
145
146        // Returns the two-dimensional "Download Result" Vector
147        // Make sure to stop the "Max Wait Time Threads"
148
149        HTMLPageMWT.shutdownMWTThreads();
150
151        return r.ret;
152    }
153
154
155    private static void loopBody(final RECORD r)
156        throws IOException, ReceiveException
157    {
158        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
159        // Download one article
160        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
161
162        r.urlStr = r.articleURLs
163            .elementAt(r.outerCounter())
164            .elementAt(r.innerCounter());
165
166        // Generate the URL-Instance, call the Garbage-Collector, if needed
167        boolean success = URL_GC.run(r);
168        if (! success) return;
169
170        // Scrape the Web-Page, Verify that the page has non-Empty HTML
171        success = ScrapePageAndVerify.run(r);
172        if (! success) return;
173
174        // Retrieve the <TITLE> element (as a String) from the page - if it has one.
175        ScrapeTitle.run(r);
176
177        // Extract the Article and Verify non-Empty Results
178        success = ExtractArticleAndVerify.run(r);
179        if (! success) return;
180
181        // Extract the Image-Locations, Verify Non-Empty Results
182        success = GetImageLocations.run(r);
183        if (! success) return;
184
185        // Skip "Banner Images"
186        success = SkipBannerImages.run(r);
187        if (! success) return;
188
189
190        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
191        // Output the Results
192        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
193
194        final Article articleResult = new Article(
195            r.url, r.title, (r.keepOriginalPageHTML ? r.page : null), r.article,
196            r.imageURLs, r.imagePosArr
197        );
198
199
200        // The article was successfully downloaded and parsed.  Send it to the
201        // "Receiver" and add DownloadResult to the return vector.
202
203        r.log.append(
204            GREEN + "ARTICLE LOADED." + RESET +
205            "  Sending to ScrapedArticleReceiver.\n"
206        );
207
208        r.articleReceiver.receive(articleResult, r.outerCounter(), r.innerCounter());
209
210        r.ret
211            .elementAt(r.outerCounter())
212            .add(DownloadResult.SUCCESS);
213
214        r.incSuccessCounter();
215    }
216
217}