ScrapeArticles.java.html

package Torello.HTML.Tools.NewsSite;

import Torello.HTML.HTMLPageMWT;
import Torello.Java.StrFilter;

import Torello.Java.Additional.Ret4;

import static Torello.Java.C.RESET;
import static Torello.Java.C.BRED;
import static Torello.Java.C.GREEN;

import java.util.Vector;
import java.io.IOException;

import java.net.URL;

/**
 * This class runs the primary iteration-loop for downloading news-articles using a list of
 * article-{@code URL's}.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE_ARTICLES>
 */
@Torello.JavaDoc.StaticFunctional
public class ScrapeArticles
{
    private ScrapeArticles() { }

    private static final String STARS =
        "*********************************************" +
        "********************************************\n";

    /**
     * This is used to do the downloading of newspaper articles.
     * @param articleReceiver           <EMBED CLASS='external-html' DATA-FILE-ID=SA_ARTICLE_REC>
     * @param articleURLs               <EMBED CLASS='external-html' DATA-FILE-ID=SA_ARTICLE_URLS>
     * @param articleGetter             <EMBED CLASS='external-html' DATA-FILE-ID=SA_ARTICLE_GETTER>
     * @param skipArticlesWithoutPhotos <EMBED CLASS='external-html' DATA-FILE-ID=SA_SKIP_NO_PHOTOS>
     * @param bannerAndAdFinder         <EMBED CLASS='external-html' DATA-FILE-ID=SA_BANNER_ADS>
     * @param keepOriginalPageHTML      <EMBED CLASS='external-html' DATA-FILE-ID=SA_KEEP_HTML_BOOL>
     * 
     * @param pause If there are many / numerous articles to download, pass an instance of
     * {@code class Pause}, and intermediate progress can be saved, and reloaded at a later time.
     * 
     * @param log This parameter may not be null, or a {@code NullPointerException} shall throw.
     * As articles are downloaded, notices shall be posted to this {@code 'log'} by this method.
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     * 
     * @return                  <EMBED CLASS='external-html' DATA-FILE-ID=SA_RETURNS>
     * @throws PauseException   If there is an error when attempting to save the download state.
     * @throws ReceiveException <EMBED CLASS='external-html' DATA-FILE-ID=SA_RECEIVE_EX>
     * @throws IOException      <EMBED CLASS='external-html' DATA-FILE-ID=SA_IO_EX>
     */
    public static Vector<Vector<DownloadResult>> download(   
        final ScrapedArticleReceiver    articleReceiver,
        final Vector<Vector<String>>    articleURLs,
        final ArticleGet                articleGetter,
        final boolean                   skipArticlesWithoutPhotos,
        final StrFilter                 bannerAndAdFinder,   
        final boolean                   keepOriginalPageHTML,
        final Pause                     pause,
        final Appendable                log
    )
        throws PauseException, ReceiveException, IOException
    {
        final RECORD r = new RECORD(
            articleReceiver,
            articleURLs,
            articleGetter,
            skipArticlesWithoutPhotos,
            bannerAndAdFinder,
            keepOriginalPageHTML,
            pause,
            log
        );

        log.append(
            "\n" + BRED + STARS + STARS +
            RESET + " Downloading Articles" + BRED + "\n" +
            STARS + STARS + RESET + '\n'
        );

        // If the user has passed an instance of 'pause' then it should be loaded from disk.
        if (pause != null)
        {
            final Ret4<Vector<Vector<DownloadResult>>, Integer, Integer, Integer> r4 =
                pause.loadState();

            r.ret.addAll(r4.a);
            r.setCounters(r4);
        }


        // If the user did not provide a "Pause" mechanism, **OR** the "Pause Mechanism" asserts
        // that the download process is starting from the beginning of the article-URL Vector,
        // THEN a *new vector* should be built.
        // 
        // Initializes the capacity (sizes) of the two-dimensional "Return Vector."
        // 
        // NOTE: The return Vector is exactly parallel to the input "articleURLs"
        //       two-dimensional input Vector.

        if ((pause == null) || r.countersAllZero())
            for (int i=0; i < articleURLs.size(); i++) 
                r.ret.add(new Vector<DownloadResult>(articleURLs.elementAt(i).size()));


        for (; r.outerLoopTest(); r.incOuterCounter())

            for (r.initInnerCounter(); r.innerLoopTest(); r.incInnerCounter())

                try
                    { loopBody(r); }

                catch (ReceiveException re)
                { HandleExceptions.receiverException(r, re); }

                catch (IOException ioe)
                { HandleExceptions.ioException(r, ioe); }

                catch (Exception e)
                { HandleExceptions.exception(r, e); }

                finally
                {
                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
                    // Write the current "READ STATE" information (two integers)
                    // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
                    // 
                    // This makes sure that the download-progress is not lost when large numbers
                    // of articles are being processed.  Restart the download, and the loop
                    // variables will automatically be initialized to where they were before the
                    // JVM exited.  (Pretty Useful)

                    if (pause != null) pause.saveState
                        (r.ret, r.outerCounter(), r.innerCounter(), r.successCounter());
                }


        log.append(
            BRED + STARS + RESET +
            "Traversing Site Completed.\n" +
            "Loaded a total of (" + r.successCounter() + ") articles.\n"
        );


        // Returns the two-dimensional "Download Result" Vector
        // Make sure to stop the "Max Wait Time Threads"

        HTMLPageMWT.shutdownMWTThreads();

        return r.ret;
    }


    private static void loopBody(final RECORD r)
        throws IOException, ReceiveException
    {
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Download one article
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        r.urlStr = r.articleURLs
            .elementAt(r.outerCounter())
            .elementAt(r.innerCounter());

        // Generate the URL-Instance, call the Garbage-Collector, if needed
        boolean success = URL_GC.run(r);
        if (! success) return;

        // Scrape the Web-Page, Verify that the page has non-Empty HTML
        success = ScrapePageAndVerify.run(r);
        if (! success) return;

        // Retrieve the <TITLE> element (as a String) from the page - if it has one.
        ScrapeTitle.run(r);

        // Extract the Article and Verify non-Empty Results
        success = ExtractArticleAndVerify.run(r);
        if (! success) return;

        // Extract the Image-Locations, Verify Non-Empty Results
        success = GetImageLocations.run(r);
        if (! success) return;

        // Skip "Banner Images"
        success = SkipBannerImages.run(r);
        if (! success) return;


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Output the Results
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        final Article articleResult = new Article(
            r.url, r.title, (r.keepOriginalPageHTML ? r.page : null), r.article,
            r.imageURLs, r.imagePosArr
        );


        // The article was successfully downloaded and parsed.  Send it to the
        // "Receiver" and add DownloadResult to the return vector.

        r.log.append(
            GREEN + "ARTICLE LOADED." + RESET +
            "  Sending to ScrapedArticleReceiver.\n"
        );

        r.articleReceiver.receive(articleResult, r.outerCounter(), r.innerCounter());

        r.ret
            .elementAt(r.outerCounter())
            .add(DownloadResult.SUCCESS);

        r.incSuccessCounter();
    }

}