RunExample.java.html

package Torello.HTML.Tools.NewsSite;

import Torello.Java.FileRW;
import Torello.Java.StorageWriter;
import Torello.Java.C;

import java.util.Vector;
import java.io.File;
import java.io.IOException;


class RunExample
{
    public static void run() throws IOException
    {
        StorageWriter log = new StorageWriter();


        // This directory will contain ".dat" files that are simply "Serialized" HTML Vectors.
        // Each ".dat" file will contain precisely one HTML page.

        final String dataFilesDir = "cnb" + File.separator + "articleData" + File.separator;


        // This directory will contain sub-directories with ".html" files (and image-files)
        // for each news-article that is saved / downloaded.

        final String htmlFilesDir = "cnb" + File.separator + "articleHTML" + File.separator;


        // This CLEARS WHATEVE DATA IS CURRENTLY IN THE DIRECTORY (by deleting all its contents)
        // The following code is the same as the UNIX Shell Command:
        // rm -r cnb/articleData/
        // mkdir cnb/articleData

        FileRW.delTree(dataFilesDir, true, log);


        // The following code is the same as the UNIX Shell Command:
        // rm -r cnb/articleHTML/
        // mkdir cnb/articleHTML

        FileRW.delTree(htmlFilesDir, true, log);


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Previous Download Data Erased (if any), Start today's News-Site Scrape
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // 
        // Use the "GovCNCarousel" instance that is created in this class as a NewsSite

        NewsSite ns = NewsSites.GovCNCarousel;


        // Call the "Scrape URLs" class to retrieve all of the available newspaper articles
        // on the Java-Script "Article Carousel"  Again, the "Article Carousel" is just this 
        // little widget at the top of the page that rotates (usually) five hilited / emphasized
        // news-article links for today

        Vector<Vector<String>> articleURLs = ScrapeURLs.get(ns, log);


        // This is usually not very important if only a small number of articles are being
        // scraped.  When downloading hundreds of articles - being able to pause if there is a
        // web-site IOError (And restart) is very important.
        //
        // The standard factory-generated "getFSInstance" creates a small file on the file-system
        // for saving the "Download State" while downloading...
    
        Pause pause = Pause.getFSInstance("cnb" + File.separator + "state.dat");
        pause.initialize();


        // The "Scraped Articles" will be sent to the directory named by "dataFilesDir"
        // Using the File-System to save these articles is the default-factory means for
        // saving article-data.  Writing a customized "ScapedArticleReceiver" to do anything
        // from saving article-data to a Data-Base up to and including e-mailing article data
        // is possible using a self-written "ScrapedArticleReceiver"

        ScrapedArticleReceiver receiver = ScrapedArticleReceiver.saveToFS(dataFilesDir);


        // This will download each of the article's from their web-page URL.  The web-page
        // article URL's were retrieved by "Scraped URLs".  The saved HTML (as HTML Vectors)
        // is sent to the "Article Receiver" (defined in the previous step).  These news articles
        // are saved as ".dat" since they are serialized java-objects.
        //
        // Explaining some "unnamed parameters" passed to the method invocation below:
        //
        // true: [skipArticlesWithoutPhotos] Skips Mandarin Chinese Newspaper Articles that do not
        //       include at least one photo.  Photos usually help when reading foreign news articles.
        // null: [bannerAndAdFinder] Some sites include images for Facebook links or advertising.
        //       Gov.CN usually doesn't have these, but occasionally there are extraneous links.
        //       for the purposes of this example, this parameter is ignored, and passed null.
        // false: [keepOriginalPageHTML] The "Complete Page" - content before the Article Body is
        //        extracted from the Article Web-Page is not saved.  This can occasionally be useful
        //        if the HTML <HEAD>...</HEAD> has JSON or React-JS data to extract.

        ScrapeArticles.download
            (receiver, articleURLs, ns.articleGetter, true, null, false, pause, log);


        // Now this will convert each of the ".dat" files to an ".html" file - and also it
        // will download the pictures / image included in the article.
        //
        // Explaining some "unnamed parameters" passed to the method invocation below:
        //
        // true: [cleanIt] This runs some basic HTML remove operations.  The best way to see
        //       what the parameter "cleanIt" asks to have removed is to view the class "ToHTML"
        // null: [HTMLModifier] Cleaning up other extraneous links and content in an newspaper
        //       article body like advertising or links to other articles is usually necessary.
        //       Anywhere between 1 and 10 lines of NodeSearch Removal Operations will get rid of
        //       unnecessary HTML.  For the purposes of this example, such a cleaning operation is
        //       not done here - although the final articles do include some "links to other
        //       articles" that is not "CLEANED" like it should be.

        ToHTML.convert(dataFilesDir, htmlFilesDir, true, null, log);


        // NOTE: The log of running this command on Debian UNIX / LINUX may be viewed in the
        // JavaDoc Comments in the top of this method.  If this method is run in an MS-DOS
        // or Windows Environment, there will be no screen colors available to view.

        FileRW.writeFile(
            C.toHTML(log.getString(), true, true, true),
            "cnb" + File.separator + "Gov.CN.log.html"
        );
    }
}