1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
package Torello.HTML.Tools.NewsSite;

import Torello.Java.FileRW;
import Torello.Java.StorageWriter;
import Torello.Java.C;

import java.util.Vector;
import java.io.File;
import java.io.IOException;


class RunExample
{
    public static void run() throws IOException
    {
        StorageWriter log = new StorageWriter();


        // This directory will contain ".dat" files that are simply "Serialized" HTML Vectors.
        // Each ".dat" file will contain precisely one HTML page.

        final String dataFilesDir = "cnb" + File.separator + "articleData" + File.separator;


        // This directory will contain sub-directories with ".html" files (and image-files)
        // for each news-article that is saved / downloaded.

        final String htmlFilesDir = "cnb" + File.separator + "articleHTML" + File.separator;


        // This CLEARS WHATEVE DATA IS CURRENTLY IN THE DIRECTORY (by deleting all its contents)
        // The following code is the same as the UNIX Shell Command:
        // rm -r cnb/articleData/
        // mkdir cnb/articleData

        FileRW.delTree(dataFilesDir, true, log);


        // The following code is the same as the UNIX Shell Command:
        // rm -r cnb/articleHTML/
        // mkdir cnb/articleHTML

        FileRW.delTree(htmlFilesDir, true, log);


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Previous Download Data Erased (if any), Start today's News-Site Scrape
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // 
        // Use the "GovCNCarousel" instance that is created in this class as a NewsSite

        NewsSite ns = NewsSites.GovCNCarousel;


        // Call the "Scrape URLs" class to retrieve all of the available newspaper articles
        // on the Java-Script "Article Carousel"  Again, the "Article Carousel" is just this 
        // little widget at the top of the page that rotates (usually) five hilited / emphasized
        // news-article links for today

        Vector<Vector<String>> articleURLs = ScrapeURLs.get(ns, log);


        // This is usually not very important if only a small number of articles are being
        // scraped.  When downloading hundreds of articles - being able to pause if there is a
        // web-site IOError (And restart) is very important.
        //
        // The standard factory-generated "getFSInstance" creates a small file on the file-system
        // for saving the "Download State" while downloading...
    
        Pause pause = Pause.getFSInstance("cnb" + File.separator + "state.dat");
        pause.initialize();


        // The "Scraped Articles" will be sent to the directory named by "dataFilesDir"
        // Using the File-System to save these articles is the default-factory means for
        // saving article-data.  Writing a customized "ScapedArticleReceiver" to do anything
        // from saving article-data to a Data-Base up to and including e-mailing article data
        // is possible using a self-written "ScrapedArticleReceiver"

        ScrapedArticleReceiver receiver = ScrapedArticleReceiver.saveToFS(dataFilesDir);


        // This will download each of the article's from their web-page URL.  The web-page
        // article URL's were retrieved by "Scraped URLs".  The saved HTML (as HTML Vectors)
        // is sent to the "Article Receiver" (defined in the previous step).  These news articles
        // are saved as ".dat" since they are serialized java-objects.
        //
        // Explaining some "unnamed parameters" passed to the method invocation below:
        //
        // true: [skipArticlesWithoutPhotos] Skips Mandarin Chinese Newspaper Articles that do not
        //       include at least one photo.  Photos usually help when reading foreign news articles.
        // null: [bannerAndAdFinder] Some sites include images for Facebook links or advertising.
        //       Gov.CN usually doesn't have these, but occasionally there are extraneous links.
        //       for the purposes of this example, this parameter is ignored, and passed null.
        // false: [keepOriginalPageHTML] The "Complete Page" - content before the Article Body is
        //        extracted from the Article Web-Page is not saved.  This can occasionally be useful
        //        if the HTML <HEAD>...</HEAD> has JSON or React-JS data to extract.

        ScrapeArticles.download
            (receiver, articleURLs, ns.articleGetter, true, null, false, pause, log);


        // Now this will convert each of the ".dat" files to an ".html" file - and also it
        // will download the pictures / image included in the article.
        //
        // Explaining some "unnamed parameters" passed to the method invocation below:
        //
        // true: [cleanIt] This runs some basic HTML remove operations.  The best way to see
        //       what the parameter "cleanIt" asks to have removed is to view the class "ToHTML"
        // null: [HTMLModifier] Cleaning up other extraneous links and content in an newspaper
        //       article body like advertising or links to other articles is usually necessary.
        //       Anywhere between 1 and 10 lines of NodeSearch Removal Operations will get rid of
        //       unnecessary HTML.  For the purposes of this example, such a cleaning operation is
        //       not done here - although the final articles do include some "links to other
        //       articles" that is not "CLEANED" like it should be.

        ToHTML.convert(dataFilesDir, htmlFilesDir, true, null, log);


        // NOTE: The log of running this command on Debian UNIX / LINUX may be viewed in the
        // JavaDoc Comments in the top of this method.  If this method is run in an MS-DOS
        // or Windows Environment, there will be no screen colors available to view.

        FileRW.writeFile(
            C.toHTML(log.getString(), true, true, true),
            "cnb" + File.separator + "Gov.CN.log.html"
        );
    }
}