1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129 | package Torello.HTML.Tools.NewsSite;
import Torello.Java.FileRW;
import Torello.Java.StorageWriter;
import Torello.Java.C;
import java.util.Vector;
import java.io.File;
import java.io.IOException;
class RunExample
{
public static void run() throws IOException
{
StorageWriter log = new StorageWriter();
// This directory will contain ".dat" files that are simply "Serialized" HTML Vectors.
// Each ".dat" file will contain precisely one HTML page.
final String dataFilesDir = "cnb" + File.separator + "articleData" + File.separator;
// This directory will contain sub-directories with ".html" files (and image-files)
// for each news-article that is saved / downloaded.
final String htmlFilesDir = "cnb" + File.separator + "articleHTML" + File.separator;
// This CLEARS WHATEVE DATA IS CURRENTLY IN THE DIRECTORY (by deleting all its contents)
// The following code is the same as the UNIX Shell Command:
// rm -r cnb/articleData/
// mkdir cnb/articleData
FileRW.delTree(dataFilesDir, true, log);
// The following code is the same as the UNIX Shell Command:
// rm -r cnb/articleHTML/
// mkdir cnb/articleHTML
FileRW.delTree(htmlFilesDir, true, log);
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
// Previous Download Data Erased (if any), Start today's News-Site Scrape
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
//
// Use the "GovCNCarousel" instance that is created in this class as a NewsSite
NewsSite ns = NewsSites.GovCNCarousel;
// Call the "Scrape URLs" class to retrieve all of the available newspaper articles
// on the Java-Script "Article Carousel" Again, the "Article Carousel" is just this
// little widget at the top of the page that rotates (usually) five hilited / emphasized
// news-article links for today
Vector<Vector<String>> articleURLs = ScrapeURLs.get(ns, log);
// This is usually not very important if only a small number of articles are being
// scraped. When downloading hundreds of articles - being able to pause if there is a
// web-site IOError (And restart) is very important.
//
// The standard factory-generated "getFSInstance" creates a small file on the file-system
// for saving the "Download State" while downloading...
Pause pause = Pause.getFSInstance("cnb" + File.separator + "state.dat");
pause.initialize();
// The "Scraped Articles" will be sent to the directory named by "dataFilesDir"
// Using the File-System to save these articles is the default-factory means for
// saving article-data. Writing a customized "ScapedArticleReceiver" to do anything
// from saving article-data to a Data-Base up to and including e-mailing article data
// is possible using a self-written "ScrapedArticleReceiver"
ScrapedArticleReceiver receiver = ScrapedArticleReceiver.saveToFS(dataFilesDir);
// This will download each of the article's from their web-page URL. The web-page
// article URL's were retrieved by "Scraped URLs". The saved HTML (as HTML Vectors)
// is sent to the "Article Receiver" (defined in the previous step). These news articles
// are saved as ".dat" since they are serialized java-objects.
//
// Explaining some "unnamed parameters" passed to the method invocation below:
//
// true: [skipArticlesWithoutPhotos] Skips Mandarin Chinese Newspaper Articles that do not
// include at least one photo. Photos usually help when reading foreign news articles.
// null: [bannerAndAdFinder] Some sites include images for Facebook links or advertising.
// Gov.CN usually doesn't have these, but occasionally there are extraneous links.
// for the purposes of this example, this parameter is ignored, and passed null.
// false: [keepOriginalPageHTML] The "Complete Page" - content before the Article Body is
// extracted from the Article Web-Page is not saved. This can occasionally be useful
// if the HTML <HEAD>...</HEAD> has JSON or React-JS data to extract.
ScrapeArticles.download
(receiver, articleURLs, ns.articleGetter, true, null, false, pause, log);
// Now this will convert each of the ".dat" files to an ".html" file - and also it
// will download the pictures / image included in the article.
//
// Explaining some "unnamed parameters" passed to the method invocation below:
//
// true: [cleanIt] This runs some basic HTML remove operations. The best way to see
// what the parameter "cleanIt" asks to have removed is to view the class "ToHTML"
// null: [HTMLModifier] Cleaning up other extraneous links and content in an newspaper
// article body like advertising or links to other articles is usually necessary.
// Anywhere between 1 and 10 lines of NodeSearch Removal Operations will get rid of
// unnecessary HTML. For the purposes of this example, such a cleaning operation is
// not done here - although the final articles do include some "links to other
// articles" that is not "CLEANED" like it should be.
ToHTML.convert(dataFilesDir, htmlFilesDir, true, null, log);
// NOTE: The log of running this command on Debian UNIX / LINUX may be viewed in the
// JavaDoc Comments in the top of this method. If this method is run in an MS-DOS
// or Windows Environment, there will be no screen colors available to view.
FileRW.writeFile(
C.toHTML(log.getString(), true, true, true),
"cnb" + File.separator + "Gov.CN.log.html"
);
}
}
|