1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182 | package Torello.HTML.Tools.NewsSite;
import Torello.HTML.*;
import Torello.Java.*;
import Torello.HTML.Tools.Images.ImageScraper;
import Torello.HTML.Tools.Images.Request;
import static Torello.Java.C.*;
import java.util.*;
import java.io.*;
import java.util.regex.*;
import java.net.URL;
import java.util.concurrent.TimeUnit;
/**
* Converts Serialized Object Files of HTML-Vectors into <CODE>'.html'</CODE> Files, and can
* also be used to do any user-defined, customized post-processing (using a function-pointer) on
* news-articles (after downloading them).
*
* <EMBED CLASS='external-html' DATA-FILE-ID=TO_HTML>
*/
@Torello.JavaDoc.StaticFunctional
public class ToHTML
{
private ToHTML() { }
private static final Pattern P1 = Pattern.compile(".*?(\\d{3,}).(\\d{3,}).*");
/**
* <EMBED CLASS='external-html' DATA-FILE-ID=TH_DESCRIPTION>
*
* @param inputDir This parameter should contain the name of the directory that was used with
* method {@code download(...)} from {@code ScrapeArticle's}. This directory must exist and it
* must contain the files that were saved.
*
* @param outputDir This parameter should contain the name of the directory where the expanded
* and de-serialized {@code '.html'} files will be stored, along with their downloaded images.
*
* @param cleanIt <EMBED CLASS='external-html' DATA-FILE-ID=TH_CLEAN_IT>
* @param modifyOrRetrieve <EMBED CLASS='external-html' DATA-FILE-ID=TH_MOD_OR_RETRIEVE>
*
* @param log Output text is sent to this log. This parameter may be null, and if it is, it
* shall be ignored. If this program is running on UNIX, color-codes will be included in the
* log data.
*
* <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
*
* @throws IOException If there any I/O Exceptions when writing image files to the file-system,
* then this exception will throw.
*/
@SuppressWarnings("unchecked")
public static void convert(
final String inputDir,
String outputDir,
final boolean cleanIt,
final HTMLModifier modifyOrRetrieve,
final Appendable log
)
throws IOException
{
if (log !=null) log.append(
"\n" + BRED +
"*****************************************************************************************\n" +
"*****************************************************************************************\n" +
RESET + " Converting Vector<HTMLNode> to '.html' files, and downloading Pictures." + BRED + "\n" +
"*****************************************************************************************\n" +
"*****************************************************************************************\n" +
RESET + '\n'
);
if (! outputDir.endsWith(File.separator)) outputDir = outputDir + File.separator;
// Uses the FileNode class to build an iterator of all '.dat' files that are found in the
// 'inputDir' directory-parameter.
final Iterator<FileNode> iter = FileNode
.createRoot(inputDir)
.loadTree()
.getDirContentsFiles
(RTC.ITERATOR(), (FileNode fn) -> fn.name.endsWith(".dat"));
// Iterate through each of the data-files.
while (iter.hasNext())
try
{
// Retrieve next article, using the iterator
final FileNode fn = iter.next();
// Load the instance of 'Article' into memory, using Object De-Serialization
Article page = FileRW.readObjectFromFileNOCNFE(fn.toString(), Article.class, true);
// If there are customized modifications to the page (or retrieval operations)
// that were requested, they are done here.
if (modifyOrRetrieve != null)
{
// Retrieves the section-number and article-number from file-name
Matcher m = P1.matcher(fn.toString());
// These will be set to -1, and if the directoryName/fileName did not use the
// standard "factory-generated" file-save, then these will STILL BE -1 when
// passed to the modifier lambda.
int sectionNum = -1;
int articleNum = -1;
if (m.find())
{
sectionNum = Integer.parseInt(m.group(1));
articleNum = Integer.parseInt(m.group(2));
}
// pass the articleBody (and it's URL and filename) to the customized
// HTML Modifier provided by the user who called this method
modifyOrRetrieve.modifyOrRetrieve
(page.articleBody, page.url, sectionNum, articleNum);
}
// We need to build a "Sub-Directory" name for the HTML page where the download
// images will be stored
int dotPos = fn.name.lastIndexOf(".");
String outDirName = outputDir + fn.name.substring(0, dotPos).replace("\\.", "/") + '/';
// Make sure the subdirectory exists.
new File(outDirName).mkdirs();
// This process may be skipped, but it makes the output HTML much cleaner and more
// readable for most Internet News Web-Sites. Both <SCRIPT>, <!-- --> elements are
// removed. Also, any "class" or "id" fields are eliminated. This "cleaning" can
// be easily skipped
if (cleanIt)
{
Util.Remove.scriptNodeBlocks(page.articleBody);
Util.Remove.styleNodeBlocks(page.articleBody);
Util.Remove.allCommentNodes(page.articleBody);
Attributes.remove(page.articleBody, "class", "id");
}
if (log != null) log.append("Writing Page: " + BGREEN + fn.name + RESET + '\n');
// 'Localize' any images available. 'localizing' an HTML web-page means downloading
// the image data, and saving it to disk.
ImageScraper.localizeImages(page.articleBody, page.url, log, outDirName);
// If there were any images available, they were downloaded and localized. The
// Write the (updated) HTML to an '.html' text-file.
FileRW.writeFile(Util.pageToString(page.articleBody), outDirName + "index.html");
}
// NOTE: The "ImageScraper" spawns a (very) small "monitor thread" that ensures that
// downloading does not "hang" the system by aborting image-downloads that take longer
// than 10 seconds. It is necessary to shut-down these threads on system exit, because
// if they are not shutdown, when a java program terminates, the operating system that
// the program is using (the terminal window) will appear to "hang" or "freeze" until
// the extra-thread is shut-down by the JVM. This delay can be upwards of 30 seconds.
catch (IOException ioe)
{ ImageScraper.shutdownTOThreads(); throw ioe; }
catch (Exception e)
{
ImageScraper.shutdownTOThreads();
throw new IOException(
"There was a problem converting the html pages. See exception.getCause() " +
"for more details.",
e
);
}
// Exit the method. Again, shutdown the Time-Out "monitor" thread.
ImageScraper.shutdownTOThreads();
}
}
|