1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125 | package Torello.HTML.Tools.NewsSite;
import Torello.HTML.HTMLNode;
import Torello.HTML.PageStats;
import java.util.Vector;
import java.io.Serializable;
import java.net.URL;
/**
* When a news article is downloaded from a {@code URL}, its contents are parsed, and the
* information-HTML is stored in this class.
*
* <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=ARTICLE>
*/
public class Article implements Serializable
{
/** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */
protected static final long serialVersionUID = 1;
/**
* This should inform the user that an error occurred when downloading an article. If this
* field, after instantiation is {@code TRUE}, all other fields in this class should be thought
* of as "irrelevant."
*/
public final boolean wasErrorDownload;
/** This is the article's URL from the news website. */
public final URL url;
/**
* This is the title that was scraped from the main page. The title is the content of the
* {@code <TITLE>...</TITLE>} element on the article HTML page.
*/
public final String titleElement;
/**
* This is the original, and complete, HTML vectorized-page download. It contains the
* original, un-modified, article download.
*/
public final Vector<HTMLNode> originalPage;
/**
* This is the pared down article-body. It is what is retrieved from {@code class ArticleGet}
*/
public final Vector<HTMLNode> articleBody;
/**
* The image-URL's that were found in the news-article. The easiest way to think about this
* field is that the following instructions were called on the article-body after downloading
* the article:
*
* <BR /><BR /><DIV CLASS=SNIP>{@code
* Vector<TagNode> imageNodes = TagNodeGet.all(article, TC.OpeningTags, "img");
* Vector<URL> imageURLs = Links.resolveSRCs(imageNodes, articleURL);
*
* // The results of the above call are stored in this field / Vector<URL>.
* }</DIV>
*/
public final Vector<URL> imageURLs;
/**
* This list contains the "Image Positions" inside the vectorized-article for each image that
* was found inside the article. The easiest way to think about this field is that the
* following instructions were called on the article-body after downloading that article:
*
* <BR /><BR /><DIV CLASS=SNIP>{@code
* int[] imagePosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
* }</DIV>
*/
public final int[] imagePosArr;
/**
* This contains an instance of {@code class PageStats} that has been generated out of an
* original Newspaper Article Page.
*
* <DIV CLASS=LOC>{@code
* this.originalPageStats = new PageStats(originalPage);
* }</DIV>
*/
public final PageStats originalPageStats;
/**
* This contains an instance of {@code class PageStats} that has been generated from the
* post-processed Newspaper Article.
*
* <DIV CLASS=LOC>{@code
* this.processedArticleStats = new PageStats(articleBody);
* }</DIV>
*/
public final PageStats processedArticleStats;
/**
* Builds an instance of this class.
*
* @param url The web-address from whence this news-article was downloaded / retrieved.
* @param titleElement The contents of the HTML {@code <TITLE>} tag, as a {@code String}.
* @param originalPage Vectorized-HTML of the original article web-page, in its entirety.
* @param articleBody Vectorized-HTML of the body of the article's page, as extracted by the
* {@code ArticleGet} function-pointer.
* @param imageURLs A list of all HTML {@code <IMG>} elements found inside the
* {@code 'articleBody'}
* @param imagePosArr The {@code Vector}-indices where the images (if any) were found in the
* article.
*/
public Article(
final URL url,
final String titleElement,
final Vector<HTMLNode> originalPage,
final Vector<HTMLNode> articleBody,
final Vector<URL> imageURLs,
final int[] imagePosArr
)
{
this.wasErrorDownload = false;
this.url = url;
this.titleElement = titleElement;
this.originalPage = originalPage;
this.articleBody = articleBody;
this.imageURLs = imageURLs;
this.imagePosArr = imagePosArr;
this.originalPageStats = (originalPage == null) ? null : new PageStats(originalPage);
this.processedArticleStats = new PageStats(articleBody);
}
}
|