Source code

001package Torello.HTML.Tools.NewsSite;
002
003import Torello.HTML.HTMLNode;
004import Torello.HTML.PageStats;
005import java.util.Vector;
006import java.io.Serializable;
007import java.net.URL;
008
009/**
010 * When a news article is downloaded from a {@code URL}, its contents are parsed, and the
011 * information-HTML is stored in this class.
012 * 
013 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=ARTICLE>
014 */
015public class Article implements Serializable
016{
017    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID>  */
018    protected static final long serialVersionUID = 1;
019
020    /**
021     * This should inform the user that an error occurred when downloading an article. If this
022     * field,  after instantiation is {@code TRUE}, all other fields in this class should be thought
023     * of as "irrelevant."
024     */
025    public final boolean wasErrorDownload;
026
027    /** This is the article's URL from the news website. */
028    public final URL url;
029
030    /**
031     * This is the title that was scraped from the main page.  The title is the content of the
032     * {@code <TITLE>...</TITLE>} element on the article HTML page.
033     */
034    public final String titleElement;
035
036    /**
037     * This is the original, and complete, HTML vectorized-page download.  It contains the
038     * original, un-modified, article download.
039     */
040    public final Vector<HTMLNode> originalPage;
041
042    /**
043     * This is the pared down article-body.  It is what is retrieved from {@code class ArticleGet}
044     */
045    public final Vector<HTMLNode> articleBody;
046
047    /**
048     * The image-URL's that were found in the news-article.  The easiest way to think about this
049     * field is that the following instructions were called on the article-body after downloading
050     * the article:
051     * 
052     * <BR /><BR /><DIV CLASS=SNIP>{@code
053     * Vector<TagNode> imageNodes  = TagNodeGet.all(article, TC.OpeningTags, "img");
054     * Vector<URL>     imageURLs   = Links.resolveSRCs(imageNodes, articleURL);
055     * 
056     * // The results of the above call are stored in this field / Vector<URL>.
057     * }</DIV>
058     */
059    public final Vector<URL> imageURLs;
060
061    /**
062     * This list contains the "Image Positions" inside the vectorized-article for each image that
063     * was found inside the article.  The easiest way to think about this field is that the
064     * following instructions were called on the article-body after downloading that article:
065     * 
066     * <BR /><BR /><DIV CLASS=SNIP>{@code
067     *  int[] imagePosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
068     * }</DIV>
069     */
070    public final int[] imagePosArr;
071
072    /**
073     * This contains an instance of {@code class PageStats} that has been generated out of an
074     * original Newspaper Article Page.
075     * 
076     * <DIV CLASS=LOC>{@code 
077     * this.originalPageStats = new PageStats(originalPage);
078     * }</DIV>
079     */
080    public final PageStats originalPageStats;
081
082    /**
083     * This contains an instance of {@code class PageStats} that has been generated from the
084     *  post-processed Newspaper Article.
085     * 
086     * <DIV CLASS=LOC>{@code 
087     * this.processedArticleStats = new PageStats(articleBody);
088     * }</DIV>
089     */
090    public final PageStats processedArticleStats;
091
092
093    /**
094     * Builds an instance of this class.
095     * 
096     * @param url The web-address from whence this news-article was downloaded / retrieved.
097     * @param titleElement The contents of the HTML {@code <TITLE>} tag, as a {@code String}.
098     * @param originalPage Vectorized-HTML of the original article web-page, in its entirety.
099     * @param articleBody Vectorized-HTML of the body of the article's page, as extracted by the
100     * {@code ArticleGet} function-pointer.
101     * @param imageURLs A list of all HTML {@code <IMG>} elements found inside the
102     * {@code 'articleBody'}
103     * @param imagePosArr The {@code Vector}-indices where the images (if any) were found in the
104     * article.
105     */
106    public Article(
107            final URL               url,
108            final String            titleElement,
109            final Vector<HTMLNode>  originalPage,
110            final Vector<HTMLNode>  articleBody,
111            final Vector<URL>       imageURLs,
112            final int[]             imagePosArr
113        )
114    {
115        this.wasErrorDownload       = false;
116        this.url                    = url;
117        this.titleElement           = titleElement;
118        this.originalPage           = originalPage;
119        this.articleBody            = articleBody;
120        this.imageURLs              = imageURLs;
121        this.imagePosArr            = imagePosArr;
122        this.originalPageStats      = (originalPage == null) ? null : new PageStats(originalPage);
123        this.processedArticleStats  = new PageStats(articleBody);
124    }
125}