Source code

001package Torello.Languages;
002
003import java.util.*;
004import java.io.*;
005import java.net.URL;
006
007import Torello.Java.*;
008import Torello.Java.Additional.*;
009import Torello.HTML.*;
010import Torello.HTML.NodeSearch.*;
011import Torello.HTML.Tools.Images.*;
012
013import static Torello.Java.C.*;
014
015/**
016 * A simple Foreign News Article Scraper.
017 * 
018 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=FNA>
019 * @see GCSTAPI#key
020 * @see GCSTAPI#sentence(String, LC, LC)
021 * @see GCSTAPI#wordByWord(Vector, LC, LC)
022 */
023@Torello.JavaDoc.StaticFunctional
024public class ForeignNewsArticle
025{
026    private ForeignNewsArticle() { }
027
028    /**
029     * This is the HTML page header that is appended to the output page.
030     */
031    public static final String HEADER =  
032        "<HTML>\n"  +
033        HTMLHeader.metaTag + "\n"   +
034        "<TITLE>Translated, Foreign Language Article</TITLE>\n" +
035        "<SCRIPT type=\"text/javascript\">\n" + HTMLHeader.javaScript + "\n" + "</SCRIPT>" + "\n"   +
036        "<STYLE>\n" + HTMLHeader.css + "</STYLE>" + "\n"    +
037        "<BODY>" + "\n" + HTMLHeader.popUpDIV + "\n"    +
038        HTMLHeader.text2SpeechNote;
039
040    /**
041     * This will download and translate a news article from a foreign news website.  All that you
042     * need to do is provide the main "Article-Body" of the article, and some information -
043     * <I><B>and calls to Google Cloud Server Translate API</I></B> will be handled by the code.
044     *
045     * <BR /><BR /><B><SPAN STYLE="color: red;">IMPORTANT NOTE:</B></SPAN> This class makes calls
046     * to the GCSTAPI, which is an acronym meaning the Google Cloud Server Translate API.  This
047     * server expects you to pay Google for the services that it provides. The translations are not
048     * free - but they are not too expensive either.  <B><I>You must be sure to set the
049     * {@code class GSCTAPI -> String key} field </I></B> in order for the GGCS Translate API
050     * Queries to succeed.
051     *
052     * <BR /><BR /><B>Your Directory Will Contain:</B>
053     * 
054     * <BR /><BR /><OL CLASS=JDUL>
055     * <LI>Article Photos, stored by number as they appear in the article</LI>
056     * <LI>{@code index.html} - Article Body with Translations</LI>
057     * </OL>
058     * 
059     * @param articleBody This should have the content of the article from the vectorized HTML
060     * page.  Read more about  cleaning an HTML news article in the class ArticleGet.
061     * 
062     * <DIV CLASS="EXAMPLE-SCROLL">{@code
063     * // Generally retreiving the "Article Body" from a news-article web-page is a 'sort-of' simple
064     * // two-step process.
065     * //
066     * // Step 1:  You must look at the web-page in your browser and press your browser's "View Content"
067     * //          Button.  Identify the HTML Divider Element that looks something to the effect of
068     * //          <DIV CLASS='article_body'> ... or maybe <DIV CLASS='page_content'>
069     * //          You will have to find the relevant divider, or article element once, and only once,
070     * //          per website
071     * //
072     * // Step 2: Grab that content with a simple call to the Inclusive-Get methods in NodeSearch
073     *
074     * URL url = new URL("https://some.foreign-news.site/some-article.html");
075     * Vector<HTMLNode> articlePage = HTMLPage.getPageTokens(url, false);
076     * Vector<HTMLNode> articleBody = InnerTagGetInclusive.first(articlePage, "div", "class",
077     *                                  TextComparitor.C, "page-content");
078     *                                  // use whatever tag you have found via the "View Content"
079     *                                  // Button on your browser.  You only need to find this tag
080     *                                  // once per website!
081     *
082     * // Now pass the 'articleBody' to this 'processArticle' method.
083     * // You will also have to  retrieve the "Article Title" manually as well.
084     * // Hopefully it is obvious that the 'title' could be stored in any number of ways
085     * // depending on which site is being viewed.  The title location is usually "consistently 
086     * // the same" as long as your on the same website.
087     *
088     * String title = "?";    // you must search the page to retrieve the title
089     * LC articleLC = LC.es;  // Select the (spoken) language used in the article.
090     *                        // This could be LC.vi (Vietnamese), LC.es (Spanish) etc...
091     *
092     * Ret3<Vector<String>, Vector<String>, String[]> response = processArticle
093     *         (articleBody, url, title, articleLC, new StorageWriter(), "outdir/");
094     *
095     * // The returned String-Vectors will have the translated sentences and words readily
096     * // available for use - if you wish to further process the article-content.
097     * // The output directory 'outdir/' will have a readable 'index.html' file, along
098     * // with any photos that were found on the page already downloaded so they may be
099     * // locally included on the output page.
100     * }</DIV>
101     *
102     * @param url  This article's URL to be scraped.  This is used, only, for including a link to
103     * the articles  original page on the output index.html file.
104     * 
105     * @param title This is needed because obtaining the title can be done in myraid ways.  If it
106     * is kept as an "external      option" - this provides more leeway to the coder/programmer.
107     * 
108     * @param srcLang This is just the "two character" language code that Google Cloud Server
109     * expects to see.
110     * 
111     * @param log This logs progress to terminal out.  Null may be passsed, in which case output
112     * will not be      displayed.  Any implementation of {@code java.lang.Appendable} will
113     * suffice.  Make note that the      'Appendable' interface allows / requires heeding
114     * IOException's for it's 'append(...)' methods.
115     * 
116     * @param targetDirectory This is the directory where the image-files and 'index.html' file
117     * will be stored.
118     * 
119     * @return This will return an instance of:
120     * {@code Ret3<Vector<String>, Vector<String>, String[]>}
121     * 
122     * <BR /><BR /><UL CLASS=JDUL>
123     * 
124     * <LI> {@code ret3.a (Vector<String>)} 
125     *      <BR /><BR />
126     *      This vector contains a list of sentences, or sentence-fragments, in the original
127     *      language of the news or article.
128     *      <BR /><BR />
129     *      </LI>
130     * 
131     * <LI> {@code ret3.b (Vector<String>)}
132     *      <BR /><BR />
133     *      This vector contains a list of sentences, or sentence-fragments, in the target
134     *      language, which is english.
135     *      <BR /><BR />
136     *      </LI>
137     * 
138     * <LI> {@code ret3.c (String[])}
139     *      <BR /><BR />
140     *      This array of strings contains a list of filenames, one for each image that was 
141     *      present on the original news or article page, and therefore downloaded.
142     *      </LI>
143     * 
144     * </UL>
145     */
146    @SuppressWarnings("unchecked")
147    public static Ret3<Vector<String>, Vector<String>, String[]> processArticle(
148            Vector<HTMLNode> articleBody, URL url, String title,
149            LC srcLang, Appendable log, String targetDirectory
150        )
151        throws IOException, ImageScraperException
152    {
153        if (! targetDirectory.endsWith(File.separator)) targetDirectory += File.separator;
154
155        Vector<HTMLNode>    article             = (Vector<HTMLNode>) articleBody.clone();
156        StringBuilder       out                 = new StringBuilder();
157        int[]               divNodes            = null;
158        String              divElemStr          = null;
159        Vector<String>      imageFileNames      = null;
160        String              urlStr              = URLs.urlToString(url);
161        String              outFile             = targetDirectory + "index.html";
162
163        // Anounce the beginning of the Parse & Translation
164        if (log != null) log.append("FOUND ARTICLE TITLE: " + title +  '\n');
165
166        // Start removing extranneous nodes.  First <SCRIPT>...</SCRIPT>
167        // REASONS: 1) Clean Up   2) Cannot Use 'in isolation'   3) Makes Readable HTML
168
169        int removeCount = Util.Remove.styleNodeBlocks(article);
170
171        if (log != null) log.append
172            ("Removed " + removeCount + " <STYLE ...> ... </STYLE> Node-Blocks\n");
173
174        // Remove <STYLE>...</STYLE>
175        // REASONS: 1) Clean Up   2) Cannot Use 'in isolation'   3) Makes Readable HTML
176
177        removeCount = Util.Remove.scriptNodeBlocks(article);
178
179        if (log != null) log.append
180            ("Removed " + removeCount + " <SCRIPT ...> ... </STYLE> Node-Blocks\n");
181
182        // Remove all other tags.  Throws away all formatting in the news-article.
183        removeCount = TagNodeRemove.allExcept(article, TC.Both, "img", "a");
184
185        if (log != null) log.append
186            ("Removed " + removeCount + " remaining HTML elements that were not: <IMG> or <A>.\n");
187
188        Util.trimTextNodes(article, true);
189
190        // Grab and save the images.  Keep the image-filenames as they were downloaded in a vector.
191        if (log != null) log.append(C.BRED + "Downloading Images First" + C.RESET + '\n');
192
193        // Call in the ImageScraper
194        // Ret2.a ==> Vector-Indices of the downloaded Images
195        // Ret2.b ==> Torello.HTML.Tools.Images.Results
196
197        Ret2<int[], Results> r = ImageScraper.localizeImages(article, url, log, targetDirectory);
198
199        // Start building the output HTML page.  Here is the <HEAD> and top of <BODY> stuff.
200        out.append(
201            HEADER +
202            "<H2>" + title + "</H2>\n" +
203            "<H2>" + GCSTAPI.sentence(title, srcLang, LC.EN) + "</H2>\n" +
204            "Original Article Link: " +
205            "<A HREF=\"" + new URL(URLs.toProperURLV4(urlStr)).toString() + "\" TARGET=\"_blank\">\n" +
206            urlStr + "</A>\n<BR /><BR />\n\n"
207        );
208
209        // Write this header stuff to a file, and clear the output buffer.
210        if (log != null) log.append("Writing to file: " + C.BCYAN + outFile + C.RESET + '\n');
211
212        FileRW.writeFile(out, outFile);
213        out = new StringBuilder();
214
215        // Generate the Article Body - with Foreign-Language Translations, and Vocabulary tables
216        // with English & Spanish
217
218        Ret2<Vector<String>, Vector<String>> r2 =
219            HTMLPageTablesGenerator.getArticleHTML(article, srcLang, out, log);
220
221        if (log != null) log.append
222            ("\nAppending to file: " + C.BCYAN + outFile + C.RESET + '\n');
223
224        FileRW.appendToFile(out, outFile);
225
226        // generate the data-div's for the JS
227        HTMLDataDIVGenerator.generateHTMLDataDIVs(r2.a, true, srcLang, true, true, outFile, log);
228
229        // Write the rest of this to a file.
230        if (log != null) log.append("Appending to file: " + C.BCYAN + outFile + C.RESET + '\n');
231
232        FileRW.appendToFile("</BODY>\n</HTML>\n", outFile);
233
234        if (log != null) log.append("Done.\n");
235    
236        return new Ret3<Vector<String>, Vector<String>, String[]>(r2.a, r2.b, r.b.fileNames);
237    }
238}