001package Torello.Languages; 002 003import java.util.*; 004import java.io.*; 005import java.net.URL; 006 007import Torello.Java.*; 008import Torello.Java.Additional.*; 009import Torello.HTML.*; 010import Torello.HTML.NodeSearch.*; 011import Torello.HTML.Tools.Images.*; 012 013import static Torello.Java.C.*; 014 015/** 016 * A simple Foreign News Article Scraper. 017 * 018 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=FNA> 019 * @see GCSTAPI#key 020 * @see GCSTAPI#sentence(String, LC, LC) 021 * @see GCSTAPI#wordByWord(Vector, LC, LC) 022 */ 023@Torello.JavaDoc.StaticFunctional 024public class ForeignNewsArticle 025{ 026 private ForeignNewsArticle() { } 027 028 /** 029 * This is the HTML page header that is appended to the output page. 030 */ 031 public static final String HEADER = 032 "<HTML>\n" + 033 HTMLHeader.metaTag + "\n" + 034 "<TITLE>Translated, Foreign Language Article</TITLE>\n" + 035 "<SCRIPT type=\"text/javascript\">\n" + HTMLHeader.javaScript + "\n" + "</SCRIPT>" + "\n" + 036 "<STYLE>\n" + HTMLHeader.css + "</STYLE>" + "\n" + 037 "<BODY>" + "\n" + HTMLHeader.popUpDIV + "\n" + 038 HTMLHeader.text2SpeechNote; 039 040 /** 041 * This will download and translate a news article from a foreign news website. All that you 042 * need to do is provide the main "Article-Body" of the article, and some information - 043 * <I><B>and calls to Google Cloud Server Translate API</I></B> will be handled by the code. 044 * 045 * <BR /><BR /><B><SPAN STYLE="color: red;">IMPORTANT NOTE:</B></SPAN> This class makes calls 046 * to the GCSTAPI, which is an acronym meaning the Google Cloud Server Translate API. This 047 * server expects you to pay Google for the services that it provides. The translations are not 048 * free - but they are not too expensive either. <B><I>You must be sure to set the 049 * {@code class GSCTAPI -> String key} field </I></B> in order for the GGCS Translate API 050 * Queries to succeed. 051 * 052 * <BR /><BR /><B>Your Directory Will Contain:</B> 053 * 054 * <BR /><BR /><OL CLASS=JDUL> 055 * <LI>Article Photos, stored by number as they appear in the article</LI> 056 * <LI>{@code index.html} - Article Body with Translations</LI> 057 * </OL> 058 * 059 * @param articleBody This should have the content of the article from the vectorized HTML 060 * page. Read more about cleaning an HTML news article in the class ArticleGet. 061 * 062 * <DIV CLASS="EXAMPLE-SCROLL">{@code 063 * // Generally retreiving the "Article Body" from a news-article web-page is a 'sort-of' simple 064 * // two-step process. 065 * // 066 * // Step 1: You must look at the web-page in your browser and press your browser's "View Content" 067 * // Button. Identify the HTML Divider Element that looks something to the effect of 068 * // <DIV CLASS='article_body'> ... or maybe <DIV CLASS='page_content'> 069 * // You will have to find the relevant divider, or article element once, and only once, 070 * // per website 071 * // 072 * // Step 2: Grab that content with a simple call to the Inclusive-Get methods in NodeSearch 073 * 074 * URL url = new URL("https://some.foreign-news.site/some-article.html"); 075 * Vector<HTMLNode> articlePage = HTMLPage.getPageTokens(url, false); 076 * Vector<HTMLNode> articleBody = InnerTagGetInclusive.first(articlePage, "div", "class", 077 * TextComparitor.C, "page-content"); 078 * // use whatever tag you have found via the "View Content" 079 * // Button on your browser. You only need to find this tag 080 * // once per website! 081 * 082 * // Now pass the 'articleBody' to this 'processArticle' method. 083 * // You will also have to retrieve the "Article Title" manually as well. 084 * // Hopefully it is obvious that the 'title' could be stored in any number of ways 085 * // depending on which site is being viewed. The title location is usually "consistently 086 * // the same" as long as your on the same website. 087 * 088 * String title = "?"; // you must search the page to retrieve the title 089 * LC articleLC = LC.es; // Select the (spoken) language used in the article. 090 * // This could be LC.vi (Vietnamese), LC.es (Spanish) etc... 091 * 092 * Ret3<Vector<String>, Vector<String>, String[]> response = processArticle 093 * (articleBody, url, title, articleLC, new StorageWriter(), "outdir/"); 094 * 095 * // The returned String-Vectors will have the translated sentences and words readily 096 * // available for use - if you wish to further process the article-content. 097 * // The output directory 'outdir/' will have a readable 'index.html' file, along 098 * // with any photos that were found on the page already downloaded so they may be 099 * // locally included on the output page. 100 * }</DIV> 101 * 102 * @param url This article's URL to be scraped. This is used, only, for including a link to 103 * the articles original page on the output index.html file. 104 * 105 * @param title This is needed because obtaining the title can be done in myraid ways. If it 106 * is kept as an "external option" - this provides more leeway to the coder/programmer. 107 * 108 * @param srcLang This is just the "two character" language code that Google Cloud Server 109 * expects to see. 110 * 111 * @param log This logs progress to terminal out. Null may be passsed, in which case output 112 * will not be displayed. Any implementation of {@code java.lang.Appendable} will 113 * suffice. Make note that the 'Appendable' interface allows / requires heeding 114 * IOException's for it's 'append(...)' methods. 115 * 116 * @param targetDirectory This is the directory where the image-files and 'index.html' file 117 * will be stored. 118 * 119 * @return This will return an instance of: 120 * {@code Ret3<Vector<String>, Vector<String>, String[]>} 121 * 122 * <BR /><BR /><UL CLASS=JDUL> 123 * 124 * <LI> {@code ret3.a (Vector<String>)} 125 * <BR /><BR /> 126 * This vector contains a list of sentences, or sentence-fragments, in the original 127 * language of the news or article. 128 * <BR /><BR /> 129 * </LI> 130 * 131 * <LI> {@code ret3.b (Vector<String>)} 132 * <BR /><BR /> 133 * This vector contains a list of sentences, or sentence-fragments, in the target 134 * language, which is english. 135 * <BR /><BR /> 136 * </LI> 137 * 138 * <LI> {@code ret3.c (String[])} 139 * <BR /><BR /> 140 * This array of strings contains a list of filenames, one for each image that was 141 * present on the original news or article page, and therefore downloaded. 142 * </LI> 143 * 144 * </UL> 145 */ 146 @SuppressWarnings("unchecked") 147 public static Ret3<Vector<String>, Vector<String>, String[]> processArticle( 148 Vector<HTMLNode> articleBody, URL url, String title, 149 LC srcLang, Appendable log, String targetDirectory 150 ) 151 throws IOException, ImageScraperException 152 { 153 if (! targetDirectory.endsWith(File.separator)) targetDirectory += File.separator; 154 155 Vector<HTMLNode> article = (Vector<HTMLNode>) articleBody.clone(); 156 StringBuilder out = new StringBuilder(); 157 int[] divNodes = null; 158 String divElemStr = null; 159 Vector<String> imageFileNames = null; 160 String urlStr = URLs.urlToString(url); 161 String outFile = targetDirectory + "index.html"; 162 163 // Anounce the beginning of the Parse & Translation 164 if (log != null) log.append("FOUND ARTICLE TITLE: " + title + '\n'); 165 166 // Start removing extranneous nodes. First <SCRIPT>...</SCRIPT> 167 // REASONS: 1) Clean Up 2) Cannot Use 'in isolation' 3) Makes Readable HTML 168 169 int removeCount = Util.Remove.styleNodeBlocks(article); 170 171 if (log != null) log.append 172 ("Removed " + removeCount + " <STYLE ...> ... </STYLE> Node-Blocks\n"); 173 174 // Remove <STYLE>...</STYLE> 175 // REASONS: 1) Clean Up 2) Cannot Use 'in isolation' 3) Makes Readable HTML 176 177 removeCount = Util.Remove.scriptNodeBlocks(article); 178 179 if (log != null) log.append 180 ("Removed " + removeCount + " <SCRIPT ...> ... </STYLE> Node-Blocks\n"); 181 182 // Remove all other tags. Throws away all formatting in the news-article. 183 removeCount = TagNodeRemove.allExcept(article, TC.Both, "img", "a"); 184 185 if (log != null) log.append 186 ("Removed " + removeCount + " remaining HTML elements that were not: <IMG> or <A>.\n"); 187 188 Util.trimTextNodes(article, true); 189 190 // Grab and save the images. Keep the image-filenames as they were downloaded in a vector. 191 if (log != null) log.append(C.BRED + "Downloading Images First" + C.RESET + '\n'); 192 193 // Call in the ImageScraper 194 // Ret2.a ==> Vector-Indices of the downloaded Images 195 // Ret2.b ==> Torello.HTML.Tools.Images.Results 196 197 Ret2<int[], Results> r = ImageScraper.localizeImages(article, url, log, targetDirectory); 198 199 // Start building the output HTML page. Here is the <HEAD> and top of <BODY> stuff. 200 out.append( 201 HEADER + 202 "<H2>" + title + "</H2>\n" + 203 "<H2>" + GCSTAPI.sentence(title, srcLang, LC.EN) + "</H2>\n" + 204 "Original Article Link: " + 205 "<A HREF=\"" + new URL(URLs.toProperURLV4(urlStr)).toString() + "\" TARGET=\"_blank\">\n" + 206 urlStr + "</A>\n<BR /><BR />\n\n" 207 ); 208 209 // Write this header stuff to a file, and clear the output buffer. 210 if (log != null) log.append("Writing to file: " + C.BCYAN + outFile + C.RESET + '\n'); 211 212 FileRW.writeFile(out, outFile); 213 out = new StringBuilder(); 214 215 // Generate the Article Body - with Foreign-Language Translations, and Vocabulary tables 216 // with English & Spanish 217 218 Ret2<Vector<String>, Vector<String>> r2 = 219 HTMLPageTablesGenerator.getArticleHTML(article, srcLang, out, log); 220 221 if (log != null) log.append 222 ("\nAppending to file: " + C.BCYAN + outFile + C.RESET + '\n'); 223 224 FileRW.appendToFile(out, outFile); 225 226 // generate the data-div's for the JS 227 HTMLDataDIVGenerator.generateHTMLDataDIVs(r2.a, true, srcLang, true, true, outFile, log); 228 229 // Write the rest of this to a file. 230 if (log != null) log.append("Appending to file: " + C.BCYAN + outFile + C.RESET + '\n'); 231 232 FileRW.appendToFile("</BODY>\n</HTML>\n", outFile); 233 234 if (log != null) log.append("Done.\n"); 235 236 return new Ret3<Vector<String>, Vector<String>, String[]>(r2.a, r2.b, r.b.fileNames); 237 } 238}