001package Torello.HTML.Tools.NewsSite; 002 003import Torello.JavaDoc.LinkJavaSource; 004 005import Torello.HTML.TagNode; 006import Torello.HTML.HTMLNode; 007import Torello.HTML.DotPair; 008import Torello.HTML.TC; 009import Torello.HTML.URLFilter; 010 011import Torello.HTML.NodeSearch.TagNodeFindL1Inclusive; 012import Torello.HTML.NodeSearch.InnerTagFindInclusive; 013import Torello.HTML.NodeSearch.TagNodeGet; 014import Torello.HTML.NodeSearch.InnerTagGet; 015import Torello.HTML.NodeSearch.InnerTagGetInclusive; 016import Torello.HTML.NodeSearch.TextComparitor; 017 018import Torello.Java.StrFilter; 019import Torello.Java.FileRW; 020import Torello.Java.LFEC; 021import Torello.Java.Country; 022import Torello.Java.C; 023 024import Torello.Languages.LC; 025 026import java.util.Vector; 027import java.util.Hashtable; 028 029import java.util.regex.Pattern; 030 031import java.net.URL; 032 033import java.io.IOException; 034 035/** 036 * This class is nothing more than an 'Example Class' that contains some foreign-language 037 * based news web-pages, from both overseas and from Latin America. 038 * 039 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITES> 040 */ 041public class NewsSites 042{ 043 private NewsSites() { } 044 045 @SuppressWarnings("unchecked") 046 private static final Hashtable<String, Vector<URL>> newsPaperSections = 047 (Hashtable<String, Vector<URL>>) LFEC.readObjectFromFile_JAR 048 (NewsSite.class, "data-files/SectionURLs.htdat", true, Hashtable.class); 049 050 051 /** 052 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITES_RUN_EX> 053 * @throws IOException This throws for IO errors that may occur when reading the web-server, 054 * or when saving the web-pages or images to the file-system. 055 * 056 * @see FileRW#delTree(String, boolean, Appendable) 057 * @see NewsSite 058 * @see FileRW#writeFile(CharSequence, String) 059 * @see C#toHTML(String, boolean, boolean, boolean) 060 */ 061 @LinkJavaSource(handle="RunExample") 062 public static void runExample() throws IOException 063 { 064 // Click on the @LinkJavaSource Curved-Arrow to view the example code in full screen 065 RunExample.run(); 066 } 067 068 /** 069 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITES_MAIN> 070 * @param argv These are the command line arguments passed by the JRE to this method. 071 * 072 * @throws IOException If there are any problems while attempting to save the output to the 073 * the output file (if one was named / requested). 074 */ 075 public static void main(String[] argv) throws IOException 076 { 077 // Uncomment this line to run the example code (instead of section-data print) 078 // runExample(); System.exit(0); 079 080 // The data-file is loaded into private field "newsPaperSections" 081 // This private field is a Hashtable<String, Vector<URL>>. Convert each of 082 // these sections so that they may be printed to terminal and maybe to a text 083 // file. 084 085 StringBuilder sb = new StringBuilder(); 086 087 for (String newspaper : newsPaperSections.keySet()) 088 { 089 sb.append(newspaper + '\n'); 090 for (URL section : newsPaperSections.get(newspaper)) 091 sb.append(section.toString() + '\n'); 092 sb.append("\n\n***************************************************\n\n"); 093 } 094 095 String s = sb.toString(); 096 System.out.println(s); 097 098 // If there is a command-line parameter, it shall be interpreted a file-name. 099 // The contents of the "sections data-file" (as text) will be written a file on the 100 // file-system using the String-value of "argv[0]" as the name of the output-filename. 101 102 if (argv.length == 1) FileRW.writeFile(s, argv[0]); 103 } 104 105 106 // URLFilter.regexKEEP(Pattern.compile("^http.+baidu\\.com\\/s\\?id=\\d+$"))); 107 // ArticleGet.usual(TextComparitor.CN_CI, "article-content")); 108 109 /** 110 * <EMBED CLASS='external-html' DATA-FILE-ID=ABC_ES_LG> 111 * @see TagNodeFindL1Inclusive#all(Vector, String) 112 * @see TagNodeGet#first(Vector, int, int, TC, String[]) 113 * @see TagNode#AV(String) 114 */ 115 public static Vector<String> ABC_LINKS_GETTER(URL url, Vector<HTMLNode> page) 116 { 117 final Vector<String> ret = new Vector<>(); 118 119 TagNode tn; 120 String urlStr; 121 122 // Links are kept inside <ARTICLE> ... </ARTICLE> on the main / section page. 123 for (DotPair article : TagNodeFindL1Inclusive.all(page, "article")) 124 125 // Now find the <A HREF=...> ... </A> 126 if ((tn = TagNodeGet.first(page, article.start, article.end, TC.OpeningTags, "a")) 127 != null) 128 129 if ((urlStr = tn.AV("href")) != null) 130 ret.add(urlStr); 131 132 return ret; 133 } 134 135 /** 136 * <EMBED CLASS='external-html' DATA-FILE-ID=ABC_ES> 137 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 138 */ 139 public static final NewsSite ABCES = new NewsSite 140 ( 141 "ABC España", 142 Country.Spain, 143 "https://www.abc.es/", 144 LC.ES, 145 "ABC is a Spanish national daily newspaper. It is the third largest general-interest " + 146 "newspaper in Spain, and the oldest newspaper still operating in Madrid.", 147 newsPaperSections.get("ABCES"), 148 StrFilter.comparitor(TextComparitor.EW_CI, ".html"), 149 NewsSites::ABC_LINKS_GETTER, 150 ArticleGet.usual("main"), 151 null // bannerAndAdFinder 152 ); 153 154 /** <EMBED CLASS='external-html' DATA-FILE-ID=PULSO> */ 155 public static final NewsSite Pulso = new NewsSite 156 ( 157 "El Pulso, México", 158 Country.Mexico, 159 "https://elpulso.mx", 160 LC.ES, 161 "El Pulso newspaper is Spanish language newspaper in Mexico. It is showing breaking " + 162 "news, " + 163 "headlines, kids news, tourism news, entertainment news, study news, industrial news, " + 164 "economical news, health & beauty news, crime news, career news, Travel news, " + 165 "diet & fitness news, Top stories, special news, celebrity news.", 166 newsPaperSections.get("PULSO"), 167 StrFilter.regExKEEP(Pattern.compile( 168 "^https?:\\/{2}.*?\\/\\d{4}\\/\\d{2}\\/\\d{2}\\/[\\w-]{10,}\\/$" 169 ), false), 170 null, // LinksGet 171 ArticleGet.usual(TextComparitor.C, "entry-content"), 172 null // bannerAndAddFinder 173 ); 174 175 /** 176 * <EMBED CLASS='external-html' DATA-FILE-ID=EL_NACIONAL_LG> 177 * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[]) 178 * @see TagNodeGet#first(Vector, int, int, TC, String[]) 179 * @see TagNode#AV(String) 180 */ 181 public static Vector<String> EL_NACIONAL_LINKS_GETTER(URL url, Vector<HTMLNode> page) 182 { 183 Vector<String> ret = new Vector<>(); TagNode tn; String urlStr; 184 185 // Links are kept inside <DIV CLASS=td-module-thumb> ... </DIV> on the main / section page. 186 for (DotPair article : InnerTagFindInclusive.all 187 (page, "div", "class", TextComparitor.C, "td-module-thumb")) 188 189 // Now find the <A HREF=...> ... </A> 190 if ((tn = TagNodeGet.first 191 (page, article.start, article.end, TC.OpeningTags, "a")) != null) 192 193 if ((urlStr = tn.AV("href")) != null) 194 ret.add(urlStr); 195 196 return ret; 197 } 198 199 /** 200 * <EMBED CLASS='external-html' DATA-FILE-ID=EL_NACIONAL> 201 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 202 */ 203 public static final NewsSite ElNacional = new NewsSite 204 ( 205 "El Nacional", 206 Country.Venezuela, 207 "https://elnacional.com", 208 LC.ES, 209 "El Nacional is a Venezuelan publishing company under the name C.A. Editorial " + 210 "El Nacional, " + 211 "most widely known for its El Nacional newspaper and website. It, along with Últimas " + 212 "Noticias and El Universal, are the most widely read and circulated daily national " + 213 "newspapers in the country, and it has an average of more than 80,000 papers distributed " + 214 "daily and 170,000 copies on weekends.", 215 newsPaperSections.get("ElNacional"), 216 (URLFilter) null, /* The LinksGetter will only return valid Anchor's */ 217 NewsSites::EL_NACIONAL_LINKS_GETTER, 218 ArticleGet.usual("article"), 219 null /* bannerAndAdFinder */ 220 ); 221 222 /** 223 * <EMBED CLASS='external-html' DATA-FILE-ID=EL_ESPECTADOR_LG> 224 * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[]) 225 * @see InnerTagGet#first(Vector, int, int, String, String, TextComparitor, String[]) 226 * @see TagNode#AV(String) 227 */ 228 public static Vector<String> EL_ESPECTADOR_LINKS_GETTER(URL url, Vector<HTMLNode> page) 229 { 230 Vector<String> ret = new Vector<>(); 231 232 TagNode tn; 233 String urlStr; 234 235 // Links are kept inside <DIV CLASS="Card ..."> ... </DIV> on the main / section page. 236 for (DotPair article : InnerTagFindInclusive.all 237 (page, "div", "class", TextComparitor.C, "Card")) 238 239 // Now find the <A CLASS="card-link" HREF=...> ... </A> 240 if ((tn = InnerTagGet.first 241 (page, article.start, article.end, "a", "class", TextComparitor.C, "card-link")) 242 != null) 243 244 if ((urlStr = tn.AV("href")) != null) 245 ret.add(urlStr); 246 247 return ret; 248 } 249 250 /** 251 * <EMBED CLASS='external-html' DATA-FILE-ID=EL_ESPECTADOR> 252 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 253 */ 254 public static final NewsSite ElEspectador = new NewsSite 255 ( 256 "El Espectador, Columbia", 257 Country.Colombia, 258 "https://elespectador.com", 259 LC.ES, 260 "El Espectador (meaning \"The Spectator\") is a newspaper with national circulation within "+ 261 "Colombia, founded by Fidel Cano Gutiérrez on 22 March 1887 in Medellín and published " + 262 "since 1915 in Bogotá. It changed from a daily to a weekly edition in 2001, following a " + 263 "financial crisis, and became a daily again on 11 May 2008, a comeback which had been " + 264 "long rumoured, in tabloid format (28 x 39.5 cm). From 1997 to 2011 its main shareholder " + 265 "was Julio Mario Santo Domingo.", 266 newsPaperSections.get("ElEspectador"), 267 StrFilter.comparitor(TextComparitor.ENDS_WITH, "/"), 268 NewsSites::EL_ESPECTADOR_LINKS_GETTER, 269 ArticleGet.usual("article"), 270 null /* bannerAndAdFinder */ 271 ); 272 273 /** 274 * <EMBED CLASS='external-html' DATA-FILE-ID=GOV_CN_CAROUSEL_LG> 275 * @see InnerTagGetInclusive#first(Vector, String, String, TextComparitor, String[]) 276 * @see TagNodeGet#all(Vector, TC, String[]) 277 * @see TagNode#AV(String) 278 */ 279 public static Vector<String> GOVCN_CAROUSEL_LINKS_GETTER(URL url, Vector<HTMLNode> page) 280 { 281 Vector<String> ret = new Vector<>(); 282 String urlStr; 283 284 // Find the first <DIV CLASS="slider-carousel"> ... </DIV> section 285 Vector<HTMLNode> carouselDIV = InnerTagGetInclusive.first 286 (page, "div", "class", TextComparitor.CN_CI, "slider-carousel"); 287 288 289 // Retrieve any HTML Anchor <A HREF=...> ... </A> found within the contents of the 290 // Divider. 291 292 for (TagNode tn: TagNodeGet.all(carouselDIV, TC.OpeningTags, "a")) 293 if ((urlStr = tn.AV("href")) != null) 294 ret.add(urlStr); 295 296 return ret; 297 }; 298 299 /** 300 * <EMBED CLASS='external-html' DATA-FILE-ID=GOV_CN_CAROUSEL> 301 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 302 */ 303 public static final NewsSite GovCNCarousel = new NewsSite 304 ( 305 "Chinese Government Web Portal", 306 Country.China, 307 "https://gov.cn/", 308 LC.ZH_CN, 309 "The Chinese Government Sponsored Web-Site", 310 newsPaperSections.get("GovCNCarousel"), 311 StrFilter.regExKEEP(Pattern.compile( 312 "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?" + 313 "content_\\d+.htm(?:l)?(#\\d+)?" 314 ), false), 315 NewsSites::GOVCN_CAROUSEL_LINKS_GETTER, 316 ArticleGet.usual(TextComparitor.C, "article"), 317 null /* bannerAndAddFinder */ 318 ); 319 320 /** 321 * <EMBED CLASS='external-html' DATA-FILE-ID=GOV_CN> 322 * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE> 323 */ 324 public static final NewsSite GovCN = new NewsSite 325 ( 326 "Chinese Government Web Portal", 327 Country.China, 328 "https://gov.cn/", 329 LC.ZH_CN, 330 "The Chinese Government Sponsored Web-Site", 331 newsPaperSections.get("GovCN"), 332 StrFilter.regExKEEP(Pattern.compile( 333 "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?" + 334 "content_\\d+.htm(?:l)?(#\\d+)?" 335 ), false), 336 null, 337 ArticleGet.usual(TextComparitor.C, "article"), 338 null /* bannerAndAddFinder */ 339 ); 340}