Source code

001package Torello.HTML.Tools.NewsSite;
002
003import Torello.JavaDoc.LinkJavaSource;
004
005import Torello.HTML.TagNode;
006import Torello.HTML.HTMLNode;
007import Torello.HTML.DotPair;
008import Torello.HTML.TC;
009import Torello.HTML.URLFilter;
010
011import Torello.HTML.NodeSearch.TagNodeFindL1Inclusive;
012import Torello.HTML.NodeSearch.InnerTagFindInclusive;
013import Torello.HTML.NodeSearch.TagNodeGet;
014import Torello.HTML.NodeSearch.InnerTagGet;
015import Torello.HTML.NodeSearch.InnerTagGetInclusive;
016import Torello.HTML.NodeSearch.TextComparitor;
017
018import Torello.Java.StrFilter;
019import Torello.Java.FileRW;
020import Torello.Java.LFEC;
021import Torello.Java.Country;
022import Torello.Java.C;
023
024import Torello.Languages.LC;
025
026import java.util.Vector;
027import java.util.Hashtable;
028
029import java.util.regex.Pattern;
030
031import java.net.URL;
032
033import java.io.IOException;
034
035/**
036 * This class is nothing more than an 'Example Class' that contains some foreign-language
037 * based news web-pages, from both overseas and from Latin America.
038 * 
039 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITES>
040 */
041public class NewsSites
042{
043    private NewsSites() { }
044
045    @SuppressWarnings("unchecked")
046    private static final Hashtable<String, Vector<URL>> newsPaperSections = 
047        (Hashtable<String, Vector<URL>>) LFEC.readObjectFromFile_JAR
048        (NewsSite.class, "data-files/SectionURLs.htdat", true, Hashtable.class);
049
050
051    /**
052     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITES_RUN_EX>
053     * @throws IOException This throws for IO errors that may occur when reading the web-server,
054     * or when saving the web-pages or images to the file-system.
055     * 
056     * @see FileRW#delTree(String, boolean, Appendable)
057     * @see NewsSite
058     * @see FileRW#writeFile(CharSequence, String)
059     * @see C#toHTML(String, boolean, boolean, boolean)
060     */
061    @LinkJavaSource(handle="RunExample")
062    public static void runExample() throws IOException
063    {
064        // Click on the @LinkJavaSource Curved-Arrow to view the example code in full screen
065        RunExample.run();
066    }
067
068    /**
069     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_SITES_MAIN>
070     * @param argv These are the command line arguments passed by the JRE to this method.
071     * 
072     * @throws IOException If there are any problems while attempting to save the output to the
073     * the output file (if one was named / requested).
074     */
075    public static void main(String[] argv) throws IOException
076    {
077        // Uncomment this line to run the example code (instead of section-data print)
078        // runExample(); System.exit(0);
079
080        // The data-file is loaded into private field "newsPaperSections"
081        // This private field is a Hashtable<String, Vector<URL>>.  Convert each of
082        // these sections so that they may be printed to terminal and maybe to a text
083        // file.
084
085        StringBuilder sb = new StringBuilder();
086
087        for (String newspaper : newsPaperSections.keySet())
088        {
089            sb.append(newspaper + '\n');
090            for (URL section : newsPaperSections.get(newspaper))
091                sb.append(section.toString() + '\n');
092            sb.append("\n\n***************************************************\n\n");
093        }
094        
095        String s = sb.toString();
096        System.out.println(s);
097        
098        // If there is a command-line parameter, it shall be interpreted a file-name.
099        // The contents of the "sections data-file" (as text) will be written a file on the
100        // file-system using the String-value of "argv[0]" as the name of the output-filename.
101
102        if (argv.length == 1) FileRW.writeFile(s, argv[0]);
103    }
104
105
106    // URLFilter.regexKEEP(Pattern.compile("^http.+baidu\\.com\\/s\\?id=\\d+$")));
107    // ArticleGet.usual(TextComparitor.CN_CI, "article-content"));
108
109    /**
110     * <EMBED CLASS='external-html' DATA-FILE-ID=ABC_ES_LG>
111     * @see TagNodeFindL1Inclusive#all(Vector, String)
112     * @see TagNodeGet#first(Vector, int, int, TC, String[])
113     * @see TagNode#AV(String)
114     */
115    public static Vector<String> ABC_LINKS_GETTER(URL url, Vector<HTMLNode> page)
116    {
117        final Vector<String> ret = new Vector<>();
118
119        TagNode tn;
120        String urlStr;
121
122        // Links are kept inside <ARTICLE> ... </ARTICLE> on the main / section page.
123        for (DotPair article : TagNodeFindL1Inclusive.all(page, "article"))
124
125            // Now find the <A HREF=...> ... </A>
126            if ((tn = TagNodeGet.first(page, article.start, article.end, TC.OpeningTags, "a"))
127                != null)
128
129                if ((urlStr = tn.AV("href")) != null)
130                    ret.add(urlStr);
131
132        return ret;
133    }
134
135    /**
136     * <EMBED CLASS='external-html' DATA-FILE-ID=ABC_ES>
137     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
138     */
139    public static final NewsSite ABCES = new NewsSite
140    (
141        "ABC España",
142        Country.Spain,
143        "https://www.abc.es/",
144        LC.ES,
145        "ABC is a Spanish national daily newspaper.  It is the third largest general-interest " +
146        "newspaper in Spain, and the oldest newspaper still operating in Madrid.",
147        newsPaperSections.get("ABCES"),
148        StrFilter.comparitor(TextComparitor.EW_CI, ".html"),
149        NewsSites::ABC_LINKS_GETTER,
150        ArticleGet.usual("main"),
151        null // bannerAndAdFinder
152    );
153
154    /** <EMBED CLASS='external-html' DATA-FILE-ID=PULSO> */
155    public static final NewsSite Pulso = new NewsSite
156    (
157        "El Pulso, México",
158        Country.Mexico,
159        "https://elpulso.mx",
160        LC.ES,
161        "El Pulso newspaper is Spanish language newspaper in Mexico. It is showing breaking " +
162            "news, " +
163        "headlines, kids news, tourism news, entertainment news, study news, industrial news, " +
164        "economical news, health & beauty news, crime news, career news, Travel news, " +
165        "diet & fitness news, Top stories, special news, celebrity news.",
166        newsPaperSections.get("PULSO"),
167        StrFilter.regExKEEP(Pattern.compile(
168            "^https?:\\/{2}.*?\\/\\d{4}\\/\\d{2}\\/\\d{2}\\/[\\w-]{10,}\\/$"
169        ), false),
170        null, // LinksGet
171        ArticleGet.usual(TextComparitor.C, "entry-content"),
172        null // bannerAndAddFinder
173    );
174
175    /**
176     * <EMBED CLASS='external-html' DATA-FILE-ID=EL_NACIONAL_LG>
177     * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[])
178     * @see TagNodeGet#first(Vector, int, int, TC, String[])
179     * @see TagNode#AV(String)
180     */
181    public static Vector<String> EL_NACIONAL_LINKS_GETTER(URL url, Vector<HTMLNode> page)
182    {
183        Vector<String> ret = new Vector<>();       TagNode tn;     String urlStr;
184
185        // Links are kept inside <DIV CLASS=td-module-thumb> ... </DIV> on the main / section page.
186        for (DotPair article : InnerTagFindInclusive.all
187            (page, "div", "class", TextComparitor.C, "td-module-thumb"))
188
189            // Now find the <A HREF=...> ... </A>
190            if ((tn = TagNodeGet.first
191                (page, article.start, article.end, TC.OpeningTags, "a")) != null)
192
193                if ((urlStr = tn.AV("href")) != null)
194                    ret.add(urlStr);
195
196        return ret;
197    }
198
199    /**
200     * <EMBED CLASS='external-html' DATA-FILE-ID=EL_NACIONAL>
201     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
202     */
203    public static final NewsSite ElNacional = new NewsSite
204    (
205        "El Nacional",
206        Country.Venezuela,
207        "https://elnacional.com",
208        LC.ES,
209        "El Nacional is a Venezuelan publishing company under the name C.A. Editorial " +
210            "El Nacional, " +
211        "most widely known for its El Nacional newspaper and website. It, along with Últimas " +
212        "Noticias and El Universal, are the most widely read and circulated daily national " +
213        "newspapers in the country, and it has an average of more than 80,000 papers distributed " +
214        "daily and 170,000 copies on weekends.",
215        newsPaperSections.get("ElNacional"),
216        (URLFilter) null, /* The LinksGetter will only return valid Anchor's */
217        NewsSites::EL_NACIONAL_LINKS_GETTER,
218        ArticleGet.usual("article"),
219        null /* bannerAndAdFinder */
220    );
221
222    /**
223     * <EMBED CLASS='external-html' DATA-FILE-ID=EL_ESPECTADOR_LG>
224     * @see InnerTagFindInclusive#all(Vector, String, String, TextComparitor, String[])
225     * @see InnerTagGet#first(Vector, int, int, String, String, TextComparitor, String[])
226     * @see TagNode#AV(String)
227     */
228    public static Vector<String> EL_ESPECTADOR_LINKS_GETTER(URL url, Vector<HTMLNode> page)
229    {
230        Vector<String> ret = new Vector<>();
231
232        TagNode tn;
233        String  urlStr;
234
235        // Links are kept inside <DIV CLASS="Card ..."> ... </DIV> on the main / section page.
236        for (DotPair article : InnerTagFindInclusive.all
237            (page, "div", "class", TextComparitor.C, "Card"))
238
239            // Now find the <A CLASS="card-link" HREF=...> ... </A>
240            if ((tn = InnerTagGet.first
241                (page, article.start, article.end, "a", "class", TextComparitor.C, "card-link"))
242                    != null)
243
244                if ((urlStr = tn.AV("href")) != null)
245                    ret.add(urlStr);
246
247        return ret;
248    }
249
250    /**
251     * <EMBED CLASS='external-html' DATA-FILE-ID=EL_ESPECTADOR>
252     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
253     */
254    public static final NewsSite ElEspectador = new NewsSite
255    (
256        "El Espectador, Columbia",
257        Country.Colombia, 
258        "https://elespectador.com",
259        LC.ES,
260        "El Espectador (meaning \"The Spectator\") is a newspaper with national circulation within "+
261        "Colombia, founded by Fidel Cano Gutiérrez on 22 March 1887 in Medellín and published "     +
262        "since 1915 in Bogotá. It changed from a daily to a weekly edition in 2001, following a "   +
263        "financial crisis, and became a daily again on 11 May 2008, a comeback which had been "     +
264        "long rumoured, in tabloid format (28 x 39.5 cm). From 1997 to 2011 its main shareholder "  +
265        "was Julio Mario Santo Domingo.",
266        newsPaperSections.get("ElEspectador"),
267        StrFilter.comparitor(TextComparitor.ENDS_WITH, "/"),
268        NewsSites::EL_ESPECTADOR_LINKS_GETTER,
269        ArticleGet.usual("article"),
270        null /* bannerAndAdFinder */
271    );
272
273    /**
274     * <EMBED CLASS='external-html' DATA-FILE-ID=GOV_CN_CAROUSEL_LG>
275     * @see InnerTagGetInclusive#first(Vector, String, String, TextComparitor, String[])
276     * @see TagNodeGet#all(Vector, TC, String[])
277     * @see TagNode#AV(String)
278     */
279    public static Vector<String> GOVCN_CAROUSEL_LINKS_GETTER(URL url, Vector<HTMLNode> page)
280    {
281        Vector<String>  ret     = new Vector<>();
282        String          urlStr;
283
284        // Find the first <DIV CLASS="slider-carousel"> ... </DIV> section
285        Vector<HTMLNode> carouselDIV = InnerTagGetInclusive.first
286            (page, "div", "class", TextComparitor.CN_CI, "slider-carousel");
287
288
289        // Retrieve any HTML Anchor <A HREF=...> ... </A> found within the contents of the
290        // Divider.
291
292        for (TagNode tn: TagNodeGet.all(carouselDIV, TC.OpeningTags, "a"))
293            if ((urlStr = tn.AV("href")) != null)
294                ret.add(urlStr);
295
296        return ret;
297    };
298
299    /**
300     * <EMBED CLASS='external-html' DATA-FILE-ID=GOV_CN_CAROUSEL>
301     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
302     */
303    public static final NewsSite GovCNCarousel = new NewsSite
304    (
305        "Chinese Government Web Portal",
306        Country.China,
307        "https://gov.cn/",
308        LC.ZH_CN,
309        "The Chinese Government Sponsored Web-Site",
310        newsPaperSections.get("GovCNCarousel"),
311        StrFilter.regExKEEP(Pattern.compile(
312            "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?" +
313                "content_\\d+.htm(?:l)?(#\\d+)?"
314        ), false),
315        NewsSites::GOVCN_CAROUSEL_LINKS_GETTER,
316        ArticleGet.usual(TextComparitor.C, "article"),
317        null /* bannerAndAddFinder */
318    );
319
320    /**
321     * <EMBED CLASS='external-html' DATA-FILE-ID=GOV_CN>
322     * <EMBED CLASS='external-html' DATA-FILE-ID=NEWS_STE_CHANGE>
323     */
324    public static final NewsSite GovCN = new NewsSite
325    (
326        "Chinese Government Web Portal",
327        Country.China,
328        "https://gov.cn/",
329        LC.ZH_CN,
330        "The Chinese Government Sponsored Web-Site",
331        newsPaperSections.get("GovCN"),
332        StrFilter.regExKEEP(Pattern.compile(
333            "^http://www.gov.cn/(?:.+?/)?\\d{4}-\\d{2}/\\d{2}/(?:.+?/)?" +
334                "content_\\d+.htm(?:l)?(#\\d+)?"
335        ), false),
336        null,
337        ArticleGet.usual(TextComparitor.C, "article"),
338        null /* bannerAndAddFinder */
339    );
340}