001package Torello.HTML.Tools.NewsSite; 002 003import java.util.function.*; 004import java.util.*; 005import java.util.regex.*; 006 007import java.net.URL; 008 009import Torello.HTML.*; 010import Torello.HTML.NodeSearch.*; 011 012import Torello.JavaDoc.LinkJavaSource; 013 014import Torello.Java.ParallelArrayException; 015 016/** 017 * A function-pointer / lambda target for extracting an article's content from the web-page 018 * from whence it was downloaded; including several {@code static}-builder methods for the 019 * most common means of finding the HTML-Tags that wrap artilce-HTML on news-media websites. 020 * 021 * <EMBED CLASS='external-html' DATA-FILE-ID=ARTICLE_GET> 022 */ 023@FunctionalInterface 024public interface ArticleGet extends java.io.Serializable 025{ 026 /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUIDFI> */ 027 public static final long serialVersionUID = 1; 028 029 030 // ******************************************************************************************** 031 // ******************************************************************************************** 032 // Standard Functional Interface Method 033 // ******************************************************************************************** 034 // ******************************************************************************************** 035 036 /** 037 * <EMBED CLASS='external-html' DATA-FILE-ID=FUNC_INTER_METH> 038 * <EMBED CLASS='external-html' DATA-FILE-ID=ART_GET_APPLY> 039 */ 040 public Vector<HTMLNode> apply(URL url, Vector<HTMLNode> page) throws ArticleGetException; 041 042 043 // ******************************************************************************************** 044 // ******************************************************************************************** 045 // Filter Factory / Filter-Generator static-methods 046 // ******************************************************************************************** 047 // ******************************************************************************************** 048 049 050 /** 051 * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_HTMLTAG> 052 * 053 * @param htmlTag This should be the HTML element that is used to wrap the actual news-content 054 * article-body of an HTML news web-site page. 055 * 056 * @return This returns an "Article Getter" that just picks out the part of a news-website 057 * article that lies between the open and closed version of the specified htmlTag. 058 */ 059 @LinkJavaSource(handle="Usual_htmlTag") 060 public static ArticleGet usual(String htmlTag) 061 { return Usual_htmlTag.generate(htmlTag); } 062 063 /** 064 * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_TC> 065 * 066 * @param tc This should be any of the pre-instantiated {@code TextComparitor's}. Again, a 067 * TextComparitor is just a {@code String} compare function like: {@code equals, contains, 068 * StrCmpr.containsIgnoreCase(...)}, etc... 069 * 070 * @param cssClassCompareStrings These are the values to be used by the 071 * {@code TextComparitor} when comparing with the value of the CSS-Selector {@code "Class"} 072 * from the list of {@code DIV} elements on the page. 073 * 074 * @return This returns an "Article Getter" that just picks out the part of a news-website 075 * article that lies between the HTML-{@code DIV} Element nodes whose class is identified by 076 * the "CSS (Cascading Style Sheets) {@code 'class'} identifier, and the 077 * {@code TextComparitor} parameter that you have chosen. 078 */ 079 @LinkJavaSource(handle="Usual_tc") 080 public static ArticleGet usual(TextComparitor tc, String... cssClassCompareStrings) 081 { return Usual_tc.generate(tc, cssClassCompareStrings); } 082 083 /** 084 * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_HTMLTAG_TC> 085 * 086 * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify 087 * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>} 088 * or {@code <FRAME>}, then you may. 089 * 090 * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use 091 * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the 092 * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice. 093 * 094 * @param tc This should be any of the pre-instantiated {@code TextComparitor's}. Again, a 095 * {@code TextComparitor} is just a {@code String} compare function like: {@code equals, 096 * contains, StrCmpr.containsIgnoreCase(...)}. 097 * 098 * @param attributeValueCompareStrings These are the {@code String's} compared with using 099 * the innerTag <B STYLE='color: red;'>value</B> using the {@code TextComparitor}. 100 * 101 * @return This returns an "Article Getter" that picks out the part of a news-website article 102 * that lies between the HTML element which matches the {@code 'htmlTag', 'innerTag' (id, 103 * class, or "other")}, and whose attribute-<B STYLE='color: red;'>value</B> of the specified 104 * {@code inner-tag} can be matched by the {@code TextComparitor} and the 105 * compare-{@code String's}. 106 */ 107 @LinkJavaSource(handle="Usual_htmlTag_tc") 108 public static ArticleGet usual 109 (String htmlTag, String innerTag, TextComparitor tc, String... attributeValueCompareStrings) 110 { return Usual_htmlTag_tc.generate(htmlTag, innerTag, tc, attributeValueCompareStrings); } 111 112 /** 113 * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_ITVP_DESC> 114 * 115 * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify 116 * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>} 117 * or {@code <FRAME>}, then you may. 118 * 119 * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use 120 * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the 121 * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice. 122 * 123 * @param innerTagValuePattern <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_ITVP_PARAM> 124 * 125 * @return This returns an "Article Getter" that picks out the part of a news-website article 126 * that lays between the HTML element which matches the htmlTag, innerTag and value-testing 127 * regex {@code Pattern "innerTagValuePattern"}. 128 */ 129 @LinkJavaSource(handle="Usual_innerTagValuePattern") 130 public static ArticleGet usual(String htmlTag, String innerTag, Pattern innerTagValuePattern) 131 { return Usual_innerTagValuePattern.generate(htmlTag, innerTag, innerTagValuePattern); } 132 133 /** 134 * <I>This is a static, factory method for building ArticleGet.</I> 135 * 136 * <BR /><BR />This gives more options for building your article getter. In almost 95% of the 137 * news-websites, the article or page-body is between and open and close HTML {@code 'DIV'} 138 * element, and the {@code <DIV CLASS="...">} can be found by the {@code CSS 'class'} attribute. 139 * <I><B>However,</B></I> This factory method allows a programmer to select article content 140 * that handles other cases than the {@code 95%}, where you specify the HTML-token, 141 * attribute-<B STYLE='color: red;'>name</B> and a {@code Predicate<String>} for finding the 142 * page-body. 143 * 144 * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify 145 * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>} 146 * or {@code <FRAME>}, then you may. 147 * 148 * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use 149 * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the 150 * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice. 151 * 152 * @param p This java "lambda {@code Predicate}" will just receive the 153 * attribute-<B STYLE='color: red;'>value</B> from the "inner-tag" and provide a yes/no answer. 154 * 155 * @return This returns an "Article Getter" that matches an HTML element specified by 156 * {@code 'htmlTag', 'innerTag'} and the result of the {@code String-Predicate} parameter 157 * {@code 'p'} on the <B STYLE='color: red;'>value</B> of that inner-tag. 158 */ 159 @LinkJavaSource(handle="Usual_p") 160 public static ArticleGet usual(String htmlTag, String innerTag, Predicate<String> p) 161 { return Usual_p.generate(htmlTag, innerTag, p); } 162 163 /** 164 * <I>This is a static, factory method for building ArticleGet.</I> 165 * 166 * <BR /><BR />This factory method generates an "ArticleGet" that will retrieve news-article 167 * body-content based on a "start-tag" and an "end-tag." It is <B><I>very</I></B> to note, 168 * that the text can only match a single text-node, and not span multiple text-nodes, or be 169 * within {@code TagNode's} at all! This should be easy to find, print up the HTML page as a 170 * {@code Vector}, and inspect it! 171 * 172 * @param startTextTag This must be text from an HTML {@code TextNode} that is 173 * <I><B>contained</B> within one (single) {@code TextNode}</I> of the vectorized-HTML page. 174 * 175 * @param endTextTag This must be text from an HTML {@code TextNode} that is also 176 * <B><I>contained</B> in a single {@code TextNode}</I> of the vectorized-HTML page. 177 * 178 * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B> in 179 * the article, specified by the text-tag parameters, and gets it. 180 */ 181 @LinkJavaSource(handle="Usual_textTag") 182 public static ArticleGet usual(String startTextTag, String endTextTag) 183 { return Usual_textTag.generate(startTextTag, endTextTag); } 184 185 /** 186 * <I>This is a static, factory method for building ArticleGet.</I> 187 * 188 * This factory method generates an "ArticleGet" that will retrieve news-article body-content 189 * based on starting and ending regular-expressions. The matches performed by the Regular 190 * Expression checker will be performed on {@code TextNode's}, not on the {@code TagNode's}, or 191 * the page itself. It is <B><I>very</I></B> to note, that the text can only match a single 192 * {@code TextNode}, and not span multiple {@code TextNode's}, or be within {@code TagNode's} 193 * at all! This should be easy to find, print up the HTML page as a {@code Vector}, and 194 * inspect it! 195 * 196 * @param startPattern This must be a regular expression {@code Pattern} that matches an HTML 197 * {@code TextNode} that is <I><B>contained</B> within one (single) {@code TextNode}</I> of 198 * the vectorized-HTML page. 199 * 200 * @param endPattern This must be a regular expression {@code Pattern} that matches an HTML 201 * {@code TextNode} that is also <B><I>contained</B> in a single {@code TextNode}</I> of the 202 * vectorized-HTML page. 203 * 204 * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B> 205 * in the article, specified by the regular-expression pattern-matching parameters, and gets it. 206 */ 207 @LinkJavaSource(handle="Usual_pattern") 208 public static ArticleGet usual(Pattern startPattern, Pattern endPattern) 209 { return Usual_pattern.generate(startPattern, endPattern); } 210 211 /** 212 * <EMBED CLASS='external-html' DATA-FILE-ID=AG_BRANCH> 213 * 214 * @param urlSelectors This is a list of {@code Predicate<URL>} elements. When one of these 215 * returns {@code TRUE} for a particular {@code URL}, then the index of that 216 * {@code URL}-selector in it's {@code array} will be used to call the appropriate getter from 217 * the parallel-{@code array} input-parameter {@code 'getters'}. 218 * 219 * @param getters This is a list of getter elements. These should be tailored to the 220 * particular news-website source that are chosen/selected by the {@code 'urlSelectors'} 221 * parallel {@code array}. 222 * 223 * @return This will be a "master {@code ArticleGet}" or a "dispatch {@code ArticleGet}." 224 * All it does is simply traverse the first {@code array} looking for a 225 * {@code Predicate}-match from the {@code 'urlSelectors'}, and then calls the getter in the 226 * parallel {@code array}. 227 * 228 * <BR /><BR /><B>NOTE:</B> If none of the {@code 'urlSelectors'} match when this 229 * <B><I>"dispatch"</B></I> or rather <B><I>"branch"</I></B> is called by {@code class 230 * NewsSiteScrape}, the function/getter that is returned will throw an 231 * {@code ArticleGetException}. It is important that the programmer only allow article 232 * {@code URL's} that he can capably handled to pass to {@code class NewsSiteScrape}. 233 * 234 * @throws IllegalArgumentException Will throw this exception if: 235 * 236 * <BR /><BR /><UL CLASS=JDUL> 237 * <LI>Either of these parameters are null</LI> 238 * <LI>If they are not parallel, with differing lengths.</LI> 239 * <LI>If either contain a null value.</LI> 240 * </UL> 241 */ 242 @LinkJavaSource(handle="Branch") 243 public static ArticleGet branch(URLFilter[] urlSelectors, ArticleGet[] getters) 244 { return Branch.generate(urlSelectors, getters); } 245 246 247 // ******************************************************************************************** 248 // ******************************************************************************************** 249 // Other Methods 250 // ******************************************************************************************** 251 // ******************************************************************************************** 252 253 254 /** 255 * This is the standard-java {@code Function 'andThen'} method. 256 * 257 * @param after This is the {@code ArticleGet} that will be (automatically) applied after 258 * {@code 'this'} function. 259 * 260 * @return <EMBED CLASS='external-html' DATA-FILE-ID=AG_AND_THEN_RET> 261 */ 262 default ArticleGet andThen(ArticleGet after) 263 { return (URL url, Vector<HTMLNode> page) -> after.apply(url, this.apply(url, page)); } 264 265 /** 266 * This is the standard-java {@code Function 'compose'} method. 267 * 268 * @param before This is the {@code ArticleGet} that is performed first, whose results are 269 * sent to {@code 'this'} function. 270 * 271 * @return <EMBED CLASS='external-html' DATA-FILE-ID=AG_COMPOSE_RET> 272 */ 273 default ArticleGet compose(ArticleGet before) 274 { return (URL url, Vector<HTMLNode> page) -> this.apply(url, before.apply(url, page)); } 275 276 /** 277 * The identity function will always return the same {@code Vector<HTMLNode>} as output that 278 * it receives as input. This is one of the {@code default} Java's lambda-methods. 279 * 280 * @return a new {@code ArticleGet} which (it should be obvious) is of type: 281 * {@code java.util.function.Function<Vector<HTMLNode>, Vector<HTMLNode>>} 282 * <BR /><BR />...<I> where the returned {@code Vector} is always the same (identical) to 283 * the input {@code Vector}.</I> 284 */ 285 static ArticleGet identity() 286 { 287 return (URL url, Vector<HTMLNode> page) -> 288 { 289 ArticleGetException.check(url, page); 290 return page; 291 }; 292 } 293 294}