001package Torello.HTML.Tools.NewsSite;
002
003import java.util.function.*;
004import java.util.*;
005import java.util.regex.*;
006
007import java.net.URL;
008
009import Torello.HTML.*;
010import Torello.HTML.NodeSearch.*;
011
012import Torello.JavaDoc.LinkJavaSource;
013
014import Torello.Java.ParallelArrayException;
015
016/**
017 * A function-pointer / lambda target for extracting an article's content from the web-page
018 * from whence it was downloaded; including several {@code static}-builder methods for the
019 * most common means of finding the HTML-Tags that wrap artilce-HTML on news-media websites.
020 * 
021 * <EMBED CLASS='external-html' DATA-FILE-ID=ARTICLE_GET>
022 */
023@FunctionalInterface
024public interface ArticleGet extends java.io.Serializable
025{
026    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUIDFI>  */
027    public static final long serialVersionUID = 1;
028
029
030    // ********************************************************************************************
031    // ********************************************************************************************
032    // Standard Functional Interface Method
033    // ********************************************************************************************
034    // ********************************************************************************************
035
036    /**
037     * <EMBED CLASS='external-html' DATA-FILE-ID=FUNC_INTER_METH>
038     * <EMBED CLASS='external-html' DATA-FILE-ID=ART_GET_APPLY>
039     */
040    public Vector<HTMLNode> apply(URL url, Vector<HTMLNode> page) throws ArticleGetException;
041
042
043    // ********************************************************************************************
044    // ********************************************************************************************
045    // Filter Factory / Filter-Generator  static-methods
046    // ********************************************************************************************
047    // ********************************************************************************************
048
049
050    /**
051     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_HTMLTAG>
052     * 
053     * @param htmlTag This should be the HTML element that is used to wrap the actual news-content
054     * article-body of an HTML news web-site page.
055     * 
056     * @return This returns an "Article Getter" that just picks out the part of a news-website
057     * article that lies between the open and closed version of the specified htmlTag.
058     */
059    @LinkJavaSource(handle="Usual_htmlTag")
060    public static ArticleGet usual(String htmlTag)
061    { return Usual_htmlTag.generate(htmlTag); }
062
063    /**
064     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_TC>
065     * 
066     * @param tc This should be any of the pre-instantiated {@code TextComparitor's}.  Again, a
067     * TextComparitor is just a {@code String} compare function like: {@code equals, contains,
068     * StrCmpr.containsIgnoreCase(...)}, etc...
069     * 
070     * @param cssClassCompareStrings These are the values to be used by the 
071     * {@code TextComparitor} when comparing with the value of the CSS-Selector {@code "Class"}
072     * from the list of {@code DIV} elements on the page.
073     * 
074     * @return This returns an "Article Getter" that just picks out the part of a news-website
075     * article that lies between the HTML-{@code DIV} Element nodes whose class is identified by
076     * the "CSS (Cascading Style Sheets) {@code 'class'} identifier,  and the
077     * {@code TextComparitor} parameter that you have chosen.
078     */
079    @LinkJavaSource(handle="Usual_tc")
080    public static ArticleGet usual(TextComparitor tc, String... cssClassCompareStrings)
081    { return Usual_tc.generate(tc, cssClassCompareStrings); }
082
083    /**
084     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_HTMLTAG_TC>
085     * 
086     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
087     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
088     * or {@code <FRAME>}, then you may.
089     * 
090     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
091     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
092     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
093     * 
094     * @param tc This should be any of the pre-instantiated {@code TextComparitor's}.  Again, a
095     * {@code TextComparitor} is just a {@code String} compare function like: {@code equals, 
096     * contains, StrCmpr.containsIgnoreCase(...)}.
097     * 
098     * @param attributeValueCompareStrings These are the {@code String's} compared with using
099     * the innerTag <B STYLE='color: red;'>value</B> using the {@code TextComparitor}.
100     * 
101     * @return This returns an "Article Getter" that picks out the part of a news-website article
102     * that lies between the HTML element which matches the {@code 'htmlTag', 'innerTag' (id,
103     * class, or "other")}, and whose attribute-<B STYLE='color: red;'>value</B> of the specified
104     * {@code inner-tag} can be matched by the {@code TextComparitor} and the 
105     * compare-{@code String's}.
106     */
107    @LinkJavaSource(handle="Usual_htmlTag_tc")
108    public static ArticleGet usual
109        (String htmlTag, String innerTag, TextComparitor tc, String... attributeValueCompareStrings)
110    { return Usual_htmlTag_tc.generate(htmlTag, innerTag, tc, attributeValueCompareStrings); }
111
112    /**
113     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_ITVP_DESC>
114     * 
115     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
116     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
117     * or {@code <FRAME>}, then you may.
118     *
119     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
120     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
121     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
122     *
123     * @param innerTagValuePattern <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_ITVP_PARAM>
124     * 
125     * @return This returns an "Article Getter" that picks out the part of a news-website article
126     * that lays between the HTML element which matches the htmlTag, innerTag and value-testing
127     * regex {@code Pattern "innerTagValuePattern"}.
128     */
129    @LinkJavaSource(handle="Usual_innerTagValuePattern")
130    public static ArticleGet usual(String htmlTag, String innerTag, Pattern innerTagValuePattern)
131    { return Usual_innerTagValuePattern.generate(htmlTag, innerTag, innerTagValuePattern); }
132
133    /**
134     * <I>This is a static, factory method for building ArticleGet.</I>
135     *
136     * <BR /><BR />This gives more options for building your article getter.  In almost 95% of the
137     * news-websites, the article or page-body is between and open and close HTML {@code 'DIV'}
138     * element, and the {@code <DIV CLASS="...">} can be found by the {@code CSS 'class'} attribute.
139     * <I><B>However,</B></I> This factory method allows a programmer to select article content
140     * that handles other cases than the {@code 95%}, where you specify the HTML-token, 
141     * attribute-<B STYLE='color: red;'>name</B> and a {@code Predicate<String>} for finding the
142     * page-body.
143     *
144     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
145     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
146     * or {@code <FRAME>}, then you may.
147     *
148     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
149     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
150     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
151     *
152     * @param p This java "lambda {@code Predicate}" will just receive the 
153     * attribute-<B STYLE='color: red;'>value</B> from the "inner-tag" and provide a yes/no answer.
154     *
155     * @return This returns an "Article Getter" that matches an HTML element specified by
156     * {@code 'htmlTag', 'innerTag'} and the result of the {@code String-Predicate} parameter
157     * {@code 'p'} on the <B STYLE='color: red;'>value</B> of that inner-tag.
158     */
159    @LinkJavaSource(handle="Usual_p")
160    public static ArticleGet usual(String htmlTag, String innerTag, Predicate<String> p)
161    { return Usual_p.generate(htmlTag, innerTag, p); }
162
163    /**
164     * <I>This is a static, factory method for building ArticleGet.</I>
165     *
166     * <BR /><BR />This factory method generates an "ArticleGet" that will retrieve news-article
167     * body-content based on a "start-tag" and an "end-tag."  It is <B><I>very</I></B> to note,
168     * that the text can only match a single text-node, and not span multiple text-nodes, or be
169     * within {@code TagNode's} at all!  This should be easy to find, print up the HTML page as a
170     * {@code Vector}, and inspect it!
171     * 
172     * @param startTextTag This must be text from an HTML {@code TextNode} that is
173     * <I><B>contained</B> within one (single) {@code TextNode}</I> of the vectorized-HTML page.
174     * 
175     * @param endTextTag This must be text from an HTML {@code TextNode} that is also
176     * <B><I>contained</B> in a single {@code TextNode}</I> of the vectorized-HTML page.
177     * 
178     * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B> in
179     * the article, specified by the text-tag parameters, and gets it.
180     */
181    @LinkJavaSource(handle="Usual_textTag")
182    public static ArticleGet usual(String startTextTag, String endTextTag)
183    { return Usual_textTag.generate(startTextTag, endTextTag); }
184
185    /**
186     * <I>This is a static, factory method for building ArticleGet.</I>
187     *
188     * This factory method generates an "ArticleGet" that will retrieve news-article body-content
189     * based on starting and ending regular-expressions.  The matches performed by the Regular
190     * Expression checker will be performed on {@code TextNode's}, not on the {@code TagNode's}, or
191     * the page itself.  It is <B><I>very</I></B> to note, that the text can only match a single
192     * {@code TextNode}, and not span multiple {@code TextNode's}, or be within {@code TagNode's}
193     * at all!  This should be easy to find, print up the HTML page as a {@code Vector}, and
194     * inspect it!
195     * 
196     * @param startPattern This must be a regular expression {@code Pattern} that matches an HTML
197     * {@code TextNode} that is <I><B>contained</B> within one (single) {@code TextNode}</I> of
198     * the vectorized-HTML page.
199     * 
200     * @param endPattern This must be a regular expression {@code Pattern} that matches an HTML
201     * {@code TextNode} that is also <B><I>contained</B> in a single  {@code TextNode}</I> of the
202     * vectorized-HTML page.
203     * 
204     * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B>
205     * in the article, specified by the regular-expression pattern-matching parameters, and gets it.
206     */
207    @LinkJavaSource(handle="Usual_pattern")
208    public static ArticleGet usual(Pattern startPattern, Pattern endPattern)
209    { return Usual_pattern.generate(startPattern, endPattern); }
210
211    /**
212     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_BRANCH>
213     * 
214     * @param urlSelectors This is a list of {@code Predicate<URL>} elements.  When one of these
215     * returns {@code TRUE} for a particular {@code URL}, then the index of that
216     * {@code URL}-selector in it's {@code array} will be used to call the appropriate getter from
217     * the parallel-{@code array} input-parameter {@code 'getters'}.
218     * 
219     * @param getters This is a list of getter elements.  These should be tailored to the
220     * particular news-website source that are chosen/selected by the {@code 'urlSelectors'}
221     * parallel {@code array}.
222     * 
223     * @return This will be a "master {@code ArticleGet}" or a "dispatch {@code ArticleGet}."
224     * All it does is simply traverse the first {@code array} looking for a
225     * {@code Predicate}-match from the {@code 'urlSelectors'}, and then calls the getter in the
226     * parallel {@code array}.
227     * 
228     * <BR /><BR /><B>NOTE:</B> If none of the {@code 'urlSelectors'} match when this
229     * <B><I>"dispatch"</B></I> or rather <B><I>"branch"</I></B> is called by {@code class 
230     * NewsSiteScrape}, the function/getter that is returned will throw an 
231     * {@code ArticleGetException}.  It is important that the programmer only allow article
232     * {@code URL's} that he can capably handled to pass to {@code class NewsSiteScrape}.
233     * 
234     * @throws IllegalArgumentException Will throw this exception if:
235     * 
236     * <BR /><BR /><UL CLASS=JDUL>
237     * <LI>Either of these parameters are null</LI>
238     * <LI>If they are not parallel, with differing lengths.</LI>
239     * <LI>If either contain a null value.</LI>
240     * </UL>
241     */
242    @LinkJavaSource(handle="Branch")
243    public static ArticleGet branch(URLFilter[] urlSelectors, ArticleGet[] getters)
244    { return Branch.generate(urlSelectors, getters); }
245
246
247    // ********************************************************************************************
248    // ********************************************************************************************
249    // Other Methods
250    // ********************************************************************************************
251    // ********************************************************************************************
252
253
254    /**
255     * This is the standard-java {@code Function 'andThen'} method.
256     *
257     * @param after This is the {@code ArticleGet} that will be (automatically) applied after
258     * {@code 'this'} function. 
259     *
260     * @return <EMBED CLASS='external-html' DATA-FILE-ID=AG_AND_THEN_RET>
261     */
262    default ArticleGet andThen(ArticleGet after)
263    { return (URL url, Vector<HTMLNode> page) -> after.apply(url, this.apply(url, page)); }
264
265    /**
266     * This is the standard-java {@code Function 'compose'} method.
267     * 
268     * @param before This is the {@code ArticleGet} that is performed first, whose results are
269     * sent to {@code 'this'} function.
270     * 
271     * @return <EMBED CLASS='external-html' DATA-FILE-ID=AG_COMPOSE_RET>
272     */
273    default ArticleGet compose(ArticleGet before)
274    { return (URL url, Vector<HTMLNode> page) -> this.apply(url, before.apply(url, page)); }
275
276    /**
277     * The identity function will always return the same {@code Vector<HTMLNode>} as output that
278     * it receives as input.  This is one of the {@code default} Java's lambda-methods.
279     * 
280     * @return a new {@code ArticleGet} which (it should be obvious) is of type:
281     * {@code java.util.function.Function<Vector<HTMLNode>, Vector<HTMLNode>>}
282     * <BR /><BR />...<I> where the returned {@code Vector} is always the same (identical) to
283     * the input {@code Vector}.</I>
284     */
285    static ArticleGet identity()
286    {
287        return (URL url, Vector<HTMLNode> page) ->
288        {
289            ArticleGetException.check(url, page);
290            return page;
291        };
292    }
293
294}