1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
package Torello.HTML.Tools.NewsSite;

import java.util.function.*;
import java.util.*;
import java.util.regex.*;

import java.net.URL;

import Torello.HTML.*;
import Torello.HTML.NodeSearch.*;

import Torello.JavaDoc.LinkJavaSource;

import Torello.Java.ParallelArrayException;

/**
 * A function-pointer / lambda target for extracting an article's content from the web-page
 * from whence it was downloaded; including several {@code static}-builder methods for the
 * most common means of finding the HTML-Tags that wrap artilce-HTML on news-media websites.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=ARTICLE_GET>
 */
@FunctionalInterface
public interface ArticleGet extends java.io.Serializable
{
    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUIDFI>  */
    public static final long serialVersionUID = 1;


    // ********************************************************************************************
    // ********************************************************************************************
    // Standard Functional Interface Method
    // ********************************************************************************************
    // ********************************************************************************************

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=FUNC_INTER_METH>
     * <EMBED CLASS='external-html' DATA-FILE-ID=ART_GET_APPLY>
     */
    public Vector<HTMLNode> apply(URL url, Vector<HTMLNode> page) throws ArticleGetException;


    // ********************************************************************************************
    // ********************************************************************************************
    // Filter Factory / Filter-Generator  static-methods
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_HTMLTAG>
     * 
     * @param htmlTag This should be the HTML element that is used to wrap the actual news-content
     * article-body of an HTML news web-site page.
     * 
     * @return This returns an "Article Getter" that just picks out the part of a news-website
     * article that lies between the open and closed version of the specified htmlTag.
     */
    @LinkJavaSource(handle="Usual_htmlTag")
    public static ArticleGet usual(String htmlTag)
    { return Usual_htmlTag.generate(htmlTag); }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_TC>
     * 
     * @param tc This should be any of the pre-instantiated {@code TextComparitor's}.  Again, a
     * TextComparitor is just a {@code String} compare function like: {@code equals, contains,
     * StrCmpr.containsIgnoreCase(...)}, etc...
     * 
     * @param cssClassCompareStrings These are the values to be used by the 
     * {@code TextComparitor} when comparing with the value of the CSS-Selector {@code "Class"}
     * from the list of {@code DIV} elements on the page.
     * 
     * @return This returns an "Article Getter" that just picks out the part of a news-website
     * article that lies between the HTML-{@code DIV} Element nodes whose class is identified by
     * the "CSS (Cascading Style Sheets) {@code 'class'} identifier,  and the
     * {@code TextComparitor} parameter that you have chosen.
     */
    @LinkJavaSource(handle="Usual_tc")
    public static ArticleGet usual(TextComparitor tc, String... cssClassCompareStrings)
    { return Usual_tc.generate(tc, cssClassCompareStrings); }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_HTMLTAG_TC>
     * 
     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
     * or {@code <FRAME>}, then you may.
     * 
     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
     * 
     * @param tc This should be any of the pre-instantiated {@code TextComparitor's}.  Again, a
     * {@code TextComparitor} is just a {@code String} compare function like: {@code equals, 
     * contains, StrCmpr.containsIgnoreCase(...)}.
     * 
     * @param attributeValueCompareStrings These are the {@code String's} compared with using
     * the innerTag <B STYLE='color: red;'>value</B> using the {@code TextComparitor}.
     * 
     * @return This returns an "Article Getter" that picks out the part of a news-website article
     * that lies between the HTML element which matches the {@code 'htmlTag', 'innerTag' (id,
     * class, or "other")}, and whose attribute-<B STYLE='color: red;'>value</B> of the specified
     * {@code inner-tag} can be matched by the {@code TextComparitor} and the 
     * compare-{@code String's}.
     */
    @LinkJavaSource(handle="Usual_htmlTag_tc")
    public static ArticleGet usual
        (String htmlTag, String innerTag, TextComparitor tc, String... attributeValueCompareStrings)
    { return Usual_htmlTag_tc.generate(htmlTag, innerTag, tc, attributeValueCompareStrings); }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_ITVP_DESC>
     * 
     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
     * or {@code <FRAME>}, then you may.
     *
     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
     *
     * @param innerTagValuePattern <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_ITVP_PARAM>
     * 
     * @return This returns an "Article Getter" that picks out the part of a news-website article
     * that lays between the HTML element which matches the htmlTag, innerTag and value-testing
     * regex {@code Pattern "innerTagValuePattern"}.
     */
    @LinkJavaSource(handle="Usual_innerTagValuePattern")
    public static ArticleGet usual(String htmlTag, String innerTag, Pattern innerTagValuePattern)
    { return Usual_innerTagValuePattern.generate(htmlTag, innerTag, innerTagValuePattern); }

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * <BR /><BR />This gives more options for building your article getter.  In almost 95% of the
     * news-websites, the article or page-body is between and open and close HTML {@code 'DIV'}
     * element, and the {@code <DIV CLASS="...">} can be found by the {@code CSS 'class'} attribute.
     * <I><B>However,</B></I> This factory method allows a programmer to select article content
     * that handles other cases than the {@code 95%}, where you specify the HTML-token, 
     * attribute-<B STYLE='color: red;'>name</B> and a {@code Predicate<String>} for finding the
     * page-body.
     *
     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
     * or {@code <FRAME>}, then you may.
     *
     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
     *
     * @param p This java "lambda {@code Predicate}" will just receive the 
     * attribute-<B STYLE='color: red;'>value</B> from the "inner-tag" and provide a yes/no answer.
     *
     * @return This returns an "Article Getter" that matches an HTML element specified by
     * {@code 'htmlTag', 'innerTag'} and the result of the {@code String-Predicate} parameter
     * {@code 'p'} on the <B STYLE='color: red;'>value</B> of that inner-tag.
     */
    @LinkJavaSource(handle="Usual_p")
    public static ArticleGet usual(String htmlTag, String innerTag, Predicate<String> p)
    { return Usual_p.generate(htmlTag, innerTag, p); }

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * <BR /><BR />This factory method generates an "ArticleGet" that will retrieve news-article
     * body-content based on a "start-tag" and an "end-tag."  It is <B><I>very</I></B> to note,
     * that the text can only match a single text-node, and not span multiple text-nodes, or be
     * within {@code TagNode's} at all!  This should be easy to find, print up the HTML page as a
     * {@code Vector}, and inspect it!
     * 
     * @param startTextTag This must be text from an HTML {@code TextNode} that is
     * <I><B>contained</B> within one (single) {@code TextNode}</I> of the vectorized-HTML page.
     * 
     * @param endTextTag This must be text from an HTML {@code TextNode} that is also
     * <B><I>contained</B> in a single {@code TextNode}</I> of the vectorized-HTML page.
     * 
     * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B> in
     * the article, specified by the text-tag parameters, and gets it.
     */
    @LinkJavaSource(handle="Usual_textTag")
    public static ArticleGet usual(String startTextTag, String endTextTag)
    { return Usual_textTag.generate(startTextTag, endTextTag); }

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * This factory method generates an "ArticleGet" that will retrieve news-article body-content
     * based on starting and ending regular-expressions.  The matches performed by the Regular
     * Expression checker will be performed on {@code TextNode's}, not on the {@code TagNode's}, or
     * the page itself.  It is <B><I>very</I></B> to note, that the text can only match a single
     * {@code TextNode}, and not span multiple {@code TextNode's}, or be within {@code TagNode's}
     * at all!  This should be easy to find, print up the HTML page as a {@code Vector}, and
     * inspect it!
     * 
     * @param startPattern This must be a regular expression {@code Pattern} that matches an HTML
     * {@code TextNode} that is <I><B>contained</B> within one (single) {@code TextNode}</I> of
     * the vectorized-HTML page.
     * 
     * @param endPattern This must be a regular expression {@code Pattern} that matches an HTML
     * {@code TextNode} that is also <B><I>contained</B> in a single  {@code TextNode}</I> of the
     * vectorized-HTML page.
     * 
     * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B>
     * in the article, specified by the regular-expression pattern-matching parameters, and gets it.
     */
    @LinkJavaSource(handle="Usual_pattern")
    public static ArticleGet usual(Pattern startPattern, Pattern endPattern)
    { return Usual_pattern.generate(startPattern, endPattern); }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_BRANCH>
     * 
     * @param urlSelectors This is a list of {@code Predicate<URL>} elements.  When one of these
     * returns {@code TRUE} for a particular {@code URL}, then the index of that
     * {@code URL}-selector in it's {@code array} will be used to call the appropriate getter from
     * the parallel-{@code array} input-parameter {@code 'getters'}.
     * 
     * @param getters This is a list of getter elements.  These should be tailored to the
     * particular news-website source that are chosen/selected by the {@code 'urlSelectors'}
     * parallel {@code array}.
     * 
     * @return This will be a "master {@code ArticleGet}" or a "dispatch {@code ArticleGet}."
     * All it does is simply traverse the first {@code array} looking for a
     * {@code Predicate}-match from the {@code 'urlSelectors'}, and then calls the getter in the
     * parallel {@code array}.
     * 
     * <BR /><BR /><B>NOTE:</B> If none of the {@code 'urlSelectors'} match when this
     * <B><I>"dispatch"</B></I> or rather <B><I>"branch"</I></B> is called by {@code class 
     * NewsSiteScrape}, the function/getter that is returned will throw an 
     * {@code ArticleGetException}.  It is important that the programmer only allow article
     * {@code URL's} that he can capably handled to pass to {@code class NewsSiteScrape}.
     * 
     * @throws IllegalArgumentException Will throw this exception if:
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>Either of these parameters are null</LI>
     * <LI>If they are not parallel, with differing lengths.</LI>
     * <LI>If either contain a null value.</LI>
     * </UL>
     */
    @LinkJavaSource(handle="Branch")
    public static ArticleGet branch(URLFilter[] urlSelectors, ArticleGet[] getters)
    { return Branch.generate(urlSelectors, getters); }


    // ********************************************************************************************
    // ********************************************************************************************
    // Other Methods
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * This is the standard-java {@code Function 'andThen'} method.
     *
     * @param after This is the {@code ArticleGet} that will be (automatically) applied after
     * {@code 'this'} function. 
     *
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=AG_AND_THEN_RET>
     */
    default ArticleGet andThen(ArticleGet after)
    { return (URL url, Vector<HTMLNode> page) -> after.apply(url, this.apply(url, page)); }

    /**
     * This is the standard-java {@code Function 'compose'} method.
     * 
     * @param before This is the {@code ArticleGet} that is performed first, whose results are
     * sent to {@code 'this'} function.
     * 
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=AG_COMPOSE_RET>
     */
    default ArticleGet compose(ArticleGet before)
    { return (URL url, Vector<HTMLNode> page) -> this.apply(url, before.apply(url, page)); }

    /**
     * The identity function will always return the same {@code Vector<HTMLNode>} as output that
     * it receives as input.  This is one of the {@code default} Java's lambda-methods.
     * 
     * @return a new {@code ArticleGet} which (it should be obvious) is of type:
     * {@code java.util.function.Function<Vector<HTMLNode>, Vector<HTMLNode>>}
     * <BR /><BR />...<I> where the returned {@code Vector} is always the same (identical) to
     * the input {@code Vector}.</I>
     */
    static ArticleGet identity()
    {
        return (URL url, Vector<HTMLNode> page) ->
        {
            ArticleGetException.check(url, page);
            return page;
        };
    }

}