ArticleGet.java.html

package Torello.HTML.Tools.NewsSite;

import java.util.function.*;
import java.util.*;
import java.util.regex.*;

import java.net.URL;

import Torello.HTML.*;
import Torello.HTML.NodeSearch.*;

import Torello.JavaDoc.LinkJavaSource;

import Torello.Java.ParallelArrayException;

/**
 * A function-pointer / lambda target for extracting an article's content from the web-page
 * from whence it was downloaded; including several {@code static}-builder methods for the
 * most common means of finding the HTML-Tags that wrap artilce-HTML on news-media websites.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=ARTICLE_GET>
 */
@FunctionalInterface
public interface ArticleGet extends java.io.Serializable
{
    /** <EMBED CLASS='external-html' DATA-FILE-ID=SVUIDFI>  */
    public static final long serialVersionUID = 1;


    // ********************************************************************************************
    // ********************************************************************************************
    // Standard Functional Interface Method
    // ********************************************************************************************
    // ********************************************************************************************

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=FUNC_INTER_METH>
     * <EMBED CLASS='external-html' DATA-FILE-ID=ART_GET_APPLY>
     */
    public Vector<HTMLNode> apply(URL url, Vector<HTMLNode> page) throws ArticleGetException;


    // ********************************************************************************************
    // ********************************************************************************************
    // Filter Factory / Filter-Generator  static-methods
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_HTMLTAG>
     * 
     * @param htmlTag This should be the HTML element that is used to wrap the actual news-content
     * article-body of an HTML news web-site page.
     * 
     * @return This returns an "Article Getter" that just picks out the part of a news-website
     * article that lies between the open and closed version of the specified htmlTag.
     */
    @LinkJavaSource(handle="Usual_htmlTag")
    public static ArticleGet usual(String htmlTag)
    { return Usual_htmlTag.generate(htmlTag); }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_TC>
     * 
     * @param tc This should be any of the pre-instantiated {@code TextComparitor's}.  Again, a
     * TextComparitor is just a {@code String} compare function like: {@code equals, contains,
     * StrCmpr.containsIgnoreCase(...)}, etc...
     * 
     * @param cssClassCompareStrings These are the values to be used by the 
     * {@code TextComparitor} when comparing with the value of the CSS-Selector {@code "Class"}
     * from the list of {@code DIV} elements on the page.
     * 
     * @return This returns an "Article Getter" that just picks out the part of a news-website
     * article that lies between the HTML-{@code DIV} Element nodes whose class is identified by
     * the "CSS (Cascading Style Sheets) {@code 'class'} identifier,  and the
     * {@code TextComparitor} parameter that you have chosen.
     */
    @LinkJavaSource(handle="Usual_tc")
    public static ArticleGet usual(TextComparitor tc, String... cssClassCompareStrings)
    { return Usual_tc.generate(tc, cssClassCompareStrings); }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_HTMLTAG_TC>
     * 
     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
     * or {@code <FRAME>}, then you may.
     * 
     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
     * 
     * @param tc This should be any of the pre-instantiated {@code TextComparitor's}.  Again, a
     * {@code TextComparitor} is just a {@code String} compare function like: {@code equals, 
     * contains, StrCmpr.containsIgnoreCase(...)}.
     * 
     * @param attributeValueCompareStrings These are the {@code String's} compared with using
     * the innerTag <B STYLE='color: red;'>value</B> using the {@code TextComparitor}.
     * 
     * @return This returns an "Article Getter" that picks out the part of a news-website article
     * that lies between the HTML element which matches the {@code 'htmlTag', 'innerTag' (id,
     * class, or "other")}, and whose attribute-<B STYLE='color: red;'>value</B> of the specified
     * {@code inner-tag} can be matched by the {@code TextComparitor} and the 
     * compare-{@code String's}.
     */
    @LinkJavaSource(handle="Usual_htmlTag_tc")
    public static ArticleGet usual
        (String htmlTag, String innerTag, TextComparitor tc, String... attributeValueCompareStrings)
    { return Usual_htmlTag_tc.generate(htmlTag, innerTag, tc, attributeValueCompareStrings); }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_ITVP_DESC>
     * 
     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
     * or {@code <FRAME>}, then you may.
     *
     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
     *
     * @param innerTagValuePattern <EMBED CLASS='external-html' DATA-FILE-ID=AG_USUAL_ITVP_PARAM>
     * 
     * @return This returns an "Article Getter" that picks out the part of a news-website article
     * that lays between the HTML element which matches the htmlTag, innerTag and value-testing
     * regex {@code Pattern "innerTagValuePattern"}.
     */
    @LinkJavaSource(handle="Usual_innerTagValuePattern")
    public static ArticleGet usual(String htmlTag, String innerTag, Pattern innerTagValuePattern)
    { return Usual_innerTagValuePattern.generate(htmlTag, innerTag, innerTagValuePattern); }

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * <BR /><BR />This gives more options for building your article getter.  In almost 95% of the
     * news-websites, the article or page-body is between and open and close HTML {@code 'DIV'}
     * element, and the {@code <DIV CLASS="...">} can be found by the {@code CSS 'class'} attribute.
     * <I><B>However,</B></I> This factory method allows a programmer to select article content
     * that handles other cases than the {@code 95%}, where you specify the HTML-token, 
     * attribute-<B STYLE='color: red;'>name</B> and a {@code Predicate<String>} for finding the
     * page-body.
     *
     * @param htmlTag This is almost always a {@code "DIV"} element, but if you wish to specify
     * something else, possibly a paragraph element ({@code <P>}), or maybe an {@code <IFRAME>}
     * or {@code <FRAME>}, then you may.
     *
     * @param innerTag This is almost always a {@code "CLASS"} attribute, but if you need to use
     * {@code "ID"} or something different altogether - possibly a site-specific tag, then use the
     * innerTag / attribute-<B STYLE='color: red;'>name</B> of your choice.
     *
     * @param p This java "lambda {@code Predicate}" will just receive the 
     * attribute-<B STYLE='color: red;'>value</B> from the "inner-tag" and provide a yes/no answer.
     *
     * @return This returns an "Article Getter" that matches an HTML element specified by
     * {@code 'htmlTag', 'innerTag'} and the result of the {@code String-Predicate} parameter
     * {@code 'p'} on the <B STYLE='color: red;'>value</B> of that inner-tag.
     */
    @LinkJavaSource(handle="Usual_p")
    public static ArticleGet usual(String htmlTag, String innerTag, Predicate<String> p)
    { return Usual_p.generate(htmlTag, innerTag, p); }

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * <BR /><BR />This factory method generates an "ArticleGet" that will retrieve news-article
     * body-content based on a "start-tag" and an "end-tag."  It is <B><I>very</I></B> to note,
     * that the text can only match a single text-node, and not span multiple text-nodes, or be
     * within {@code TagNode's} at all!  This should be easy to find, print up the HTML page as a
     * {@code Vector}, and inspect it!
     * 
     * @param startTextTag This must be text from an HTML {@code TextNode} that is
     * <I><B>contained</B> within one (single) {@code TextNode}</I> of the vectorized-HTML page.
     * 
     * @param endTextTag This must be text from an HTML {@code TextNode} that is also
     * <B><I>contained</B> in a single {@code TextNode}</I> of the vectorized-HTML page.
     * 
     * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B> in
     * the article, specified by the text-tag parameters, and gets it.
     */
    @LinkJavaSource(handle="Usual_textTag")
    public static ArticleGet usual(String startTextTag, String endTextTag)
    { return Usual_textTag.generate(startTextTag, endTextTag); }

    /**
     * <I>This is a static, factory method for building ArticleGet.</I>
     *
     * This factory method generates an "ArticleGet" that will retrieve news-article body-content
     * based on starting and ending regular-expressions.  The matches performed by the Regular
     * Expression checker will be performed on {@code TextNode's}, not on the {@code TagNode's}, or
     * the page itself.  It is <B><I>very</I></B> to note, that the text can only match a single
     * {@code TextNode}, and not span multiple {@code TextNode's}, or be within {@code TagNode's}
     * at all!  This should be easy to find, print up the HTML page as a {@code Vector}, and
     * inspect it!
     * 
     * @param startPattern This must be a regular expression {@code Pattern} that matches an HTML
     * {@code TextNode} that is <I><B>contained</B> within one (single) {@code TextNode}</I> of
     * the vectorized-HTML page.
     * 
     * @param endPattern This must be a regular expression {@code Pattern} that matches an HTML
     * {@code TextNode} that is also <B><I>contained</B> in a single  {@code TextNode}</I> of the
     * vectorized-HTML page.
     * 
     * @return This will return an "Article Getter" that looks for <B><I>non-HTML Text</I></B>
     * in the article, specified by the regular-expression pattern-matching parameters, and gets it.
     */
    @LinkJavaSource(handle="Usual_pattern")
    public static ArticleGet usual(Pattern startPattern, Pattern endPattern)
    { return Usual_pattern.generate(startPattern, endPattern); }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=AG_BRANCH>
     * 
     * @param urlSelectors This is a list of {@code Predicate<URL>} elements.  When one of these
     * returns {@code TRUE} for a particular {@code URL}, then the index of that
     * {@code URL}-selector in it's {@code array} will be used to call the appropriate getter from
     * the parallel-{@code array} input-parameter {@code 'getters'}.
     * 
     * @param getters This is a list of getter elements.  These should be tailored to the
     * particular news-website source that are chosen/selected by the {@code 'urlSelectors'}
     * parallel {@code array}.
     * 
     * @return This will be a "master {@code ArticleGet}" or a "dispatch {@code ArticleGet}."
     * All it does is simply traverse the first {@code array} looking for a
     * {@code Predicate}-match from the {@code 'urlSelectors'}, and then calls the getter in the
     * parallel {@code array}.
     * 
     * <BR /><BR /><B>NOTE:</B> If none of the {@code 'urlSelectors'} match when this
     * <B><I>"dispatch"</B></I> or rather <B><I>"branch"</I></B> is called by {@code class 
     * NewsSiteScrape}, the function/getter that is returned will throw an 
     * {@code ArticleGetException}.  It is important that the programmer only allow article
     * {@code URL's} that he can capably handled to pass to {@code class NewsSiteScrape}.
     * 
     * @throws IllegalArgumentException Will throw this exception if:
     * 
     * <BR /><BR /><UL CLASS=JDUL>
     * <LI>Either of these parameters are null</LI>
     * <LI>If they are not parallel, with differing lengths.</LI>
     * <LI>If either contain a null value.</LI>
     * </UL>
     */
    @LinkJavaSource(handle="Branch")
    public static ArticleGet branch(URLFilter[] urlSelectors, ArticleGet[] getters)
    { return Branch.generate(urlSelectors, getters); }


    // ********************************************************************************************
    // ********************************************************************************************
    // Other Methods
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * This is the standard-java {@code Function 'andThen'} method.
     *
     * @param after This is the {@code ArticleGet} that will be (automatically) applied after
     * {@code 'this'} function. 
     *
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=AG_AND_THEN_RET>
     */
    default ArticleGet andThen(ArticleGet after)
    { return (URL url, Vector<HTMLNode> page) -> after.apply(url, this.apply(url, page)); }

    /**
     * This is the standard-java {@code Function 'compose'} method.
     * 
     * @param before This is the {@code ArticleGet} that is performed first, whose results are
     * sent to {@code 'this'} function.
     * 
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=AG_COMPOSE_RET>
     */
    default ArticleGet compose(ArticleGet before)
    { return (URL url, Vector<HTMLNode> page) -> this.apply(url, before.apply(url, page)); }

    /**
     * The identity function will always return the same {@code Vector<HTMLNode>} as output that
     * it receives as input.  This is one of the {@code default} Java's lambda-methods.
     * 
     * @return a new {@code ArticleGet} which (it should be obvious) is of type:
     * {@code java.util.function.Function<Vector<HTMLNode>, Vector<HTMLNode>>}
     * <BR /><BR />...<I> where the returned {@code Vector} is always the same (identical) to
     * the input {@code Vector}.</I>
     */
    static ArticleGet identity()
    {
        return (URL url, Vector<HTMLNode> page) ->
        {
            ArticleGetException.check(url, page);
            return page;
        };
    }

}