ScrapeURLs.java.html

package Torello.HTML.Tools.NewsSite;

import Torello.HTML.HTMLNode;
import Torello.HTML.HTMLPage;
import Torello.HTML.HTTPCodes;
import Torello.HTML.URLFilter;
import Torello.HTML.Links;
import Torello.HTML.TagNode;

import Torello.HTML.NodeSearch.InnerTagGet;

import Torello.Java.StrPrint;
import Torello.Java.StrCmpr;
import Torello.Java.StrIndent;

import static Torello.Java.C.RESET;
import static Torello.Java.C.BRED;
import static Torello.Java.C.BYELLOW;

import Torello.Java.Additional.Ret2;
import Torello.Java.Additional.URLs;

import Torello.JavaDoc.Excuse;
import Torello.JavaDoc.StaticFunctional;

import java.util.Vector;
import java.io.IOException;
import java.util.stream.Stream;
import java.util.stream.Collectors;

import java.net.URL;
import java.net.MalformedURLException;

/**
 * Collects all <B>news-article {@code URL's}</B> from a news oriented web-site's main web-page
 * and from the list 'sub-section' web-pages.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE_URLS>
 */
@StaticFunctional(Excused="SKIP_ON_SECTION_URL_EXCEPTION", Excuses=Excuse.CONFIGURATION)
public class ScrapeURLs
{
    private ScrapeURLs() { }

    /** <EMBED CLASS='external-html' DATA-FILE-ID=SU_SKIP_OSUE> */
    public static boolean SKIP_ON_SECTION_URL_EXCEPTION = true;

    /**
     * Convenience Method.
     * <BR />Invokes: {@link #get(Vector, URLFilter, LinksGet, Appendable)}
     */
    public static Vector<Vector<String>> get(NewsSite ns, Appendable log) throws IOException
    { return get(ns.sectionURLsVec(), ns.filter, ns.linksGetter, log); }

    /**
     * This class is used to retrieve <B>all</B> of the available article {@code URL} links found
     * on <B>all sections</B> of a newspaper website.
     * 
     * @param sectionURLs       <EMBED CLASS='external-html' DATA-FILE-ID=SU_SEC_URLS>
     * @param articleURLFilter  <EMBED CLASS='external-html' DATA-FILE-ID=SU_ARTICLE_URLF>
     * @param linksGetter       <EMBED CLASS='external-html' DATA-FILE-ID=SU_LINKS_GET>
     * 
     * @param log This prints log information to the screen.  This parameter may not be null,
     * or a {@code NullPointerException} will throw.
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     *
     * @return                      <EMBED CLASS='external-html' DATA-FILE-ID=SU_RETURNS>
     * @throws SectionURLException  <EMBED CLASS='external-html' DATA-FILE-ID=SU_SEC_URL_EX>
     */
    public static Vector<Vector<String>> get(
            Vector<URL> sectionURLs, URLFilter articleURLFilter, LinksGet linksGetter,
            Appendable log
        )
    {
        LOG_WRITE(
            log,
            "\n" + BRED +
            "*****************************************************************************************\n" +
            "*****************************************************************************************\n" + 
            RESET + " Finding Article URL's in Newspaper Sections" + BRED + "\n" +
            "*****************************************************************************************\n" +
            "*****************************************************************************************\n" + 
            RESET + '\n'
        );

        final Vector<Vector<String>> ret = new Vector<>();

        for (URL sectionURL : sectionURLs)
        {
            Stream<String> urlStream;

            // It helps to run this, because web-pages can use a lot of strings
            System.gc();

            // Starting Scraping the Section for URL's
            LOG_WRITE(log, "Visiting Section URL: " + sectionURL.toString() + '\n');

            try
            {
                // Download, Scrape & Parse the main-page or section URL.
                Vector<HTMLNode> sectionPage = HTMLPage.getPageTokens(sectionURL, false);


                // If the 'LinksGet' instances is null, then select all URL's on the main-page
                // section-pge, and pray for rain (hope for the best).  If no 'LinksGet' instance
                // was provided, there will likely be many spurious / irrelevant links to
                // non-article pages, and even advertisement pages that are also included in this
                // Stream<String>.
                // 
                // InnerTagGet returns a Vector<TagNode>.  Convert that to a Stream<String>, where
                // each 'String' in the 'Stream' is the HREF attribute of the <A HREF=...> tag.

                if (linksGetter == null)
                    urlStream = InnerTagGet.all(sectionPage, "a", "href")
                        .stream().map((TagNode tn) -> tn.AV("href"));

                else 
                    urlStream = linksGetter.apply(sectionURL, sectionPage).stream();
            }

            catch (Exception e)
            {
                LOG_WRITE(
                    log,
                    BRED + "Error loading this main-section page-URL\n" + RESET +
                    e.getMessage() + '\n'
                );

                if (SKIP_ON_SECTION_URL_EXCEPTION)
                {
                    LOG_WRITE(log, "Non-fatal Exception, continuing to next Section URL.\n\n");
                    continue;
                }

                else
                {
                    LOG_WRITE(
                        log,
                        BRED + "Fatal - Exiting.  Top-Level Section URL's must be valid URL's." +
                        RESET + "\n" + HTTPCodes.convertMessageVerbose(e, sectionURL, 0) + '\n'
                    );

                    throw new SectionURLException
                        ("Invalid Main Section URL: " + sectionURL.toString(), e);
                }
            }

            Vector<String> sectionArticleURLs = urlStream

                // If any TagNode's did not have HREF-Attributes, remove those null-values
                .filter ((String href)  -> (href != null))

                // Perform a Standard String.trim() operation.
                .map    ((String href) -> href.trim())

                // Any HREF's that are "just white-space" are now removed.
                .filter ((String href) -> href.length() > 0)


                // This removes any HREF Attribute values that begin with
                // "mailto:" "tel:" "javascript:" "magnet:" etc...

                .filter ((String href) -> StrCmpr.startsWithNAND_CI(href, Links.NON_URL_HREFS()))

                // Now, Resolve any "Partial URL References"
                .map ((String href) -> Links.resolve_KE(href, sectionURL))
                                                     

                // If there were any exceptions while performing the Partial-URL Resolve-Operation,
                // then print an error message.

                .peek ((Ret2<URL, MalformedURLException> r2) ->
                {
                    if (r2.b != null) LOG_WRITE(
                        log,
                        "Section URL was a malformed URL, and provided exception messsage:\n" +
                        r2.b.getMessage() + '\n'
                    );
                })

                // Convert the Ret2<URL, Exception> to just the URL, without any Exceptions
                .map ((Ret2<URL, MalformedURLException> r2) -> r2.a)

                // If there was an exception, the URL Ret.a field would be null (remove nulls)
                .filter ((URL url) -> url != null)


                // NOTE: When this evaluates to TRUE - it should be kept
                // Java Stream's 'filter' method states that when the predicate evaluates to TRUE,
                // the stream element is KEPT / RETAINED.
                // 
                // Class URLFilter mimics the filter behavior of Streams.filter(...)

                .filter ((URL url) -> (articleURLFilter == null) || articleURLFilter.test(url))


                // Convert these to "Standard Strings"
                //      Case-Insensitive parts are set to LowerCase
                //      Case Sensitive Parts are left alone.

                .map ((URL url) -> URLs.urlToString(url))


                // Filter any duplicates -> This is the reason for the above case-sensitive parts
                // being separated.

                .distinct()


                // Convert the URL's back to a String. There really should not be any exceptions,
                // This is just an "extra-careful" step.  It is not needed.

                .filter ((String url) ->
                    { try { new URL(url); return true; } catch (Exception e) { return false; } })

                // Convert the Stream to a Vector
                .collect(Collectors.toCollection(Vector::new));

            ret.add(sectionArticleURLs);

            LOG_WRITE(
                log,
                "Found [" + BYELLOW + sectionArticleURLs.size() + RESET + "] " +
                "Article Links.\n\n"
            );
        }


        // Provide a simple count to the log output on how many URL's have been uncovered.
        // NOTE: This does not heed whether different sections contain non-distinct URL's.
        //       (An identical URL found in two different sections will be counted twice!)

        int totalURLs = 0;

        // <?> Prevents the "Xlint:all" from generating warnings...
        for (Vector<?> section : ret) totalURLs += section.size();

        LOG_WRITE(
            log,
            "Complete Possible Article URL list has: " + 
            BYELLOW + StrPrint.zeroPad10e4(totalURLs) + RESET + ' ' +
            "url(s).\n\n"
        );

        return ret;
    }

    // This is necessary because Java's 'java.lang.Appendable' permits a IOException
    private static void LOG_WRITE(Appendable log, String s)
    {
        try
            { log.append(s); }

        catch (Exception e)
        {
            System.out.println(
                "While trying to write to the log, an exception occurred\n" +
                "the java.lang.Appendable you have provided threw an IOException:\n" +
                StrIndent.indent(e.getMessage(), 4) +
                "Unfortunaely, with a faulty Appendable-Log, the scraper cannot continue."
            );

            e.printStackTrace();

            System.out.println("Fatal Error, JVM Exiting...");

            System.exit(1);
        }
    }
}