HTMLPageMWT.java.html

package Torello.HTML;

import java.io.*;
import java.util.Vector;
import java.net.URL;
import Torello.HTML.HTMLPage.Parser;

import java.util.concurrent.*;
import java.util.concurrent.locks.*;

import Torello.JavaDoc.Excuse;

/**
 * A carbon-copy of class {@link HTMLPage}, augmented with a mechanism for setting <B>a timeout</B> 
 * so that when scraping web-pages and {@code URL's} from servers that might have a tendency to hang,
 * freeze, or delay - the Java Virtual Machine can skip and move-on when that timeout expires. 
 * 
 * <BR /><BR />
 * <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MWT>
 * <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE>
 * 
 * @see Scrape#getHTML(BufferedReader, int, int)
 * @see Scrape#getHTML(BufferedReader, String, String)
 * @see HTMLPage
 */
@Torello.JavaDoc.StaticFunctional(Excused="parser", Excuses=Excuse.SINGLETON)
@Torello.JavaDoc.JDHeaderBackgroundImg
public class HTMLPageMWT
{
    private HTMLPageMWT() { }

    /**
     * If needing to "swap a proprietary parser" comes up, this is possible.
     * It just needs to accept the same parameters as the current parser, and produce a 
     * {@code Vector<HTMLNode>.}  This is not an advised step to take, but if an alternative
     * parser has been tested and happens to be generating different results, it can be easily
     * 'swapped out' for the one used now.
     * @see HTMLPage.Parser
     * @see HTMLPage.Parser#parse(CharSequence, boolean, String, String, String)
     */
    public static Parser parser = ParserRE::parsePageTokens;


    // ********************************************************************************************
    // ********************************************************************************************
    // These 6 functions presume that the HTML source is from a URL
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Convenience Method.
     * <BR />Accepts: {@code URL} and Time-Out Parameters {@code 'timeout' & 'unit'}
     * <BR />Passes null to parameters
     * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(long, TimeUnit, URL, boolean, String, String,
     *      String, String, String)}
     */
    public static Vector<HTMLNode> getPageTokens
        (long timeout, TimeUnit unit, URL url, boolean eliminateHTMLTags)
        throws IOException, InterruptedException
    {
        return getPageTokens(timeout, unit, url, eliminateHTMLTags, null, null, null, null, null);
    }

    /**
     * Convenience Method.
     * <BR />Accepts: {@code URL} and Time-Out Parameters {@code 'timeout' & 'unit'}
     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(long, TimeUnit, URL, boolean, String, String,
     *      String, String, String)}
     */ 
    public static Vector<HTMLNode> getPageTokens(
            long timeout, TimeUnit unit,
            URL url, boolean eliminateHTMLTags,
            String startTag, String endTag
        )
        throws IOException, InterruptedException
    {
        return getPageTokens
            (timeout, unit, url, eliminateHTMLTags, startTag, endTag, null, null, null);
    }

    /**
     * Convenience Method.
     * <BR />Accepts: {@code URL} and Time-Out Parameters {@code 'timeout' & 'unit'}
     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(long, TimeUnit, URL, boolean, int, int,
     *      String, String, String)}
     */
    public static Vector<HTMLNode> getPageTokens(
            long timeout, TimeUnit unit,
            URL url, boolean eliminateHTMLTags,
            int startLineNum, int endLineNum
        )
        throws IOException, InterruptedException
    {
        return getPageTokens
            (timeout, unit, url, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
    }

    /**
     * Convenience Method.
     * <BR />Accepts: {@code URL} and Time-Out Parameters {@code 'timeout' & 'unit'}
     * <BR />Passes null to {@code startTag} &amp; {@code endTag} parameters.
     * <BR />Invokes: {@link #getPageTokens(long, TimeUnit, URL, boolean, String, String,
     *      String, String, String)}
     */
    public static Vector<HTMLNode> getPageTokens(
            long timeout, TimeUnit unit,
            URL url, boolean eliminateHTMLTags,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException, InterruptedException
    {
        return getPageTokens(
            timeout, unit, url, eliminateHTMLTags, null, null,
            rawHTMLFile, matchesFile, justTextFile
        );
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // The next 6 functions presume that the input is from a BufferedReader
    // ********************************************************************************************
    // ********************************************************************************************

    /**
     * Convenience Method.
     * <BR />Accepts: {@code BufferedReader}
     * <BR />Passes null to parameters
     * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(long, TimeUnit, BufferedReader, boolean,
     *      String, String, String, String, String)}
     */
    public static Vector<HTMLNode> getPageTokens
        (long timeout, TimeUnit unit, BufferedReader br, boolean eliminateHTMLTags)
        throws IOException, InterruptedException
    {
        return getPageTokens
            (timeout, unit, br, eliminateHTMLTags, null, null, null, null, null);
    }

    /**
     * Convenience Method.
     * <BR />Accepts: {@code BufferedReader}
     * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'}
     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(long, TimeUnit, BufferedReader, boolean,
     *      String, String, String, String, String)}
     */ 
    public static Vector<HTMLNode> getPageTokens(
            long timeout, TimeUnit unit,
            BufferedReader br, boolean eliminateHTMLTags, String startTag, String endTag
        )
        throws IOException, InterruptedException
    {
        return getPageTokens
            (timeout, unit, br, eliminateHTMLTags, startTag, endTag, null, null, null);
    }

    /**
     * Convenience Method.
     * <BR />Accepts: {@code BufferedReader}
     * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'}
     * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}.
     * <BR />Invokes: {@link #getPageTokens(long, TimeUnit, BufferedReader, boolean,
     *      int, int, String, String, String)}
     */
    public static Vector<HTMLNode> getPageTokens(
            long timeout, TimeUnit unit,
            BufferedReader br, boolean eliminateHTMLTags,
            int startLineNum, int endLineNum
        )
        throws IOException, InterruptedException
    {
        return getPageTokens
            (timeout, unit, br, eliminateHTMLTags, startLineNum, endLineNum, null, null, null);
    }

    /**
     * Convenience Method.
     * <BR />Accepts: {@code BufferedReader}
     * <BR />Passes null to {@code startTag} &amp; {@code endTag} parameters.
     * <BR />Invokes: {@link #getPageTokens(long, TimeUnit, BufferedReader, boolean,
     *      String, String, String, String, String)}
     */
    public static Vector<HTMLNode> getPageTokens(
            long timeout, TimeUnit unit,
            BufferedReader br, boolean eliminateHTMLTags,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException, InterruptedException
    { 
        return getPageTokens
            (timeout, unit, br, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile);
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // * Receives a "pre-instantiated" BufferedReader for the HTML Source parameter
    // ********************************************************************************************
    // ********************************************************************************************


    private static final ExecutorService    executor    = Executors.newCachedThreadPool();
    private static final Lock               lock        = new ReentrantLock();

    /**
     * If this class has been used to make "multi-threaded" calls that use a Time-Out wait-period,
     * you might see your Java-Program hang for a few seconds when you would expect it to exit back
     * to your O.S. normally.
     *
     * <BR /><BR /><B>Max Wait Time</B> operates by building a "Timeout &amp; Monitor" thread, and
     * therefore when a program you have written yourself reaches the end of its code, <I><B>if you
     * have performed any Internet-Downloads using {@code class HTMLPageMWT}</B></I>, then your
     * program <I>might not exit immediately,</I> but rather sit at the command-prompt for anywhere
     * between 10 and 30 seconds before this Timeout-Thread, created in class HTMLPageMWT, dies.
     *
     * <BR /><BR /><B CLASS=JDDescLabel>Multi-Threaded:</B>
     * 
     * <BR />You may also immediately terminate any additional threads that were started by using
     * this method.
     */
    public static void shutdownMWTThreads() { executor.shutdownNow(); }

    /**
     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
     * Spawns a <I>monitor-thread</I> that stops the download if a 
     * certain, user-specified, time-limit is exceeded.
     * @param timeout <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_TIMEOUT>
     * @param unit <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_UNIT>
     * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
     * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
     * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
     * @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
     * @throws InterruptedException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IEX>
     * @throws RejectedExecutionException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_REEX>
     */
    public static Vector<HTMLNode> getPageTokens(
            long timeout, TimeUnit unit,
            BufferedReader br, boolean eliminateHTMLTags,
            String startTag, String endTag,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException, InterruptedException
    {
        Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
        {
            public Vector<HTMLNode> call() throws Exception
            {
                return parser.parse(
                    Scrape.getHTML(br, startTag, endTag),
                    eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
                );
            }
        };

        lock.lock();
        Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
        lock.unlock();

        try
            { return future.get(timeout, unit); }

        catch (TimeoutException e) { return null; }

        catch (ExecutionException e)
        {
            Throwable originalException = e.getCause();
            if (originalException == null) throw new RejectedExecutionException(
                "An Execution Exception was thrown, but it did provide a cause throwable " +
                "(e.getCause() returned null).  See this exception's getCause() method to " +
                "view the ExecutionException that has occurred.",
                e
            );

            if (originalException instanceof IOException)
                throw (IOException) originalException;

            if (originalException instanceof RuntimeException)
                throw (RuntimeException) originalException;

            throw new RejectedExecutionException(
                "An Execution Exception occurred, but it was neither a RuntimeException, " +
                "nor IOException.  See this exception's getCause() method to view the " +
                "underlying error that has occurred.", originalException
            );
        }
    }

    /**
     * Parses and Vectorizes HTML from a {@code BufferedReader} source.
     * Spawns a <I>monitor-thread</I> that stops the download if a 
     * certain, user-specified, time-limit is exceeded.
     * @param timeout <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_TIMEOUT>
     * @param unit <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_UNIT>
     * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR>
     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
     * @param startLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_LN>
     * @param endLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_LN>
     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
     * @throws IllegalArgumentException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IAEX>
     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX1>
     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
     * @throws InterruptedException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IEX>
     * @throws RejectedExecutionException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_REEX>
     */
    public static Vector<HTMLNode> getPageTokens(
            long timeout, TimeUnit unit,
            BufferedReader br, boolean eliminateHTMLTags,
            int startLineNum, int endLineNum,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException, InterruptedException
    {
        Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
        {
            public Vector<HTMLNode> call() throws Exception
            {
                return parser.parse(
                    Scrape.getHTML(br, startLineNum, endLineNum),
                    eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
                );
            }
        };

        lock.lock();
        Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
        lock.unlock();

        try
            { return future.get(timeout, unit); }

        catch (TimeoutException e) { return null; }

        catch (ExecutionException e)
        {
            Throwable originalException = e.getCause();

            if (originalException == null) throw new RejectedExecutionException(
                "An Execution Exception was thrown, but it did provide a cause throwable " +
                "(e.getCause() returned null).  See this exception's getCause() method to " +
                "view the ExecutionException has that occurred.",
                e
            );

            if (originalException instanceof IOException)
                throw (IOException) originalException;

            if (originalException instanceof RuntimeException)
                throw (RuntimeException) originalException;

            throw new RejectedExecutionException(
                "An Execution Exception occurred, but it was neither a RuntimeException, nor " +
                "IOException.  See this exception's getCause() method to view the underlying " +
                "error that has occurred.", originalException
            );
        }
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Receives a java.net.URL for the HTML Source parameter -- that could Timeout/Hang
    // ********************************************************************************************
    // ********************************************************************************************


    // It must be opened within the Multi-Threaded "Timeout" code (and therefore requires a second
    // version of these two methods - where Scrape.openConn(url) is *inside* the monitored
    // downloading thread.
    
    /**
     * Parses and Vectorizes HTML from a URL source.
     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
     * time-limit is exceeded.
     * @param timeout <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_TIMEOUT>
     * @param unit <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_UNIT>
     * @param url <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_URL>
     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
     * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG>
     * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG>
     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
     * @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2>
     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
     * @throws InterruptedException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IEX>
     * @throws RejectedExecutionException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_REEX>
     */
    public static Vector<HTMLNode> getPageTokens(
            long timeout, TimeUnit unit,
            URL url, boolean eliminateHTMLTags,
            String startTag, String endTag,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException, InterruptedException
    {
        Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
        {
            public Vector<HTMLNode> call() throws Exception
            { 
                return parser.parse(
                    Scrape.getHTML(Scrape.openConn(url), startTag, endTag),
                    eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
                );
            }
        };

        lock.lock();
        Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
        lock.unlock();

        try
            { return future.get(timeout, unit); }

        catch (TimeoutException e) { return null; }

        catch (ExecutionException e)
        {
            Throwable originalException = e.getCause();

            if (originalException == null) throw new RejectedExecutionException(
                "An Execution Exception was thrown, but it did provide a cause throwable " +
                "(e.getCause() returned null).  See this exception's getCause() method to " +
                "view the ExecutionException that has occurred.", e
            );

            if (originalException instanceof IOException)
                throw (IOException) originalException;

            if (originalException instanceof RuntimeException)
                throw (RuntimeException) originalException;

            throw new RejectedExecutionException(
                "An Execution Exception occurred, but it was neither a RuntimeException, " +
                "nor IOException.  See this exception's getCause() method to view the " +
                "underlying error that has occurred.", originalException
            );
        }
    }

    
    /**
     * Parses and Vectorizes HTML from a URL source.
     * Spawns a <I>monitor-thread</I> that stops the download if a certain, user-specified,
     * time-limit is exceeded.
     * @param timeout <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_TIMEOUT>
     * @param unit <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_UNIT>
     * @param url <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_URL>
     * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT>
     * @param startLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_LN>
     * @param endLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_LN>
     * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML>
     * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F>
     * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT>
     * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN>
     * @throws IllegalArgumentException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IAEX>
     * @throws ScrapeException  <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX1>
     * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX>
     * @throws InterruptedException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IEX>
     * @throws RejectedExecutionException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_REEX>
     */
    public static Vector<HTMLNode> getPageTokens(
            long timeout, TimeUnit unit,
            URL url, boolean eliminateHTMLTags,
            int startLineNum, int endLineNum,
            String rawHTMLFile, String matchesFile, String justTextFile
        )
        throws IOException, InterruptedException
    {
        Callable<Vector<HTMLNode>> threadDownloader = new Callable<Vector<HTMLNode>>()
        {
            public Vector<HTMLNode> call() throws Exception
            { 
                return parser.parse(
                    Scrape.getHTML(Scrape.openConn(url), startLineNum, endLineNum),
                    eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile
                );
            }
        };

        lock.lock();
        Future<Vector<HTMLNode>> future = executor.submit(threadDownloader);
        lock.unlock();

        try
            { return future.get(timeout, unit); }

        catch (TimeoutException e) { return null; }

        catch (ExecutionException e)
        {
            Throwable originalException = e.getCause();
    
            if (originalException == null) throw new RejectedExecutionException(
                "An Execution Exception was thrown, but it did provide a cause throwable " +
                "(e.getCause() returned null).  See this exception's getCause() method to " +
                "view the ExecutionException has that occurred.",
                e
            );

            if (originalException instanceof IOException)
                throw (IOException) originalException;

            if (originalException instanceof RuntimeException)
                throw (RuntimeException) originalException;

            throw new RejectedExecutionException(
                "An Execution Exception occurred, but it was neither a RuntimeException, nor " +
                "IOException.  See this exception's getCause() method to view the underlying " +
                "error that has occurred.", originalException
            );
        }
    }
}