ImageScraper.java.html

package Torello.HTML.Tools.Images;


// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
// My Imports
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

import Torello.HTML.*;
import Torello.Java.*;

import Torello.HTML.NodeSearch.TagNodeFind;
import Torello.Java.Additional.Ret2;
import Torello.Java.Additional.AppendableLog;
import Torello.Java.Additional.AppendableSafe;


// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
// JDK Imports.  These are all spelled-out at the bottom, because none of them are commonly used.
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

// ByteArrayOutputStream, File, IOException
import java.io.*;

import java.net.URL;
import java.util.Vector;

/**
 * A more advanced class for both downloading and saving a list of images, using URL's.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=ISR>
 */
@Torello.JavaDoc.StaticFunctional
@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="IMAGE_SCRAPER_CLASS")
public class ImageScraper
{
    // This Class is Static-Functional, and does not have any program state, other than the monitor
    // Thread.  There is no need for a public-constructor, or any constructor for that matter.

    private ImageScraper() { }


    // ********************************************************************************************
    // ********************************************************************************************
    // Thread-Related Stuff
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * If this class has been used to make "multi-threaded" calls that use a Time-Out wait-period,
     * you might see your Java-Program hang for a few seconds when you would expect it to exit back
     * to your O.S. normally.
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>Before Exiting:</B>
     * 
     * <BR />When a program you have written reaches the end of its code, if you have performed any
     * time-dependent Image-Downloads using this class (class {@code ImageScraper}), then your
     * program <I>might not exit immediately,</I> but rather sit at the command-prompt for anywhere
     * between 10 and 30 seconds before this Timeout-Thread dies.
     *
     * <BR /><BR />Note that you may immediately terminate any additional threads that were started
     * using this method.
     */
    public static void shutdownTOThreads() { DownloadImage.executor.shutdownNow(); }


    // ********************************************************************************************
    // ********************************************************************************************
    // Primary User-API Methods
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Downloads images located inside an HTML Page and updates the {@code SRC=...} {@code URL's}
     * so that the links point to a <I>local copy</I> of <I>local images</I>.
     *
     * <BR /><BR />After completion of this method, an HTML page which contained any HTML image
     * elements will have had those images downloaded to the local file-system, and also have had 
     * the HTML attribute {@code 'src=...'} changed to reflect the local image name instead of the
     * Internet URL name.
     *
     * @param page Any vectorized-html page or subpage.  This page should have HTML {@code <IMG ...>}
     * elements in it, or else this method will exit without doing anything.
     *
     * @param pageURL If any of the HTML image elements have {@code src='...'} attributes that are
     * partially resolved or <I>relative {@code URL's}</I> then this can be passed to the
     * {@code ImageScraper} constructors in order to convert partial or relative {@code URL's}
     * into complete {@code URL's.}  The Image Downloader simply cannot work with partially
     * resolved {@code URL's}, and will skip them if they are partially resolved.  This parameter
     * may be null, but if it is and there are incomplete-{@code URL's} those images will
     * simply not be downloaded.
     *
     * @param log This is the 'logger' for this method.  It may be null, and if it is - no output
     * will be sent to the terminal.
     *
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     *
     * @param targetDirectory This File-System directory where these files shall be stored.
     *
     * @return An instance of {@code Ret2<int[], Results>}.  The two returned elements
     * of this class include:
     *
     * <BR /><BR /><UL CLASS=JDUL>
     * 
     * <LI> {@code Ret2.a (int[])}
     *      <BR /><BR />This shall contain an index-array for the indices of each HTML
     *      {@code '<IMG SRC=...>'} element found on the page.  It is not guaranteed that each of
     *      images will have been resolved or downloaded successfully, but rather just that an HTML
     *      {@code 'IMG'} element that had a {@code 'SRC'} attribute.  The second element of this
     *      return-type will contain information regarding which images downloaded successfully.
     *      <BR /><BR />
     *      </LI>
     * 
     * <LI> {@code Ret2.b (Results)}
     *      <BR /><BR />The second element of the return-type shall be the instance of
     *      {@link Results} returned from the invocation of
     *      {@code ImageScraper.download(...)}.  This method will provide details about each of the
     *      images that were downloaded; or, if the download failed, the reasons for the failure.
     *      <I>This return element shall be null if no images were found on the page.</I>
     *      </LI>
     * 
     * </UL>
     * 
     * <BR />These return {@code Object} references are not necessarily important - <I>and they
     * may be discarded if needed.</I>  They are provided as a matter of utility if further
     * verification or research into successful downloads is needed.
     * 
     * @throws IOException I/O Problems that weren't avoided.
     * @throws ImageScraperException Thrown for any number of errors that went unsuppressed.
     */
    public static Ret2<int[], Results> localizeImages
        (Vector<HTMLNode> page, URL pageURL, Appendable log, String targetDirectory)
        throws IOException, ImageScraperException
    {
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Find all of the Image TagNode's on the Input Web-Page
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        final int[]             imgPosArr   = TagNodeFind.all(page, TC.Both, "img");
        final Vector<TagNode>   vec         = new Vector<>();

        // No Images Found.
        if (imgPosArr.length == 0) return new Ret2<int[], Results>(imgPosArr, null);

        for (final int pos : imgPosArr) vec.addElement((TagNode) page.elementAt(pos));


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Build a Request and Download all of the Image's that were just found / identified
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        final Request request = Request.buildFromTagNodeIter(vec, pageURL, true);
        request.targetDirectory = targetDirectory;

        // NOTE: This is NOT FINISHED:
        // SET ALL OF THE "Skip On Exception" booleans to TRUE!!!

        // Invoke the Main Image Downloader
        final Results r = ImageScraper.download(request, log);


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Replace the <IMG SRC=...> TagNode URL's for images that were successfully downloaded.
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        // Now replace 
        final ReplaceFunction replacer = (HTMLNode n, int arrPos, int count) ->
        {
            if (r.skipped[count] == false)

                return ((TagNode) page.elementAt(arrPos))
                        .setAV("src", r.fileNames[count], SD.SingleQuotes);

            else return (TagNode) n;
        };
    
        ReplaceNodes.r(page, imgPosArr, replacer);


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Report the Results
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        return new Ret2<int[], Results>(imgPosArr, r);
    }

    /**
     * This will iterate through the {@code URL's} and download them.  Note that parameter
     * {@code 'log'} may be null, and if so, it will be quietly ignored.
     *
     * @param request This parameter takes customization requests for batch image downloads.  To
     * read more information about how to configure a download, please review the documentation for
     * the class {@link Request}.
     *
     * <BR /><BR />Note that upon entering this method, this parameter is immediately cloned to
     * prevent the possibility of Thread Concurrency Problems from happening.  After cloning, the
     * the cloned instance is used exclusively, and the original parameter is discarded.  Further
     * changes to the parameter-instance will not have any effect on the process.
     * 
     * @param log This shall receive text / log information.  This parameter may receive null, and
     * if it does it will be ignored.  When ignored, logging information will not printed.
     *
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     *
     * @return an instance of {@code class Results} for the download.  The {@link Results} class
     * contains several parallel arrays with information about images that have downloaded.  If an
     * image-download happens to fail due to an improperly formed {@code URL} (or an 'incorrect' 
     * {@code URL}), then the information in the {@code Results} arrays will contain a 'null' value
     * for the index at those array-positions corresponding to the failed image.
     *
     * @throws ImageScraperException Thrown for any number of exceptions that may be thrown while
     * executing the download-loop.  If another exception is thrown, then it is wrapped by this
     * class' exception ({@link ImageScraperException}), and set as the {@code 'cause'} of that
     * exception.
     * 
     * @throws AppendableError The interface {@code java.lang.Appendable} was designed to allow for
     * an implementation to throw the (unchecked) exception {@code IOException}.  This has many 
     * blessings, but can occasionally be a pain since, indeed, {@code IOException} is both an
     * unchecked exception (and requires an explicity catch), and also very common
     * (even ubiquitous) inside of HTTP download code.
     * 
     * <BR /><BR />If the user-provided {@code 'log'} parameter throws an {@code IOException} for
     * simply trying to write character-data to the log about the download-progress, then <I>an
     * {@code AppendableError} will be thrown</I>.  Note that this throwable does inherit 
     * {@code java.lang.Error}, meaning that it won't be caught by standard Java {@code catch}
     * clauses <I>(unless {@code 'Error'} is explicity mentioned!)</I>
     */
    public static Results download(Request request, Appendable log)
        throws ImageScraperException
    {
        // Clone the Request, Similar to "SafeVarArgs" - Specifically, if the user starts playing
        // with the contents of this class in the middle of a download, it will not have any effect
        // on the 'request' object that is actually being used.

        request = request.clone();        
    
        // Runs a few tests to make sure there are no problems using the request
        request.CHECK();

        // Makes log printing easier and easier.
        final AppendableLog al = new AppendableLog(log, request.verbosity);

        // Main Request-Configuration and Response Class Instances.
        final Results results = new Results(request.size);

        // Private, Internal Static-Class.  Makes passing variables even easier
        final RECORD r = new RECORD(request, results, al);

        // Now, this just gets rid of the surrounding try-catch block.  This is the only real
        // reason for the internal/private method 'downloadWithoutTryCatch'.  This makes the
        // indentation look a lot better.  Also, in this method, the 'log' is replaced with the
        // AppendableSafe log

        try 
        {
            // private static void mainDownloadLoop(RECORD r) throws ImageScraperException
            // Helps prepare for the printing loop;

            if (r.logLevelGTEQ1) r.append("\n");

            for (URL url : r.request.source())
            {
                r.reset();
                r.url = url;
                MainLoopBody.loop(r);
            }

            return results;
        }

        catch (ImageScraperException e)
        {
            // If an exception causes the system to stop/halt, this extra '\n\n' makes the output
            // text look a little nicer (sometimes... Sometimes it already looks fine).
            // No more no less.

            if (al.hasLog) al.append("\n\nThrowing ImageScraperException...\n");
            throw e;
        }
    }
}