Source code

001package Torello.HTML.Tools.Images;
002
003
004// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
005// My Imports
006// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
007
008import Torello.HTML.*;
009import Torello.Java.*;
010
011import Torello.HTML.NodeSearch.TagNodeFind;
012import Torello.Java.Additional.Ret2;
013import Torello.Java.Additional.AppendableLog;
014import Torello.Java.Additional.AppendableSafe;
015
016
017// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
018// JDK Imports.  These are all spelled-out at the bottom, because none of them are commonly used.
019// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
020
021// ByteArrayOutputStream, File, IOException
022import java.io.*;
023
024import java.net.URL;
025import java.util.Vector;
026
027/**
028 * A more advanced class for both downloading and saving a list of images, using URL's.
029 * 
030 * <EMBED CLASS='external-html' DATA-FILE-ID=ISR>
031 */
032@Torello.JavaDoc.StaticFunctional
033@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="IMAGE_SCRAPER_CLASS")
034public class ImageScraper
035{
036    // This Class is Static-Functional, and does not have any program state, other than the monitor
037    // Thread.  There is no need for a public-constructor, or any constructor for that matter.
038
039    private ImageScraper() { }
040
041
042    // ********************************************************************************************
043    // ********************************************************************************************
044    // Thread-Related Stuff
045    // ********************************************************************************************
046    // ********************************************************************************************
047
048
049    /**
050     * If this class has been used to make "multi-threaded" calls that use a Time-Out wait-period,
051     * you might see your Java-Program hang for a few seconds when you would expect it to exit back
052     * to your O.S. normally.
053     * 
054     * <BR /><BR /><B CLASS=JDDescLabel>Before Exiting:</B>
055     * 
056     * <BR />When a program you have written reaches the end of its code, if you have performed any
057     * time-dependent Image-Downloads using this class (class {@code ImageScraper}), then your
058     * program <I>might not exit immediately,</I> but rather sit at the command-prompt for anywhere
059     * between 10 and 30 seconds before this Timeout-Thread dies.
060     *
061     * <BR /><BR />Note that you may immediately terminate any additional threads that were started
062     * using this method.
063     */
064    public static void shutdownTOThreads() { DownloadImage.executor.shutdownNow(); }
065
066
067    // ********************************************************************************************
068    // ********************************************************************************************
069    // Primary User-API Methods
070    // ********************************************************************************************
071    // ********************************************************************************************
072
073
074    /**
075     * Downloads images located inside an HTML Page and updates the {@code SRC=...} {@code URL's}
076     * so that the links point to a <I>local copy</I> of <I>local images</I>.
077     *
078     * <BR /><BR />After completion of this method, an HTML page which contained any HTML image
079     * elements will have had those images downloaded to the local file-system, and also have had 
080     * the HTML attribute {@code 'src=...'} changed to reflect the local image name instead of the
081     * Internet URL name.
082     *
083     * @param page Any vectorized-html page or subpage.  This page should have HTML {@code <IMG ...>}
084     * elements in it, or else this method will exit without doing anything.
085     *
086     * @param pageURL If any of the HTML image elements have {@code src='...'} attributes that are
087     * partially resolved or <I>relative {@code URL's}</I> then this can be passed to the
088     * {@code ImageScraper} constructors in order to convert partial or relative {@code URL's}
089     * into complete {@code URL's.}  The Image Downloader simply cannot work with partially
090     * resolved {@code URL's}, and will skip them if they are partially resolved.  This parameter
091     * may be null, but if it is and there are incomplete-{@code URL's} those images will
092     * simply not be downloaded.
093     *
094     * @param log This is the 'logger' for this method.  It may be null, and if it is - no output
095     * will be sent to the terminal.
096     *
097     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
098     *
099     * @param targetDirectory This File-System directory where these files shall be stored.
100     *
101     * @return An instance of {@code Ret2<int[], Results>}.  The two returned elements
102     * of this class include:
103     *
104     * <BR /><BR /><UL CLASS=JDUL>
105     * 
106     * <LI> {@code Ret2.a (int[])}
107     *      <BR /><BR />This shall contain an index-array for the indices of each HTML
108     *      {@code '<IMG SRC=...>'} element found on the page.  It is not guaranteed that each of
109     *      images will have been resolved or downloaded successfully, but rather just that an HTML
110     *      {@code 'IMG'} element that had a {@code 'SRC'} attribute.  The second element of this
111     *      return-type will contain information regarding which images downloaded successfully.
112     *      <BR /><BR />
113     *      </LI>
114     * 
115     * <LI> {@code Ret2.b (Results)}
116     *      <BR /><BR />The second element of the return-type shall be the instance of
117     *      {@link Results} returned from the invocation of
118     *      {@code ImageScraper.download(...)}.  This method will provide details about each of the
119     *      images that were downloaded; or, if the download failed, the reasons for the failure.
120     *      <I>This return element shall be null if no images were found on the page.</I>
121     *      </LI>
122     * 
123     * </UL>
124     * 
125     * <BR />These return {@code Object} references are not necessarily important - <I>and they
126     * may be discarded if needed.</I>  They are provided as a matter of utility if further
127     * verification or research into successful downloads is needed.
128     * 
129     * @throws IOException I/O Problems that weren't avoided.
130     * @throws ImageScraperException Thrown for any number of errors that went unsuppressed.
131     */
132    public static Ret2<int[], Results> localizeImages
133        (Vector<HTMLNode> page, URL pageURL, Appendable log, String targetDirectory)
134        throws IOException, ImageScraperException
135    {
136        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
137        // Find all of the Image TagNode's on the Input Web-Page
138        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
139
140        final int[]             imgPosArr   = TagNodeFind.all(page, TC.Both, "img");
141        final Vector<TagNode>   vec         = new Vector<>();
142
143        // No Images Found.
144        if (imgPosArr.length == 0) return new Ret2<int[], Results>(imgPosArr, null);
145
146        for (final int pos : imgPosArr) vec.addElement((TagNode) page.elementAt(pos));
147
148
149        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
150        // Build a Request and Download all of the Image's that were just found / identified
151        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
152
153        final Request request = Request.buildFromTagNodeIter(vec, pageURL, true);
154        request.targetDirectory = targetDirectory;
155
156        // NOTE: This is NOT FINISHED:
157        // SET ALL OF THE "Skip On Exception" booleans to TRUE!!!
158
159        // Invoke the Main Image Downloader
160        final Results r = ImageScraper.download(request, log);
161
162
163        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
164        // Replace the <IMG SRC=...> TagNode URL's for images that were successfully downloaded.
165        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
166
167        // Now replace 
168        final ReplaceFunction replacer = (HTMLNode n, int arrPos, int count) ->
169        {
170            if (r.skipped[count] == false)
171
172                return ((TagNode) page.elementAt(arrPos))
173                        .setAV("src", r.fileNames[count], SD.SingleQuotes);
174
175            else return (TagNode) n;
176        };
177    
178        ReplaceNodes.r(page, imgPosArr, replacer);
179
180
181        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
182        // Report the Results
183        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
184
185        return new Ret2<int[], Results>(imgPosArr, r);
186    }
187
188    /**
189     * This will iterate through the {@code URL's} and download them.  Note that parameter
190     * {@code 'log'} may be null, and if so, it will be quietly ignored.
191     *
192     * @param request This parameter takes customization requests for batch image downloads.  To
193     * read more information about how to configure a download, please review the documentation for
194     * the class {@link Request}.
195     *
196     * <BR /><BR />Note that upon entering this method, this parameter is immediately cloned to
197     * prevent the possibility of Thread Concurrency Problems from happening.  After cloning, the
198     * the cloned instance is used exclusively, and the original parameter is discarded.  Further
199     * changes to the parameter-instance will not have any effect on the process.
200     * 
201     * @param log This shall receive text / log information.  This parameter may receive null, and
202     * if it does it will be ignored.  When ignored, logging information will not printed.
203     *
204     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
205     *
206     * @return an instance of {@code class Results} for the download.  The {@link Results} class
207     * contains several parallel arrays with information about images that have downloaded.  If an
208     * image-download happens to fail due to an improperly formed {@code URL} (or an 'incorrect' 
209     * {@code URL}), then the information in the {@code Results} arrays will contain a 'null' value
210     * for the index at those array-positions corresponding to the failed image.
211     *
212     * @throws ImageScraperException Thrown for any number of exceptions that may be thrown while
213     * executing the download-loop.  If another exception is thrown, then it is wrapped by this
214     * class' exception ({@link ImageScraperException}), and set as the {@code 'cause'} of that
215     * exception.
216     * 
217     * @throws AppendableError The interface {@code java.lang.Appendable} was designed to allow for
218     * an implementation to throw the (unchecked) exception {@code IOException}.  This has many 
219     * blessings, but can occasionally be a pain since, indeed, {@code IOException} is both an
220     * unchecked exception (and requires an explicity catch), and also very common
221     * (even ubiquitous) inside of HTTP download code.
222     * 
223     * <BR /><BR />If the user-provided {@code 'log'} parameter throws an {@code IOException} for
224     * simply trying to write character-data to the log about the download-progress, then <I>an
225     * {@code AppendableError} will be thrown</I>.  Note that this throwable does inherit 
226     * {@code java.lang.Error}, meaning that it won't be caught by standard Java {@code catch}
227     * clauses <I>(unless {@code 'Error'} is explicity mentioned!)</I>
228     */
229    public static Results download(Request request, Appendable log)
230        throws ImageScraperException
231    {
232        // Clone the Request, Similar to "SafeVarArgs" - Specifically, if the user starts playing
233        // with the contents of this class in the middle of a download, it will not have any effect
234        // on the 'request' object that is actually being used.
235
236        request = request.clone();        
237    
238        // Runs a few tests to make sure there are no problems using the request
239        request.CHECK();
240
241        // Makes log printing easier and easier.
242        final AppendableLog al = new AppendableLog(log, request.verbosity);
243
244        // Main Request-Configuration and Response Class Instances.
245        final Results results = new Results(request.size);
246
247        // Private, Internal Static-Class.  Makes passing variables even easier
248        final RECORD r = new RECORD(request, results, al);
249
250        // Now, this just gets rid of the surrounding try-catch block.  This is the only real
251        // reason for the internal/private method 'downloadWithoutTryCatch'.  This makes the
252        // indentation look a lot better.  Also, in this method, the 'log' is replaced with the
253        // AppendableSafe log
254
255        try 
256        {
257            // private static void mainDownloadLoop(RECORD r) throws ImageScraperException
258            // Helps prepare for the printing loop;
259
260            if (r.logLevelGTEQ1) r.append("\n");
261
262            for (URL url : r.request.source())
263            {
264                r.reset();
265                r.url = url;
266                MainLoopBody.loop(r);
267            }
268
269            return results;
270        }
271
272        catch (ImageScraperException e)
273        {
274            // If an exception causes the system to stop/halt, this extra '\n\n' makes the output
275            // text look a little nicer (sometimes... Sometimes it already looks fine).
276            // No more no less.
277
278            if (al.hasLog) al.append("\n\nThrowing ImageScraperException...\n");
279            throw e;
280        }
281    }
282}