1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
package Torello.HTML.Tools.Images;


// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
// My Imports
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

import Torello.HTML.*;
import Torello.Java.*;

import Torello.HTML.NodeSearch.TagNodeFind;
import Torello.Java.Additional.Ret2;
import Torello.Java.Additional.AppendableLog;
import Torello.Java.Additional.AppendableSafe;


// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
// JDK Imports.  These are all spelled-out at the bottom, because none of them are commonly used.
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

// ByteArrayOutputStream, File, IOException
import java.io.*;

import java.net.URL;
import java.util.Vector;

/**
 * A more advanced class for both downloading and saving a list of images, using URL's.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=ISR>
 */
@Torello.JavaDoc.StaticFunctional
@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="IMAGE_SCRAPER_CLASS")
public class ImageScraper
{
    // This Class is Static-Functional, and does not have any program state, other than the monitor
    // Thread.  There is no need for a public-constructor, or any constructor for that matter.

    private ImageScraper() { }


    // ********************************************************************************************
    // ********************************************************************************************
    // Thread-Related Stuff
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * If this class has been used to make "multi-threaded" calls that use a Time-Out wait-period,
     * you might see your Java-Program hang for a few seconds when you would expect it to exit back
     * to your O.S. normally.
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>Before Exiting:</B>
     * 
     * <BR />When a program you have written reaches the end of its code, if you have performed any
     * time-dependent Image-Downloads using this class (class {@code ImageScraper}), then your
     * program <I>might not exit immediately,</I> but rather sit at the command-prompt for anywhere
     * between 10 and 30 seconds before this Timeout-Thread dies.
     *
     * <BR /><BR />Note that you may immediately terminate any additional threads that were started
     * using this method.
     */
    public static void shutdownTOThreads() { DownloadImage.executor.shutdownNow(); }


    // ********************************************************************************************
    // ********************************************************************************************
    // Primary User-API Methods
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Downloads images located inside an HTML Page and updates the {@code SRC=...} {@code URL's}
     * so that the links point to a <I>local copy</I> of <I>local images</I>.
     *
     * <BR /><BR />After completion of this method, an HTML page which contained any HTML image
     * elements will have had those images downloaded to the local file-system, and also have had 
     * the HTML attribute {@code 'src=...'} changed to reflect the local image name instead of the
     * Internet URL name.
     *
     * @param page Any vectorized-html page or subpage.  This page should have HTML {@code <IMG ...>}
     * elements in it, or else this method will exit without doing anything.
     *
     * @param pageURL If any of the HTML image elements have {@code src='...'} attributes that are
     * partially resolved or <I>relative {@code URL's}</I> then this can be passed to the
     * {@code ImageScraper} constructors in order to convert partial or relative {@code URL's}
     * into complete {@code URL's.}  The Image Downloader simply cannot work with partially
     * resolved {@code URL's}, and will skip them if they are partially resolved.  This parameter
     * may be null, but if it is and there are incomplete-{@code URL's} those images will
     * simply not be downloaded.
     *
     * @param log This is the 'logger' for this method.  It may be null, and if it is - no output
     * will be sent to the terminal.
     *
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     *
     * @param targetDirectory This File-System directory where these files shall be stored.
     *
     * @return An instance of {@code Ret2<int[], Results>}.  The two returned elements
     * of this class include:
     *
     * <BR /><BR /><UL CLASS=JDUL>
     * 
     * <LI> {@code Ret2.a (int[])}
     *      <BR /><BR />This shall contain an index-array for the indices of each HTML
     *      {@code '<IMG SRC=...>'} element found on the page.  It is not guaranteed that each of
     *      images will have been resolved or downloaded successfully, but rather just that an HTML
     *      {@code 'IMG'} element that had a {@code 'SRC'} attribute.  The second element of this
     *      return-type will contain information regarding which images downloaded successfully.
     *      <BR /><BR />
     *      </LI>
     * 
     * <LI> {@code Ret2.b (Results)}
     *      <BR /><BR />The second element of the return-type shall be the instance of
     *      {@link Results} returned from the invocation of
     *      {@code ImageScraper.download(...)}.  This method will provide details about each of the
     *      images that were downloaded; or, if the download failed, the reasons for the failure.
     *      <I>This return element shall be null if no images were found on the page.</I>
     *      </LI>
     * 
     * </UL>
     * 
     * <BR />These return {@code Object} references are not necessarily important - <I>and they
     * may be discarded if needed.</I>  They are provided as a matter of utility if further
     * verification or research into successful downloads is needed.
     * 
     * @throws IOException I/O Problems that weren't avoided.
     * @throws ImageScraperException Thrown for any number of errors that went unsuppressed.
     */
    public static Ret2<int[], Results> localizeImages
        (Vector<HTMLNode> page, URL pageURL, Appendable log, String targetDirectory)
        throws IOException, ImageScraperException
    {
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Find all of the Image TagNode's on the Input Web-Page
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        final int[]             imgPosArr   = TagNodeFind.all(page, TC.Both, "img");
        final Vector<TagNode>   vec         = new Vector<>();

        // No Images Found.
        if (imgPosArr.length == 0) return new Ret2<int[], Results>(imgPosArr, null);

        for (final int pos : imgPosArr) vec.addElement((TagNode) page.elementAt(pos));


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Build a Request and Download all of the Image's that were just found / identified
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        final Request request = Request.buildFromTagNodeIter(vec, pageURL, true);
        request.targetDirectory = targetDirectory;

        // NOTE: This is NOT FINISHED:
        // SET ALL OF THE "Skip On Exception" booleans to TRUE!!!

        // Invoke the Main Image Downloader
        final Results r = ImageScraper.download(request, log);


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Replace the <IMG SRC=...> TagNode URL's for images that were successfully downloaded.
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        // Now replace 
        final ReplaceFunction replacer = (HTMLNode n, int arrPos, int count) ->
        {
            if (r.skipped[count] == false)

                return ((TagNode) page.elementAt(arrPos))
                        .setAV("src", r.fileNames[count], SD.SingleQuotes);

            else return (TagNode) n;
        };
    
        ReplaceNodes.r(page, imgPosArr, replacer);


        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
        // Report the Results
        // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***

        return new Ret2<int[], Results>(imgPosArr, r);
    }

    /**
     * This will iterate through the {@code URL's} and download them.  Note that parameter
     * {@code 'log'} may be null, and if so, it will be quietly ignored.
     *
     * @param request This parameter takes customization requests for batch image downloads.  To
     * read more information about how to configure a download, please review the documentation for
     * the class {@link Request}.
     *
     * <BR /><BR />Note that upon entering this method, this parameter is immediately cloned to
     * prevent the possibility of Thread Concurrency Problems from happening.  After cloning, the
     * the cloned instance is used exclusively, and the original parameter is discarded.  Further
     * changes to the parameter-instance will not have any effect on the process.
     * 
     * @param log This shall receive text / log information.  This parameter may receive null, and
     * if it does it will be ignored.  When ignored, logging information will not printed.
     *
     * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE>
     *
     * @return an instance of {@code class Results} for the download.  The {@link Results} class
     * contains several parallel arrays with information about images that have downloaded.  If an
     * image-download happens to fail due to an improperly formed {@code URL} (or an 'incorrect' 
     * {@code URL}), then the information in the {@code Results} arrays will contain a 'null' value
     * for the index at those array-positions corresponding to the failed image.
     *
     * @throws ImageScraperException Thrown for any number of exceptions that may be thrown while
     * executing the download-loop.  If another exception is thrown, then it is wrapped by this
     * class' exception ({@link ImageScraperException}), and set as the {@code 'cause'} of that
     * exception.
     * 
     * @throws AppendableError The interface {@code java.lang.Appendable} was designed to allow for
     * an implementation to throw the (unchecked) exception {@code IOException}.  This has many 
     * blessings, but can occasionally be a pain since, indeed, {@code IOException} is both an
     * unchecked exception (and requires an explicity catch), and also very common
     * (even ubiquitous) inside of HTTP download code.
     * 
     * <BR /><BR />If the user-provided {@code 'log'} parameter throws an {@code IOException} for
     * simply trying to write character-data to the log about the download-progress, then <I>an
     * {@code AppendableError} will be thrown</I>.  Note that this throwable does inherit 
     * {@code java.lang.Error}, meaning that it won't be caught by standard Java {@code catch}
     * clauses <I>(unless {@code 'Error'} is explicity mentioned!)</I>
     */
    public static Results download(Request request, Appendable log)
        throws ImageScraperException
    {
        // Clone the Request, Similar to "SafeVarArgs" - Specifically, if the user starts playing
        // with the contents of this class in the middle of a download, it will not have any effect
        // on the 'request' object that is actually being used.

        request = request.clone();        
    
        // Runs a few tests to make sure there are no problems using the request
        request.CHECK();

        // Makes log printing easier and easier.
        final AppendableLog al = new AppendableLog(log, request.verbosity);

        // Main Request-Configuration and Response Class Instances.
        final Results results = new Results(request.size);

        // Private, Internal Static-Class.  Makes passing variables even easier
        final RECORD r = new RECORD(request, results, al);

        // Now, this just gets rid of the surrounding try-catch block.  This is the only real
        // reason for the internal/private method 'downloadWithoutTryCatch'.  This makes the
        // indentation look a lot better.  Also, in this method, the 'log' is replaced with the
        // AppendableSafe log

        try 
        {
            // private static void mainDownloadLoop(RECORD r) throws ImageScraperException
            // Helps prepare for the printing loop;

            if (r.logLevelGTEQ1) r.append("\n");

            for (URL url : r.request.source())
            {
                r.reset();
                r.url = url;
                MainLoopBody.loop(r);
            }

            return results;
        }

        catch (ImageScraperException e)
        {
            // If an exception causes the system to stop/halt, this extra '\n\n' makes the output
            // text look a little nicer (sometimes... Sometimes it already looks fine).
            // No more no less.

            if (al.hasLog) al.append("\n\nThrowing ImageScraperException...\n");
            throw e;
        }
    }
}