Source code

001package Torello.HTML.Tools.Images;
002
003import Torello.Java.StringParse;
004import Torello.Java.StrPrint;
005import Torello.Java.WritableDirectoryException;
006
007import java.io.*;
008
009import javax.imageio.ImageIO;
010import java.net.URL;
011import java.awt.Image;
012import java.awt.image.BufferedImage;
013import java.util.Vector;
014
015/**
016 * A simple class for scraping &amp; downloading images using a URL, or list of URL's.
017 * <EMBED CLASS='external-html' DATA-FILE-ID=IMSC>
018 */
019public class ImageScrape
020{
021    // No public constructor
022    private ImageScrape() { }
023
024    /** {@code String}-Array having the list of file-formats */
025    public static final String[] imageExts = { "jpg", "png", "gif", "bmp", "jpeg" };
026
027    /**
028     * This will extract the file-extension from an image {@code URL.}  Not all images on the
029     * internet have {@code URL's} that end with the actual image-file-type.  In that case, or in
030     * the case that the {@code 'urStr'} is a pointer to a non-image-file, null will be returned.
031     * 
032     * @param urlStr Is the {@code url} of the image. 
033     * 
034     * @return If it has a file-extension that is listed in the {@code 'imageExts'} array - that
035     * file-extension will be returned, otherwise {@code null} will be returned.
036     */
037    public static String getImageTypeFromURL(String urlStr)
038    {
039        if (urlStr == null) return null;
040
041        String ext = StringParse.fromExtension(urlStr, false);
042
043        if (ext == null) return null;
044
045        ext = ext.toLowerCase();
046
047        for (int i=0; i < imageExts.length; i++)
048            if (imageExts[i].equals(ext))
049                return imageExts[i];
050
051        return null;
052    }
053
054    /**
055     * Convenience Method.
056     * <BR />Invokes: {@link #downloadImageGuessType(String, String, String)}
057     */
058    public static String downloadImageGuessType(String urlStr, String outputFileStr)
059        throws IOException
060    { 
061        // We need to check whether the file-name that was passed is just a filename; or if it
062        // has a directory component in its name.
063
064        final int sep = outputFileStr.lastIndexOf(File.separator) + 1;
065
066        if (sep == 0)
067            return downloadImageGuessType(urlStr, outputFileStr, "");
068
069        else if (sep == outputFileStr.length())
070            return downloadImageGuessType(urlStr, "IMAGE", outputFileStr);
071
072        else return downloadImageGuessType
073            (urlStr, outputFileStr.substring(sep), outputFileStr.substring(0, sep));
074    }
075
076    /**
077     * This will download an image, and try to guess if it is one of the following types:
078     * {@code .jpg, .png, .bmp, .gif or .jpeg}.  If the {@code 'urlStr'} has a valid image-type
079     * extension as a filename, then that format will be used to save to a file.  If that fails,
080     * an exception of type {@code javax.imageio.IIOException} is thrown.
081     *
082     * <DIV CLASS="EXAMPLE">{@code
083     *  // Retrieve all images found on a random Yahoo! News web-page article
084     *  URL url = new URL("https://news.yahoo.com/former-fox-news-employees [actual URL hidden].html");
085     *  
086     *  // Parse & Scrape the Web-Page, store it in a local html-vector
087     *  Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
088     *  
089     *  // Skip ahead to the "article body."  The body is surrounded by an <ARTICLE>...</ARTICLE>
090     *  // HTML Element.  Retrieve (using 'Inclusive') - everything between the HTML "ARTICLE" Tags.
091     *  page = TagNodeGetInclusive.first(page, "article");
092     *  
093     *  // Get the SECOND picture (HTML <IMG SRC=...>) element found on the page.
094     *  // For the news-article used in this example, the first image was an icon thumbnail.
095     *  // The second image contained the "Main Article Photo"
096     *  TagNode firstPic    = TagNodeGet.nth(page, 2, TC.OpeningTags, "img");
097     *  String  urlStr      = Links.resolveSRC(firstPic, url).toString();
098     *  
099     *  // Run this method.  A file named 'img.jpg' is saved.
100     *  System.out.println("Image URL to Download:" + urlStr);
101     *  ImageScrape.downloadImageGuessType(urlStr, "img");
102     * }</DIV>
103     *
104     * @param urlStr Is the {@code url} of the image.  <B>Yahoo! Images</B>, for instance, have
105     * really long {@code URL's} and don't have any extensions at the end.  If {@code 'urlStr'}
106     * does contain an image extension in the {@code 'String'}, then this method will attempt to
107     * save the image using the appropriate file-extension, and throw an {@code 'IIOException'} if
108     * it fails.
109     * 
110     * @param outputFileStr This is the target or destination name for the output image file.
111     *
112     * <BR /><BR /><DIV CLASS=JDHint>
113     * <B STYLE='color:red;'>File-Extension:</B> This file is not intended to have an extension.
114     * The extension will be generated by the code in this method, and it will match whatever
115     * image-file-encoding was successfully used to download the file.  If this is a
116     * {@code '.png'}, for instance, but it did not download until {@code '.bmp'} was used
117     * (mis-labeled), this output file will be saved as {@code 'outputFileStr'} + {@code '.bmp'}.
118     * </DIV>
119     * 
120     * <BR /><DIV CLASS=JDHintAlt>
121     * <B STYLE='color: red;'>URL vs. File Names:</B> This parameter, {@code 'outputFileStr'},
122     * <B>may NOT be null</B>.  It is important to realize, here, that file-names and {@code URL's}
123     * do not obey the same naming conventions.  Because it is often seen on the internet
124     * Image-{@code URL's} that have a plethora of file-system 'irreverent' characters in their
125     * name, this method simply cannot pick out the file-name of an image from its
126     * {@code URL}.
127     * </DIV>
128     * 
129     * <BR />It may seem counter-intuitive to expect a "filename" parameter be provided as
130     * input here, <I>given that an image-{@code URL} is also required</I> (since in most cases
131     * the file-name of the image being downloaded is included in the image's {@code URL}).  
132     * However, because many of the modern content-providers on the internet use many layers of
133     * naming conventions for their image-{@code URL's}, the user must provide the file-name of
134     * the image (as a {@code String}) to avoid crashing this method in situations / cases where
135     * the image file-name is "too difficult" to discern from it's {@code URL}.
136     *
137     * @param outputDirectory This is just "prepended" to the file-save name.  This
138     * {@code 'String'} is not included in the returned filename.  <B>Specifically</B> The returned
139     * file name <I>only includes</I> the file-name and the file-name-extension.  It does not
140     * include the whole "canonical" or "absolute" directory-path name for this image.
141     *
142     * @return It will return the name of the file as a result - including the extension type
143     * which did not throw a {@code javax.imageio.IIOException.}  This exception is thrown
144     * whenever an image, of - for instance {@code '.png'} format tries to save as a
145     * {@code '.jpg'}, or any other incorrect image-format.
146     *
147     * <BR /><BR /><DIV CLASS=JDHint>
148     * <B STYLE='color:red;'>Regarding 'null':</B> {@code 'null'} will be returned if the image
149     * has failed to save / write at all.
150     * </DIV>
151     *
152     * <BR /><DIV CLASS=JDHintAlt>
153     * <B STYLE='color:red;'>Exception Throw:</B> If the passed {@code 'urlStr'} does not save
154     * properly, {@code javax.imageio.IIOException} will also be thrown.
155     * </DIV>
156     *
157     * <BR /><B>It is important to return the filename, since the extension identifies in
158     * what format the image was saved - {@code .jpg, .gif, .png,} etc...</B>
159     *
160     * @throws WritableDirectoryException If the provided output directory must exist and be
161     * writable, or else this exception shall throw.  Java will attempt to write a small,
162     * temporary file to the directory-name provided.  It will be deleted immediately afterwards.
163     * 
164     * @see #imageExts
165     */
166    public static String downloadImageGuessType
167        (String urlStr, String outputFileStr, String outputDirectory)
168        throws IOException
169    {
170        // If the "file name" has directory components...  it is just "better" to flag this as
171        // an exception
172
173        if (outputFileStr.indexOf(File.separator) != -1) throw new IllegalArgumentException(
174            "This method expects parameter 'outputFileStr' to be a simple file-name, without " +
175            "any directory-names attached.  If directory names need to be attached to ensure " +
176            "that the file is ultimately saved to the proper location in the file-system, " +
177            "pass the directory to the 'outputDirectory' parameter to this method.\n" +
178            "You have passed: " + outputFileStr + "\nwhich contains the file-name separator " +
179            "character."
180        );
181
182        if (outputDirectory == null) outputDirectory = "";
183
184        // Make sure the directory exists on the file-system, and that it is writable.
185        WritableDirectoryException.check(outputDirectory);
186
187        // Unless writing the "current directory" - make sure the directory name ends with the
188        // Operating System file-separator character.
189
190        if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator)))
191            outputDirectory = outputDirectory + File.separator;
192
193        final BufferedImage image   = ImageIO.read(new URL(urlStr));
194        final String        ext     = getImageTypeFromURL(urlStr);
195
196        File f = null;
197
198        if (ext != null) 
199
200            try
201            {
202                final String fName = outputFileStr + '.' + ext;
203                f = new File(outputDirectory + fName);
204
205                ImageIO.write(image, ext, f);
206
207                return fName;
208            }
209
210            // NOTE: If saving the file using the named image-extension fails, try the other.
211            catch (javax.imageio.IIOException e) { f.delete(); }
212
213        for (int i=0; i < imageExts.length; i++)
214
215            try
216            {
217                f = new File(outputFileStr + '.' + imageExts[i]);
218
219                ImageIO.write(image, imageExts[i], f);
220
221                return outputFileStr + '.' + imageExts[i];
222            }
223
224            catch (javax.imageio.IIOException e) { f.delete(); continue; }
225
226        System.out.println
227            ("NOTE: Image " + urlStr + "\nAttempted to save to:" + outputFileStr + "\nFAILED.");
228
229        return null;
230    }
231
232    /**
233     * Convenience Method.
234     * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)}
235     */
236    public static Vector<String> downloadImagesGuessTypes(String rootURL, Iterable<String> urls)
237        throws IOException
238    { return downloadImagesGuessTypes(rootURL, urls, ""); }
239
240    /**
241     * Convenience Method.
242     * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)}
243     */
244    public static Vector<String> downloadImagesGuessTypes(Iterable<String> urls)
245        throws IOException
246    { return downloadImagesGuessTypes("", urls, ""); }
247
248    /**
249     * Convenience Method.
250     * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)}
251     */
252    public static Vector<String> downloadImagesGuessTypes
253        (Iterable<String> urls, String outputDirectory)
254        throws IOException
255    { return downloadImagesGuessTypes("", urls, outputDirectory); }
256
257    /**
258     * This will download an entire {@code Vector<String>} of {@code URL's}, and save the
259     * output fileNames which were used to save these images.  It will use a the 
260     * {@code StrPrint.zeroPad(int)} method to generate filenames - starting with 
261     * {@code 001.jpg} - or whatever extension was correct.  It will use the <B><I>guessed 
262     * file-name extension</I></B> that is appropriate for this image.
263     *
264     * <BR /><BR /><DIV CLASS=JDHint>
265     * <B STYLE='color:red;'>Output Printing:</B> As the images are downloaded, the fileName is
266     * printed via {@code System.out.println()}
267     * </DIV>
268     *
269     * <DIV CLASS=EXAMPLE>{@code
270     *  // Retrieve all images found on the Wikipedia (Encyclopedia) Page for Galileo
271     *  URL url = new URL("https://en.wikipedia.org/wiki/Galileo_Galilei");
272     *  
273     *  // Parse & Scrape the Web-Page, store it in a local html-vector
274     *  Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false);
275     *  
276     *  // Get the "Vector Index Array" for every HTML <IMG> element found on the page.
277     *  int[] imgPosArr = TagNodeFind.all(page, TC.OpeningTags, "img");
278     *  
279     *  // Since there are many "relative" or "partial" URL's, make sure to resolve them
280     *  // against the main Wikipedia page-url.  Also, note, that Links.resolve returns a
281     *  // Vector<URL>, but that ImageScraper.downloadImagesGuessTypes requires a 
282     *  // Vector<String>, so make sure to convert the output url's to strings.
283     * 
284     *  Vector<String> urls = new Vector<String>(imgPosArr.length);
285     * 
286     *  Links.resolveSRCs(page, imgPosArr, url).forEach((URL u) -> urls.add(u.toString()));
287     *  
288     *  // Run this method.  A series of '.png' and '.jpg' files will be saved to the current
289     *  // working directory.
290     * 
291     *  ImageScrape.downloadImagesGuessTypes(urls);
292     * }</DIV>
293     *
294     * @param urls is a {@code Vector} of {@code String's} that are to contain image pointers
295     * 
296     * @param rootURL if these are "sub-urls", with a root {@code URL}, this root {@code URL}
297     * is pre-pended to each of the {@code String's} in the {@code 'urls' Vector}.  This parameter
298     * may contain the empty string ({@code ""}) (and if it is, it will be ignored)
299     * 
300     * @param outputDirectory The files that are downloaded are saved to this directory.
301     * 
302     * @return a {@code Vector} of {@code String's} which contains the output filenames of these
303     * files.
304     * 
305     * @throws WritableDirectoryException If the provided output directory must exist and be
306     * writable, or else this exception shall throw.  Java will attempt to write a small, temporary
307     * file to the directory-name provided.  It will be deleted immediately afterwards.
308     * 
309     * @see StrPrint#zeroPad(int)
310     * @see #downloadImageGuessType(String, String)
311     */
312    public static Vector<String> downloadImagesGuessTypes
313        (String rootURL, Iterable<String> urls, String outputDirectory)
314        throws IOException
315    {
316        if (outputDirectory == null) outputDirectory = "";
317
318        // Make sure the directory exists on the file-system, and that it is writable.
319        WritableDirectoryException.check(outputDirectory);
320
321        // Unless writing the "current directory" - make sure the directory name ends with the
322        // Operating System file-separator character.
323
324        if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator)))
325            outputDirectory = outputDirectory + File.separator;
326
327        if (rootURL == null) rootURL = "";
328
329        final Vector<String> ret = new Vector<String>();
330
331        int count = 0;
332
333        for (final String url : urls)
334        {
335            final String fileName = downloadImageGuessType
336                (rootURL + url, StrPrint.zeroPad(++count), outputDirectory);
337
338            System.out.print(fileName + ((fileName.length() < 10) ? ' ' : '\n'));
339
340            ret.addElement(fileName);
341        }
342
343        return ret;
344    }
345
346    /**
347     * This downloads an image to a a file named {@code 'outputFileStr'}.  A valid image-extension
348     * needs to be provided for the java {@code ImageIO.write(...)} method to work properly.  The
349     * {@code 'extensionStr'} should be {@code String's} such as: {@code '.jpg'} or {@code '.png'}
350     * 
351     * @param urlStr The {@code URL} of the image which generated the exception
352     * @param outputFileStr The intended file-name root to which the image is supposed to save
353     * @param extensionStr The intended file-name extension to which this image was to be saved.
354     * @throws java.imageio.IIOException - if this file type / {@code 'extensionStr'} are incorrect
355    */
356    public static void getImage(String urlStr, String outputFileStr, String extensionStr)
357        throws IOException
358    {
359        final File          f       = new File(outputFileStr);
360        final BufferedImage image   = ImageIO.read(new URL(urlStr));
361
362        ImageIO.write(image, extensionStr, f);
363    }
364
365    /**
366     * This method will read from a text-file, which must have a list of image-{@code URL's} from
367     * the internet - and download them, one by one, to a directory.  Messages will be printed as
368     * each file is downloaded via {@code System.out.print()}
369     * 
370     * @param f A file pointer to a text-file that contains a list of {@code String's}.  Each 
371     * {@code String} is intended to be a {@code URL} to an image on the internet.
372     * 
373     * @return a {@code Vector} containing the file-names of these images.
374     */
375    public static Vector<String> downloadImages(File f) throws IOException, FileNotFoundException
376    {
377        final BufferedReader br     = new BufferedReader(new FileReader(f));
378        final Vector<String> pics   = new Vector<String>();
379
380        String s;
381        while ((s = br.readLine()) != null) pics.addElement(s);
382
383        return downloadImagesGuessTypes(pics);
384    }
385}