001package Torello.HTML.Tools.Images; 002 003import Torello.Java.StringParse; 004import Torello.Java.StrPrint; 005import Torello.Java.WritableDirectoryException; 006 007import java.io.*; 008 009import javax.imageio.ImageIO; 010import java.net.URL; 011import java.awt.Image; 012import java.awt.image.BufferedImage; 013import java.util.Vector; 014 015/** 016 * A simple class for scraping & downloading images using a URL, or list of URL's. 017 * <EMBED CLASS='external-html' DATA-FILE-ID=IMSC> 018 */ 019public class ImageScrape 020{ 021 // No public constructor 022 private ImageScrape() { } 023 024 /** {@code String}-Array having the list of file-formats */ 025 public static final String[] imageExts = { "jpg", "png", "gif", "bmp", "jpeg" }; 026 027 /** 028 * This will extract the file-extension from an image {@code URL.} Not all images on the 029 * internet have {@code URL's} that end with the actual image-file-type. In that case, or in 030 * the case that the {@code 'urStr'} is a pointer to a non-image-file, null will be returned. 031 * 032 * @param urlStr Is the {@code url} of the image. 033 * 034 * @return If it has a file-extension that is listed in the {@code 'imageExts'} array - that 035 * file-extension will be returned, otherwise {@code null} will be returned. 036 */ 037 public static String getImageTypeFromURL(String urlStr) 038 { 039 if (urlStr == null) return null; 040 041 String ext = StringParse.fromExtension(urlStr, false); 042 043 if (ext == null) return null; 044 045 ext = ext.toLowerCase(); 046 047 for (int i=0; i < imageExts.length; i++) 048 if (imageExts[i].equals(ext)) 049 return imageExts[i]; 050 051 return null; 052 } 053 054 /** 055 * Convenience Method. 056 * <BR />Invokes: {@link #downloadImageGuessType(String, String, String)} 057 */ 058 public static String downloadImageGuessType(String urlStr, String outputFileStr) 059 throws IOException 060 { 061 // We need to check whether the file-name that was passed is just a filename; or if it 062 // has a directory component in its name. 063 064 final int sep = outputFileStr.lastIndexOf(File.separator) + 1; 065 066 if (sep == 0) 067 return downloadImageGuessType(urlStr, outputFileStr, ""); 068 069 else if (sep == outputFileStr.length()) 070 return downloadImageGuessType(urlStr, "IMAGE", outputFileStr); 071 072 else return downloadImageGuessType 073 (urlStr, outputFileStr.substring(sep), outputFileStr.substring(0, sep)); 074 } 075 076 /** 077 * This will download an image, and try to guess if it is one of the following types: 078 * {@code .jpg, .png, .bmp, .gif or .jpeg}. If the {@code 'urlStr'} has a valid image-type 079 * extension as a filename, then that format will be used to save to a file. If that fails, 080 * an exception of type {@code javax.imageio.IIOException} is thrown. 081 * 082 * <DIV CLASS="EXAMPLE">{@code 083 * // Retrieve all images found on a random Yahoo! News web-page article 084 * URL url = new URL("https://news.yahoo.com/former-fox-news-employees [actual URL hidden].html"); 085 * 086 * // Parse & Scrape the Web-Page, store it in a local html-vector 087 * Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false); 088 * 089 * // Skip ahead to the "article body." The body is surrounded by an <ARTICLE>...</ARTICLE> 090 * // HTML Element. Retrieve (using 'Inclusive') - everything between the HTML "ARTICLE" Tags. 091 * page = TagNodeGetInclusive.first(page, "article"); 092 * 093 * // Get the SECOND picture (HTML <IMG SRC=...>) element found on the page. 094 * // For the news-article used in this example, the first image was an icon thumbnail. 095 * // The second image contained the "Main Article Photo" 096 * TagNode firstPic = TagNodeGet.nth(page, 2, TC.OpeningTags, "img"); 097 * String urlStr = Links.resolveSRC(firstPic, url).toString(); 098 * 099 * // Run this method. A file named 'img.jpg' is saved. 100 * System.out.println("Image URL to Download:" + urlStr); 101 * ImageScrape.downloadImageGuessType(urlStr, "img"); 102 * }</DIV> 103 * 104 * @param urlStr Is the {@code url} of the image. <B>Yahoo! Images</B>, for instance, have 105 * really long {@code URL's} and don't have any extensions at the end. If {@code 'urlStr'} 106 * does contain an image extension in the {@code 'String'}, then this method will attempt to 107 * save the image using the appropriate file-extension, and throw an {@code 'IIOException'} if 108 * it fails. 109 * 110 * @param outputFileStr This is the target or destination name for the output image file. 111 * 112 * <BR /><BR /><DIV CLASS=JDHint> 113 * <B STYLE='color:red;'>File-Extension:</B> This file is not intended to have an extension. 114 * The extension will be generated by the code in this method, and it will match whatever 115 * image-file-encoding was successfully used to download the file. If this is a 116 * {@code '.png'}, for instance, but it did not download until {@code '.bmp'} was used 117 * (mis-labeled), this output file will be saved as {@code 'outputFileStr'} + {@code '.bmp'}. 118 * </DIV> 119 * 120 * <BR /><DIV CLASS=JDHintAlt> 121 * <B STYLE='color: red;'>URL vs. File Names:</B> This parameter, {@code 'outputFileStr'}, 122 * <B>may NOT be null</B>. It is important to realize, here, that file-names and {@code URL's} 123 * do not obey the same naming conventions. Because it is often seen on the internet 124 * Image-{@code URL's} that have a plethora of file-system 'irreverent' characters in their 125 * name, this method simply cannot pick out the file-name of an image from its 126 * {@code URL}. 127 * </DIV> 128 * 129 * <BR />It may seem counter-intuitive to expect a "filename" parameter be provided as 130 * input here, <I>given that an image-{@code URL} is also required</I> (since in most cases 131 * the file-name of the image being downloaded is included in the image's {@code URL}). 132 * However, because many of the modern content-providers on the internet use many layers of 133 * naming conventions for their image-{@code URL's}, the user must provide the file-name of 134 * the image (as a {@code String}) to avoid crashing this method in situations / cases where 135 * the image file-name is "too difficult" to discern from it's {@code URL}. 136 * 137 * @param outputDirectory This is just "prepended" to the file-save name. This 138 * {@code 'String'} is not included in the returned filename. <B>Specifically</B> The returned 139 * file name <I>only includes</I> the file-name and the file-name-extension. It does not 140 * include the whole "canonical" or "absolute" directory-path name for this image. 141 * 142 * @return It will return the name of the file as a result - including the extension type 143 * which did not throw a {@code javax.imageio.IIOException.} This exception is thrown 144 * whenever an image, of - for instance {@code '.png'} format tries to save as a 145 * {@code '.jpg'}, or any other incorrect image-format. 146 * 147 * <BR /><BR /><DIV CLASS=JDHint> 148 * <B STYLE='color:red;'>Regarding 'null':</B> {@code 'null'} will be returned if the image 149 * has failed to save / write at all. 150 * </DIV> 151 * 152 * <BR /><DIV CLASS=JDHintAlt> 153 * <B STYLE='color:red;'>Exception Throw:</B> If the passed {@code 'urlStr'} does not save 154 * properly, {@code javax.imageio.IIOException} will also be thrown. 155 * </DIV> 156 * 157 * <BR /><B>It is important to return the filename, since the extension identifies in 158 * what format the image was saved - {@code .jpg, .gif, .png,} etc...</B> 159 * 160 * @throws WritableDirectoryException If the provided output directory must exist and be 161 * writable, or else this exception shall throw. Java will attempt to write a small, 162 * temporary file to the directory-name provided. It will be deleted immediately afterwards. 163 * 164 * @see #imageExts 165 */ 166 public static String downloadImageGuessType 167 (String urlStr, String outputFileStr, String outputDirectory) 168 throws IOException 169 { 170 // If the "file name" has directory components... it is just "better" to flag this as 171 // an exception 172 173 if (outputFileStr.indexOf(File.separator) != -1) throw new IllegalArgumentException( 174 "This method expects parameter 'outputFileStr' to be a simple file-name, without " + 175 "any directory-names attached. If directory names need to be attached to ensure " + 176 "that the file is ultimately saved to the proper location in the file-system, " + 177 "pass the directory to the 'outputDirectory' parameter to this method.\n" + 178 "You have passed: " + outputFileStr + "\nwhich contains the file-name separator " + 179 "character." 180 ); 181 182 if (outputDirectory == null) outputDirectory = ""; 183 184 // Make sure the directory exists on the file-system, and that it is writable. 185 WritableDirectoryException.check(outputDirectory); 186 187 // Unless writing the "current directory" - make sure the directory name ends with the 188 // Operating System file-separator character. 189 190 if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator))) 191 outputDirectory = outputDirectory + File.separator; 192 193 final BufferedImage image = ImageIO.read(new URL(urlStr)); 194 final String ext = getImageTypeFromURL(urlStr); 195 196 File f = null; 197 198 if (ext != null) 199 200 try 201 { 202 final String fName = outputFileStr + '.' + ext; 203 f = new File(outputDirectory + fName); 204 205 ImageIO.write(image, ext, f); 206 207 return fName; 208 } 209 210 // NOTE: If saving the file using the named image-extension fails, try the other. 211 catch (javax.imageio.IIOException e) { f.delete(); } 212 213 for (int i=0; i < imageExts.length; i++) 214 215 try 216 { 217 f = new File(outputFileStr + '.' + imageExts[i]); 218 219 ImageIO.write(image, imageExts[i], f); 220 221 return outputFileStr + '.' + imageExts[i]; 222 } 223 224 catch (javax.imageio.IIOException e) { f.delete(); continue; } 225 226 System.out.println 227 ("NOTE: Image " + urlStr + "\nAttempted to save to:" + outputFileStr + "\nFAILED."); 228 229 return null; 230 } 231 232 /** 233 * Convenience Method. 234 * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)} 235 */ 236 public static Vector<String> downloadImagesGuessTypes(String rootURL, Iterable<String> urls) 237 throws IOException 238 { return downloadImagesGuessTypes(rootURL, urls, ""); } 239 240 /** 241 * Convenience Method. 242 * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)} 243 */ 244 public static Vector<String> downloadImagesGuessTypes(Iterable<String> urls) 245 throws IOException 246 { return downloadImagesGuessTypes("", urls, ""); } 247 248 /** 249 * Convenience Method. 250 * <BR />Invokes: {@link #downloadImagesGuessTypes(String, Iterable, String)} 251 */ 252 public static Vector<String> downloadImagesGuessTypes 253 (Iterable<String> urls, String outputDirectory) 254 throws IOException 255 { return downloadImagesGuessTypes("", urls, outputDirectory); } 256 257 /** 258 * This will download an entire {@code Vector<String>} of {@code URL's}, and save the 259 * output fileNames which were used to save these images. It will use a the 260 * {@code StrPrint.zeroPad(int)} method to generate filenames - starting with 261 * {@code 001.jpg} - or whatever extension was correct. It will use the <B><I>guessed 262 * file-name extension</I></B> that is appropriate for this image. 263 * 264 * <BR /><BR /><DIV CLASS=JDHint> 265 * <B STYLE='color:red;'>Output Printing:</B> As the images are downloaded, the fileName is 266 * printed via {@code System.out.println()} 267 * </DIV> 268 * 269 * <DIV CLASS=EXAMPLE>{@code 270 * // Retrieve all images found on the Wikipedia (Encyclopedia) Page for Galileo 271 * URL url = new URL("https://en.wikipedia.org/wiki/Galileo_Galilei"); 272 * 273 * // Parse & Scrape the Web-Page, store it in a local html-vector 274 * Vector<HTMLNode> page = HTMLPage.getPageTokens(url, false); 275 * 276 * // Get the "Vector Index Array" for every HTML <IMG> element found on the page. 277 * int[] imgPosArr = TagNodeFind.all(page, TC.OpeningTags, "img"); 278 * 279 * // Since there are many "relative" or "partial" URL's, make sure to resolve them 280 * // against the main Wikipedia page-url. Also, note, that Links.resolve returns a 281 * // Vector<URL>, but that ImageScraper.downloadImagesGuessTypes requires a 282 * // Vector<String>, so make sure to convert the output url's to strings. 283 * 284 * Vector<String> urls = new Vector<String>(imgPosArr.length); 285 * 286 * Links.resolveSRCs(page, imgPosArr, url).forEach((URL u) -> urls.add(u.toString())); 287 * 288 * // Run this method. A series of '.png' and '.jpg' files will be saved to the current 289 * // working directory. 290 * 291 * ImageScrape.downloadImagesGuessTypes(urls); 292 * }</DIV> 293 * 294 * @param urls is a {@code Vector} of {@code String's} that are to contain image pointers 295 * 296 * @param rootURL if these are "sub-urls", with a root {@code URL}, this root {@code URL} 297 * is pre-pended to each of the {@code String's} in the {@code 'urls' Vector}. This parameter 298 * may contain the empty string ({@code ""}) (and if it is, it will be ignored) 299 * 300 * @param outputDirectory The files that are downloaded are saved to this directory. 301 * 302 * @return a {@code Vector} of {@code String's} which contains the output filenames of these 303 * files. 304 * 305 * @throws WritableDirectoryException If the provided output directory must exist and be 306 * writable, or else this exception shall throw. Java will attempt to write a small, temporary 307 * file to the directory-name provided. It will be deleted immediately afterwards. 308 * 309 * @see StrPrint#zeroPad(int) 310 * @see #downloadImageGuessType(String, String) 311 */ 312 public static Vector<String> downloadImagesGuessTypes 313 (String rootURL, Iterable<String> urls, String outputDirectory) 314 throws IOException 315 { 316 if (outputDirectory == null) outputDirectory = ""; 317 318 // Make sure the directory exists on the file-system, and that it is writable. 319 WritableDirectoryException.check(outputDirectory); 320 321 // Unless writing the "current directory" - make sure the directory name ends with the 322 // Operating System file-separator character. 323 324 if ((outputDirectory.length() > 0) && (! outputDirectory.endsWith(File.separator))) 325 outputDirectory = outputDirectory + File.separator; 326 327 if (rootURL == null) rootURL = ""; 328 329 final Vector<String> ret = new Vector<String>(); 330 331 int count = 0; 332 333 for (final String url : urls) 334 { 335 final String fileName = downloadImageGuessType 336 (rootURL + url, StrPrint.zeroPad(++count), outputDirectory); 337 338 System.out.print(fileName + ((fileName.length() < 10) ? ' ' : '\n')); 339 340 ret.addElement(fileName); 341 } 342 343 return ret; 344 } 345 346 /** 347 * This downloads an image to a a file named {@code 'outputFileStr'}. A valid image-extension 348 * needs to be provided for the java {@code ImageIO.write(...)} method to work properly. The 349 * {@code 'extensionStr'} should be {@code String's} such as: {@code '.jpg'} or {@code '.png'} 350 * 351 * @param urlStr The {@code URL} of the image which generated the exception 352 * @param outputFileStr The intended file-name root to which the image is supposed to save 353 * @param extensionStr The intended file-name extension to which this image was to be saved. 354 * @throws java.imageio.IIOException - if this file type / {@code 'extensionStr'} are incorrect 355 */ 356 public static void getImage(String urlStr, String outputFileStr, String extensionStr) 357 throws IOException 358 { 359 final File f = new File(outputFileStr); 360 final BufferedImage image = ImageIO.read(new URL(urlStr)); 361 362 ImageIO.write(image, extensionStr, f); 363 } 364 365 /** 366 * This method will read from a text-file, which must have a list of image-{@code URL's} from 367 * the internet - and download them, one by one, to a directory. Messages will be printed as 368 * each file is downloaded via {@code System.out.print()} 369 * 370 * @param f A file pointer to a text-file that contains a list of {@code String's}. Each 371 * {@code String} is intended to be a {@code URL} to an image on the internet. 372 * 373 * @return a {@code Vector} containing the file-names of these images. 374 */ 375 public static Vector<String> downloadImages(File f) throws IOException, FileNotFoundException 376 { 377 final BufferedReader br = new BufferedReader(new FileReader(f)); 378 final Vector<String> pics = new Vector<String>(); 379 380 String s; 381 while ((s = br.readLine()) != null) pics.addElement(s); 382 383 return downloadImagesGuessTypes(pics); 384 } 385}