001package Torello.HTML.Tools.Images; 002 003 004// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 005// My Imports 006// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 007 008import Torello.HTML.*; 009import Torello.Java.*; 010 011import Torello.HTML.NodeSearch.TagNodeFind; 012import Torello.Java.Additional.Ret2; 013import Torello.Java.Additional.AppendableLog; 014import Torello.Java.Additional.AppendableSafe; 015 016 017// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 018// JDK Imports. These are all spelled-out at the bottom, because none of them are commonly used. 019// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 020 021// ByteArrayOutputStream, File, IOException 022import java.io.*; 023 024import java.net.URL; 025import java.util.Vector; 026 027/** 028 * A more advanced class for both downloading and saving a list of images, using URL's. 029 * 030 * <EMBED CLASS='external-html' DATA-FILE-ID=ISR> 031 */ 032@Torello.JavaDoc.StaticFunctional 033@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="IMAGE_SCRAPER_CLASS") 034public class ImageScraper 035{ 036 // This Class is Static-Functional, and does not have any program state, other than the monitor 037 // Thread. There is no need for a public-constructor, or any constructor for that matter. 038 039 private ImageScraper() { } 040 041 042 // ******************************************************************************************** 043 // ******************************************************************************************** 044 // Thread-Related Stuff 045 // ******************************************************************************************** 046 // ******************************************************************************************** 047 048 049 /** 050 * If this class has been used to make "multi-threaded" calls that use a Time-Out wait-period, 051 * you might see your Java-Program hang for a few seconds when you would expect it to exit back 052 * to your O.S. normally. 053 * 054 * <BR /><BR /><B CLASS=JDDescLabel>Before Exiting:</B> 055 * 056 * <BR />When a program you have written reaches the end of its code, if you have performed any 057 * time-dependent Image-Downloads using this class (class {@code ImageScraper}), then your 058 * program <I>might not exit immediately,</I> but rather sit at the command-prompt for anywhere 059 * between 10 and 30 seconds before this Timeout-Thread dies. 060 * 061 * <BR /><BR />Note that you may immediately terminate any additional threads that were started 062 * using this method. 063 */ 064 public static void shutdownTOThreads() { DownloadImage.executor.shutdownNow(); } 065 066 067 // ******************************************************************************************** 068 // ******************************************************************************************** 069 // Primary User-API Methods 070 // ******************************************************************************************** 071 // ******************************************************************************************** 072 073 074 /** 075 * Downloads images located inside an HTML Page and updates the {@code SRC=...} {@code URL's} 076 * so that the links point to a <I>local copy</I> of <I>local images</I>. 077 * 078 * <BR /><BR />After completion of this method, an HTML page which contained any HTML image 079 * elements will have had those images downloaded to the local file-system, and also have had 080 * the HTML attribute {@code 'src=...'} changed to reflect the local image name instead of the 081 * Internet URL name. 082 * 083 * @param page Any vectorized-html page or subpage. This page should have HTML {@code <IMG ...>} 084 * elements in it, or else this method will exit without doing anything. 085 * 086 * @param pageURL If any of the HTML image elements have {@code src='...'} attributes that are 087 * partially resolved or <I>relative {@code URL's}</I> then this can be passed to the 088 * {@code ImageScraper} constructors in order to convert partial or relative {@code URL's} 089 * into complete {@code URL's.} The Image Downloader simply cannot work with partially 090 * resolved {@code URL's}, and will skip them if they are partially resolved. This parameter 091 * may be null, but if it is and there are incomplete-{@code URL's} those images will 092 * simply not be downloaded. 093 * 094 * @param log This is the 'logger' for this method. It may be null, and if it is - no output 095 * will be sent to the terminal. 096 * 097 * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE> 098 * 099 * @param targetDirectory This File-System directory where these files shall be stored. 100 * 101 * @return An instance of {@code Ret2<int[], Results>}. The two returned elements 102 * of this class include: 103 * 104 * <BR /><BR /><UL CLASS=JDUL> 105 * 106 * <LI> {@code Ret2.a (int[])} 107 * <BR /><BR />This shall contain an index-array for the indices of each HTML 108 * {@code '<IMG SRC=...>'} element found on the page. It is not guaranteed that each of 109 * images will have been resolved or downloaded successfully, but rather just that an HTML 110 * {@code 'IMG'} element that had a {@code 'SRC'} attribute. The second element of this 111 * return-type will contain information regarding which images downloaded successfully. 112 * <BR /><BR /> 113 * </LI> 114 * 115 * <LI> {@code Ret2.b (Results)} 116 * <BR /><BR />The second element of the return-type shall be the instance of 117 * {@link Results} returned from the invocation of 118 * {@code ImageScraper.download(...)}. This method will provide details about each of the 119 * images that were downloaded; or, if the download failed, the reasons for the failure. 120 * <I>This return element shall be null if no images were found on the page.</I> 121 * </LI> 122 * 123 * </UL> 124 * 125 * <BR />These return {@code Object} references are not necessarily important - <I>and they 126 * may be discarded if needed.</I> They are provided as a matter of utility if further 127 * verification or research into successful downloads is needed. 128 * 129 * @throws IOException I/O Problems that weren't avoided. 130 * @throws ImageScraperException Thrown for any number of errors that went unsuppressed. 131 */ 132 public static Ret2<int[], Results> localizeImages 133 (Vector<HTMLNode> page, URL pageURL, Appendable log, String targetDirectory) 134 throws IOException, ImageScraperException 135 { 136 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 137 // Find all of the Image TagNode's on the Input Web-Page 138 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 139 140 final int[] imgPosArr = TagNodeFind.all(page, TC.Both, "img"); 141 final Vector<TagNode> vec = new Vector<>(); 142 143 // No Images Found. 144 if (imgPosArr.length == 0) return new Ret2<int[], Results>(imgPosArr, null); 145 146 for (final int pos : imgPosArr) vec.addElement((TagNode) page.elementAt(pos)); 147 148 149 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 150 // Build a Request and Download all of the Image's that were just found / identified 151 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 152 153 final Request request = Request.buildFromTagNodeIter(vec, pageURL, true); 154 request.targetDirectory = targetDirectory; 155 156 // NOTE: This is NOT FINISHED: 157 // SET ALL OF THE "Skip On Exception" booleans to TRUE!!! 158 159 // Invoke the Main Image Downloader 160 final Results r = ImageScraper.download(request, log); 161 162 163 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 164 // Replace the <IMG SRC=...> TagNode URL's for images that were successfully downloaded. 165 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 166 167 // Now replace 168 final ReplaceFunction replacer = (HTMLNode n, int arrPos, int count) -> 169 { 170 if (r.skipped[count] == false) 171 172 return ((TagNode) page.elementAt(arrPos)) 173 .setAV("src", r.fileNames[count], SD.SingleQuotes); 174 175 else return (TagNode) n; 176 }; 177 178 ReplaceNodes.r(page, imgPosArr, replacer); 179 180 181 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 182 // Report the Results 183 // *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** 184 185 return new Ret2<int[], Results>(imgPosArr, r); 186 } 187 188 /** 189 * This will iterate through the {@code URL's} and download them. Note that parameter 190 * {@code 'log'} may be null, and if so, it will be quietly ignored. 191 * 192 * @param request This parameter takes customization requests for batch image downloads. To 193 * read more information about how to configure a download, please review the documentation for 194 * the class {@link Request}. 195 * 196 * <BR /><BR />Note that upon entering this method, this parameter is immediately cloned to 197 * prevent the possibility of Thread Concurrency Problems from happening. After cloning, the 198 * the cloned instance is used exclusively, and the original parameter is discarded. Further 199 * changes to the parameter-instance will not have any effect on the process. 200 * 201 * @param log This shall receive text / log information. This parameter may receive null, and 202 * if it does it will be ignored. When ignored, logging information will not printed. 203 * 204 * <EMBED CLASS='external-html' DATA-FILE-ID=APPENDABLE> 205 * 206 * @return an instance of {@code class Results} for the download. The {@link Results} class 207 * contains several parallel arrays with information about images that have downloaded. If an 208 * image-download happens to fail due to an improperly formed {@code URL} (or an 'incorrect' 209 * {@code URL}), then the information in the {@code Results} arrays will contain a 'null' value 210 * for the index at those array-positions corresponding to the failed image. 211 * 212 * @throws ImageScraperException Thrown for any number of exceptions that may be thrown while 213 * executing the download-loop. If another exception is thrown, then it is wrapped by this 214 * class' exception ({@link ImageScraperException}), and set as the {@code 'cause'} of that 215 * exception. 216 * 217 * @throws AppendableError The interface {@code java.lang.Appendable} was designed to allow for 218 * an implementation to throw the (unchecked) exception {@code IOException}. This has many 219 * blessings, but can occasionally be a pain since, indeed, {@code IOException} is both an 220 * unchecked exception (and requires an explicity catch), and also very common 221 * (even ubiquitous) inside of HTTP download code. 222 * 223 * <BR /><BR />If the user-provided {@code 'log'} parameter throws an {@code IOException} for 224 * simply trying to write character-data to the log about the download-progress, then <I>an 225 * {@code AppendableError} will be thrown</I>. Note that this throwable does inherit 226 * {@code java.lang.Error}, meaning that it won't be caught by standard Java {@code catch} 227 * clauses <I>(unless {@code 'Error'} is explicity mentioned!)</I> 228 */ 229 public static Results download(Request request, Appendable log) 230 throws ImageScraperException 231 { 232 // Clone the Request, Similar to "SafeVarArgs" - Specifically, if the user starts playing 233 // with the contents of this class in the middle of a download, it will not have any effect 234 // on the 'request' object that is actually being used. 235 236 request = request.clone(); 237 238 // Runs a few tests to make sure there are no problems using the request 239 request.CHECK(); 240 241 // Makes log printing easier and easier. 242 final AppendableLog al = new AppendableLog(log, request.verbosity); 243 244 // Main Request-Configuration and Response Class Instances. 245 final Results results = new Results(request.size); 246 247 // Private, Internal Static-Class. Makes passing variables even easier 248 final RECORD r = new RECORD(request, results, al); 249 250 // Now, this just gets rid of the surrounding try-catch block. This is the only real 251 // reason for the internal/private method 'downloadWithoutTryCatch'. This makes the 252 // indentation look a lot better. Also, in this method, the 'log' is replaced with the 253 // AppendableSafe log 254 255 try 256 { 257 // private static void mainDownloadLoop(RECORD r) throws ImageScraperException 258 // Helps prepare for the printing loop; 259 260 if (r.logLevelGTEQ1) r.append("\n"); 261 262 for (URL url : r.request.source()) 263 { 264 r.reset(); 265 r.url = url; 266 MainLoopBody.loop(r); 267 } 268 269 return results; 270 } 271 272 catch (ImageScraperException e) 273 { 274 // If an exception causes the system to stop/halt, this extra '\n\n' makes the output 275 // text look a little nicer (sometimes... Sometimes it already looks fine). 276 // No more no less. 277 278 if (al.hasLog) al.append("\n\nThrowing ImageScraperException...\n"); 279 throw e; 280 } 281 } 282}