1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580 | package Torello.HTML.Tools.Images;
import Torello.HTML.*;
import Torello.Java.*;
import Torello.Java.Additional.Ret2;
import Torello.JavaDoc.LinkJavaSource;
import java.util.function.*;
import java.io.Serializable;
import java.io.File;
import java.net.URL;
import java.net.MalformedURLException;
import java.util.Vector;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
/**
* Holds all relevant configurations and parameters needed to run the primary download-loop of
* class {@link ImageScraper}
*
* <EMBED CLASS='external-html' DATA-FILE-ID=REQUEST>
* <EMBED CLASS='external-html' DATA-FILE-ID=REQ_STR_BUILDER1_EX>
*/
@SuppressWarnings("overrides")
@Torello.JavaDoc.JDHeaderBackgroundImg(EmbedTagFileID="IMAGE_SCRAPER_CLASS")
public class Request implements Cloneable, Serializable
{
/** <EMBED CLASS='external-html' DATA-FILE-ID=SVUID> */
public static final long serialVersionUID = 1;
// ********************************************************************************************
// ********************************************************************************************
// MAIN CONSTRUCTORS
// ********************************************************************************************
// ********************************************************************************************
// There are 5 or 6 'static' builder-methods below. The only reason on earth that those are
// static-methods rather than constructors is that their parameter lists all use the same
// 'Iterable', but with a different Generic-Parameter. If you convert those to Constructors,
// you will get that they have the "Same Erasure", and that compiling cannot continue.
//
// Instead they are methods that have slightly different names, and the Java-Compiler, instead,
// shuts up, and stops complaining.
Request(
Vector<URL> source, int size, URL originalPageURL, Vector<String[]> b64Images,
Vector<Exception> tagNodeSRCExceptions
)
{
this.source = source;
this.size = size;
this.counterPrinter = getPrinter(size);
this.originalPageURL = originalPageURL;
this.b64Images = b64Images;
this.tagNodeSRCExceptions = tagNodeSRCExceptions;
}
Request(Vector<URL> source, int size, URL originalPageURL)
{
this.source = source;
this.size = size;
this.counterPrinter = getPrinter(size);
this.originalPageURL = originalPageURL;
this.b64Images = null;
this.tagNodeSRCExceptions = null;
}
// Used by Clone
private Request(final Request other)
{
this.source = other.source;
this.size = other.size;
this.counterPrinter = other.counterPrinter;
this.originalPageURL = other.originalPageURL;
this.b64Images = other.b64Images;
this.tagNodeSRCExceptions = other.tagNodeSRCExceptions;
this.b64Pos = other.b64Pos;
this.tnExPos = other.tnExPos;
}
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
// Small static constructor-helper
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
private static IntFunction<String> getPrinter(int size)
{
// Now produce the Printer for the Image-Number. All this does is make sure to do an
// appropriate zero-padding for the text-output.
if (size < 10) return (int i) -> "" + i;
else if (size < 100) return StrPrint::zeroPad10e2;
else if (size < 1000) return StrPrint::zeroPad;
else if (size < 10000) return StrPrint::zeroPad10e4;
// This case seems extremely unlikely and even largely preposterous, but leaving it like
// this means I will never have to analyze this crap ever again. Note that the above case
// where size is greater than 1,000 seems a little ridiculous. Usually there are under 100
// photos on any one HTML Page.
else
{
final int power = (int) Math.floor(Math.log10(size));
return (int i) -> StrPrint.zeroPad(i, power);
}
}
// ********************************************************************************************
// ********************************************************************************************
// Static Constructor-Like Builder Methods (Cannot Use Constructors because of "Erasure")
// ********************************************************************************************
// ********************************************************************************************
/**
* Builds an instance of this class from a list of {@code URL's} as {@code String's}
*
* @param source <EMBED CLASS='external-html' DATA-FILE-ID=REQUEST_ITER_STR>
*
* @return A {@code 'Request'} instance. This may be further configured by assigning values to
* any / all fields (which will still have their initialized / default-values)
*
* @throws NullPointerException If any of the {@code String's} in the {@code Iterable} are null
*
* @throws IllegalArgumentException If any of the {@code URL's} are {@code String's} which
* begin with neither {@code 'http://'} nor {@code 'https://'}. Since this method doesn't
* accept the parameter {@code 'originalPageURL'}, each and every {@code URL} in the
* {@code 'source'} iterable must be a full & complete {@code URL}.
*
* <BR /><BR />This exception will also throw if there are any {@code URL's} in the
* {@code String}-List that cause a {@code MalformedURLException} to throw when constructing an
* instance of {@code java.net.URL} from the {@code String}. In these cases, the original
* {@code MalformedURLException} will be assigned to the {@code 'cause'}, and may be retrieved
* using the exception's {@code getCause()} method.
*/
@LinkJavaSource(handle="FromStringIterator", name="build", paramCount=1)
public static Request buildFromStrIter(Iterable<String> source)
{ return FromStringIterator.build(source); }
/**
* Builds an instance of this class from a list of {@code URL's} as {@code String's}
*
* @param source <EMBED CLASS='external-html' DATA-FILE-ID=REQUEST_ITER_STR>
* @param originalPageURL <EMBED CLASS='external-html' DATA-FILE-ID=REQUEST_ORIG_PG_URL>
*
* @param skipDontThrowIfBadStr If an exception is thrown when attempting to resolve a
* partial-{@code URL}, and this parameter is {@code TRUE}, then that exception is suppressed
* and logged, and the builder-loop continues to the next {@code URL}-as-a-{@code String}.
*
* <BR /><BR />When this parameter is passed {@code FALSE}, unresolvable {@code URL's} will
* generate an {@code IllegalArgumentException}-throw.
*
* <BR /><BR />Note that the presence of a null in the {@code Iterable 'source'} parameter
* will always force this method to throw {@code NullPointerException}.
*
* @return A {@code 'Request'} instance. This may be further configured by assigning values to
* any / all fields (which will still have their initialized / default-values)
*
* @throws NullPointerException If any of the {@code String's} in the {@code Iterable} are null
*
* @throws IllegalArgumentException This exception will also throw if there are any
* {@code URL's} in the {@code String}-List that cause a {@code MalformedURLException} to throw
* when constructing an instance of {@code java.net.URL} from the {@code String}. In these
* cases, the generated {@code MalformedURLException} will be assigned to the exception's
* {@code 'cause'}, and may therefore be retrieved using this exception's {@code getCause()}
* method.
*/
@LinkJavaSource(handle="FromStringIterator", name="build", paramCount=3)
public static Request buildFromStrIter
(Iterable<String> source, URL originalPageURL, boolean skipDontThrowIfBadStr)
{ return FromStringIterator.build(source, originalPageURL, skipDontThrowIfBadStr); }
/**
* Builds an instance of this class using the {@code SRC}-Attribute from a list of
* {@link TagNode}'s.
*
* @param source <EMBED CLASS='external-html' DATA-FILE-ID=REQUEST_ITER_TGND>
*
* @param originalPageURL <EMBED CLASS='external-html' DATA-FILE-ID=REQUEST_ORIG_PG_URL>
*
* @param skipDontThrowIfBadSRCAttr
* <EMBED CLASS='external-html' DATA-FILE-ID=REQUEST_SKIP_BOOL>
*
* @return A {@code 'Request'} instance. This may be further configured by assigning values to
* any / all fields (which will still have their initialized / default-values)
*
* @throws NullPointerException If any of the {@link TagNode}'s in the {@code Iterable} are
* null
*
* @throws SRCException If any of the {@link TagNode}'s in the list do not have a {@code 'SRC'}
* Attribute, and {@code 'skipDontThrowIfBadSRCAttr'} is {@code FALSE}.
*
* <BR /><BR />This exception will also throw if there are any {@code URL's} in the
* {@link TagNode}-List that cause a {@code MalformedURLException} to throw when constructing
* an instance of {@code java.net.URL} (from the {@code TagNode's SRC}-Attribute). In these
* cases, the generated {@code MalformedURLException} will be assigned to the exception's
* {@code 'cause'}, and may therefore be retrieved using the exception's {@code getCause()}
* method.
*
* <BR /><BR />If {@code 'skipDontThrowIfBadSRCAttr'} is {@code FALSE}, then this
* exception will not throw, and a null will be placed in the query-list.
*/
@LinkJavaSource(handle="FromTagNodeIterator", name="build", paramCount=3)
public static Request buildFromTagNodeIter
(Iterable<TagNode> source, URL originalPageURL, boolean skipDontThrowIfBadSRCAttr)
{ return FromTagNodeIterator.build(source, originalPageURL, skipDontThrowIfBadSRCAttr); }
/**
* Builds an instance of this class using the {@code SRC}-Attribute from a list of
* {@link TagNode}'s.
*
* @param source <EMBED CLASS='external-html' DATA-FILE-ID=REQUEST_ITER_TGND>
*
* @param skipDontThrowIfBadSRCAttr
* <EMBED CLASS='external-html' DATA-FILE-ID=REQUEST_SKIP_BOOL>
*
* @return A {@code 'Request'} instance. This may be further configured by assigning values to
* any / all fields (which will still have their initialized / default-values)
*
* @throws NullPointerException If any of the {@link TagNode}'s in the {@code Iterable} are
* null
*
* @throws SRCException If any of the {@link TagNode}'s in the list do not have a
* {@code 'SRC'}-Attribute, and {@code 'skipDontThrowIfBadSRCAttr'} is {@code FALSE}.
*
* <BR /><BR />This exception will also throw if any of the {@code URL's} assigned to a
* {@code 'SRC'}-Attribute are partial-{@code URL's} which do not begin with {@code 'http://'}
* (or {@code 'https://'}), and {@code 'skipDontThrowIfBadSRCAttr'} is {@code FALSE}.
*
* <BR /><BR />Finally, if any of the {@code URL's} inside a {@link TagNode}'s'
* {@code 'SRC'}-Attribute cause a {@code MalformedURLException}, that exception will be
* assigned to the {@code cause} of a {@link SRCException}, and thrown (unless
* {@code 'skipDontThrowIfBadSRCAttr'} is {@code FALSE}).
*/
@LinkJavaSource(handle="FromTagNodeIterator", name="build", paramCount=2)
public static Request buildFromTagNodeIter
(Iterable<TagNode> source, boolean skipDontThrowIfBadSRCAttr)
{ return FromTagNodeIterator.build(source, skipDontThrowIfBadSRCAttr); }
/**
* Builds an instance of this class using a list of <I><B STYLE='color: red;'>already
* prepared</B></I> {@code URL's}.
*
* @param source <EMBED CLASS='external-html' DATA-FILE-ID=REQUEST_ITER_URL>
*
* @return A {@code 'Request'} instance. This may be further configured by assigning values to
* any / all fields (which will still have their initialized / default-values)
*
* @throws NullPointerException If any of the {@code URL's} in the {@code Iterable} are null
*/
@LinkJavaSource(handle="FromURLIterator")
public static Request buildFromURLIter(Iterable<URL> source)
{ return FromURLIterator.build(source); }
// ********************************************************************************************
// ********************************************************************************************
// Package-Visible Utility Methods for ImageScraper, Set by the Constructor.
// ********************************************************************************************
// ********************************************************************************************
// Package-Visibility: Used only in class ImageScraper (to retrieve the Iterable)
Iterable<URL> source() { return source; }
// This Vector-Index Counter is used only once - three lines below
private int b64Pos = 0;
// Package-Visibility: Used only in class Imagescraper (to retrieve a B64-Image String-Array)
String[] nextB64Image()
{
// Since the creation/construction of these Vectors is completely controlled, they should
// never be a source of NullPointerException. If for some odd reason they are, it is
// better to keep a record indicating that "this really shouldn't have happened"
//
// These are "assert" statements. There is no reason this method should ever be called
// if these are null. In the static-builder, if a null-URL is put into the source-vector
// then one of these would be called (b64Images and/or tagNodeSRCExceptions). In such
// cases, both of these secondary vectors would already have references put into them.
//
// Since the "ImageScraper" is heavy-user-interaction class, the paranoia is 10x worse.
// This sort of helps mitigate it, although it seems completely superfluous and unnecessary
if (b64Images == null) throw new UnreachableError();
if (b64Pos >= b64Images.size()) throw new UnreachableError();
return b64Images.elementAt(b64Pos++);
}
// This Vector-Index Counter is used only once - three lines below
private int tnExPos = 0;
// Package-Visibility: Used only by class ImageScraper
Exception nextTNSRCException()
{
// Since the creation/construction of these Vectors is completely controlled, they should
// never be a source of NullPointerException. If for some odd reason they are, it is
// better to keep a record indicating that "this really shouldn't have happened"
//
// These are "assert" statements. There is no reason this method should ever be called
// if these are null. In the static-builder, if a null-URL is put into the source-vector
// then one of these would be called (b64Images and/or tagNodeSRCExceptions). In such
// cases, both of these secondary vectors would already have references put into them.
//
// Since the "ImageScraper" is heavy-user-interaction class, the paranoia is 10x worse.
// This sort of helps mitigate it, although it seems completely superfluous and unnecessary
if (tagNodeSRCExceptions == null) throw new UnreachableError();
if (tnExPos >= tagNodeSRCExceptions.size()) throw new UnreachableError();
return tagNodeSRCExceptions.elementAt(tnExPos++);
}
// ********************************************************************************************
// ********************************************************************************************
// Primary Request Fields
// ********************************************************************************************
// ********************************************************************************************
/** {@code URL} from whence this page has been downloaded */
public final URL originalPageURL;
// The Source Iterable
private final Iterable<URL> source;
/** The number of Image-{@code URL's} identified inside the {@code 'source'} Iterable. */
public final int size;
// This is just a zero-padding printer. It adjusts for the number of elements in the original
// input Iterable. If there are, for example, under 100 elements, then the first 10 elements
// will be padded with a zero.
final IntFunction<String> counterPrinter;
// Any & all Base-64 Images. This is usually empty, so it is initialized to null
private final Vector<String[]> b64Images;
// If the user has built from an Iterable<TagNode>, and requested to suppress-exceptions, then
// this vector will save those exceptions so that they are ready for the return/result object.
private final Vector<Exception> tagNodeSRCExceptions;
// ********************************************************************************************
// ********************************************************************************************
// Public-Fields 01: Verbosity & URL-PreProcessor
// ********************************************************************************************
// ********************************************************************************************
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_verbosity>*/
public Verbosity verbosity = Verbosity.Normal;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_urlPreProcessor>*/
public Function<URL, URL> urlPreProcessor = null;
// ********************************************************************************************
// ********************************************************************************************
// Public-Fields 02: Location-Decisions for Saving an Image File, or sending to 'imageReceiver'
// ********************************************************************************************
// ********************************************************************************************
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_targetDirectoryRetriever>*/
public Function<ImageInfo, File> targetDirectoryRetriever = null;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_imageReceiver>*/
public Consumer<ImageInfo> imageReceiver = null;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_targetDirectory>*/
public String targetDirectory = null;
// ********************************************************************************************
// ********************************************************************************************
// Public-Fields 03: File-Name given to an Image File
// ********************************************************************************************
// ********************************************************************************************
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_fileNamePrefix>*/
public String fileNamePrefix = null;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_useDefaultCounterForImageFileNames>*/
public boolean useDefaultCounterForImageFileNames = true;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_getImageFileSaveName>*/
public Function<ImageInfo, String> getImageFileSaveName = null;
// ********************************************************************************************
// ********************************************************************************************
// Public-Fields 04: BOOLEANS'S: Continuing or Throwing on Failure & Exception
// ********************************************************************************************
// ********************************************************************************************
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_skipOnDownloadException>*/
public boolean skipOnDownloadException = false;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_skipOnB64DecodeException>*/
public boolean skipOnB64DecodeException = false;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_skipOnTimeOutException>*/
public boolean skipOnTimeOutException = false;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_skipOnNullImageException>*/
public boolean skipOnNullImageException = false;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_skipOnImageWritingFail>*/
public boolean skipOnImageWritingFail = false;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_skipOnUserLambdaException>*/
public boolean skipOnUserLambdaException = false;
// ********************************************************************************************
// ********************************************************************************************
// Public-Fields 05: USER-PREDICATE'S & BOOLEAN'S: Which Image Files to Save, and Which to Skip
// ********************************************************************************************
// ********************************************************************************************
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_skipURL>*/
public Predicate<URL> skipURL = null;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_skipBase64EncodedImages> */
public boolean skipBase64EncodedImages = false;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_keeperPredicate> */
public Predicate<ImageInfo> keeperPredicate = null;
// ********************************************************************************************
// ********************************************************************************************
// Public-Fields 06: Avoiding Hangs and Locks with a TimeOut
// ********************************************************************************************
// ********************************************************************************************
/**
* <EMBED CLASS='external-html' DATA-FILE-ID=REQ_MAX_WAIT_TIME>
* @see #MAX_WAIT_TIME_UNIT
* @see #maxDownloadWaitTime
*/
public static final long MAX_WAIT_TIME = 10;
/**
* <EMBED CLASS='external-html' DATA-FILE-ID=REQ_MAX_WAIT_TIME_UNIT>
* @see #MAX_WAIT_TIME
* @see #waitTimeUnits
*/
public static final TimeUnit MAX_WAIT_TIME_UNIT = TimeUnit.SECONDS;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_maxDownloadWaitTime> */
public long maxDownloadWaitTime = MAX_WAIT_TIME;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_waitTimeUnits> */
public TimeUnit waitTimeUnits = MAX_WAIT_TIME_UNIT;
// ********************************************************************************************
// ********************************************************************************************
// Public-Fields 07: USER-AGENT
// ********************************************************************************************
// ********************************************************************************************
/**
* <EMBED CLASS='external-html' DATA-FILE-ID=REQ_DEFAULT_USER_AGENT>
* @see Scrape#USER_AGENT;
*/
public static final String DEFAULT_USER_AGENT = Scrape.USER_AGENT;
/**
* <EMBED CLASS='external-html' DATA-FILE-ID=REQ_userAgent>
* @see Scrape#DEFAULT_USER_AGENT;
*/
public String userAgent = DEFAULT_USER_AGENT;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_alwaysUseUserAgent> */
public boolean alwaysUseUserAgent = false;
/** <EMBED CLASS='external-html' DATA-FILE-ID=REQ_retryWithUserAgent> */
public boolean retryWithUserAgent = true;
// ********************************************************************************************
// ********************************************************************************************
// Check for Validity Method
// ********************************************************************************************
// ********************************************************************************************
void CHECK() { RequestValidity.check(this); }
// ********************************************************************************************
// ********************************************************************************************
// TURN ON **ALL** Exception-Skip Methods
// ********************************************************************************************
// ********************************************************************************************
/** This allows a user to quickly / easily set all {@code 'skipOn'} flags in one method call */
public void skipOnAllExceptions()
{
// exceptions thrown by Java's ImageIO class when downloading and image
skipOnDownloadException =
// if Java's Base-64 Image-Decoder throws an exception.
skipOnB64DecodeException =
// exception that's thrown when the Monitor-Thread has timed-out.
skipOnTimeOutException =
// exception that's thrown when a downloaded image is null.
skipOnNullImageException =
// exceptions thrown when writing an already downloaded image to the file-system.
skipOnImageWritingFail =
// exceptions thrown by any of the User-Provided Lambda-Target / Functional-Interfaces
skipOnUserLambdaException = true;
}
// ********************************************************************************************
// ********************************************************************************************
// Standard-Java Object Methods
// ********************************************************************************************
// ********************************************************************************************
/**
* Converts {@code 'this'} instance into a simple Java-{@code String}
* @return A {@code String} where each field has had a 'best efforts' {@code String}-Conversion
*/
@LinkJavaSource(handle="RequestToString")
public String toString()
{ return RequestToString.toString(this); }
// ********************************************************************************************
// ********************************************************************************************
// Clone & Clone-Constructor
// ********************************************************************************************
// ********************************************************************************************
/**
* Builds a clone of {@code 'this'} instance
*
* @return The copied instance. Note that this is a <B STYLE='color: red;'>shallow</B> clone,
* rather than a <B STYLE='color: red;'>deep</B> clone. The references within the returned
* instances are <I>the exact same references as are in {@code 'this'} instance</I>.
*/
@LinkJavaSource(handle="RequestClone")
public Request clone()
{
final Request cloned = new Request(this);
RequestClone.copy(this, cloned);
return cloned;
}
}
|