Source code

001package Torello.HTML;
002
003import Torello.Java.Additional.Ret2;
004import Torello.Java.StringParse;
005
006import java.util.*;
007import java.util.regex.*;
008import java.io.*;
009import java.util.zip.*;
010import java.net.URL;
011import java.net.HttpURLConnection;
012import java.nio.charset.Charset;
013
014import Torello.JavaDoc.StaticFunctional;
015import Torello.JavaDoc.Excuse;
016
017/**
018 * Some standard utilities for transfering &amp; downloading HTML from web-sites and then storing
019 * that content in memory as a Java {@code String} - <I>which, subsequently, can be written to
020 * disk, transferred elsewhere, or even parsed (using class {@link HTMLPage})</I>.
021 * 
022 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE>
023 */
024@StaticFunctional(
025    Excused={"USER_AGENT", "USE_USER_AGENT"},
026    Excuses={Excuse.CONFIGURATION, Excuse.FLAG}
027)
028public class Scrape
029{
030    private Scrape() { }
031
032    /**
033     * When opening an {@code HTTP URL} connection, it is usually a good idea to use a
034     * {@code "User Agent"}  The default behavior in this Scrape &amp; Search Package is to connect
035     * using the {@code public static String USER_AGENT = "Chrome/61.0.3163.100";}
036     * 
037     * <BR /><BR /><DIV CLASS=JDHint>
038     * This behavior may be changed by modifying these {@code public static} variables.
039     * 
040     * <BR />
041     * If the boolean {@link #USE_USER_AGENT} is set to {@code FALSE}, then no User-Agent will be
042     * used at all.
043     * </DIV>
044     */
045    public static String USER_AGENT = "Chrome/61.0.3163.100";
046
047    /**
048     * When opening an {@code HTTP URL} connection, it is usually a good idea to use a
049     * {@code "User Agent"} The default behavior in this Scrape &amp; Search Package is to connect
050     * using the {@code public static String USER_AGENT = "Chrome/61.0.3163.100";}
051     * 
052     * <BR /><BR /><DIV CLASS=JDHint>
053     * This behavior may be changed by modifying these {@code public static} variables.
054     * 
055     * <BR />
056     * If the boolean {@link #USE_USER_AGENT} is set to {@code FALSE}, then no User-Agent will be
057     * used at all.
058     * </DIV>
059     */
060    public static boolean USE_USER_AGENT = true;
061
062
063    // ********************************************************************************************
064    // ********************************************************************************************
065    // HTTP Headers stuff
066    // ********************************************************************************************
067    // ********************************************************************************************
068
069
070    /**
071     * This method will check whether the {@code HTTP Header} returned by a website has been
072     * encoded using the {@code GZIP Compression} encoding.  It expects the {@code java.util.Map}
073     * that is returned from an invocation of {@code HttpURLConnection.getHeaderFields()}.
074     * 
075     * <BR /><BR /><B CLASS=JDDescLabel>Case-Insensitive:</B>
076     * 
077     * <BR />Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>, all {@code String}
078     * comparisons done in this method shall ignore case.
079     * 
080     * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}.  It 
081     * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}.
082     *
083     * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B>
084     * this property has a <I>property-value</I> in it's list equal to {@code "gzip"}, then this
085     * method will return {@code TRUE}.  Otherwise this method will return {@code FALSE}.
086     */
087    public static boolean usesGZIP(Map<String, List<String>> httpHeaders)
088    {
089        // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if
090        //       certain values are present - rather than the (more simple) Map.containsKey(...)
091
092        for (String prop : httpHeaders.keySet())
093
094            // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding"
095            // NOTE: The Map's returned have been known to contain null keys, so check for that here.
096
097            if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding"))
098
099                // Check (Case Insensitive), if any of the properties assigned to "Content-Encoding"
100                // is "GZIP".  If this is found, return TRUE immediately.
101    
102                for (String vals : httpHeaders.get(prop))
103                    if (vals.equalsIgnoreCase("gzip")) return true;
104
105        // The property-value "GZIP" wasn't found, so return FALSE.
106        return false;
107    }
108
109    /**
110     * This method will check whether the {@code HTTP Header} returned by a website has been
111     * encoded using the {@code ZIP Compression (PKZIP, Deflate)} encoding.  It expects the
112     * {@code java.util.Map} that is returned from an invokation of
113     * {@code HttpURLConnection.getHeaderFields()}.
114     * 
115     * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}.  It 
116     * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}.
117     *
118     * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B>
119     * this property has a <I>property-value</I> in it's list equal to {@code "deflate"}, then this
120     * method will return {@code TRUE}.  Otherwise this method will return {@code FALSE}.
121     * 
122     * <BR /><BR /><DIV CLASS=JDHint>
123     * Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>, all {@code String}
124     * comparisons done in this method shall ignore case.
125     * </DIV>
126     */
127    public static boolean usesDeflate(Map<String, List<String>> httpHeaders)
128    {
129        // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if
130        //       certain values are present - rather than the (more simple) Map.containsKey(...)
131
132        for (String prop : httpHeaders.keySet())
133
134            // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding"
135            // NOTE: The returned Maps have been known to contain null keys, so check for that here
136
137            if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding"))
138
139                // Check (Case Insensitive), if any properties assigned to "Content-Encoding" are
140                // "DEFLATE" - then return TRUE immediately.
141    
142                for (String vals : httpHeaders.get(prop))
143                    if (vals.equalsIgnoreCase("deflate")) return true;
144
145        // The property-value "deflate" wasn't found, so return FALSE.
146        return false;
147    }
148
149    /**
150     * This method will check whether the {@code HTTP Header} returned by a website has been
151     * encoded using compression.  It expects the
152     * {@code java.util.Map} that is returned from an invokation of
153     * {@code HttpURLConnection.getHeaderFields()}.
154     * 
155     * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}.  It 
156     * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}.
157     * 
158     * @param is This should be the {@code InputStream} that is returned from the
159     * {@code HttpURLConnection} when reqesting the content from the web-server that is hosting the
160     * {@code URL}.  The {@code HTTP Headers} will be searched, and if a compression algorithm
161     * has been specified (<I>and the algorithm is one of the algorithm's automatically handled 
162     * by Java</I>) - then this {@code InputStream} shall be <B>wrapped</B> by the appropriate
163     * decompression algorithm.
164     *
165     * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B>
166     * this property has a <I>property-value</I> in it's list equal to either {@code "deflate"}
167     * or {@code "gzip"}, then this shall return a <I>wrapped {@code InputStream}</I> that is
168     * capable of handling the <I>decompression algorithm</I>.
169     * 
170     * <BR /><BR /><DIV CLASS=JDHint>
171     * Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>, all {@code String}
172     * comparisons done in this method shall ignore case.
173     * </DIV>
174     */
175    public static InputStream checkHTTPCompression
176        (Map<String, List<String>> httpHeaders, InputStream is) throws IOException
177    {
178        // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if
179        //       certain values are present - rather than the (more simple) Map.containsKey(...)
180
181        for (String prop : httpHeaders.keySet())
182
183            // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding"
184            // NOTE: The returned Maps have been known to contain null keys, so check for that here
185
186            if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding"))
187
188                // Check (Case Insensitive), if any properties assigned to "Content-Encoding"
189                // are "DEFLATE" or "GZIP" - then return the compression-algorithm immediately.
190    
191                for (String vals : httpHeaders.get(prop))
192
193                    if (vals.equalsIgnoreCase("gzip"))          return new GZIPInputStream(is);
194                    else if (vals.equalsIgnoreCase("deflate"))  return new ZipInputStream(is);
195
196        // Neither of the property-values "gzip" or "deflate" were found.
197        // Return the original input stream.
198
199        return is;
200    }
201
202    /**
203     * This method shall simply take as input a {@code java.util.Map} which contains the
204     * {@code HTTP Header} properties that <I>must have been generated</I> by a call to the method
205     * {@code HttpURLConnection.getHeaderFields()}.  It will produce a Java {@code String} that
206     * lists these headers in text / readable format.
207     * 
208     * @param httpHeaders This parameter must be an instance of 
209     * {@code java.util.Map<String, List<String>>} and it should have been generated by a call to
210     * {@code HttpURLConnection.getHeaderFields()}.  The property names and values contained by
211     * this {@code Map} will be iterated and printed to a returned {@code java.lang.String}.
212     * 
213     * @return This shall return a printed version of the {@code Map}.
214     */
215    public static String httpHeadersToString(Map<String, List<String>> httpHeaders)
216    {
217        StringBuilder   sb  = new StringBuilder();
218        int             max = 0;
219
220        // To ensure that the output string is "aligned", check the length of each of the
221        // keys in the HTTP Header.
222
223        for (String key : httpHeaders.keySet()) if (key.length() > max) max = key.length();
224
225        max += 5;
226
227        // Iterate all of the Properties that are included in the 'httpHeaders' parameter
228        // It is important to note that the java "toString()" method for the List<String> that
229        // is used to store the Property-Values list works great, without any changes.
230
231        for (String key : httpHeaders.keySet()) sb.append(
232            StringParse.rightSpacePad(key + ':', max) +
233            httpHeaders.get(key).toString() + '\n'
234        );
235
236        return sb.toString();
237    }
238
239
240    // ********************************************************************************************
241    // ********************************************************************************************
242    // Some various ways to open a connection to a website.
243    // ********************************************************************************************
244    // ********************************************************************************************
245
246
247    /**
248     * Convenience Method.
249     * <BR />Invokes: {@link #openConn(URL)}
250     */
251    public static BufferedReader openConn(String url) throws IOException
252    { return openConn(new URL(url)); }
253
254    /**
255     * Opens a standard connection to a {@code URL}, and returns a {@code BufferedReader} for
256     * reading from it.
257     * 
258     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
259     * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note -->
260     * 
261     * @param url This may be an Internet-{@code URL.}
262     * @return A java {@code BufferedReader} for retrieving the data from the internet connection.
263     * @see #USER_AGENT
264     * @see #USE_USER_AGENT
265     * @see #checkHTTPCompression(Map, InputStream)
266     */
267    public static BufferedReader openConn(URL url) throws IOException
268    {
269        HttpURLConnection con = (HttpURLConnection) url.openConnection();
270
271        con.setRequestMethod("GET");
272
273        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
274
275        InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream());
276
277        return new BufferedReader(new InputStreamReader(is));
278    }
279
280    /**
281     * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for
282     * reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the
283     * {@code HTTP Server}.
284     * 
285     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
286     * 
287     * @param url This may be an Internet {@code URL}.
288     * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2>
289     * @throws IOException
290     * @see #checkHTTPCompression(Map, InputStream)
291     */
292    public static Ret2<BufferedReader, Map<String, List<String>>> openConnGetHeader(URL url)
293        throws IOException
294    {
295        HttpURLConnection con = (HttpURLConnection) url.openConnection();
296
297        con.setRequestMethod("GET");
298
299        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
300
301        Map<String, List<String>> httpHeaders = con.getHeaderFields();
302
303        InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream());
304
305        return new Ret2<BufferedReader, Map<String, List<String>>>
306            (new BufferedReader(new InputStreamReader(is)), httpHeaders);
307    }
308
309    /**
310     * Convenience Method.
311     * <BR />Invokes: {@link #openConn_iso_8859_1(URL)}
312     */
313    public static BufferedReader openConn_iso_8859_1(String url) throws IOException 
314    { return openConn_iso_8859_1(new URL(url)); }
315
316    /**
317     * Will open an {@code ISO-8859} connection to a {@code URL}, and returns a 
318     * {@code BufferedReader} for reading it.
319     * 
320     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 
321     * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note -->
322     * 
323     * @param url This may be an Internet {@code URL}. The site and page to which it points should
324     * return data encoded in the {@code ISO-8859} charset.
325     * 
326     * @return A java {@code BufferedReader} for retrieving the data from the internet connection.
327     * @see #USER_AGENT
328     * @see #USE_USER_AGENT
329     * @see #checkHTTPCompression(Map, InputStream)
330     */
331    public static BufferedReader openConn_iso_8859_1(URL url) throws IOException
332    {
333        HttpURLConnection con = (HttpURLConnection) url.openConnection();
334
335        con.setRequestMethod("GET");
336
337        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
338
339        con.setRequestProperty("Content-Type", "text/html; charset=iso-8859-1");
340
341        InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream());
342
343        return new BufferedReader(new InputStreamReader(is, Charset.forName("iso-8859-1")));
344    }
345
346
347    /**
348     * Opens a {@code ISO-8859-1} connection to a {@code URL}, and returns a {@code BufferedReader}
349     * for reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the
350     * {@code HTTP Server}.
351     * 
352     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
353     * 
354     * @param url This may be an Internet {@code URL}.  The site and page to which it points should
355     * return data encoded in the {@code ISO-8859-1} charset.
356     * 
357     * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2>
358     * @throws IOException
359     * @see #checkHTTPCompression(Map, InputStream)
360     */
361    public static Ret2<BufferedReader, Map<String, List<String>>>
362        openConnGetHeader_iso_8859_1(URL url)
363        throws IOException
364    {
365        HttpURLConnection con = (HttpURLConnection) url.openConnection();
366
367        con.setRequestMethod("GET");
368
369        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
370
371        con.setRequestProperty("Content-Type", "charset=iso-8859-1");
372
373        Map<String, List<String>> httpHeaders = con.getHeaderFields();
374
375        InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream());
376
377        return new Ret2<BufferedReader, Map<String, List<String>>>(
378            new BufferedReader(new InputStreamReader(is, Charset.forName("charset=iso-8859-1"))),
379            httpHeaders
380        );
381    }
382
383    /**
384     * Convenience Method.
385     * <BR />Invokes: {@link #openConn_UTF8(URL)}.
386     */
387    public static BufferedReader openConn_UTF8(String url) throws IOException
388    { return openConn_UTF8(new URL(url)); }
389
390    /**
391     * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for
392     * reading it.
393     * 
394     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEUTF8>
395     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
396     * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note -->
397     * 
398     * @param url This may be an Internet {@code URL}.  The site and page to which it points should
399     * return data encoded in the {@code UTF-8} charset.
400     * 
401     * @return A java {@code BufferedReader} for retrieving the data from the internet connection.
402     * @see #USER_AGENT
403     * @see #USE_USER_AGENT
404     * @see #checkHTTPCompression(Map, InputStream)
405     */
406    public static BufferedReader openConn_UTF8(URL url) throws IOException
407    {
408        HttpURLConnection con = (HttpURLConnection) url.openConnection();
409
410        con.setRequestMethod("GET");
411
412        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
413
414        con.setRequestProperty("Content-Type", "charset=UTF-8");
415
416        InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream());
417
418        return new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
419    }
420
421    /**
422     * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for
423     * reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the
424     * {@code HTTP Server}.
425     * 
426     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEUTF8>
427     * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP>
428     * 
429     * @param url This may be an Internet {@code URL}.  The site and page to which it points should
430     * return data encoded in the {@code UTF-8} charet.
431     * 
432     * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2>
433     * @throws IOException
434     * @see #checkHTTPCompression(Map, InputStream)
435     */
436    public static Ret2<BufferedReader, Map<String, List<String>>> openConnGetHeader_UTF8(URL url)
437        throws IOException
438    {
439        HttpURLConnection con = (HttpURLConnection) url.openConnection();
440
441        con.setRequestMethod("GET");
442
443        if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT);
444
445        con.setRequestProperty("Content-Type", "charset=UTF-8");
446
447        Map<String, List<String>> httpHeaders = con.getHeaderFields();
448
449        InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream());
450
451        return new Ret2<BufferedReader, Map<String, List<String>>>(
452            new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))),
453            httpHeaders
454        );
455    }
456
457
458    // ********************************************************************************************
459    // ********************************************************************************************
460    // Some simple/easy HTML scrape functions, saves to a String.
461    // ********************************************************************************************
462    // ********************************************************************************************
463
464
465    /**
466     * Convenience Method.
467     * <BR />Invokes: {@link #scrapePage(BufferedReader)}
468     * <BR />Obtains: {@code BufferedReader} from {@link #openConn(String)}
469     */
470    public static String scrapePage(String url) throws IOException
471    { return scrapePage(openConn(url)); }
472
473    /**
474     * Convenience Method.
475     * <BR />Invokes: {@link #scrapePage(BufferedReader)}
476     * <BR />Obtains: {@code BufferedReader} from {@link #openConn(URL)}
477     */
478    public static String scrapePage(URL url) throws IOException
479    { return scrapePage(openConn(url)); }
480
481    /**
482     * This scrapes a website and dumps the entire contents into a {@code java.lang.String}.
483     * 
484     * @param br This is a {@code Reader} that needs to have been connected to a Website that will
485     * output text/html data.
486     * 
487     * @return The text/html data - returned inside a {@code String}
488     */
489    public static String scrapePage(BufferedReader br) throws IOException
490    {
491        StringBuffer sb = new StringBuffer();
492        String s;
493
494        while ((s = br.readLine()) != null) sb.append(s + "\n");
495
496        return sb.toString();
497    }
498
499
500    // ********************************************************************************************
501    // ********************************************************************************************
502    // Some simple/easy HTML scrape functions, saves to a Vector<String>.
503    // ********************************************************************************************
504    // ********************************************************************************************
505
506
507    /**
508     * Convenience Method.
509     * <BR />Invokes: {@link #scrapePageToVector(BufferedReader, boolean)}
510     * <BR />Obtains: {@code BufferedReader} from {@link #openConn(String)}
511     */
512    public static Vector<String> scrapePageToVector(String url, boolean includeNewLine)
513        throws IOException
514    { return scrapePageToVector(openConn(url), includeNewLine); }
515
516    /**
517     * Convenience Method.
518     * <BR />Invokes: {@link #scrapePageToVector(BufferedReader, boolean)}
519     * <BR />Obtains: {@code Bufferedeader} from {@link #openConn(URL)}
520     */
521    public static Vector<String> scrapePageToVector(URL url, boolean includeNewLine)
522        throws IOException
523    { return scrapePageToVector(openConn(url), includeNewLine); }
524
525    /**
526     * This will scrape the entire contents of an HTML page to a {@code Vector<String>}  Each
527     * line of the text/HTML page is demarcated by the reception of a {@code '\n'} character
528     * from the web-server.
529     * 
530     * @param br  This is the input source of the HTML page.  It will query for String data.
531     * 
532     * @param includeNewLine This will append the {@code '\n'} character to the end of each
533     * {@code String} in the {@code Vector}.
534     * 
535     * @return a {@code Vector} of {@code String's} where each {@code String} is a line on the
536     * web-page.
537     * 
538     * @see #scrapePageToVector(String, boolean)
539     */
540    public static Vector<String> scrapePageToVector(BufferedReader br, boolean includeNewLine)
541        throws IOException
542    {
543        Vector<String>  ret = new Vector<>();
544        String          s   = null;
545
546        if (includeNewLine)
547
548            while ((s = br.readLine()) != null)
549                ret.add(s + '\n');
550
551        else
552
553            while ((s = br.readLine()) != null)
554                ret.add(s);
555
556        return ret;
557    }
558
559
560    // ********************************************************************************************
561    // ********************************************************************************************
562    // Main HTML scrape functions - used by main class of "HTMLPage.getPageTokens()"
563    // ********************************************************************************************
564    // ********************************************************************************************
565
566
567    /**
568     * This receives an input stream that is contains a pipe to a website that will produce HTML.
569     * The HTML is read from the website, and returned as a {@code String.}
570     * This is called "scraping HTML."
571     * 
572     * @param startTag  If this is null, the scrape will begin with the first character received.
573     * If this contains a {@code String}, the scrape will not include any text/HTML data that
574     * occurs prior to the first occurrence of {@code 'startTag'}
575     * 
576     * @param endTag  If this is null, the scrape will read the entire contents of text/HTML data
577     * from the {@code Bufferedreader br} parameter.  If this contains a {@code String}, then data
578     * will be read and included in the result until {@code 'endTag'} is received.
579     * 
580     * @return a {@code StringBuffer} that is text/html data retrieved from the Reader.
581     * Call {@code toString()} on the return value to retrieve that {@code String.} 
582     * 
583     * @throws ScrapeException If, after download completes, either the {@code 'startTag'} or the
584     * parameter {@code 'endTag'} do not represent {@code String's} that were found within the
585     * downloaded page, this exception is thrown.
586     */
587    public static StringBuffer getHTML(BufferedReader br, String startTag, String endTag)
588        throws IOException
589    {
590        StringBuffer    html = new StringBuffer();
591        String          s;
592
593        // Nice Long Name...  Guess what it means
594        boolean alreadyFoundEndTagInStartTagLine = false;
595
596        // If the startTag parameter is not null, skip all content, until the startTag is found!
597        if (startTag != null)
598        {
599            boolean foundStartTag = false;
600
601            while ((s = br.readLine()) != null)
602
603                if (s.contains(startTag))
604                {
605                    int startTagPos = s.indexOf(startTag);
606
607                    foundStartTag = true;
608
609                    // NOTE:    Sometimes the 'startTag' and 'endTag' are on the same line!
610                    //          This happens, for instance, on Yahoo Photos, when giant lines
611                    //          (no line-breaks) are transmitted
612                    //          Hence... *really* long variable name, this is confusing!
613
614                    s = s.substring(startTagPos);
615
616                    if ((endTag != null) && s.contains(endTag))
617                    {
618                        s = s.substring(0, s.indexOf(endTag) + endTag.length());
619
620                        alreadyFoundEndTagInStartTagLine = true;
621                    }
622
623                    html.append(s + "\n"); break;
624                }
625
626            if (! foundStartTag) throw new ScrapeException
627                ("Start Tag: '" + startTag + "' was Not Found on Page.");
628        }
629
630        // if the endTag parameter is not null, stop reading as soon as the end-tag is found
631        if (endTag != null)
632        {
633            // NOTE: This 'if' is inside curly-braces, because there is an 'else' that "goes with"
634            // the 'if' above... BUT NOT the following 'if'
635
636            if (! alreadyFoundEndTagInStartTagLine)
637            {
638                boolean foundEndTag = false;
639
640                while ((s = br.readLine()) != null)
641
642                    if (s.contains(endTag))
643                    {
644                        foundEndTag = true;
645                        int endTagPos = s.indexOf(endTag);
646                        html.append(s.substring(0, endTagPos + endTag.length()) + "\n");
647                        break;
648                    }
649
650                    else html.append(s + "\n");
651
652                if (! foundEndTag) throw new ScrapeException
653                    ("End Tag: '" + endTag + "' was Not Found on Page.");
654            }
655        }
656
657        // ELSE: (endTag *was* null) ... read all content until EOF ... or ... "EOWP" (end of web-page)
658        else
659
660            while ((s = br.readLine()) != null)
661                html.append(s + "\n");
662
663        // Kind of an annoying line, but this is the new "Multi-Threaded" thing I added.
664        return html;
665    }
666
667
668    /**
669     * This receives an input stream that is contains a pipe to a website that will produce HTML.
670     * The HTML is read from the website, and returned as a {@code String.}
671     * This is called "scraping HTML."
672     * 
673     * @param startLineNum  If this is {@code '0'} or {@code '1'}, the scrape will begin with the
674     * first character received.  If this contains a positive integer, the scrape will not include 
675     * any text/HTML data that occurs prior to {@code int startLineNum} lines of text/html having 
676     * been received.
677     * 
678     * @param endLineNum  If this is negative, the scrape will read the entire contents of
679     * text/HTML data from the {@code Bufferedreader br} parameter (until {@code EOF} is
680     * encountered).  If this contains a positive integer, then data will be read and included in
681     * the result until {@code int endLineNum} lines of text/html have been received.
682     * 
683     * @return a {@code StringBuffer} that is text/html data retrieved from the Reader.
684     * Call {@code toString()} on the return value to retrieve that {@code String}
685     * 
686     * @throws IllegalArgumentException If parameter {@code 'startLineNum'} is negative or greater
687     * than {@code 'endLineNum'}  If {@code 'endLineNum'} was negative, this test is skipped.
688     * 
689     * @throws ScrapeException If there were not enough lines read from the {@code BufferedReader}
690     * parameter to be consistent with the values in {@code 'startLineNum'} and
691     * {@code 'endLineNum'}
692     */
693    public static StringBuffer getHTML(BufferedReader br, int startLineNum, int endLineNum)
694        throws IOException
695    {
696        StringBuffer    html    = new StringBuffer();
697        String          s       = "";
698
699        // NOTE: Arrays start at 0, **BUT** HTML page line counts start at 1!
700        int curLineNum = 1;
701
702        if (startLineNum < 0) throw new IllegalArgumentException(
703            "The parameter startLineNum is negative: " + startLineNum + " but this is not " +
704            "allowed."
705        );
706
707        if (endLineNum == 0) throw new IllegalArgumentException
708            ("The parameter endLineNum is zero, but this is not allowed.");
709
710        endLineNum      = (endLineNum < 0) ? 1 : endLineNum;
711        startLineNum    = (startLineNum == 0) ? 1 : startLineNum;
712
713        if ((endLineNum < startLineNum) && (endLineNum != 1)) throw new IllegalArgumentException(
714            "The parameter startLineNum is: " + startLineNum + "\n" +
715            "The parameter endLineNum is: " + endLineNum + "\n" +
716            "It is required that the latter is larger than the former, " +
717            "or it must be 0 or negative to signify read until EOF."
718        );
719
720        if (startLineNum > 1)
721        {
722            while (curLineNum++ < startLineNum)
723
724                if (br.readLine() == null) throw new ScrapeException(
725                    "The HTML Page that was given didn't even have enough lines to read " +
726                    "quantity in variable startLineNum.\nstartLineNum = " + startLineNum + 
727                    " and read " + (curLineNum-1) + " line(s) before EOF."
728                );
729
730            // Off-By-One computer science error correction - remember post-decrement, means the
731            // last loop iteration didn't read line, but did increment the loop counter!
732
733            curLineNum--;
734        }
735
736        // endLineNum==1  means/imples that we don't have to heed the
737        // endLineNum variable ==> read to EOF/null!
738
739        if (endLineNum == 1)
740
741            while ((s = br.readLine()) != null)
742                html.append(s + "\n");
743
744        // endLineNum > 1 ==> Head endLineNum variable!
745        else
746        {
747            // System.out.println("At START of LOOP: curLineNum = " + curLineNum +
748            // " and endLineNum = " + endLineNum);
749
750            for ( ;curLineNum <= endLineNum; curLineNum++)
751
752                if ((s = br.readLine()) != null) html.append(s + "\n");
753                else break;
754
755            // NOTE: curLineNum-1 and endLineNum+1 are used because:
756            //
757            //      ** The loop counter (curLineNum) breaks when the next line to read is the one
758            //          passed the endLineNum
759            //      ** endLineNum+1 is the appropriate state if enough lines were read from the
760            //           HTML Page
761            //      ** curLineNum-1 is the number of the last line read from the HTML
762
763            if (curLineNum != (endLineNum+1)) throw new ScrapeException(
764                "The HTML Page that was read didn't have enough lines to read to quantity in " +
765                "variable endLineNum.\nendLineNum = " + endLineNum + " but only read " +
766                (curLineNum-1) + " line(s) before EOF."
767            );
768        }
769
770        // Kind of an annoying line, but this is the new "Multi-Threaded" thing I added.
771        return html;
772    }
773}