001package Torello.HTML; 002 003import Torello.Java.Additional.Ret2; 004import Torello.Java.StringParse; 005 006import java.util.*; 007import java.util.regex.*; 008import java.io.*; 009import java.util.zip.*; 010import java.net.URL; 011import java.net.HttpURLConnection; 012import java.nio.charset.Charset; 013 014import Torello.JavaDoc.StaticFunctional; 015import Torello.JavaDoc.Excuse; 016 017/** 018 * Some standard utilities for transfering & downloading HTML from web-sites and then storing 019 * that content in memory as a Java {@code String} - <I>which, subsequently, can be written to 020 * disk, transferred elsewhere, or even parsed (using class {@link HTMLPage})</I>. 021 * 022 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPE> 023 */ 024@StaticFunctional( 025 Excused={"USER_AGENT", "USE_USER_AGENT"}, 026 Excuses={Excuse.CONFIGURATION, Excuse.FLAG} 027) 028public class Scrape 029{ 030 private Scrape() { } 031 032 /** 033 * When opening an {@code HTTP URL} connection, it is usually a good idea to use a 034 * {@code "User Agent"} The default behavior in this Scrape & Search Package is to connect 035 * using the {@code public static String USER_AGENT = "Chrome/61.0.3163.100";} 036 * 037 * <BR /><BR /><DIV CLASS=JDHint> 038 * This behavior may be changed by modifying these {@code public static} variables. 039 * 040 * <BR /> 041 * If the boolean {@link #USE_USER_AGENT} is set to {@code FALSE}, then no User-Agent will be 042 * used at all. 043 * </DIV> 044 */ 045 public static String USER_AGENT = "Chrome/61.0.3163.100"; 046 047 /** 048 * When opening an {@code HTTP URL} connection, it is usually a good idea to use a 049 * {@code "User Agent"} The default behavior in this Scrape & Search Package is to connect 050 * using the {@code public static String USER_AGENT = "Chrome/61.0.3163.100";} 051 * 052 * <BR /><BR /><DIV CLASS=JDHint> 053 * This behavior may be changed by modifying these {@code public static} variables. 054 * 055 * <BR /> 056 * If the boolean {@link #USE_USER_AGENT} is set to {@code FALSE}, then no User-Agent will be 057 * used at all. 058 * </DIV> 059 */ 060 public static boolean USE_USER_AGENT = true; 061 062 063 // ******************************************************************************************** 064 // ******************************************************************************************** 065 // HTTP Headers stuff 066 // ******************************************************************************************** 067 // ******************************************************************************************** 068 069 070 /** 071 * This method will check whether the {@code HTTP Header} returned by a website has been 072 * encoded using the {@code GZIP Compression} encoding. It expects the {@code java.util.Map} 073 * that is returned from an invocation of {@code HttpURLConnection.getHeaderFields()}. 074 * 075 * <BR /><BR /><B CLASS=JDDescLabel>Case-Insensitive:</B> 076 * 077 * <BR />Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>, all {@code String} 078 * comparisons done in this method shall ignore case. 079 * 080 * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}. It 081 * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}. 082 * 083 * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B> 084 * this property has a <I>property-value</I> in it's list equal to {@code "gzip"}, then this 085 * method will return {@code TRUE}. Otherwise this method will return {@code FALSE}. 086 */ 087 public static boolean usesGZIP(Map<String, List<String>> httpHeaders) 088 { 089 // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if 090 // certain values are present - rather than the (more simple) Map.containsKey(...) 091 092 for (String prop : httpHeaders.keySet()) 093 094 // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding" 095 // NOTE: The Map's returned have been known to contain null keys, so check for that here. 096 097 if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding")) 098 099 // Check (Case Insensitive), if any of the properties assigned to "Content-Encoding" 100 // is "GZIP". If this is found, return TRUE immediately. 101 102 for (String vals : httpHeaders.get(prop)) 103 if (vals.equalsIgnoreCase("gzip")) return true; 104 105 // The property-value "GZIP" wasn't found, so return FALSE. 106 return false; 107 } 108 109 /** 110 * This method will check whether the {@code HTTP Header} returned by a website has been 111 * encoded using the {@code ZIP Compression (PKZIP, Deflate)} encoding. It expects the 112 * {@code java.util.Map} that is returned from an invokation of 113 * {@code HttpURLConnection.getHeaderFields()}. 114 * 115 * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}. It 116 * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}. 117 * 118 * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B> 119 * this property has a <I>property-value</I> in it's list equal to {@code "deflate"}, then this 120 * method will return {@code TRUE}. Otherwise this method will return {@code FALSE}. 121 * 122 * <BR /><BR /><DIV CLASS=JDHint> 123 * Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>, all {@code String} 124 * comparisons done in this method shall ignore case. 125 * </DIV> 126 */ 127 public static boolean usesDeflate(Map<String, List<String>> httpHeaders) 128 { 129 // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if 130 // certain values are present - rather than the (more simple) Map.containsKey(...) 131 132 for (String prop : httpHeaders.keySet()) 133 134 // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding" 135 // NOTE: The returned Maps have been known to contain null keys, so check for that here 136 137 if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding")) 138 139 // Check (Case Insensitive), if any properties assigned to "Content-Encoding" are 140 // "DEFLATE" - then return TRUE immediately. 141 142 for (String vals : httpHeaders.get(prop)) 143 if (vals.equalsIgnoreCase("deflate")) return true; 144 145 // The property-value "deflate" wasn't found, so return FALSE. 146 return false; 147 } 148 149 /** 150 * This method will check whether the {@code HTTP Header} returned by a website has been 151 * encoded using compression. It expects the 152 * {@code java.util.Map} that is returned from an invokation of 153 * {@code HttpURLConnection.getHeaderFields()}. 154 * 155 * @param httpHeaders This is a simply {@code java.util.Map<String, List<String>>}. It 156 * <B><I>must be</I></B> the exact map that is returned by the {@code HttpURLConnection}. 157 * 158 * @param is This should be the {@code InputStream} that is returned from the 159 * {@code HttpURLConnection} when reqesting the content from the web-server that is hosting the 160 * {@code URL}. The {@code HTTP Headers} will be searched, and if a compression algorithm 161 * has been specified (<I>and the algorithm is one of the algorithm's automatically handled 162 * by Java</I>) - then this {@code InputStream} shall be <B>wrapped</B> by the appropriate 163 * decompression algorithm. 164 * 165 * @return If this map contains a property named {@code "Content-Encoding"} <B><I>AND</I></B> 166 * this property has a <I>property-value</I> in it's list equal to either {@code "deflate"} 167 * or {@code "gzip"}, then this shall return a <I>wrapped {@code InputStream}</I> that is 168 * capable of handling the <I>decompression algorithm</I>. 169 * 170 * <BR /><BR /><DIV CLASS=JDHint> 171 * Since {@code HTTP Headers} are considered <B>CASE INSENSITIVE</B>, all {@code String} 172 * comparisons done in this method shall ignore case. 173 * </DIV> 174 */ 175 public static InputStream checkHTTPCompression 176 (Map<String, List<String>> httpHeaders, InputStream is) throws IOException 177 { 178 // NOTE: HTTP Headers are CASE-INSENSITIVE, so a loop is needed to check if 179 // certain values are present - rather than the (more simple) Map.containsKey(...) 180 181 for (String prop : httpHeaders.keySet()) 182 183 // Check (Case Insensitive) if the HTTP Headers Map has the property "Content-Encoding" 184 // NOTE: The returned Maps have been known to contain null keys, so check for that here 185 186 if ((prop != null) && prop.equalsIgnoreCase("Content-Encoding")) 187 188 // Check (Case Insensitive), if any properties assigned to "Content-Encoding" 189 // are "DEFLATE" or "GZIP" - then return the compression-algorithm immediately. 190 191 for (String vals : httpHeaders.get(prop)) 192 193 if (vals.equalsIgnoreCase("gzip")) return new GZIPInputStream(is); 194 else if (vals.equalsIgnoreCase("deflate")) return new ZipInputStream(is); 195 196 // Neither of the property-values "gzip" or "deflate" were found. 197 // Return the original input stream. 198 199 return is; 200 } 201 202 /** 203 * This method shall simply take as input a {@code java.util.Map} which contains the 204 * {@code HTTP Header} properties that <I>must have been generated</I> by a call to the method 205 * {@code HttpURLConnection.getHeaderFields()}. It will produce a Java {@code String} that 206 * lists these headers in text / readable format. 207 * 208 * @param httpHeaders This parameter must be an instance of 209 * {@code java.util.Map<String, List<String>>} and it should have been generated by a call to 210 * {@code HttpURLConnection.getHeaderFields()}. The property names and values contained by 211 * this {@code Map} will be iterated and printed to a returned {@code java.lang.String}. 212 * 213 * @return This shall return a printed version of the {@code Map}. 214 */ 215 public static String httpHeadersToString(Map<String, List<String>> httpHeaders) 216 { 217 StringBuilder sb = new StringBuilder(); 218 int max = 0; 219 220 // To ensure that the output string is "aligned", check the length of each of the 221 // keys in the HTTP Header. 222 223 for (String key : httpHeaders.keySet()) if (key.length() > max) max = key.length(); 224 225 max += 5; 226 227 // Iterate all of the Properties that are included in the 'httpHeaders' parameter 228 // It is important to note that the java "toString()" method for the List<String> that 229 // is used to store the Property-Values list works great, without any changes. 230 231 for (String key : httpHeaders.keySet()) sb.append( 232 StringParse.rightSpacePad(key + ':', max) + 233 httpHeaders.get(key).toString() + '\n' 234 ); 235 236 return sb.toString(); 237 } 238 239 240 // ******************************************************************************************** 241 // ******************************************************************************************** 242 // Some various ways to open a connection to a website. 243 // ******************************************************************************************** 244 // ******************************************************************************************** 245 246 247 /** 248 * Convenience Method. 249 * <BR />Invokes: {@link #openConn(URL)} 250 */ 251 public static BufferedReader openConn(String url) throws IOException 252 { return openConn(new URL(url)); } 253 254 /** 255 * Opens a standard connection to a {@code URL}, and returns a {@code BufferedReader} for 256 * reading from it. 257 * 258 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 259 * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note --> 260 * 261 * @param url This may be an Internet-{@code URL.} 262 * @return A java {@code BufferedReader} for retrieving the data from the internet connection. 263 * @see #USER_AGENT 264 * @see #USE_USER_AGENT 265 * @see #checkHTTPCompression(Map, InputStream) 266 */ 267 public static BufferedReader openConn(URL url) throws IOException 268 { 269 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 270 271 con.setRequestMethod("GET"); 272 273 if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT); 274 275 InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream()); 276 277 return new BufferedReader(new InputStreamReader(is)); 278 } 279 280 /** 281 * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for 282 * reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the 283 * {@code HTTP Server}. 284 * 285 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 286 * 287 * @param url This may be an Internet {@code URL}. 288 * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2> 289 * @throws IOException 290 * @see #checkHTTPCompression(Map, InputStream) 291 */ 292 public static Ret2<BufferedReader, Map<String, List<String>>> openConnGetHeader(URL url) 293 throws IOException 294 { 295 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 296 297 con.setRequestMethod("GET"); 298 299 if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT); 300 301 Map<String, List<String>> httpHeaders = con.getHeaderFields(); 302 303 InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream()); 304 305 return new Ret2<BufferedReader, Map<String, List<String>>> 306 (new BufferedReader(new InputStreamReader(is)), httpHeaders); 307 } 308 309 /** 310 * Convenience Method. 311 * <BR />Invokes: {@link #openConn_iso_8859_1(URL)} 312 */ 313 public static BufferedReader openConn_iso_8859_1(String url) throws IOException 314 { return openConn_iso_8859_1(new URL(url)); } 315 316 /** 317 * Will open an {@code ISO-8859} connection to a {@code URL}, and returns a 318 * {@code BufferedReader} for reading it. 319 * 320 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 321 * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note --> 322 * 323 * @param url This may be an Internet {@code URL}. The site and page to which it points should 324 * return data encoded in the {@code ISO-8859} charset. 325 * 326 * @return A java {@code BufferedReader} for retrieving the data from the internet connection. 327 * @see #USER_AGENT 328 * @see #USE_USER_AGENT 329 * @see #checkHTTPCompression(Map, InputStream) 330 */ 331 public static BufferedReader openConn_iso_8859_1(URL url) throws IOException 332 { 333 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 334 335 con.setRequestMethod("GET"); 336 337 if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT); 338 339 con.setRequestProperty("Content-Type", "text/html; charset=iso-8859-1"); 340 341 InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream()); 342 343 return new BufferedReader(new InputStreamReader(is, Charset.forName("iso-8859-1"))); 344 } 345 346 347 /** 348 * Opens a {@code ISO-8859-1} connection to a {@code URL}, and returns a {@code BufferedReader} 349 * for reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the 350 * {@code HTTP Server}. 351 * 352 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 353 * 354 * @param url This may be an Internet {@code URL}. The site and page to which it points should 355 * return data encoded in the {@code ISO-8859-1} charset. 356 * 357 * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2> 358 * @throws IOException 359 * @see #checkHTTPCompression(Map, InputStream) 360 */ 361 public static Ret2<BufferedReader, Map<String, List<String>>> 362 openConnGetHeader_iso_8859_1(URL url) 363 throws IOException 364 { 365 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 366 367 con.setRequestMethod("GET"); 368 369 if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT); 370 371 con.setRequestProperty("Content-Type", "charset=iso-8859-1"); 372 373 Map<String, List<String>> httpHeaders = con.getHeaderFields(); 374 375 InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream()); 376 377 return new Ret2<BufferedReader, Map<String, List<String>>>( 378 new BufferedReader(new InputStreamReader(is, Charset.forName("charset=iso-8859-1"))), 379 httpHeaders 380 ); 381 } 382 383 /** 384 * Convenience Method. 385 * <BR />Invokes: {@link #openConn_UTF8(URL)}. 386 */ 387 public static BufferedReader openConn_UTF8(String url) throws IOException 388 { return openConn_UTF8(new URL(url)); } 389 390 /** 391 * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for 392 * reading it. 393 * 394 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEUTF8> 395 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 396 * <EMBED CLASS='external-html' DATA-FILE-ID=SCUA> <!-- User Agent, Browser War Note --> 397 * 398 * @param url This may be an Internet {@code URL}. The site and page to which it points should 399 * return data encoded in the {@code UTF-8} charset. 400 * 401 * @return A java {@code BufferedReader} for retrieving the data from the internet connection. 402 * @see #USER_AGENT 403 * @see #USE_USER_AGENT 404 * @see #checkHTTPCompression(Map, InputStream) 405 */ 406 public static BufferedReader openConn_UTF8(URL url) throws IOException 407 { 408 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 409 410 con.setRequestMethod("GET"); 411 412 if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT); 413 414 con.setRequestProperty("Content-Type", "charset=UTF-8"); 415 416 InputStream is = checkHTTPCompression(con.getHeaderFields(), con.getInputStream()); 417 418 return new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))); 419 } 420 421 /** 422 * Opens a {@code UTF8} connection to a {@code URL}, and returns a {@code BufferedReader} for 423 * reading it, <B><I>and also</I></B> the {@code HTTP Header} that was returned by the 424 * {@code HTTP Server}. 425 * 426 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEUTF8> 427 * <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPEGZIP> 428 * 429 * @param url This may be an Internet {@code URL}. The site and page to which it points should 430 * return data encoded in the {@code UTF-8} charet. 431 * 432 * @return <EMBED CLASS='external-html' DATA-FILE-ID=SCRAPERET2> 433 * @throws IOException 434 * @see #checkHTTPCompression(Map, InputStream) 435 */ 436 public static Ret2<BufferedReader, Map<String, List<String>>> openConnGetHeader_UTF8(URL url) 437 throws IOException 438 { 439 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 440 441 con.setRequestMethod("GET"); 442 443 if (USE_USER_AGENT) con.setRequestProperty("User-Agent", USER_AGENT); 444 445 con.setRequestProperty("Content-Type", "charset=UTF-8"); 446 447 Map<String, List<String>> httpHeaders = con.getHeaderFields(); 448 449 InputStream is = checkHTTPCompression(httpHeaders, con.getInputStream()); 450 451 return new Ret2<BufferedReader, Map<String, List<String>>>( 452 new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))), 453 httpHeaders 454 ); 455 } 456 457 458 // ******************************************************************************************** 459 // ******************************************************************************************** 460 // Some simple/easy HTML scrape functions, saves to a String. 461 // ******************************************************************************************** 462 // ******************************************************************************************** 463 464 465 /** 466 * Convenience Method. 467 * <BR />Invokes: {@link #scrapePage(BufferedReader)} 468 * <BR />Obtains: {@code BufferedReader} from {@link #openConn(String)} 469 */ 470 public static String scrapePage(String url) throws IOException 471 { return scrapePage(openConn(url)); } 472 473 /** 474 * Convenience Method. 475 * <BR />Invokes: {@link #scrapePage(BufferedReader)} 476 * <BR />Obtains: {@code BufferedReader} from {@link #openConn(URL)} 477 */ 478 public static String scrapePage(URL url) throws IOException 479 { return scrapePage(openConn(url)); } 480 481 /** 482 * This scrapes a website and dumps the entire contents into a {@code java.lang.String}. 483 * 484 * @param br This is a {@code Reader} that needs to have been connected to a Website that will 485 * output text/html data. 486 * 487 * @return The text/html data - returned inside a {@code String} 488 */ 489 public static String scrapePage(BufferedReader br) throws IOException 490 { 491 StringBuffer sb = new StringBuffer(); 492 String s; 493 494 while ((s = br.readLine()) != null) sb.append(s + "\n"); 495 496 return sb.toString(); 497 } 498 499 500 // ******************************************************************************************** 501 // ******************************************************************************************** 502 // Some simple/easy HTML scrape functions, saves to a Vector<String>. 503 // ******************************************************************************************** 504 // ******************************************************************************************** 505 506 507 /** 508 * Convenience Method. 509 * <BR />Invokes: {@link #scrapePageToVector(BufferedReader, boolean)} 510 * <BR />Obtains: {@code BufferedReader} from {@link #openConn(String)} 511 */ 512 public static Vector<String> scrapePageToVector(String url, boolean includeNewLine) 513 throws IOException 514 { return scrapePageToVector(openConn(url), includeNewLine); } 515 516 /** 517 * Convenience Method. 518 * <BR />Invokes: {@link #scrapePageToVector(BufferedReader, boolean)} 519 * <BR />Obtains: {@code Bufferedeader} from {@link #openConn(URL)} 520 */ 521 public static Vector<String> scrapePageToVector(URL url, boolean includeNewLine) 522 throws IOException 523 { return scrapePageToVector(openConn(url), includeNewLine); } 524 525 /** 526 * This will scrape the entire contents of an HTML page to a {@code Vector<String>} Each 527 * line of the text/HTML page is demarcated by the reception of a {@code '\n'} character 528 * from the web-server. 529 * 530 * @param br This is the input source of the HTML page. It will query for String data. 531 * 532 * @param includeNewLine This will append the {@code '\n'} character to the end of each 533 * {@code String} in the {@code Vector}. 534 * 535 * @return a {@code Vector} of {@code String's} where each {@code String} is a line on the 536 * web-page. 537 * 538 * @see #scrapePageToVector(String, boolean) 539 */ 540 public static Vector<String> scrapePageToVector(BufferedReader br, boolean includeNewLine) 541 throws IOException 542 { 543 Vector<String> ret = new Vector<>(); 544 String s = null; 545 546 if (includeNewLine) 547 548 while ((s = br.readLine()) != null) 549 ret.add(s + '\n'); 550 551 else 552 553 while ((s = br.readLine()) != null) 554 ret.add(s); 555 556 return ret; 557 } 558 559 560 // ******************************************************************************************** 561 // ******************************************************************************************** 562 // Main HTML scrape functions - used by main class of "HTMLPage.getPageTokens()" 563 // ******************************************************************************************** 564 // ******************************************************************************************** 565 566 567 /** 568 * This receives an input stream that is contains a pipe to a website that will produce HTML. 569 * The HTML is read from the website, and returned as a {@code String.} 570 * This is called "scraping HTML." 571 * 572 * @param startTag If this is null, the scrape will begin with the first character received. 573 * If this contains a {@code String}, the scrape will not include any text/HTML data that 574 * occurs prior to the first occurrence of {@code 'startTag'} 575 * 576 * @param endTag If this is null, the scrape will read the entire contents of text/HTML data 577 * from the {@code Bufferedreader br} parameter. If this contains a {@code String}, then data 578 * will be read and included in the result until {@code 'endTag'} is received. 579 * 580 * @return a {@code StringBuffer} that is text/html data retrieved from the Reader. 581 * Call {@code toString()} on the return value to retrieve that {@code String.} 582 * 583 * @throws ScrapeException If, after download completes, either the {@code 'startTag'} or the 584 * parameter {@code 'endTag'} do not represent {@code String's} that were found within the 585 * downloaded page, this exception is thrown. 586 */ 587 public static StringBuffer getHTML(BufferedReader br, String startTag, String endTag) 588 throws IOException 589 { 590 StringBuffer html = new StringBuffer(); 591 String s; 592 593 // Nice Long Name... Guess what it means 594 boolean alreadyFoundEndTagInStartTagLine = false; 595 596 // If the startTag parameter is not null, skip all content, until the startTag is found! 597 if (startTag != null) 598 { 599 boolean foundStartTag = false; 600 601 while ((s = br.readLine()) != null) 602 603 if (s.contains(startTag)) 604 { 605 int startTagPos = s.indexOf(startTag); 606 607 foundStartTag = true; 608 609 // NOTE: Sometimes the 'startTag' and 'endTag' are on the same line! 610 // This happens, for instance, on Yahoo Photos, when giant lines 611 // (no line-breaks) are transmitted 612 // Hence... *really* long variable name, this is confusing! 613 614 s = s.substring(startTagPos); 615 616 if ((endTag != null) && s.contains(endTag)) 617 { 618 s = s.substring(0, s.indexOf(endTag) + endTag.length()); 619 620 alreadyFoundEndTagInStartTagLine = true; 621 } 622 623 html.append(s + "\n"); break; 624 } 625 626 if (! foundStartTag) throw new ScrapeException 627 ("Start Tag: '" + startTag + "' was Not Found on Page."); 628 } 629 630 // if the endTag parameter is not null, stop reading as soon as the end-tag is found 631 if (endTag != null) 632 { 633 // NOTE: This 'if' is inside curly-braces, because there is an 'else' that "goes with" 634 // the 'if' above... BUT NOT the following 'if' 635 636 if (! alreadyFoundEndTagInStartTagLine) 637 { 638 boolean foundEndTag = false; 639 640 while ((s = br.readLine()) != null) 641 642 if (s.contains(endTag)) 643 { 644 foundEndTag = true; 645 int endTagPos = s.indexOf(endTag); 646 html.append(s.substring(0, endTagPos + endTag.length()) + "\n"); 647 break; 648 } 649 650 else html.append(s + "\n"); 651 652 if (! foundEndTag) throw new ScrapeException 653 ("End Tag: '" + endTag + "' was Not Found on Page."); 654 } 655 } 656 657 // ELSE: (endTag *was* null) ... read all content until EOF ... or ... "EOWP" (end of web-page) 658 else 659 660 while ((s = br.readLine()) != null) 661 html.append(s + "\n"); 662 663 // Kind of an annoying line, but this is the new "Multi-Threaded" thing I added. 664 return html; 665 } 666 667 668 /** 669 * This receives an input stream that is contains a pipe to a website that will produce HTML. 670 * The HTML is read from the website, and returned as a {@code String.} 671 * This is called "scraping HTML." 672 * 673 * @param startLineNum If this is {@code '0'} or {@code '1'}, the scrape will begin with the 674 * first character received. If this contains a positive integer, the scrape will not include 675 * any text/HTML data that occurs prior to {@code int startLineNum} lines of text/html having 676 * been received. 677 * 678 * @param endLineNum If this is negative, the scrape will read the entire contents of 679 * text/HTML data from the {@code Bufferedreader br} parameter (until {@code EOF} is 680 * encountered). If this contains a positive integer, then data will be read and included in 681 * the result until {@code int endLineNum} lines of text/html have been received. 682 * 683 * @return a {@code StringBuffer} that is text/html data retrieved from the Reader. 684 * Call {@code toString()} on the return value to retrieve that {@code String} 685 * 686 * @throws IllegalArgumentException If parameter {@code 'startLineNum'} is negative or greater 687 * than {@code 'endLineNum'} If {@code 'endLineNum'} was negative, this test is skipped. 688 * 689 * @throws ScrapeException If there were not enough lines read from the {@code BufferedReader} 690 * parameter to be consistent with the values in {@code 'startLineNum'} and 691 * {@code 'endLineNum'} 692 */ 693 public static StringBuffer getHTML(BufferedReader br, int startLineNum, int endLineNum) 694 throws IOException 695 { 696 StringBuffer html = new StringBuffer(); 697 String s = ""; 698 699 // NOTE: Arrays start at 0, **BUT** HTML page line counts start at 1! 700 int curLineNum = 1; 701 702 if (startLineNum < 0) throw new IllegalArgumentException( 703 "The parameter startLineNum is negative: " + startLineNum + " but this is not " + 704 "allowed." 705 ); 706 707 if (endLineNum == 0) throw new IllegalArgumentException 708 ("The parameter endLineNum is zero, but this is not allowed."); 709 710 endLineNum = (endLineNum < 0) ? 1 : endLineNum; 711 startLineNum = (startLineNum == 0) ? 1 : startLineNum; 712 713 if ((endLineNum < startLineNum) && (endLineNum != 1)) throw new IllegalArgumentException( 714 "The parameter startLineNum is: " + startLineNum + "\n" + 715 "The parameter endLineNum is: " + endLineNum + "\n" + 716 "It is required that the latter is larger than the former, " + 717 "or it must be 0 or negative to signify read until EOF." 718 ); 719 720 if (startLineNum > 1) 721 { 722 while (curLineNum++ < startLineNum) 723 724 if (br.readLine() == null) throw new ScrapeException( 725 "The HTML Page that was given didn't even have enough lines to read " + 726 "quantity in variable startLineNum.\nstartLineNum = " + startLineNum + 727 " and read " + (curLineNum-1) + " line(s) before EOF." 728 ); 729 730 // Off-By-One computer science error correction - remember post-decrement, means the 731 // last loop iteration didn't read line, but did increment the loop counter! 732 733 curLineNum--; 734 } 735 736 // endLineNum==1 means/imples that we don't have to heed the 737 // endLineNum variable ==> read to EOF/null! 738 739 if (endLineNum == 1) 740 741 while ((s = br.readLine()) != null) 742 html.append(s + "\n"); 743 744 // endLineNum > 1 ==> Head endLineNum variable! 745 else 746 { 747 // System.out.println("At START of LOOP: curLineNum = " + curLineNum + 748 // " and endLineNum = " + endLineNum); 749 750 for ( ;curLineNum <= endLineNum; curLineNum++) 751 752 if ((s = br.readLine()) != null) html.append(s + "\n"); 753 else break; 754 755 // NOTE: curLineNum-1 and endLineNum+1 are used because: 756 // 757 // ** The loop counter (curLineNum) breaks when the next line to read is the one 758 // passed the endLineNum 759 // ** endLineNum+1 is the appropriate state if enough lines were read from the 760 // HTML Page 761 // ** curLineNum-1 is the number of the last line read from the HTML 762 763 if (curLineNum != (endLineNum+1)) throw new ScrapeException( 764 "The HTML Page that was read didn't have enough lines to read to quantity in " + 765 "variable endLineNum.\nendLineNum = " + endLineNum + " but only read " + 766 (curLineNum-1) + " line(s) before EOF." 767 ); 768 } 769 770 // Kind of an annoying line, but this is the new "Multi-Threaded" thing I added. 771 return html; 772 } 773}