001package Torello.HTML; 002 003import java.io.*; 004import java.util.Vector; 005import java.net.URL; 006 007import Torello.JavaDoc.Excuse; 008import Torello.Java.UnreachableError; 009 010/** 011 * Java HTML's flagship-parser class for converting HTML web-pages into plain Java {@code Vector's} 012 * of {@link HTMLNode}. 013 * 014 * <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE> 015 * 016 * @see Scrape#getHTML(BufferedReader, int, int) 017 * @see Scrape#getHTML(BufferedReader, String, String) 018 * @see HTMLPageMWT 019 */ 020@Torello.JavaDoc.StaticFunctional(Excused="parser", Excuses=Excuse.SINGLETON) 021@Torello.JavaDoc.JDHeaderBackgroundImg 022public class HTMLPage 023{ 024 private HTMLPage() { } 025 026 /** 027 * A function-pointer / lambda-target that (could) potentially be used to replace this 028 * library's current regular-expression based parser with something possibly faster or even 029 * more efficient. 030 * 031 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_PARSER> 032 * @see #parser 033 */ 034 @FunctionalInterface 035 public static interface Parser 036 { 037 /** 038 * Parse html source-text into a {@code Vector<HTMLNode>}. 039 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML> 040 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 041 * 042 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 043 * 044 * <BR /><BR /><DIV CLASS=JDHint> If you have decided to implement a parser, and you wish 045 * to ingore this parameter (and don't want to output such a file) - it is (hopefully) 046 * obvious that you may skip this step!</DIV> 047 * 048 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 049 * <BR /><BR /><DIV CLASS=JDHint><B>As above,</B> you may skip implementing this.</DIV> 050 * 051 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 052 * <BR /><BR /><DIV CLASS=JDHint><B>As above,</B> you may skip implementing this.</DIV> 053 * 054 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 055 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 056 */ 057 public Vector<HTMLNode> parse( 058 CharSequence html, 059 boolean eliminateHTMLTags, 060 String rawHTMLFile, 061 String matchesFile, 062 String justTextFile 063 ) 064 throws IOException; 065 } 066 067 /** 068 * If needing to "swap a proprietary parser" comes up, this is possible. 069 * It just needs to accept the same parameters as the current parser, and produce a 070 * {@code Vector<HTMLNode>.} This is not an advised step to take, but if an alternative 071 * parser has been tested and happens to be generating different results, it can be easily 072 * 'swapped out' for the one used now. 073 * @see Parser 074 * @see Parser#parse 075 */ 076 public static Parser parser = ParserRE::parsePageTokens; 077 078 079 // ******************************************************************************************** 080 // ******************************************************************************************** 081 // These 6 functions presume that the HTML source needs to be downloaded & read from a URL 082 // ******************************************************************************************** 083 // ******************************************************************************************** 084 085 086 /** 087 * Convenience Method. 088 * <BR />Accepts: {@code URL} 089 * <BR />Passes null to parameters 090 * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}. 091 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 092 * String, String, String, String, String)} 093 * <BR />And Invokes: {@link Scrape#openConn(URL)} 094 */ 095 public static Vector<HTMLNode> getPageTokens 096 (URL url, boolean eliminateHTMLTags) 097 throws IOException 098 { 099 return getPageTokens 100 (Scrape.openConn(url), eliminateHTMLTags, null, null, null, null, null); 101 } 102 103 /** 104 * Convenience Method. 105 * <BR />Accepts: {@code URL} 106 * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'} 107 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 108 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 109 * String, String, String, String, String)} 110 * <BR />And Invokes: {@link Scrape#openConn(URL)} 111 */ 112 public static Vector<HTMLNode> getPageTokens 113 (URL url, boolean eliminateHTMLTags, String startTag, String endTag) 114 throws IOException 115 { 116 return getPageTokens 117 (Scrape.openConn(url), eliminateHTMLTags, startTag, endTag, null, null, null); 118 } 119 120 /** 121 * Convenience Method. 122 * <BR />Accepts: {@code URL} 123 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 124 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 125 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 126 * int, int, String, String, String)} 127 * <BR />And Invokes: {@link Scrape#openConn(URL)} 128 */ 129 public static Vector<HTMLNode> getPageTokens 130 (URL url, boolean eliminateHTMLTags, int startLineNum, int endLineNum) 131 throws IOException 132 { 133 return getPageTokens 134 (Scrape.openConn(url), eliminateHTMLTags, startLineNum, endLineNum, null, null, null); 135 } 136 137 /** 138 * Convenience Method. 139 * <BR />Accepts: {@code URL} 140 * <BR />Passes null to {@code startTag} & {@code endTag} parameters. 141 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 142 * String, String, String, String, String)} 143 * <BR />And Invokes: {@link Scrape#openConn(URL)} 144 */ 145 public static Vector<HTMLNode> getPageTokens( 146 URL url, boolean eliminateHTMLTags, 147 String rawHTMLFile, String matchesFile, String justTextFile 148 ) 149 throws IOException 150 { 151 return getPageTokens( 152 Scrape.openConn(url), eliminateHTMLTags, 153 null, null, 154 rawHTMLFile, matchesFile, justTextFile 155 ); 156 } 157 158 /** 159 * Convenience Method. 160 * <BR />Accepts: {@code URL} 161 * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'} 162 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 163 * String, String, String, String, String)} 164 * <BR />And Invokes: {@link Scrape#openConn(URL)} 165 */ 166 public static Vector<HTMLNode> getPageTokens( 167 URL url, boolean eliminateHTMLTags, 168 String startTag, String endTag, 169 String rawHTMLFile, String matchesFile, String justTextFile 170 ) 171 throws IOException 172 { 173 return getPageTokens( 174 Scrape.openConn(url), eliminateHTMLTags, 175 startTag, endTag, 176 rawHTMLFile, matchesFile, justTextFile 177 ); 178 } 179 180 /** 181 * Convenience Method. 182 * <BR />Accepts: {@code URL} 183 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 184 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 185 * int, int, String, String, String)} 186 * <BR />And Invokes: {@link Scrape#openConn(URL)} 187 */ 188 public static Vector<HTMLNode> getPageTokens( 189 URL url, boolean eliminateHTMLTags, 190 int startLineNum, int endLineNum, 191 String rawHTMLFile, String matchesFile, String justTextFile 192 ) 193 throws IOException 194 { 195 return getPageTokens( 196 Scrape.openConn(url), eliminateHTMLTags, 197 startLineNum, endLineNum, 198 rawHTMLFile, matchesFile, justTextFile 199 ); 200 } 201 202 203 // ******************************************************************************************** 204 // ******************************************************************************************** 205 // These 6 functions presume that the HTML source is from a CharSequence 206 // ******************************************************************************************** 207 // ******************************************************************************************** 208 209 210 /** 211 * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source. 212 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML> 213 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 214 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 215 * <BR /><BR /><B><SPAN STYLE="color: red;">NOTE:</B></SPAN> This method does not throw any 216 * checked-exceptions, there is no Input-Output involved here, it is strictly a computational 217 * method that neither invokes the file-system, nor the web. 218 */ 219 public static Vector<HTMLNode> getPageTokens 220 (CharSequence html, boolean eliminateHTMLTags) 221 // NO IOException... NO I/O! 222 { 223 try 224 { return parser.parse(html, eliminateHTMLTags, null, null, null); } 225 226 // This should never happen, when reading from a 'String' rather than a URL, or 227 // BufferedReader ==> IOException will not be thrown. 228 229 catch (IOException ioe) 230 { throw new UnreachableError(ioe); } 231 } 232 233 /** 234 * Convenience Method. 235 * <BR />Accepts: {@code CharSequence} 236 * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'} 237 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 238 * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean, 239 * String, String, String, String, String)} 240 * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't 241 * possible! 242 */ 243 public static Vector<HTMLNode> getPageTokens 244 (CharSequence html, boolean eliminateHTMLTags, String startTag, String endTag) 245 // NO IOException... NO I/O! 246 { 247 try 248 { return getPageTokens(html, eliminateHTMLTags, startTag, endTag, null, null, null); } 249 250 // This should never happen, when reading from a 'String' rather than a URL, or 251 // BufferedReader ==> IOException will not be thrown. 252 253 catch (IOException ioe) 254 { throw new UnreachableError(ioe); } 255 } 256 257 /** 258 * Convenience Method. 259 * <BR />Accepts: {@code CharSequence} 260 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 261 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 262 * <BR />Invokes: {@link #getPageTokens(CharSequence, boolean, 263 * int, int, String, String, String)} 264 * <BR />Catches: {@code IOException} <B>{@code ==>}</B> No HTTP-I/O, so an IOException isn't 265 * possible! 266 */ 267 public static Vector<HTMLNode> getPageTokens 268 (CharSequence html, boolean eliminateHTMLTags, int startLineNum, int endLineNum) 269 // NO IOException... NO I/O! 270 { 271 try 272 { 273 return getPageTokens 274 (html, eliminateHTMLTags, startLineNum, endLineNum, null, null, null); 275 } 276 277 // This should never happen, when reading from a 'String' rather than a URL, or 278 // BufferedReader ==> IOException will not be thrown. 279 280 catch (IOException ioe) 281 { throw new UnreachableError(ioe); } 282 } 283 284 /** 285 * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source. 286 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML> 287 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 288 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 289 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 290 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 291 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 292 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 293 */ 294 public static Vector<HTMLNode> getPageTokens( 295 CharSequence html, boolean eliminateHTMLTags, 296 String rawHTMLFile, String matchesFile, String justTextFile 297 ) 298 throws IOException 299 { return parser.parse(html, eliminateHTMLTags, rawHTMLFile, matchesFile, justTextFile); } 300 301 /** 302 * Parses and Vectorizes HTML from a {@code CharSequence} (usually a {@code String}) source. 303 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_HTML> 304 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 305 * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG> 306 * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG> 307 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 308 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 309 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 310 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 311 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 312 * @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2> 313 */ 314 public static Vector<HTMLNode> getPageTokens( 315 CharSequence html, boolean eliminateHTMLTags, 316 String startTag, String endTag, 317 String rawHTMLFile, String matchesFile, String justTextFile 318 ) 319 throws IOException 320 { 321 String htmlStr = html.toString(); 322 323 int sPos = htmlStr.indexOf(startTag); 324 325 if (sPos == -1) throw new IllegalArgumentException 326 ("Passed String-Parameter 'startTag' [" + startTag + "] was not found in HTML."); 327 328 int ePos = htmlStr.indexOf(endTag, sPos); 329 330 if (ePos == -1) throw new IllegalArgumentException 331 ("Passed String-Parameter 'endTag' [" + endTag + "] was not found in HTML."); 332 333 ePos += endTag.length(); 334 335 return parser.parse( 336 htmlStr.substring(sPos, ePos), eliminateHTMLTags, 337 rawHTMLFile, matchesFile, justTextFile 338 ); 339 } 340 341 /** 342 * Convenience Method. 343 * <BR />Accepts: {@code CharSequence} 344 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 345 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 346 * int, int, String, String, String)} 347 */ 348 public static Vector<HTMLNode> getPageTokens( 349 CharSequence html, boolean eliminateHTMLTags, 350 int startLineNum, int endLineNum, 351 String rawHTMLFile, String matchesFile, String justTextFile 352 ) 353 throws IOException 354 { 355 return getPageTokens( 356 new BufferedReader(new StringReader(html.toString())), 357 eliminateHTMLTags, startLineNum, endLineNum, rawHTMLFile, matchesFile, justTextFile 358 ); 359 } 360 361 362 // ******************************************************************************************** 363 // ******************************************************************************************** 364 // The next 6 functions presume that the input is from a BufferedReader 365 // ******************************************************************************************** 366 // ******************************************************************************************** 367 368 369 /** 370 * Convenience Method. 371 * <BR />Accepts: {@code BufferedReader} 372 * <BR />Passes null to parameters 373 * {@code startTag, endTag, rawHTMLFile, matchesFile & justTextFile}. 374 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 375 * String, String, String, String, String)} 376 */ 377 public static Vector<HTMLNode> getPageTokens 378 (BufferedReader br, boolean eliminateHTMLTags) 379 throws IOException 380 { return getPageTokens(br, eliminateHTMLTags, null, null, null, null, null); } 381 382 /** 383 * Convenience Method. 384 * <BR />Accepts: {@code BufferedReader} 385 * <BR />And-Accepts: {@code 'startTag'} and {@code 'endTag'} 386 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 387 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 388 * String, String, String, String, String)} 389 */ 390 public static Vector<HTMLNode> getPageTokens 391 (BufferedReader br, boolean eliminateHTMLTags, String startTag, String endTag) 392 throws IOException 393 { return getPageTokens(br, eliminateHTMLTags, startTag, endTag, null, null, null); } 394 395 /** 396 * Convenience Method. 397 * <BR />Accepts: {@code BufferedReader} 398 * <BR />And-Accepts: {@code 'startLineNum'} and {@code 'endLineNum'} 399 * <BR />Passes null to parameters {@code rawHTMLFile, matchesFile & justTextFile}. 400 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 401 * int, int, String, String, String)} 402 */ 403 public static Vector<HTMLNode> getPageTokens 404 (BufferedReader br, boolean eliminateHTMLTags, int startLineNum, int endLineNum) 405 throws IOException 406 { return getPageTokens(br, eliminateHTMLTags, startLineNum, endLineNum, null, null, null); } 407 408 409 /** 410 * Convenience Method. 411 * <BR />Accepts: {@code BufferedReader} 412 * <BR />Passes null to {@code startTag} & {@code endTag} parameters. 413 * <BR />Invokes: {@link #getPageTokens(BufferedReader, boolean, 414 * String, String, String, String, String)} 415 */ 416 public static Vector<HTMLNode> getPageTokens( 417 BufferedReader br, boolean eliminateHTMLTags, 418 String rawHTMLFile, String matchesFile, String justTextFile 419 ) 420 throws IOException 421 { 422 return getPageTokens 423 (br, eliminateHTMLTags, null, null, rawHTMLFile, matchesFile, justTextFile); 424 } 425 426 427 // ******************************************************************************************** 428 // ******************************************************************************************** 429 // 430 // ******************************************************************************************** 431 // ******************************************************************************************** 432 433 434 /** 435 * Parses and Vectorizes HTML from a {@code BufferedReader} source. 436 * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR> 437 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 438 * @param startTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_TAG> 439 * @param endTag <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_TAG> 440 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 441 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 442 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 443 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 444 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 445 * @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX2> 446 */ 447 public static Vector<HTMLNode> getPageTokens( 448 BufferedReader br, boolean eliminateHTMLTags, 449 String startTag, String endTag, 450 String rawHTMLFile, String matchesFile, String justTextFile 451 ) 452 throws IOException 453 { 454 return parser.parse( 455 Scrape.getHTML(br, startTag, endTag), eliminateHTMLTags, rawHTMLFile, 456 matchesFile, justTextFile 457 ); 458 } 459 460 /** 461 * Parses and Vectorizes HTML from a {@code BufferedReader} source. 462 * @param br <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_BR> 463 * @param eliminateHTMLTags <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_ELIM_HT> 464 * @param startLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_START_LN> 465 * @param endLineNum <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_END_LN> 466 * @param rawHTMLFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RAW_HTML> 467 * @param matchesFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_MATCHES_F> 468 * @param justTextFile <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_JUST_TEXT> 469 * @return <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_RETURN> 470 * @throws IOException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IOEX> 471 * @throws IllegalArgumentException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_IAEX> 472 * @throws ScrapeException <EMBED CLASS='external-html' DATA-FILE-ID=HTML_PAGE_SCEX1> 473 */ 474 public static Vector<HTMLNode> getPageTokens( 475 BufferedReader br, boolean eliminateHTMLTags, 476 int startLineNum, int endLineNum, 477 String rawHTMLFile, String matchesFile, String justTextFile 478 ) 479 throws IOException 480 { 481 return parser.parse( 482 Scrape.getHTML(br, startLineNum, endLineNum), eliminateHTMLTags, 483 rawHTMLFile, matchesFile, justTextFile 484 ); 485 } 486}