001package Torello.HTML; 002 003import Torello.Java.*; 004 005import Torello.HTML.NodeSearch.InnerTagFind; // Used for an @see reference 006import Torello.HTML.NodeSearch.TagNodeFind; // Used in getBaseURL 007 008import Torello.Java.Additional.Ret2; 009import Torello.Java.Additional.Ret3; 010 011import Torello.JavaDoc.LinkJavaSource; 012import static Torello.JavaDoc.Entity.METHOD; 013 014import java.net.URL; 015import java.net.MalformedURLException; 016 017import java.util.Vector; 018import java.util.stream.IntStream; 019 020/** 021 * Utilities for de-refrencing 'partially-completed' {@code URL's} in a Web-Page {@code Vector}. 022 * 023 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=LINKS> 024 * @see ReplaceNodes 025 * @see ReplaceFunction 026 * @see HTMLPage 027 * @see InnerTagFind 028 * @see Ret2 029 */ 030@Torello.JavaDoc.StaticFunctional 031public class Links 032{ 033 private Links() { } 034 035 /** 036 * List of documented "starter-strings" that are sometimes used in Anchor URL 037 * {@code 'HREF=...'} attributes. 038 * 039 * @see #NON_URL_HREFS 040 */ 041 protected static final String[] _NON_URL_HREFS = 042 { "tel:", "magnet:", "javascript:", "mailto:", "ftp:", "file:", "data:", "blog:", "#" }; 043 044 /** 045 * This small method just returns the complete list of commonly found Anchor 046 * {@code 'HREF' String's} that do not actually constitute an HTML {@code 'URL'.} This method 047 * actually returns a "clone" of an internally stored {@code String[]} Array. This is to 048 * protect and make sure that the list of potential HTML Anchor-Tag {@code 'HREF'} Attributes 049 * is not changed, doctored or modified 050 * 051 * @return A clone of the {@code String}-array {@code '_NON_URL_HREFS'} 052 * @see #_NON_URL_HREFS 053 */ 054 public static String[] NON_URL_HREFS() 055 { return _NON_URL_HREFS.clone(); } 056 057 /** 058 * The methods in this class <I><B>will not automatically extract</I></B> any HTML 059 * {@code <BASE HREF=URL>} definitions that are found on this page. If the user wishes to 060 * dereference partial / relative {@code URL} definitions that exist on the input page, all the 061 * while respecting any {@code <BASE HREF=URL>} definitions found on the input page, then this 062 * method should be utilized. 063 * 064 * @param page This may be any HTML page or partial page. If this page has a valid HTML 065 * {@code <BASE HREF=URL>}, it will be extracted and returned as an instance of 066 * {@code class URL}. 067 * 068 * @return This shall return the HTML {@code <BASE HREF="http://...">} element found available 069 * within the input-page parameter {@code 'page'}. If the page provided does not contain a 070 * {@code BASE URL} definition, then null shall be returned. 071 * 072 * <BR /><BR /><DIV CLASS=JDHint> 073 * The HTML Specification clearly states that only one {@code URL} may be defined using the 074 * HTML Element {@code <BASE>}. Clearly, due to the browser wars, unspecified / 075 * non-deterministic behavior is possible if multiple definitions are provided. For the 076 * purposes of this class, if such a situation arises, an exception is thrown. 077 * </DIV> 078 * 079 * @throws MalformedHTMLException If the HTML page provided contains multiple definitions of 080 * the element {@code <BASE HREF=URL>}, then this exception will throw. 081 * 082 * @throws MalformedURLException If the {@code <BASE HREF=URL>} found / identified within the 083 * input page, but that {@code URL} is invalid, then this exception shall throw. 084 * 085 * @see TagNodeFind 086 * @see Attributes#retrieve(Vector, int[], String) 087 */ 088 public static URL getBaseURL(Vector<? extends HTMLNode> page) 089 throws MalformedHTMLException, MalformedURLException 090 { 091 int[] posArr = TagNodeFind.all(page, TC.OpeningTags, "base"); 092 093 if (posArr.length == 0) return null; 094 095 096 // NOTE: The cast is all right because 'posArr' only points to TagNode's 097 // Attributes expects to avoid processing Vector<TextNode>, and Vector<CommentNode> 098 // Above, there will be nothing in the 'posArr' if either of those was passed. 099 100 @SuppressWarnings("unchecked") 101 String[] urls = Attributes.retrieve((Vector<HTMLNode>) page, posArr, "href"); 102 103 boolean found = false; 104 String ret = null; 105 106 for (String url : urls) 107 if ((url != null) && (url.length() > 0)) 108 if (found) 109 throw new MalformedHTMLException( 110 "The page you have provided has multiple <BASE HREF=URL> definitions. " + 111 "However, the HTML Specifications state that pages may provide just one " + 112 "definition. If you wish to proceed, retrieve the definitions manually " + 113 "using class TagNodeFind.all and Attributes.retrieve, as explained in " + 114 "the JavaDoc pages for this class." 115 ); 116 else 117 { 118 found = true; 119 ret = url; 120 } 121 122 return new URL(ret); 123 } 124 125 126 // ******************************************************************************************** 127 // ******************************************************************************************** 128 // Complete Vector-Resolve Methods - SRC-ATTRIBUTE 129 // ******************************************************************************************** 130 // ******************************************************************************************** 131 132 133 /** 134 * Convenience Method. 135 * <BR />Invokes: {@link #resolveAllSRC(Vector, int, int, URL, SD, boolean)} 136 */ 137 public static Ret3<int[], int[], int[]> resolveAllSRC( 138 Vector<? super TagNode> html, URL sourcePage, SD quote, 139 boolean askForReturnArraysOrReturnNull 140 ) 141 { return resolveAllSRC(html, 0, -1, sourcePage, quote, askForReturnArraysOrReturnNull); } 142 143 /** 144 * Convenience Method. 145 * <BR />Accepts: {@code DotPair}. 146 * <BR />Invokes: {@link #resolveAllSRC(Vector, int, int, URL, SD, boolean)} 147 */ 148 public static Ret3<int[], int[], int[]> resolveAllSRC( 149 Vector<? super TagNode> html, DotPair dp, URL sourcePage, SD quote, 150 boolean askForReturnArraysOrReturnNull 151 ) 152 { 153 return resolveAllSRC 154 (html, dp.start, dp.end + 1, sourcePage, quote, askForReturnArraysOrReturnNull); 155 } 156 157 /** 158 * This method shall resolve all partial {@code URL} addresses that are found within 159 * {@code TagNode} elements having {@code 'SRC=...'} attributes. Each instance of 160 * {@code TagNode} found in the input HTML {@code Vector} that has an {@code 'SRC'} 161 * attribute - if the {@code 'URL'} is only partially resolve - shall be updated and replaced 162 * with a new {@code TagNode} with a fully resolved {@code URL}. 163 * 164 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 165 * 166 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 167 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 168 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 169 * 170 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 171 * (possibly-relative) {@code URL's} in the HTML-{@code Vector} will be resolved. 172 * 173 * @param quote A choice for the quotes to use. In most cases, {@code URL} attribute 174 * <B STYLE="color: red;">values</B> do not contain quotation-marks. So likely either 175 * choice would work just fine, without exceptions. 176 * 177 * <BR /><BR /><DIV CLASS=JDHint> 178 * <B>null may be passed to this parameter</B>, and if it is, the original quotation marks 179 * found in the {@code TagNode's 'SRC'} attribute will be reused. Passing null to this 180 * parameter should almost always be easiest, safest. 181 * </DIV> 182 * 183 * @param askForReturnArraysOrReturnNull This (long-named) parameter is merely here to 184 * facilitate retrieving more information from this method - <I>if necessary</I>. When this 185 * parameter receives the following values: 186 * 187 * <BR /><BR /><UL CLASS=JDUL> 188 * 189 * <LI> <B>TRUE:</B> Three integer {@code int[]} arrays will be returned as listed in the 190 * <B>{@code Returns:}</B> section of this method's documentation. 191 * </LI> 192 * 193 * <LI><B>FALSE:</B> This method shall return null.</LI> 194 * </UL> 195 * 196 * @return If input parameter {@code 'askForReturnArraysOrReturnNull'} has been passed 197 * {@code FALSE}, this method shall return null. Otherwise, (if passed {@code TRUE}), then 198 * this method shall return an instance of {@code 'Ret3<int[], int[], int[]>'} - which is 199 * <I>returning three separate integer-arrays about what was found, and what has occurred.</I> 200 * 201 * <BR /><BR /> 202 * Three arrays are returned as a result of this method's invocation. Keep in mind that 203 * though the information might be superfluous, rejecting these arrays away is easy. 204 * They are provided as a matter of convenience for cases where more details information is 205 * mandatory for ensuring that long lists of {@code HTMLNode's} were properly updated. 206 * 207 * <BR /><BR /><OL CLASS=JDOL> 208 * 209 * <LI> {@code Ret3.a (int[])} 210 * <BR /><BR /> 211 * The first {@code int[] array} shall contain a list of the index of every 212 * {@code TagNode} in the input-{@code Vector} parameter's range that <B><I>contained</B> 213 * </I> a non-null HTML {@code 'SRC'} Attribute. 214 * <BR /><BR /> 215 * </LI> 216 * 217 * <LI> {@code Ret3.b (int[])} 218 * <BR /><BR /> 219 * The second {@code int[] array} will contain an index-list of the indices 220 * which contained {@code TagNode's} that were <B><I>replaced</I></B> by the 221 * internal-resolve logic. 222 * <BR /><BR /> 223 * </LI> 224 * 225 * <LI> {@code Ret3.c (int[])} 226 * <BR /><BR /> 227 * The third {@code int[] array} will contain an index-list of the indices 228 * which contained {@code TagNode's} whose {@code 'SRC=...'} attribute 229 * <I><B>failed</I></B> to be resolved by the internal-resolve logic, <I>or</I> caused a 230 * {@code QuotesException} to throw. 231 * </LI> 232 * 233 * </OL> 234 * 235 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 236 * @see #resolve(String, URL) 237 * @see TagNode#AV(String) 238 * @see TagNode#setAV(String, String, SD) 239 */ 240 public static Ret3<int[], int[], int[]> resolveAllSRC( 241 Vector<? super TagNode> html, int sPos, int ePos, URL sourcePage, SD quote, 242 boolean askForReturnArraysOrReturnNull 243 ) 244 { 245 // Retrieve the Vector-location of any TagNode on the page that has 246 // a "SRC=..." attribute. These are almost always HTML <IMG> elements. 247 // NOTE: FIND Method's are "READ ONLY" - the Cast will make no difference at run-time. 248 // The @SuppressWarnings is to overcome the cast of 'html' 249 250 @SuppressWarnings("unchecked") 251 int[] hasSrcPosArr = InnerTagFind.all((Vector<HTMLNode>) html, sPos, ePos, "src"); 252 253 254 // Java Stream's are convenient for keeping "Growing Lists" of return values. 255 // This builder shall keep a list of all URL's that failed to update - for any reason 256 // **UNLESS** the reason is that the URL was already a fully-resolved, non-partial URL 257 258 IntStream.Builder failedUpdate = askForReturnArraysOrReturnNull 259 ? IntStream.builder() 260 : null; 261 262 263 // This stream will keep a list of all URL's that were updated, and whose TagNode's 264 // were replaced inside the input HTML Vector 265 266 IntStream.Builder replaced = askForReturnArraysOrReturnNull 267 ? IntStream.builder() 268 : null; 269 270 for (int pos : hasSrcPosArr) 271 { 272 // Get the node at the index 273 TagNode tn = (TagNode) html.elementAt(pos); 274 275 276 // 1) Retrieve the SRC Attribute 277 // 2) if it is a partial-URL resolve it 278 // 3) Convert to a String 279 280 String oldURL = tn.AV("src"); 281 URL newURL = resolve(oldURL, sourcePage); 282 283 284 // Some URL's cannot be resolved, if so, just skip this TagNode. 285 // Log the index to the stream (if requested), and continue. 286 287 if (newURL == null) 288 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 289 290 291 // If the URL was already a fully-resolved-URL, continue - don't replace the TagNode; 292 // No logging needed here, the URL was *already* resolved... 293 294 if (oldURL.length() == newURL.toString().length()) continue; 295 296 297 // Replace the SRC Attribute in the TagNode. This builds a new instance of TagNode 298 // If there is an exception, log the index to the stream (if requested), and continue. 299 300 try 301 { tn = tn.setAV("src", newURL.toString(), quote); } 302 303 catch (QuotesException qex) 304 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 305 306 // Replace the index in the Vector containing the old TagNode with the new one. 307 html.setElementAt(tn , pos); 308 309 310 // The Vector-Index at this position had it's old TagNode removed and replaced with a 311 // new updated one. Log this to the stream-list so to allow the user to know. 312 313 if (askForReturnArraysOrReturnNull) replaced.accept(pos); 314 } 315 316 return askForReturnArraysOrReturnNull 317 318 ? new Ret3<int[], int[], int[]> 319 (hasSrcPosArr, replaced.build().toArray(), failedUpdate.build().toArray()) 320 : null; 321 } 322 323 324 // ******************************************************************************************** 325 // ******************************************************************************************** 326 // Complete Vector-Resolve Methods - HREF-ATTRIBUTE 327 // ******************************************************************************************** 328 // ******************************************************************************************** 329 330 331 /** 332 * Convenience Method. 333 * <BR />Invokes: {@link #resolveAllHREF(Vector, int, int, URL, SD, boolean)} 334 */ 335 public static Ret3<int[], int[], int[]> resolveAllHREF( 336 Vector<? super TagNode> html, URL sourcePage, SD quote, 337 boolean askForReturnArraysOrReturnNull 338 ) 339 { return resolveAllHREF(html, 0, -1, sourcePage, quote, askForReturnArraysOrReturnNull); } 340 341 /** 342 * Convenience Method. 343 * <BR />Accepts: {@code DotPair}. 344 * <BR />Invokes: {@link #resolveAllHREF(Vector, int, int, URL, SD, boolean)} 345 */ 346 public static Ret3<int[], int[], int[]> resolveAllHREF( 347 Vector<? super TagNode> html, DotPair dp, URL sourcePage, SD quote, 348 boolean askForReturnArraysOrReturnNull 349 ) 350 { 351 return resolveAllHREF 352 (html, dp.start, dp.end + 1, sourcePage, quote, askForReturnArraysOrReturnNull); 353 } 354 355 /** 356 * This method shall resolve all partial {@code URL} addresses that are found within 357 * {@code TagNode} elements having {@code 'HREF=...'} attributes. Each instance of 358 * {@code TagNode} found in the input HTML {@code Vector} that has an {@code 'HREF'} 359 * attribute - if the {@code 'URL'} is only partially resolve - shall be updated and replaced 360 * with a new {@code TagNode} with a fully resolved {@code URL}. 361 * 362 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 363 * 364 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 365 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 366 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 367 * 368 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 369 * (possibly-relative) {@code URL's} in the HTML-{@code Vector} will be resolved. 370 * 371 * @param quote A choice for the quotes to use. In most cases, {@code URL} attribute 372 * <B STYLE="color: red;">values</B> do not contain quotation-marks. So likely either 373 * choice would work just fine, without exceptions. 374 * 375 * <BR /><BR /><DIV CLASS=JDHint> 376 * <B>null may be passed to this parameter</B>, and if it is the original quotation marks 377 * found in the {@code TagNode's 'HREF'} attribute will be reused. Passing null to this 378 * parameter should almost always be easiest, safest. 379 * </DIV> 380 * 381 * @param askForReturnArraysOrReturnNull This (long-named) parameter is merely here to 382 * facilitate retrieving more information from this method - <I>if necessary</I>. When this 383 * parameter receives the following values: 384 * 385 * <BR /><BR /><UL CLASS=JDUL> 386 * 387 * <LI> <B>TRUE:</B> Three integer {@code int[]} arrays will be returned as listed in the 388 * <B>{@code Returns:}</B> section of this method's documentation. 389 * </LI> 390 * 391 * <LI><B>FALSE:</B> This method shall return null. </LI> 392 * </UL> 393 * 394 * @return If input parameter {@code 'askForReturnArraysOrReturnNull'} has been passed 395 * {@code FALSE}, this method shall return null. Otherwise, (if passed {@code TRUE}), then 396 * this method shall return an instance of {@code 'Ret3<int[], int[], int[]>'} - which is 397 * <I>returning three separate integer-arrays about what was found, and what has occurred.</I> 398 * 399 * <BR /><BR /> 400 * Three arrays are returned as a result of this method's invocation. Keep in mind that 401 * though the information might be superfluous, rejecting these arrays away is easy. 402 * They are provided as a matter of convenience for cases where more details information is 403 * mandatory for ensuring that long lists of {@code HTMLNode's} were properly updated. 404 * 405 * <BR /><BR /><OL CLASS=JDOL> 406 * 407 * <LI> {@code Ret3.a (int[])} 408 * <BR /><BR /> 409 * The first {@code int[] array} shall contain a list of the index of every 410 * {@code TagNode} in the input-{@code Vector} parameter's range that <B><I>contained</B> 411 * </I> a non-null HTML {@code 'HREF'} Attribute. 412 * <BR /><BR /> 413 * </LI> 414 * 415 * <LI> {@code Ret3.b (int[])} 416 * <BR /><BR /> 417 * The second {@code int[] array} will contain an index-list of the indices 418 * which contained {@code TagNode's} that were <B><I>replaced</I></B> by the 419 * internal-resolve logic. 420 * <BR /><BR /> 421 * </LI> 422 * 423 * <LI> {@code Ret3.c (int[])} 424 * <BR /><BR /> 425 * The third {@code int[] array} will contain an index-list of the indices 426 * which contained {@code TagNode's} whose {@code 'HREF=...'} attribute 427 * <I><B>failed</I></B> to be resolved by the internal-resolve logic, <I>or</I> caused a 428 * {@code QuotesException} to throw. 429 * </LI> 430 * 431 * </OL> 432 * 433 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 434 * @see #resolve(String, URL) 435 * @see TagNode#AV(String) 436 * @see TagNode#setAV(String, String, SD) 437 */ 438 public static Ret3<int[], int[], int[]> resolveAllHREF( 439 Vector<? super TagNode> html, int sPos, int ePos, URL sourcePage, SD quote, 440 boolean askForReturnArraysOrReturnNull 441 ) 442 { 443 // Retrieve the Vector-location of any TagNode on the page that has 444 // a "HREF=..." attribute. These are almost always HTML <IMG> elements. 445 // NOTE: FIND Method's are "READ ONLY" - the Cast will make no difference at run-time. 446 // The @SuppressWarnings is to overcome the cast of 'html' 447 448 @SuppressWarnings("unchecked") 449 int[] hasHRefPosArr = InnerTagFind.all((Vector<HTMLNode>) html, sPos, ePos, "href"); 450 451 452 // Java Stream's are convenient for keeping "Growing Lists" of return values. 453 // This builder shall keep a list of all URL's that failed to update - for any reason 454 // **UNLESS** the reason is that the URL was already a fully-resolved, non-partial URL 455 456 IntStream.Builder failedUpdate = askForReturnArraysOrReturnNull 457 ? IntStream.builder() 458 : null; 459 460 461 // This stream will keep a list of all URL's that were updated, and whose TagNode's 462 // were replaced inside the input HTML Vector 463 464 IntStream.Builder replaced = askForReturnArraysOrReturnNull 465 ? IntStream.builder() 466 : null; 467 468 for (int pos : hasHRefPosArr) 469 { 470 // Get the node at the index 471 TagNode tn = (TagNode) html.elementAt(pos); 472 473 474 // 1) Retrieve the HREF Attribute 475 // 2) if it is a partial-URL resolve it 476 // 3) Convert to a String 477 478 String oldURL = tn.AV("HREF"); 479 URL newURL = resolve(oldURL, sourcePage); 480 481 482 // Some URL's cannot be resolved, if so, just skip this TagNode. 483 // Log the index to the stream (if requested), and continue. 484 485 if (newURL == null) 486 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 487 488 489 // If the URL was already a fully-resolved-URL, continue - don't replace the TagNode; 490 // No logging needed here, the URL was *already* resolved... 491 492 if (oldURL.length() == newURL.toString().length()) continue; 493 494 495 // Replace the HREF Attribute in the TagNode. This builds a new instance of TagNode 496 // If there is an exception, log the index to the stream (if requested), and continue. 497 498 try 499 { tn = tn.setAV("href", newURL.toString(), quote); } 500 501 catch (QuotesException qex) 502 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 503 504 // Replace the index in the Vector containing the old TagNode with the new one. 505 html.setElementAt(tn , pos); 506 507 508 // The Vector-Index at this position had it's old TagNode removed and replaced with a 509 // new updated one. Log this to the stream-list so to allow the user to know. 510 511 if (askForReturnArraysOrReturnNull) replaced.accept(pos); 512 } 513 514 return askForReturnArraysOrReturnNull 515 516 ? new Ret3<int[], int[], int[]> 517 (hasHRefPosArr, replaced.build().toArray(), failedUpdate.build().toArray()) 518 : null; 519 } 520 521 522 // ******************************************************************************************** 523 // ******************************************************************************************** 524 // Resolve, Not Keep Exceptions 525 // ******************************************************************************************** 526 // ******************************************************************************************** 527 528 529 /** 530 * Convenience Method. 531 * <BR />Invokes: {@link #resolveHREF(TagNode, URL)}. 532 * <BR />And-Then: {@link TagNode#setAV(String, String, SD)} 533 */ 534 public static TagNode resolveHREFAndUpdate(TagNode tnWithHREF, URL sourcePage) 535 { 536 URL url = resolveHREF(tnWithHREF, sourcePage); 537 538 return (url == null) 539 ? null 540 : tnWithHREF.setAV("href", url.toString(), null); 541 } 542 543 544 /** 545 * This should be used for {@code TagNode's} that contain an {@code 'HREF'} inner-tag 546 * (attribute). 547 * 548 * @param tnWithHREF <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TN_HREF> 549 * 550 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode} 551 * (possibly-relative) {@code URL} will be resolved. 552 * 553 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 554 * directory. Null is returned if attempting to build the {@code URL} generated a 555 * {@code MalformedURLException}. 556 * 557 * <BR /><BR /><DIV CLASS=JDHint> 558 * <B>SPECIFICALLY:</B> This method shall catch all {@code MalformedURLException's}. 559 * </DIV> 560 * 561 * @throws HREFException If the {@code TagNode} passed to parameter {@code 'tnWithHREF'} does 562 * not actually contain an {@code HREF} attribute, then this exception shall throw. 563 * 564 * @see #resolve(String, URL) 565 * @see TagNode#AV(String) 566 */ 567 public static URL resolveHREF(TagNode tnWithHREF, URL sourcePage) 568 { 569 String href = tnWithHREF.AV("href"); 570 571 if (href == null) throw new HREFException( 572 "The TagNode passed to parameter tnWithHREF does not actually contain an " + 573 "HREF attribute." 574 ); 575 576 return resolve(href, sourcePage); 577 } 578 579 580 /** 581 * Convenience Method. 582 * <BR />Invokes: {@link #resolveSRC(TagNode, URL)} 583 * <BR />And-Then: {@link TagNode#setAV(String, String, SD)} 584 */ 585 public static TagNode resolveSRCAndUpdate(TagNode tnWithSRC, URL sourcePage) 586 { 587 URL url = resolveSRC(tnWithSRC, sourcePage); 588 589 return (url == null) 590 ? null 591 : tnWithSRC.setAV("src", url.toString(), null); 592 } 593 594 595 /** 596 * This should be used for {@code TagNode's} that contain a {@code 'SRC'} inner-tag 597 * (attribute). 598 * 599 * @param tnWithSRC <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TN_SRC> 600 * 601 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode} 602 * (possibly-relative) {@code URL} will be resolved. 603 * 604 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 605 * directory. Null is returned if attempting to build the {@code URL} generated a 606 * {@code MalformedURLException}. 607 * 608 * <BR /><BR /><DIV CLASS=JDHint> 609 * <B>SPECIFICALLY:</B> This method shall catch all {@code MalformedURLException's}. 610 * </DIV> 611 * 612 * @throws SRCException If the {@code TagNode} passed to parameter {@code 'tnWithSRC'} does not 613 * actually contain a {@code SRC} attribute, then this exception shall throw. 614 * 615 * @see #resolve(String, URL) 616 * @see TagNode#AV(String) 617 */ 618 public static URL resolveSRC(TagNode tnWithSRC, URL sourcePage) 619 { 620 String src = tnWithSRC.AV("src"); 621 622 if (src == null) throw new SRCException( 623 "The TagNode passed to parameter tnWithSRC does not actually contain a " + 624 "SRC attribute." 625 ); 626 627 return resolve(src, sourcePage); 628 } 629 630 /** 631 * This should be used for lists of {@code TagNode's}, each of which contain an {@code 'HREF'} 632 * inner-tag (attribute). 633 * 634 * @param tnListWithHREF <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TNLIST_HREF> 635 * 636 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 637 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 638 * 639 * @return A list of {@code URL's}, each of which have been completed/resolved with the 640 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 641 * result in a null value in the {@code Vector}. 642 * 643 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_HREF> 644 * 645 * @see #resolve(String, URL) 646 * @see TagNode#AV(String) 647 */ 648 public static Vector<URL> resolveHREFs(Iterable<TagNode> tnListWithHREF, URL sourcePage) 649 { 650 Vector<URL> ret = new Vector<>(); 651 652 for (TagNode tn : tnListWithHREF) ret.addElement(resolve(tn.AV("href"), sourcePage)); 653 654 return ret; 655 } 656 657 658 /** 659 * This should be used for lists of {@code TagNode's}, each of which contain a {@code 'SRC'} 660 * inner-tag (attribute). 661 * 662 * @param tnListWithSRC <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TNLIST_SRC> 663 * 664 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 665 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 666 * 667 * @return A list of {@code URL's}, each of which have been completed/resolved with the 668 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 669 * result in a null value in the {@code Vector.} 670 * 671 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_SRC> 672 * 673 * @see #resolve(String, URL) 674 * @see TagNode#AV(String) 675 */ 676 public static Vector<URL> resolveSRCs(Iterable<TagNode> tnListWithSRC, URL sourcePage) 677 { 678 Vector<URL> ret = new Vector<>(); 679 680 for (TagNode tn : tnListWithSRC) ret.addElement(resolve(tn.AV("src"), sourcePage)); 681 682 return ret; 683 } 684 685 686 /** 687 * This will use a "pointer array" - an array containing indexes into the downloaded page to 688 * retrieve {@code TagNode's}. The {@code TagNode's} to which this pointer-array points - 689 * must each contain an {@code HREF} inner-tag with a {@code URL}, or a partial {@code URL}. 690 * 691 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 692 * 693 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 694 * 695 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 696 * reference {@code TagNode's} that contain {@code HREF} attributes. Integer-pointer Arrays 697 * are usually returned from the {@code package 'NodeSearch'} "Find" methods. 698 * 699 * <DIV CLASS="EXAMPLE">{@code 700 * // Retrieve 'pointers' to all the '<A HREF=...>' TagNode's. The term 'pointer' refers to 701 * // integer-indices into the vectorized-html variable 'page' 702 * int[] anchorPosArr = TagNodeFind.all(page, TC.OpeningTags, "a"); 703 * 704 * // Extract each HREF inner-tag, and construct a {@code URL}. Use the 'sourcePage' parameter 705 * // if the URL is only partially-resolved 706 * Vector<URL> urls = Links.resolveHREFs(page, anchorPosArr, mySourcePage); 707 * }</DIV> 708 * 709 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 710 * {@code "<A ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 711 * {@code 'html'}, and then resolve any shortened {@code URL's}. 712 * 713 * @param sourcePage This is the source page {@code URL} from whence the (possibly relative) 714 * {@code TagNode URL's} in the {@code Vector} are to be resolved. 715 * 716 * @return A list of {@code URL's}, each of which have been completed/resolved with the 717 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 718 * result in a null value in the {@code Vector}. However, if any of the nodes pointed to by 719 * the {@code 'nodePosArr'} parameter do not contain opening {@code TagNode} elements, then 720 * this mistake shall generate {@code TagNodeExpectedException's}. 721 * 722 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_HREF> 723 * 724 * @throws ArrayIndexOutOfBoundsException 725 * <EMBED CLASS='external-html' DATA-FILE-ID=ATTR_AIOOB_EX> 726 * 727 * @throws OpeningTagNodeExpectedException 728 * <EMBED CLASS='external-html' DATA-FILE-ID=OPEN_TNE_EX> 729 * 730 * @throws TagNodeExpectedException <EMBED CLASS='external-html' DATA-FILE-ID=TNE_EX> 731 * 732 * @see #resolve(String, URL) 733 * @see TagNode#AV(String) 734 */ 735 public static Vector<URL> resolveHREFs 736 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 737 { 738 // Return Vector 739 Vector<URL> ret = new Vector<>(); 740 741 for (int nodePos : nodePosArr) 742 { 743 HTMLNode n = html.elementAt(nodePos); 744 745 // Must be an HTML TagNode 746 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 747 748 TagNode tn = (TagNode) n; 749 750 // Must be an "Opening" HTML TagNode 751 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 752 753 // Resolve the 'HREF', save the URL 754 ret.addElement(resolve(tn.AV("href"), sourcePage)); 755 } 756 757 return ret; 758 } 759 760 761 /** 762 * This will use a "pointer array" - an array containing indexes into the downloaded page to 763 * retrieve {@code TagNode's}. The {@code TagNode's} to which this pointer-array points - must 764 * each contain a {@code SRC} inner-tag with a {@code URL}, or a partial {@code URL}. 765 * 766 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 767 * 768 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> Any HTML page (or sub-page) 769 * 770 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 771 * reference {@code TagNode's} that contain {@code SRC} attributes. Integer-pointer Arrays are 772 * usually returned from the {@code package 'NodeSearch'} "Find" methods. 773 * 774 * <DIV CLASS="EXAMPLE">{@code 775 * // Retrieve 'pointers' to all the '<IMG SRC=...>' TagNode's. The term 'pointer' refers to 776 * // integer-indices into the vectorized-html variable 'page' 777 * 778 * int[] picturePosArr = TagNodeFind.all(page, TC.OpeningTags, "img"); 779 * 780 * // Extract each SRC inner-tag, and construct a {@code URL}. Use the 'sourcePage' parameter 781 * // if the URL is only partially-resolved 782 * 783 * Vector<URL> urls = Links.resolveSRCs(page, picturePosArr, mySourcePage); 784 * }</DIV> 785 * 786 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 787 * {@code "<IMG ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 788 * {@code 'html'}, and then resolve any shorted image {@code URL's}. 789 * 790 * @param sourcePage This is the source page {@code URL} from whence the (possibly relative) 791 * {@code TagNode URL's} in the {@code Vector} are to be resolved. 792 * 793 * @return A list of {@code URL's}, each of which have been completed/resolved with the 794 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 795 * result in a null value in the {@code Vector}. However, if any of the nodes pointed to by 796 * the {@code 'nodePosArr'} parameter do not contain opening {@code TagNode} elements, then 797 * this mistake shall generate {@code TagNodeExpectedException's}. 798 * 799 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_SRC> 800 * 801 * @throws ArrayIndexOutOfBoundsException 802 * <EMBED CLASS='external-html' DATA-FILE-ID=ATTR_AIOOB_EX> 803 * 804 * @throws OpeningTagNodeExpectedException 805 * <EMBED CLASS='external-html' DATA-FILE-ID=OPEN_TNE_EX> 806 * 807 * @throws TagNodeExpectedException <EMBED CLASS='external-html' DATA-FILE-ID=TNE_EX> 808 * 809 * @see #resolve(String, URL) 810 * @see TagNode#AV(String) 811 */ 812 public static Vector<URL> resolveSRCs 813 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 814 { 815 // Return Vector 816 Vector<URL> ret = new Vector<>(); 817 818 for (int nodePos : nodePosArr) 819 { 820 HTMLNode n = html.elementAt(nodePos); 821 822 // Must be an HTML TagNode 823 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 824 825 TagNode tn = (TagNode) n; 826 827 // Must be an "Opening" HTML TagNode 828 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 829 830 // Resolve the "SRC", save the URL 831 ret.addElement(resolve(tn.AV("src"), sourcePage)); 832 } 833 834 return ret; 835 } 836 837 838 /** 839 * This will convert <I><B>a list of </B></I> simple java {@code String's} to a 840 * list/{@code Vector} of {@code URL's}, de-referencing any missing information using the 841 * {@code 'sourcePage'} parameter. 842 * 843 * @param src a list of strings - usually partially or totally completed Internet {@code URL's} 844 * 845 * @param sourcePage This is the source page {@code URL} from which the {@code String's} 846 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 847 * 848 * @return A list of {@code URL's}, each of which have been completed/resolved with the 849 * {@code 'sourcePage'} parameter. If there were any {@code String's} that were zero-length or 850 * null, then null is returned in the related {@code Vector} position. If any 851 * {@code TagNode} causes a {@code MalformedURLException}, then that position in the 852 * {@code Vector} will be null. 853 * 854 * @see #resolve(String, URL) 855 */ 856 public static Vector<URL> resolve(Vector<String> src, URL sourcePage) 857 { 858 Vector<URL> ret = new Vector<>(); 859 860 for (String s : src) ret.addElement(resolve(s, sourcePage)); 861 862 return ret; 863 } 864 865 /** 866 * This will convert a simple java {@code String} to a {@code URL}, de-referencing any missing 867 * information using the {@code 'sourcePage'} parameter. 868 * 869 * @param src Any java {@code String}, usually one which was scraped from an HTML-Page, and 870 * needs to be "completed." 871 * 872 * @param sourcePage This is the source page {@code URL} from which the String 873 * (possibly-relative) {@code URL} will be resolved. 874 * 875 * @return A {@code URL}, which has been completed/resolved with the {@code 'sourcePage'} 876 * parameter. If parameter {@code 'src'} is null or zero-length, then this method will also 877 * return null. If a {@code MalformedURLException} is generated, null will also be returned. 878 */ 879 public static URL resolve(String src, URL sourcePage) 880 { 881 if (sourcePage == null) throw new NullPointerException( 882 "Though you may provide null to the partial-URL to dereference parameter, null " + 883 "may not be passed to the Source-Page Parameter. The purpose of the 'resolve' " + 884 "operation is to resolve partial-URLs against a source-page (root) URL. " + 885 "Therefore this is not allowed." 886 ); 887 888 if (src == null) return null; 889 890 src = src.trim(); 891 892 if (src.length() == 0) return null; 893 894 String srcLC = src.toLowerCase(); 895 896 if (StrCmpr.startsWithXOR(srcLC, _NON_URL_HREFS)) return null; 897 898 if (srcLC.startsWith("http://") || srcLC.startsWith("https://")) 899 900 try 901 { return new URL(src); } 902 903 catch (MalformedURLException e) { return null; } 904 905 if (src.startsWith("//") && (src.charAt(3) != '/')) 906 907 try 908 { return new URL(sourcePage.getProtocol().toLowerCase() + ":" + src); } 909 910 catch (MalformedURLException e) { return null; } 911 912 if (src.startsWith("/")) 913 914 try 915 { 916 return new URL( 917 sourcePage.getProtocol().toLowerCase() + "://" + 918 sourcePage.getHost().toLowerCase() + 919 src 920 ); 921 } 922 923 catch (MalformedURLException e) { return null; } 924 925 if (src.startsWith("../")) 926 { 927 String sourcePageStr = sourcePage.toString(); 928 short nLevels = 0; 929 930 do { nLevels++; src = src.substring(3); } 931 while (src.startsWith("../")); 932 933 String directory = StringParse.dotDotParentDirectory(sourcePage.toString(), nLevels); 934 935 try { return new URL(directory + src); } 936 catch (Exception e) { return null; } 937 } 938 939 String root = 940 sourcePage.getProtocol().toLowerCase() + "://" + 941 sourcePage.getHost().toLowerCase(); 942 943 String path = sourcePage.getPath().trim(); 944 int pos = StringParse.findLastFrontSlashPos(path); 945 946 if (pos == -1) throw new StringIndexOutOfBoundsException( 947 "The URL you have provided: " + sourcePage.toString() + " does not have a '/' " + 948 "front-slash character in it's path. Cannot proceed resolving relative-URL's " + 949 "without this." 950 ); 951 952 path = path.substring(0, pos + 1); 953 954 try { return new URL(root + path + src); } 955 catch (MalformedURLException e) { return null; } 956 } 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 // ******************************************************************************************** 979 // ******************************************************************************************** 980 // Resolve, KE - Keep Exceptions 981 // ******************************************************************************************** 982 // ******************************************************************************************** 983 984 985 /** 986 * This should be used for {@code TagNode's} that contain an {@code 'HREF'} inner-tag 987 * (attribute). 988 * 989 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 990 * 991 * @param tnWithHREF <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TN_HREF> 992 * 993 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 994 * (possibly-relative) {@code URL} will be resolved. 995 * 996 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 997 * directory. If there were no {@code HREF} tag, then null is returned. If 998 * the {@code TagNode} causes a {@code MalformedURLException}, that is returned in 999 * {@code Ret2.b} 1000 * 1001 * <BR /><BR /><DIV CLASS=JDHint> 1002 * <B>SPECIFICALLY:</B> This method shall catch all {@code MalformedURLException's}. 1003 * </DIV> 1004 * 1005 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1006 * 1007 * @throws HREFException If the {@code TagNode} passed to parameter {@code 'tnWithHREF'} does 1008 * not actually contain an {@code HREF} attribute, then this exception shall throw. 1009 * 1010 * @see #resolve_KE(String, URL) 1011 * @see TagNode#AV(String) 1012 * @see Ret2 1013 */ 1014 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1015 public static Ret2<URL, MalformedURLException> resolveHREF_KE 1016 (TagNode tnWithHREF, URL sourcePage) 1017 { 1018 String href = tnWithHREF.AV("href"); 1019 1020 if (href == null) throw new HREFException( 1021 "The TagNode passed to parameter tnWithHREF does not actually contain an " + 1022 "HREF attribute." 1023 ); 1024 1025 return LinksResolve_KE.resolve(href, sourcePage); 1026 } 1027 1028 1029 /** 1030 * This should be used for {@code TagNode's} that contain a {@code 'SRC'} inner-tag 1031 * (attribute). 1032 * 1033 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1034 * 1035 * @param tnWithSRC <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TN_SRC> 1036 * 1037 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1038 * (possibly-relative) {@code URL} will be resolved. 1039 * 1040 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 1041 * directory. If there were no {@code SRC} tag, then null is returned. If the 1042 * {@code TagNode} causes a {@code MalformedURLException}, that is returned in {@code Ret2.b} 1043 * 1044 * <BR /><BR /><DIV CLASS=JDHint> 1045 * <B>SPECIFICALLY:</B> This method shall catch all {@code MalformedURLException's}. 1046 * </DIV> 1047 * 1048 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1049 * 1050 * @throws SRCException If the {@code TagNode} passed to parameter {@code 'tnWithSRC'} does not 1051 * actually contain a {@code SRC} attribute, then this exception shall throw. 1052 * 1053 * @see #resolve_KE(String, URL) 1054 * @see TagNode#AV(String) 1055 * @see Ret2 1056 */ 1057 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1058 public static Ret2<URL, MalformedURLException> resolveSRC_KE 1059 (TagNode tnWithSRC, URL sourcePage) 1060 { 1061 String src = tnWithSRC.AV("src"); 1062 1063 if (src == null) throw new SRCException( 1064 "The TagNode passed to parameter tnWithSRC does not actually contain a " + 1065 "SRC attribute." 1066 ); 1067 1068 return LinksResolve_KE.resolve(src, sourcePage); 1069 } 1070 1071 1072 /** 1073 * This should be used for lists of {@code TagNode's}, each of which contain an {@code 'HREF'} 1074 * inner-tag (attribute). 1075 * 1076 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1077 * 1078 * @param tnListWithHREF <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TNLIST_HREF> 1079 * 1080 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1081 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 1082 * 1083 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1084 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code HREF} tag, 1085 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1086 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1087 * exception in {@code Ret2.b} 1088 * 1089 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_HREF> 1090 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1091 * 1092 * @see #resolve_KE(String, URL) 1093 * @see TagNode#AV(String) 1094 * @see Ret2 1095 */ 1096 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1097 public static Vector<Ret2<URL, MalformedURLException>> resolveHREFs_KE 1098 (Iterable<TagNode> tnListWithHREF, URL sourcePage) 1099 { 1100 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1101 1102 for (TagNode tn : tnListWithHREF) 1103 ret.addElement(LinksResolve_KE.resolve(tn.AV("href"), sourcePage)); 1104 1105 return ret; 1106 } 1107 1108 1109 /** 1110 * This should be used for lists of {@code TagNode's}, each of which contain a {@code 'SRC'} 1111 * inner-tag (attribute). 1112 * 1113 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1114 * 1115 * @param tnListWithSRC <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TNLIST_SRC> 1116 * 1117 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1118 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 1119 * 1120 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1121 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code SRC} tag, 1122 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1123 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1124 * exception in {@code Ret2.b} 1125 * 1126 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_SRC> 1127 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1128 * 1129 * @see #resolve_KE(String, URL) 1130 * @see TagNode#AV(String) 1131 * @see Ret2 1132 */ 1133 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1134 public static Vector<Ret2<URL, MalformedURLException>> resolveSRCs_KE 1135 (Iterable<TagNode> tnListWithSRC, URL sourcePage) 1136 { 1137 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1138 1139 for (TagNode tn : tnListWithSRC) 1140 ret.addElement(LinksResolve_KE.resolve(tn.AV("src"), sourcePage)); 1141 1142 return ret; 1143 } 1144 1145 1146 /** 1147 * This will use a "pointer array" - an array containing indexes into the downloaded page to 1148 * retrieve {@code TagNode's}. The {@code TagNode} to which this pointer-array points - must 1149 * contain {@code HREF} inner-tags with {@code URL's}, or partial {@code URL's}. 1150 * 1151 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 1152 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1153 * 1154 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> Any HTML page (or sub-page) 1155 * 1156 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 1157 * reference {@code TagNode's} that contain {@code HREF} attributes. Integer-pointer Arrays 1158 * are usually return from the {@code package 'NodeSearch'} "Find" methods. 1159 * 1160 * <DIV CLASS="EXAMPLE">{@code 1161 * // Retrieve 'pointers' to all the '<A HREF=...>' TagNode's. The term 'pointer' refers to 1162 * // integer-indices into the vectorized-html variable 'page' 1163 * 1164 * int[] anchorPosArr = TagNodeFind.all(page, TC.OpeningTags, "a"); 1165 * 1166 * // Extract each HREF inner-tag, and construct a URL. Use the 'sourcePage' parameter if 1167 * // the URL is only partially-resolved. If any URL's on the original-page are invalid, the 1168 * // method shall not crash, but save the exception instead. 1169 * 1170 * Vector<Ret2<URL, MalformedURLException> urlsWithEx = 1171 * Links.resolveHREFs_KE(page, picturePosArr, mySourcePage); 1172 * 1173 * // Print out any "failed" urls 1174 * for (Ret2<URL, MalformedURLException> r : urlsWithEx) 1175 * if (r.b != null) 1176 * System.out.println("There was an exception: " + r.b.toString()); 1177 * }</DIV> 1178 * 1179 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 1180 * {@code "<A ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 1181 * {@code 'html'}., and then resolve any shortened {@code URL's}. 1182 * 1183 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1184 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 1185 * 1186 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1187 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code HREF} tag, 1188 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1189 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1190 * exception in {@code Ret2.b} 1191 * 1192 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_HREF> 1193 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1194 * 1195 * @throws ArrayIndexOutOfBoundsException 1196 * <EMBED CLASS='external-html' DATA-FILE-ID=ATTR_AIOOB_EX> 1197 * 1198 * @throws OpeningTagNodeExpectedException 1199 * <EMBED CLASS='external-html' DATA-FILE-ID=OPEN_TNE_EX> 1200 * 1201 * @throws TagNodeExpectedException <EMBED CLASS='external-html' DATA-FILE-ID=TNE_EX> 1202 * 1203 * @see #resolve_KE(String, URL) 1204 * @see TagNode#AV(String) 1205 * @see Ret2 1206 */ 1207 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1208 public static Vector<Ret2<URL, MalformedURLException>> resolveHREFs_KE 1209 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 1210 { 1211 // Return Vector 1212 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1213 1214 for (int nodePos : nodePosArr) 1215 { 1216 HTMLNode n = html.elementAt(nodePos); 1217 1218 // Must be an HTML TagNode 1219 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 1220 1221 TagNode tn = (TagNode) n; 1222 1223 // Must be an "Opening" HTML TagNode 1224 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 1225 1226 // Resolve the "HREF", keep the URL 1227 ret.addElement(LinksResolve_KE.resolve(tn.AV("href"), sourcePage)); 1228 } 1229 1230 return ret; 1231 } 1232 1233 /** 1234 * This will use a "pointer array" - an array containing indexes into the downloaded page to 1235 * retrieve {@code TagNode's}. The {@code TagNode} to which this pointer-array points - must 1236 * contain {@code SRC} inner-tags with {@code URL's}, or partial {@code URL's}. 1237 * 1238 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 1239 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1240 * 1241 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> Any HTML page (or sub-page) 1242 * 1243 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 1244 * reference {@code TagNode's} that contain {@code SRC} attributes. Integer-pointer Arrays are 1245 * usually return from the {@code package 'NodeSearch'} "Find" methods. 1246 * 1247 * <DIV CLASS="EXAMPLE">{@code 1248 * // Retrieve 'pointers' to all the '<IMG SRC=...>' TagNode's. The term 'pointer' refers to 1249 * // integer-indices into the vectorized-html variable 'page' 1250 * 1251 * int[] picturePosArr = TagNodeFind.all(page, TC.OpeningTags, "img"); 1252 * 1253 * // Extract each SRC inner-tag, and construct a URL. Use the 'sourcePage' parameter if 1254 * // the URL is only partially-resolved. If any URL's on the original-page are invalid, 1255 * // the method shall not crash, but save the exception instead. 1256 * 1257 * Vector<Ret2<URL, MalformedURLException> urlsWithEx = 1258 * Links.resolveSRCs_KE(page, picturePosArr, mySourcePage); 1259 * 1260 * // Print out any "failed" urls 1261 * for (Ret2<URL, MalformedURLException> r : urlsWithEx) 1262 * if (r.b != null) 1263 * System.out.println("There was an exception: " + r.b.toString()); 1264 * }</DIV> 1265 * 1266 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 1267 * {@code "<IMG ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 1268 * {@code 'html'}, and then resolve any shortened {@code URL's}. 1269 * 1270 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1271 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 1272 * 1273 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1274 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code SRC} tag, 1275 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1276 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1277 * exception in {@code Ret2.b} 1278 * 1279 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_SRC> 1280 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1281 * 1282 * @throws ArrayIndexOutOfBoundsException 1283 * <EMBED CLASS='external-html' DATA-FILE-ID=ATTR_AIOOB_EX> 1284 * 1285 * @throws OpeningTagNodeExpectedException 1286 * <EMBED CLASS='external-html' DATA-FILE-ID=OPEN_TNE_EX> 1287 * 1288 * @throws TagNodeExpectedException <EMBED CLASS='external-html' DATA-FILE-ID=TNE_EX> 1289 * 1290 * @see #resolve_KE(String, URL) 1291 * @see TagNode#AV(String) 1292 * @see Ret2 1293 */ 1294 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1295 public static Vector<Ret2<URL, MalformedURLException>> resolveSRCs_KE 1296 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 1297 { 1298 // Return Vector 1299 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1300 1301 for (int nodePos : nodePosArr) 1302 { 1303 HTMLNode n = html.elementAt(nodePos); 1304 1305 // Must be an HTML TagNode 1306 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 1307 1308 TagNode tn = (TagNode) n; 1309 1310 // Must be an "Opening" HTML TagNode 1311 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 1312 1313 // Resolve "SRC" and keep URL's 1314 ret.addElement(LinksResolve_KE.resolve(tn.AV("src"), sourcePage)); 1315 } 1316 1317 return ret; 1318 } 1319 1320 /** 1321 * Resolve all {@code URL's}, represented as {@code String's}, inside of a {@code Vector}. 1322 * 1323 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1324 * 1325 * @param src a list of {@code String's} - usually partially or totally completed Internet 1326 * {@code URL's} 1327 * 1328 * @param sourcePage This is the source page {@code URL} from which the {@code String's} 1329 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 1330 * 1331 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1332 * {@code 'sourcePage'} parameter. If there were any {@code String's} that were zero-length or 1333 * null, then null is returned in the related {@code Vector} position. If any {@code TagNode} 1334 * causes a {@code MalformedURLException}, then that position in the {@code Vector} will 1335 * contain the exception in {@code Ret2.b} 1336 * 1337 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1338 * 1339 * @see #resolve_KE(String, URL) 1340 * @see Ret2 1341 */ 1342 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1343 public static Vector<Ret2<URL, MalformedURLException>> resolve_KE 1344 (Vector<String> src, URL sourcePage) 1345 { 1346 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1347 1348 for (String s : src) 1349 ret.addElement(LinksResolve_KE.resolve(s, sourcePage)); 1350 1351 return ret; 1352 } 1353 1354 /** 1355 * This will convert a simple java {@code String} to a {@code URL}, de-referencing any missing 1356 * information using the {@code 'sourcePage'} parameter. 1357 * 1358 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1359 * 1360 * @param src Any java {@code String}, usually one which was scraped from an HTML-Page, and 1361 * needs to be "completed." 1362 * 1363 * @param sourcePage This is the source page {@code URL} from which the String (possibly 1364 * relative) {@code URL} will be resolved. 1365 * 1366 * @return A {@code URL}, which has been completed/resolved with the {@code 'sourcePage'} 1367 * parameter. If parameter {@code 'src'} is null or zero-length, null will be returned. If a 1368 * {@code MalformedURLException} is thrown, that will be included with the {@code Ret2<>} 1369 * result. 1370 * 1371 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1372 * 1373 * @see Ret2 1374 */ 1375 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1376 public static Ret2<URL, MalformedURLException> resolve_KE(String src, URL sourcePage) 1377 { return LinksResolve_KE.resolve(src, sourcePage); } 1378}