001package Torello.HTML; 002 003import Torello.Java.*; 004 005import Torello.HTML.NodeSearch.InnerTagFind; // Used for an @see reference 006import Torello.HTML.NodeSearch.TagNodeFind; // Used in getBaseURL 007 008import Torello.Java.Additional.Ret2; 009import Torello.Java.Additional.Ret3; 010 011import Torello.JavaDoc.LinkJavaSource; 012import static Torello.JavaDoc.Entity.METHOD; 013 014import java.net.URL; 015import java.net.MalformedURLException; 016 017import java.util.Vector; 018import java.util.stream.IntStream; 019 020/** 021 * Utilities for de-refrencing 'partially-completed' {@code URL's} in a Web-Page {@code Vector}. 022 * 023 * <BR /><BR /><EMBED CLASS='external-html' DATA-FILE-ID=LINKS> 024 * @see ReplaceNodes 025 * @see ReplaceFunction 026 * @see HTMLPage 027 * @see InnerTagFind 028 * @see Ret2 029 */ 030@Torello.JavaDoc.StaticFunctional 031public class Links 032{ 033 private Links() { } 034 035 /** 036 * List of documented "starter-strings" that are sometimes used in Anchor URL 037 * {@code 'HREF=...'} attributes. 038 * 039 * @see #NON_URL_HREFS 040 */ 041 protected static final String[] _NON_URL_HREFS = 042 { "tel:", "magnet:", "javascript:", "mailto:", "ftp:", "file:", "data:", "blog:", "#" }; 043 044 /** 045 * This small method just returns the complete list of commonly found Anchor 046 * {@code 'HREF' String's} that do not actually constitute an HTML {@code 'URL'.} This method 047 * actually returns a "clone" of an internally stored {@code String[]} Array. This is to 048 * protect and make sure that the list of potential HTML Anchor-Tag {@code 'HREF'} Attributes 049 * is not changed, doctored or modified 050 * 051 * @return A clone of the {@code String}-array {@code '_NON_URL_HREFS'} 052 * 053 * @see #_NON_URL_HREFS 054 */ 055 public static String[] NON_URL_HREFS() 056 { return _NON_URL_HREFS.clone(); } 057 058 /** 059 * The methods in this class <I><B>will not automatically extract</I></B> any HTML 060 * {@code <BASE HREF=URL>} definitions that are found on this page. If the user wishes to 061 * dereference partial / relative {@code URL} definitions that exist on the input page, all the 062 * while respecting any {@code <BASE HREF=URL>} definitions found on the input page, then this 063 * method should be utilized. 064 * 065 * @param page This may be any HTML page or partial page. If this page has a valid HTML 066 * {@code <BASE HREF=URL>}, it will be extracted and returned as an instance of 067 * {@code class URL}. 068 * 069 * @return This shall return the HTML {@code <BASE HREF="http://...">} element found available 070 * within the input-page parameter {@code 'page'}. If the page provided does not contain a 071 * {@code BASE URL} definition, then null shall be returned. 072 * 073 * <BR /><BR /><B>NOTE:</B> The HTML Specification clearly states that only one {@code URL} 074 * may be defined using the HTML Element {@code <BASE>}. Clearly, due to the browser wars, 075 * unspecified / non-deterministic behavior is possible if multiple definitions are provided. 076 * For the purposes of this class, if such a situation arises, an exception is thrown. 077 * 078 * @throws MalformedHTMLException If the HTML page provided contains multiple definitions of 079 * the element {@code <BASE HREF=URL>}, then this exception will throw. 080 * 081 * @throws MalformedURLException If the {@code <BASE HREF=URL>} found / identified within the 082 * input page, but that {@code URL} is invalid, then this exception shall throw. 083 * 084 * @see TagNodeFind 085 * @see Attributes#retrieve(Vector, int[], String) 086 */ 087 public static URL getBaseURL(Vector<? extends HTMLNode> page) 088 throws MalformedHTMLException, MalformedURLException 089 { 090 int[] posArr = TagNodeFind.all(page, TC.OpeningTags, "base"); 091 092 if (posArr.length == 0) return null; 093 094 // NOTE: The cast is all right because 'posArr' only points to TagNode's 095 // Attributes expects to avoid processing Vector<TextNode>, and Vector<CommentNode> 096 // Above, there will be nothing in the 'posArr' if either of those was passed. 097 098 @SuppressWarnings("unchecked") 099 String[] urls = Attributes.retrieve((Vector<HTMLNode>) page, posArr, "href"); 100 101 boolean found = false; 102 String ret = null; 103 104 for (String url : urls) 105 if ((url != null) && (url.length() > 0)) 106 if (found) 107 throw new MalformedHTMLException( 108 "The page you have provided has multiple <BASE HREF=URL> definitions. " + 109 "However, the HTML Specifications state that pages may provide just one " + 110 "definition. If you wish to proceed, retrieve the definitions manually " + 111 "using class TagNodeFind.all and Attributes.retrieve, as explained in " + 112 "the JavaDoc pages for this class." 113 ); 114 else 115 { 116 found = true; 117 ret = url; 118 } 119 120 return new URL(ret); 121 } 122 123 124 // ******************************************************************************************** 125 // ******************************************************************************************** 126 // Complete Vector-Resolve Methods - SRC-ATTRIBUTE 127 // ******************************************************************************************** 128 // ******************************************************************************************** 129 130 131 /** 132 * Convenience Method. 133 * <BR />Invokes: {@link #resolveAllSRC(Vector, int, int, URL, SD, boolean)} 134 */ 135 public static Ret3<int[], int[], int[]> resolveAllSRC( 136 Vector<? super TagNode> html, URL sourcePage, SD quote, 137 boolean askForReturnArraysOrReturnNull 138 ) 139 { return resolveAllSRC(html, 0, -1, sourcePage, quote, askForReturnArraysOrReturnNull); } 140 141 /** 142 * Convenience Method. 143 * <BR />Accepts: {@code DotPair}. 144 * <BR />Invokes: {@link #resolveAllSRC(Vector, int, int, URL, SD, boolean)} 145 */ 146 public static Ret3<int[], int[], int[]> resolveAllSRC( 147 Vector<? super TagNode> html, DotPair dp, URL sourcePage, SD quote, 148 boolean askForReturnArraysOrReturnNull 149 ) 150 { 151 return resolveAllSRC 152 (html, dp.start, dp.end + 1, sourcePage, quote, askForReturnArraysOrReturnNull); 153 } 154 155 /** 156 * This method shall resolve all partial {@code URL} addresses that are found within 157 * {@code TagNode} elements having {@code 'SRC=...'} attributes. Each instance of 158 * {@code TagNode} found in the input HTML {@code Vector} that has an {@code 'SRC'} 159 * attribute - if the {@code 'URL'} is only partially resolve - shall be updated and replaced 160 * with a new {@code TagNode} with a fully resolved {@code URL}. 161 * 162 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 163 * 164 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 165 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 166 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 167 * 168 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 169 * (possibly-relative) {@code URL's} in the HTML-{@code Vector} will be resolved. 170 * 171 * @param quote A choice for the quotes to use. In most cases, {@code URL} attribute 172 * <B STYLE="color: red;">values</B> do not contain quotation-marks. So likely either 173 * choice would work just fine, without exceptions. 174 * 175 * <BR /><BR /><B>NOTE:</B> <I>null may be passed to this parameter</I>, and if it is 176 * the original quotation marks found in the {@code TagNode's 'SRC'} attribute will be 177 * reused. Passing null to this parameter should almost always be easiest, safest. 178 * 179 * @param askForReturnArraysOrReturnNull This (long-named) parameter is merely here to 180 * facilitate retrieving more information from this method - <I>if necessary</I>. When this 181 * parameter receives the following values: 182 * 183 * <BR /><BR /><UL CLASS=JDUL> 184 * <LI> <B>TRUE:</B> Three integer {@code int[]} arrays will be returned as listed in the 185 * <B>{@code Returns:}</B> section of this method's documentation. 186 * </LI> 187 * 188 * <LI><B>FALSE:</B> This method shall return null.</LI> 189 * </UL> 190 * 191 * @return If input parameter {@code 'askForReturnArraysOrReturnNull'} has been passed 192 * {@code FALSE}, this method shall return null. Otherwise, (if passed {@code TRUE}), then 193 * this method shall return an instance of {@code 'Ret3<int[], int[], int[]>'} - which is 194 * <I>returning three separate integer-arrays about what was found, and what has occurred.</I> 195 * 196 * <BR /><BR /> 197 * Three arrays are returned as a result of this method's invocation. Keep in mind that 198 * though the information might be superfluous, rejecting these arrays away is easy. 199 * They are provided as a matter of convenience for cases where more details information is 200 * mandatory for ensuring that long lists of {@code HTMLNode's} were properly updated. 201 * 202 * <BR /><BR /><OL CLASS=JDOL> 203 * <LI> {@code Ret3.a (int[])} 204 * <BR /><BR /> 205 * The first {@code int[] array} shall contain a list of the index of every 206 * {@code TagNode} in the input-{@code Vector} parameter's range that <B><I>contained</B> 207 * </I> a non-null HTML {@code 'SRC'} Attribute. 208 * <BR /><BR /> 209 * </LI> 210 * 211 * <LI> {@code Ret3.b (int[])} 212 * <BR /><BR /> 213 * The second {@code int[] array} will contain an index-list of the indices 214 * which contained {@code TagNode's} that were <B><I>replaced</I></B> by the 215 * internal-resolve logic. 216 * <BR /><BR /> 217 * </LI> 218 * 219 * <LI> {@code Ret3.c (int[])} 220 * <BR /><BR /> 221 * The third {@code int[] array} will contain an index-list of the indices 222 * which contained {@code TagNode's} whose {@code 'SRC=...'} attribute 223 * <I><B>failed</I></B> to be resolved by the internal-resolve logic, <I>or</I> caused a 224 * {@code QuotesException} to throw. 225 * </LI> 226 * </OL> 227 * 228 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 229 * 230 * @see #resolve(String, URL) 231 * @see TagNode#AV(String) 232 * @see TagNode#setAV(String, String, SD) 233 */ 234 public static Ret3<int[], int[], int[]> resolveAllSRC( 235 Vector<? super TagNode> html, int sPos, int ePos, URL sourcePage, SD quote, 236 boolean askForReturnArraysOrReturnNull 237 ) 238 { 239 // Retrieve the Vector-location of any TagNode on the page that has 240 // a "SRC=..." attribute. These are almost always HTML <IMG> elements. 241 // NOTE: FIND Method's are "READ ONLY" - the Cast will make no difference at run-time. 242 // The @SuppressWarnings is to overcome the cast of 'html' 243 244 @SuppressWarnings("unchecked") 245 int[] hasSrcPosArr = InnerTagFind.all((Vector<HTMLNode>) html, sPos, ePos, "src"); 246 247 // Java Stream's are convenient for keeping "Growing Lists" of return values. 248 // This builder shall keep a list of all URL's that failed to update - for any reason 249 // **UNLESS** the reason is that the URL was already a fully-resolved, non-partial URL 250 251 IntStream.Builder failedUpdate = askForReturnArraysOrReturnNull 252 ? IntStream.builder() 253 : null; 254 255 // This stream will keep a list of all URL's that were updated, and whose TagNode's 256 // were replaced inside the input HTML Vector 257 258 IntStream.Builder replaced = askForReturnArraysOrReturnNull 259 ? IntStream.builder() 260 : null; 261 262 for (int pos : hasSrcPosArr) 263 { 264 // Get the node at the index 265 TagNode tn = (TagNode) html.elementAt(pos); 266 267 // 1) Retrieve the SRC Attribute 268 // 2) if it is a partial-URL resolve it 269 // 3) Convert to a String 270 271 String oldURL = tn.AV("src"); 272 URL newURL = resolve(oldURL, sourcePage); 273 274 // Some URL's cannot be resolved, if so, just skip this TagNode. 275 // Log the index to the stream (if requested), and continue. 276 277 if (newURL == null) 278 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 279 280 // If the URL was already a fully-resolved-URL, continue - don't replace the TagNode; 281 // No logging needed here, the URL was *already* resolved... 282 283 if (oldURL.length() == newURL.toString().length()) continue; 284 285 // Replace the SRC Attribute in the TagNode. This builds a new instance of TagNode 286 // If there is an exception, log the index to the stream (if requested), and continue. 287 288 try 289 { tn = tn.setAV("src", newURL.toString(), quote); } 290 291 catch (QuotesException qex) 292 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 293 294 // Replace the index in the Vector containing the old TagNode with the new one. 295 html.setElementAt(tn , pos); 296 297 // The Vector-Index at this position had it's old TagNode removed and replaced with a 298 // new updated one. Log this to the stream-list so to allow the user to know. 299 300 if (askForReturnArraysOrReturnNull) replaced.accept(pos); 301 } 302 303 return askForReturnArraysOrReturnNull 304 305 ? new Ret3<int[], int[], int[]> 306 (hasSrcPosArr, replaced.build().toArray(), failedUpdate.build().toArray()) 307 : null; 308 } 309 310 311 // ******************************************************************************************** 312 // ******************************************************************************************** 313 // Complete Vector-Resolve Methods - HREF-ATTRIBUTE 314 // ******************************************************************************************** 315 // ******************************************************************************************** 316 317 318 /** 319 * Convenience Method. 320 * <BR />Invokes: {@link #resolveAllHREF(Vector, int, int, URL, SD, boolean)} 321 */ 322 public static Ret3<int[], int[], int[]> resolveAllHREF( 323 Vector<? super TagNode> html, URL sourcePage, SD quote, 324 boolean askForReturnArraysOrReturnNull 325 ) 326 { return resolveAllHREF(html, 0, -1, sourcePage, quote, askForReturnArraysOrReturnNull); } 327 328 /** 329 * Convenience Method. 330 * <BR />Accepts: {@code DotPair}. 331 * <BR />Invokes: {@link #resolveAllHREF(Vector, int, int, URL, SD, boolean)} 332 */ 333 public static Ret3<int[], int[], int[]> resolveAllHREF( 334 Vector<? super TagNode> html, DotPair dp, URL sourcePage, SD quote, 335 boolean askForReturnArraysOrReturnNull 336 ) 337 { 338 return resolveAllHREF 339 (html, dp.start, dp.end + 1, sourcePage, quote, askForReturnArraysOrReturnNull); 340 } 341 342 /** 343 * This method shall resolve all partial {@code URL} addresses that are found within 344 * {@code TagNode} elements having {@code 'HREF=...'} attributes. Each instance of 345 * {@code TagNode} found in the input HTML {@code Vector} that has an {@code 'HREF'} 346 * attribute - if the {@code 'URL'} is only partially resolve - shall be updated and replaced 347 * with a new {@code TagNode} with a fully resolved {@code URL}. 348 * 349 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 350 * 351 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 352 * @param sPos <EMBED CLASS='external-html' DATA-FILE-ID=SPOSVEC> 353 * @param ePos <EMBED CLASS='external-html' DATA-FILE-ID=EPOSVEC> 354 * 355 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 356 * (possibly-relative) {@code URL's} in the HTML-{@code Vector} will be resolved. 357 * 358 * @param quote A choice for the quotes to use. In most cases, {@code URL} attribute 359 * <B STYLE="color: red;">values</B> do not contain quotation-marks. So likely either 360 * choice would work just fine, without exceptions. 361 * 362 * <BR /><BR /><B>NOTE:</B> <I>null may be passed to this parameter</I>, and if it is 363 * the original quotation marks found in the {@code TagNode's 'HREF'} attribute will be 364 * reused. Passing null to this parameter should almost always be easiest, safest. 365 * 366 * @param askForReturnArraysOrReturnNull This (long-named) parameter is merely here to 367 * facilitate retrieving more information from this method - <I>if necessary</I>. When this 368 * parameter receives the following values: 369 * 370 * <BR /><BR /><UL CLASS=JDUL> 371 * <LI> <B>TRUE:</B> Three integer {@code int[]} arrays will be returned as listed in the 372 * <B>{@code Returns:}</B> section of this method's documentation. 373 * </LI> 374 * 375 * <LI><B>FALSE:</B> This method shall return null. </LI> 376 * </UL> 377 * 378 * @return If input parameter {@code 'askForReturnArraysOrReturnNull'} has been passed 379 * {@code FALSE}, this method shall return null. Otherwise, (if passed {@code TRUE}), then 380 * this method shall return an instance of {@code 'Ret3<int[], int[], int[]>'} - which is 381 * <I>returning three separate integer-arrays about what was found, and what has occurred.</I> 382 * 383 * <BR /><BR /> 384 * Three arrays are returned as a result of this method's invocation. Keep in mind that 385 * though the information might be superfluous, rejecting these arrays away is easy. 386 * They are provided as a matter of convenience for cases where more details information is 387 * mandatory for ensuring that long lists of {@code HTMLNode's} were properly updated. 388 * 389 * <BR /><BR /><OL CLASS=JDOL> 390 * <LI> {@code Ret3.a (int[])} 391 * <BR /><BR /> 392 * The first {@code int[] array} shall contain a list of the index of every 393 * {@code TagNode} in the input-{@code Vector} parameter's range that <B><I>contained</B> 394 * </I> a non-null HTML {@code 'HREF'} Attribute. 395 * <BR /><BR /> 396 * </LI> 397 * 398 * <LI> {@code Ret3.b (int[])} 399 * <BR /><BR /> 400 * The second {@code int[] array} will contain an index-list of the indices 401 * which contained {@code TagNode's} that were <B><I>replaced</I></B> by the 402 * internal-resolve logic. 403 * <BR /><BR /> 404 * </LI> 405 * 406 * <LI> {@code Ret3.c (int[])} 407 * <BR /><BR /> 408 * The third {@code int[] array} will contain an index-list of the indices 409 * which contained {@code TagNode's} whose {@code 'HREF=...'} attribute 410 * <I><B>failed</I></B> to be resolved by the internal-resolve logic, <I>or</I> caused a 411 * {@code QuotesException} to throw. 412 * </LI> 413 * </OL> 414 * 415 * @throws IndexOutOfBoundsException <EMBED CLASS='external-html' DATA-FILE-ID=VIOOBEX> 416 * 417 * @see #resolve(String, URL) 418 * @see TagNode#AV(String) 419 * @see TagNode#setAV(String, String, SD) 420 */ 421 public static Ret3<int[], int[], int[]> resolveAllHREF( 422 Vector<? super TagNode> html, int sPos, int ePos, URL sourcePage, SD quote, 423 boolean askForReturnArraysOrReturnNull 424 ) 425 { 426 // Retrieve the Vector-location of any TagNode on the page that has 427 // a "HREF=..." attribute. These are almost always HTML <IMG> elements. 428 // NOTE: FIND Method's are "READ ONLY" - the Cast will make no difference at run-time. 429 // The @SuppressWarnings is to overcome the cast of 'html' 430 431 @SuppressWarnings("unchecked") 432 int[] hasHRefPosArr = InnerTagFind.all((Vector<HTMLNode>) html, sPos, ePos, "href"); 433 434 // Java Stream's are convenient for keeping "Growing Lists" of return values. 435 // This builder shall keep a list of all URL's that failed to update - for any reason 436 // **UNLESS** the reason is that the URL was already a fully-resolved, non-partial URL 437 438 IntStream.Builder failedUpdate = askForReturnArraysOrReturnNull 439 ? IntStream.builder() 440 : null; 441 442 // This stream will keep a list of all URL's that were updated, and whose TagNode's 443 // were replaced inside the input HTML Vector 444 445 IntStream.Builder replaced = askForReturnArraysOrReturnNull 446 ? IntStream.builder() 447 : null; 448 449 for (int pos : hasHRefPosArr) 450 { 451 // Get the node at the index 452 TagNode tn = (TagNode) html.elementAt(pos); 453 454 // 1) Retrieve the HREF Attribute 455 // 2) if it is a partial-URL resolve it 456 // 3) Convert to a String 457 458 String oldURL = tn.AV("HREF"); 459 URL newURL = resolve(oldURL, sourcePage); 460 461 // Some URL's cannot be resolved, if so, just skip this TagNode. 462 // Log the index to the stream (if requested), and continue. 463 464 if (newURL == null) 465 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 466 467 // If the URL was already a fully-resolved-URL, continue - don't replace the TagNode; 468 // No logging needed here, the URL was *already* resolved... 469 470 if (oldURL.length() == newURL.toString().length()) continue; 471 472 // Replace the HREF Attribute in the TagNode. This builds a new instance of TagNode 473 // If there is an exception, log the index to the stream (if requested), and continue. 474 475 try 476 { tn = tn.setAV("href", newURL.toString(), quote); } 477 478 catch (QuotesException qex) 479 { if (askForReturnArraysOrReturnNull) failedUpdate.accept(pos); continue; } 480 481 // Replace the index in the Vector containing the old TagNode with the new one. 482 html.setElementAt(tn , pos); 483 484 // The Vector-Index at this position had it's old TagNode removed and replaced with a 485 // new updated one. Log this to the stream-list so to allow the user to know. 486 487 if (askForReturnArraysOrReturnNull) replaced.accept(pos); 488 } 489 490 return askForReturnArraysOrReturnNull 491 492 ? new Ret3<int[], int[], int[]> 493 (hasHRefPosArr, replaced.build().toArray(), failedUpdate.build().toArray()) 494 : null; 495 } 496 497 498 // ******************************************************************************************** 499 // ******************************************************************************************** 500 // Resolve, Not Keep Exceptions 501 // ******************************************************************************************** 502 // ******************************************************************************************** 503 504 505 /** 506 * Convenience Method. 507 * <BR />Invokes: {@link #resolveHREF(TagNode, URL)}. 508 * <BR />And-Then: {@link TagNode#setAV(String, String, SD)} 509 */ 510 public static TagNode resolveHREFAndUpdate(TagNode tnWithHREF, URL sourcePage) 511 { 512 URL url = resolveHREF(tnWithHREF, sourcePage); 513 514 return (url == null) 515 ? null 516 : tnWithHREF.setAV("href", url.toString(), null); 517 } 518 519 520 /** 521 * This should be used for {@code TagNode's} that contain an {@code 'HREF'} inner-tag 522 * (attribute). 523 * 524 * @param tnWithHREF <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TN_HREF> 525 * 526 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode} 527 * (possibly-relative) {@code URL} will be resolved. 528 * 529 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 530 * directory. Null is returned if attempting to build the {@code URL} generated a 531 * {@code MalformedURLException}. 532 * 533 * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 534 * {@code MalformedURLException's}. 535 * 536 * @throws HREFException If the {@code TagNode} passed to parameter {@code 'tnWithHREF'} does 537 * not actually contain an {@code HREF} attribute, then this exception shall throw. 538 * 539 * @see #resolve(String, URL) 540 * @see TagNode#AV(String) 541 */ 542 public static URL resolveHREF(TagNode tnWithHREF, URL sourcePage) 543 { 544 String href = tnWithHREF.AV("href"); 545 546 if (href == null) throw new HREFException( 547 "The TagNode passed to parameter tnWithHREF does not actually contain an " + 548 "HREF attribute." 549 ); 550 551 return resolve(href, sourcePage); 552 } 553 554 555 /** 556 * Convenience Method. 557 * <BR />Invokes: {@link #resolveSRC(TagNode, URL)} 558 * <BR />And-Then: {@link TagNode#setAV(String, String, SD)} 559 */ 560 public static TagNode resolveSRCAndUpdate(TagNode tnWithSRC, URL sourcePage) 561 { 562 URL url = resolveSRC(tnWithSRC, sourcePage); 563 564 return (url == null) 565 ? null 566 : tnWithSRC.setAV("src", url.toString(), null); 567 } 568 569 570 /** 571 * This should be used for {@code TagNode's} that contain a {@code 'SRC'} inner-tag 572 * (attribute). 573 * 574 * @param tnWithSRC <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TN_SRC> 575 * 576 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode} 577 * (possibly-relative) {@code URL} will be resolved. 578 * 579 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 580 * directory. Null is returned if attempting to build the {@code URL} generated a 581 * {@code MalformedURLException}. 582 * 583 * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 584 * {@code MalformedURLException's}. 585 * 586 * @throws SRCException If the {@code TagNode} passed to parameter {@code 'tnWithSRC'} does not 587 * actually contain a {@code SRC} attribute, then this exception shall throw. 588 * 589 * @see #resolve(String, URL) 590 * @see TagNode#AV(String) 591 */ 592 public static URL resolveSRC(TagNode tnWithSRC, URL sourcePage) 593 { 594 String src = tnWithSRC.AV("src"); 595 596 if (src == null) throw new SRCException( 597 "The TagNode passed to parameter tnWithSRC does not actually contain a " + 598 "SRC attribute." 599 ); 600 601 return resolve(src, sourcePage); 602 } 603 604 /** 605 * This should be used for lists of {@code TagNode's}, each of which contain an {@code 'HREF'} 606 * inner-tag (attribute). 607 * 608 * @param tnListWithHREF <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TNLIST_HREF> 609 * 610 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 611 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 612 * 613 * @return A list of {@code URL's}, each of which have been completed/resolved with the 614 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 615 * result in a null value in the {@code Vector}. 616 * 617 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_HREF> 618 * 619 * @see #resolve(String, URL) 620 * @see TagNode#AV(String) 621 */ 622 public static Vector<URL> resolveHREFs(Iterable<TagNode> tnListWithHREF, URL sourcePage) 623 { 624 Vector<URL> ret = new Vector<>(); 625 626 for (TagNode tn : tnListWithHREF) ret.addElement(resolve(tn.AV("href"), sourcePage)); 627 628 return ret; 629 } 630 631 632 /** 633 * This should be used for lists of {@code TagNode's}, each of which contain a {@code 'SRC'} 634 * inner-tag (attribute). 635 * 636 * @param tnListWithSRC <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TNLIST_SRC> 637 * 638 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 639 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 640 * 641 * @return A list of {@code URL's}, each of which have been completed/resolved with the 642 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 643 * result in a null value in the {@code Vector.} 644 * 645 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_SRC> 646 * 647 * @see #resolve(String, URL) 648 * @see TagNode#AV(String) 649 */ 650 public static Vector<URL> resolveSRCs(Iterable<TagNode> tnListWithSRC, URL sourcePage) 651 { 652 Vector<URL> ret = new Vector<>(); 653 654 for (TagNode tn : tnListWithSRC) ret.addElement(resolve(tn.AV("src"), sourcePage)); 655 656 return ret; 657 } 658 659 660 /** 661 * This will use a "pointer array" - an array containing indexes into the downloaded page to 662 * retrieve {@code TagNode's}. The {@code TagNode's} to which this pointer-array points - 663 * must each contain an {@code HREF} inner-tag with a {@code URL}, or a partial {@code URL}. 664 * 665 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 666 * 667 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> 668 * 669 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 670 * reference {@code TagNode's} that contain {@code HREF} attributes. Integer-pointer Arrays 671 * are usually returned from the {@code package 'NodeSearch'} "Find" methods. 672 * 673 * <DIV CLASS="EXAMPLE">{@code 674 * // Retrieve 'pointers' to all the '<A HREF=...>' TagNode's. The term 'pointer' refers to 675 * // integer-indices into the vectorized-html variable 'page' 676 * int[] anchorPosArr = TagNodeFind.all(page, TC.OpeningTags, "a"); 677 * 678 * // Extract each HREF inner-tag, and construct a {@code URL}. Use the 'sourcePage' parameter 679 * // if the URL is only partially-resolved 680 * Vector<URL> urls = Links.resolveHREFs(page, anchorPosArr, mySourcePage); 681 * }</DIV> 682 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 683 * {@code "<A ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 684 * {@code 'html'}, and then resolve any shortened {@code URL's}. 685 * 686 * @param sourcePage This is the source page {@code URL} from whence the (possibly relative) 687 * {@code TagNode URL's} in the {@code Vector} are to be resolved. 688 * 689 * @return A list of {@code URL's}, each of which have been completed/resolved with the 690 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 691 * result in a null value in the {@code Vector}. However, if any of the nodes pointed to by 692 * the {@code 'nodePosArr'} parameter do not contain opening {@code TagNode} elements, then 693 * this mistake shall generate {@code TagNodeExpectedException's}. 694 * 695 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_HREF> 696 * 697 * @throws ArrayIndexOutOfBoundsException 698 * <EMBED CLASS='external-html' DATA-FILE-ID=ATTR_AIOOB_EX> 699 * @throws OpeningTagNodeExpectedException 700 * <EMBED CLASS='external-html' DATA-FILE-ID=OPEN_TNE_EX> 701 * 702 * @throws TagNodeExpectedException <EMBED CLASS='external-html' DATA-FILE-ID=TNE_EX> 703 * 704 * @see #resolve(String, URL) 705 * @see TagNode#AV(String) 706 */ 707 public static Vector<URL> resolveHREFs 708 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 709 { 710 // Return Vector 711 Vector<URL> ret = new Vector<>(); 712 713 for (int nodePos : nodePosArr) 714 { 715 HTMLNode n = html.elementAt(nodePos); 716 717 // Must be an HTML TagNode 718 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 719 720 TagNode tn = (TagNode) n; 721 722 // Must be an "Opening" HTML TagNode 723 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 724 725 // Resolve the 'HREF', save the URL 726 ret.addElement(resolve(tn.AV("href"), sourcePage)); 727 } 728 729 return ret; 730 } 731 732 733 /** 734 * This will use a "pointer array" - an array containing indexes into the downloaded page to 735 * retrieve {@code TagNode's}. The {@code TagNode's} to which this pointer-array points - must 736 * each contain a {@code SRC} inner-tag with a {@code URL}, or a partial {@code URL}. 737 * 738 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 739 * 740 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> Any HTML page (or sub-page) 741 * 742 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 743 * reference {@code TagNode's} that contain {@code SRC} attributes. Integer-pointer Arrays are 744 * usually returned from the {@code package 'NodeSearch'} "Find" methods. 745 * 746 * <DIV CLASS="EXAMPLE">{@code 747 * // Retrieve 'pointers' to all the '<IMG SRC=...>' TagNode's. The term 'pointer' refers to 748 * // integer-indices into the vectorized-html variable 'page' 749 * 750 * int[] picturePosArr = TagNodeFind.all(page, TC.OpeningTags, "img"); 751 * 752 * // Extract each SRC inner-tag, and construct a {@code URL}. Use the 'sourcePage' parameter 753 * // if the URL is only partially-resolved 754 * 755 * Vector<URL> urls = Links.resolveSRCs(page, picturePosArr, mySourcePage); 756 * }</DIV> 757 * 758 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 759 * {@code "<IMG ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 760 * {@code 'html'}, and then resolve any shorted image {@code URL's}. 761 * 762 * @param sourcePage This is the source page {@code URL} from whence the (possibly relative) 763 * {@code TagNode URL's} in the {@code Vector} are to be resolved. 764 * 765 * @return A list of {@code URL's}, each of which have been completed/resolved with the 766 * {@code 'sourcePage'} parameter. Any {@code TagNode} which generated an exception, will 767 * result in a null value in the {@code Vector}. However, if any of the nodes pointed to by 768 * the {@code 'nodePosArr'} parameter do not contain opening {@code TagNode} elements, then 769 * this mistake shall generate {@code TagNodeExpectedException's}. 770 * 771 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_SRC> 772 * 773 * @throws ArrayIndexOutOfBoundsException 774 * <EMBED CLASS='external-html' DATA-FILE-ID=ATTR_AIOOB_EX> 775 * @throws OpeningTagNodeExpectedException 776 * <EMBED CLASS='external-html' DATA-FILE-ID=OPEN_TNE_EX> 777 * 778 * @throws TagNodeExpectedException <EMBED CLASS='external-html' DATA-FILE-ID=TNE_EX> 779 * 780 * @see #resolve(String, URL) 781 * @see TagNode#AV(String) 782 */ 783 public static Vector<URL> resolveSRCs 784 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 785 { 786 // Return Vector 787 Vector<URL> ret = new Vector<>(); 788 789 for (int nodePos : nodePosArr) 790 { 791 HTMLNode n = html.elementAt(nodePos); 792 793 // Must be an HTML TagNode 794 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 795 796 TagNode tn = (TagNode) n; 797 798 // Must be an "Opening" HTML TagNode 799 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 800 801 // Resolve the "SRC", save the URL 802 ret.addElement(resolve(tn.AV("src"), sourcePage)); 803 } 804 805 return ret; 806 } 807 808 809 /** 810 * This will convert <I><B>a list of </B></I> simple java {@code String's} to a 811 * list/{@code Vector} of {@code URL's}, de-referencing any missing information using the 812 * {@code 'sourcePage'} parameter. 813 * 814 * @param src a list of strings - usually partially or totally completed Internet {@code URL's} 815 * 816 * @param sourcePage This is the source page {@code URL} from which the {@code String's} 817 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 818 * 819 * @return A list of {@code URL's}, each of which have been completed/resolved with the 820 * {@code 'sourcePage'} parameter. If there were any {@code String's} that were zero-length or 821 * null, then null is returned in the related {@code Vector} position. If any 822 * {@code TagNode} causes a {@code MalformedURLException}, then that position in the 823 * {@code Vector} will be null. 824 * 825 * @see #resolve(String, URL) 826 */ 827 public static Vector<URL> resolve(Vector<String> src, URL sourcePage) 828 { 829 Vector<URL> ret = new Vector<>(); 830 831 for (String s : src) ret.addElement(resolve(s, sourcePage)); 832 833 return ret; 834 } 835 836 /** 837 * This will convert a simple java {@code String} to a {@code URL}, de-referencing any missing 838 * information using the {@code 'sourcePage'} parameter. 839 * 840 * @param src Any java {@code String}, usually one which was scraped from an HTML-Page, and 841 * needs to be "completed." 842 * 843 * @param sourcePage This is the source page {@code URL} from which the String 844 * (possibly-relative) {@code URL} will be resolved. 845 * 846 * @return A {@code URL}, which has been completed/resolved with the {@code 'sourcePage'} 847 * parameter. If parameter {@code 'src'} is null or zero-length, then this method will also 848 * return null. If a {@code MalformedURLException} is generated, null will also be returned. 849 */ 850 public static URL resolve(String src, URL sourcePage) 851 { 852 if (sourcePage == null) throw new NullPointerException( 853 "Though you may provide null to the partial-URL to dereference parameter, null " + 854 "may not be passed to the Source-Page Parameter. The purpose of the 'resolve' " + 855 "operation is to resolve partial-URLs against a source-page (root) URL. " + 856 "Therefore this is not allowed." 857 ); 858 859 if (src == null) return null; 860 861 src = src.trim(); 862 863 if (src.length() == 0) return null; 864 865 String srcLC = src.toLowerCase(); 866 867 if (StrCmpr.startsWithXOR(srcLC, _NON_URL_HREFS)) return null; 868 869 if (srcLC.startsWith("http://") || srcLC.startsWith("https://")) 870 871 try 872 { return new URL(src); } 873 874 catch (MalformedURLException e) { return null; } 875 876 if (src.startsWith("//") && (src.charAt(3) != '/')) 877 878 try 879 { return new URL(sourcePage.getProtocol().toLowerCase() + ":" + src); } 880 881 catch (MalformedURLException e) { return null; } 882 883 if (src.startsWith("/")) 884 885 try 886 { 887 return new URL( 888 sourcePage.getProtocol().toLowerCase() + "://" + 889 sourcePage.getHost().toLowerCase() + 890 src 891 ); 892 } 893 894 catch (MalformedURLException e) { return null; } 895 896 if (src.startsWith("../")) 897 { 898 String sourcePageStr = sourcePage.toString(); 899 short nLevels = 0; 900 901 do { nLevels++; src = src.substring(3); } 902 while (src.startsWith("../")); 903 904 String directory = StringParse.dotDotParentDirectory(sourcePage.toString(), nLevels); 905 906 try { return new URL(directory + src); } 907 catch (Exception e) { return null; } 908 } 909 910 String root = 911 sourcePage.getProtocol().toLowerCase() + "://" + 912 sourcePage.getHost().toLowerCase(); 913 914 String path = sourcePage.getPath().trim(); 915 int pos = StringParse.findLastFrontSlashPos(path); 916 917 if (pos == -1) throw new StringIndexOutOfBoundsException( 918 "The URL you have provided: " + sourcePage.toString() + " does not have a '/' " + 919 "front-slash character in it's path. Cannot proceed resolving relative-URL's " + 920 "without this." 921 ); 922 923 path = path.substring(0, pos + 1); 924 925 try { return new URL(root + path + src); } 926 catch (MalformedURLException e) { return null; } 927 } 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 // ******************************************************************************************** 950 // ******************************************************************************************** 951 // Resolve, KE - Keep Exceptions 952 // ******************************************************************************************** 953 // ******************************************************************************************** 954 955 956 /** 957 * This should be used for {@code TagNode's} that contain an {@code 'HREF'} inner-tag 958 * (attribute). 959 * 960 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 961 * 962 * @param tnWithHREF <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TN_HREF> 963 * 964 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 965 * (possibly-relative) {@code URL} will be resolved. 966 * 967 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 968 * directory. If there were no {@code HREF} tag, then null is returned. If 969 * the {@code TagNode} causes a {@code MalformedURLException}, that is returned in 970 * {@code Ret2.b} 971 * 972 * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 973 * {@code MalformedURLException's}. 974 * 975 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 976 * 977 * @throws HREFException If the {@code TagNode} passed to parameter {@code 'tnWithHREF'} does 978 * not actually contain an {@code HREF} attribute, then this exception shall throw. 979 * 980 * @see #resolve_KE(String, URL) 981 * @see TagNode#AV(String) 982 * @see Ret2 983 */ 984 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 985 public static Ret2<URL, MalformedURLException> resolveHREF_KE 986 (TagNode tnWithHREF, URL sourcePage) 987 { 988 String href = tnWithHREF.AV("href"); 989 990 if (href == null) throw new HREFException( 991 "The TagNode passed to parameter tnWithHREF does not actually contain an " + 992 "HREF attribute." 993 ); 994 995 return LinksResolve_KE.resolve(href, sourcePage); 996 } 997 998 999 /** 1000 * This should be used for {@code TagNode's} that contain a {@code 'SRC'} inner-tag 1001 * (attribute). 1002 * 1003 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1004 * 1005 * @param tnWithSRC <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TN_SRC> 1006 * 1007 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1008 * (possibly-relative) {@code URL} will be resolved. 1009 * 1010 * @return A complete-{@code URL} without any missing "presumed data" - such as host/domain or 1011 * directory. If there were no {@code SRC} tag, then null is returned. If the 1012 * {@code TagNode} causes a {@code MalformedURLException}, that is returned in {@code Ret2.b} 1013 * 1014 * <BR /><BR /><B STYLE="color: red;">SPECIFICALLY:</B> This method shall catch all 1015 * {@code MalformedURLException's}. 1016 * 1017 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1018 * 1019 * @throws SRCException If the {@code TagNode} passed to parameter {@code 'tnWithSRC'} does not 1020 * actually contain a {@code SRC} attribute, then this exception shall throw. 1021 * 1022 * @see #resolve_KE(String, URL) 1023 * @see TagNode#AV(String) 1024 * @see Ret2 1025 */ 1026 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1027 public static Ret2<URL, MalformedURLException> resolveSRC_KE 1028 (TagNode tnWithSRC, URL sourcePage) 1029 { 1030 String src = tnWithSRC.AV("src"); 1031 1032 if (src == null) throw new SRCException( 1033 "The TagNode passed to parameter tnWithSRC does not actually contain a " + 1034 "SRC attribute." 1035 ); 1036 1037 return LinksResolve_KE.resolve(src, sourcePage); 1038 } 1039 1040 1041 /** 1042 * This should be used for lists of {@code TagNode's}, each of which contain an {@code 'HREF'} 1043 * inner-tag (attribute). 1044 * 1045 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1046 * 1047 * @param tnListWithHREF <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TNLIST_HREF> 1048 * 1049 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1050 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 1051 * 1052 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1053 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code HREF} tag, 1054 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1055 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1056 * exception in {@code Ret2.b} 1057 * 1058 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_HREF> 1059 * 1060 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1061 * 1062 * @see #resolve_KE(String, URL) 1063 * @see TagNode#AV(String) 1064 * @see Ret2 1065 */ 1066 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1067 public static Vector<Ret2<URL, MalformedURLException>> resolveHREFs_KE 1068 (Iterable<TagNode> tnListWithHREF, URL sourcePage) 1069 { 1070 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1071 1072 for (TagNode tn : tnListWithHREF) 1073 ret.addElement(LinksResolve_KE.resolve(tn.AV("href"), sourcePage)); 1074 1075 return ret; 1076 } 1077 1078 1079 /** 1080 * This should be used for lists of {@code TagNode's}, each of which contain a {@code 'SRC'} 1081 * inner-tag (attribute). 1082 * 1083 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1084 * 1085 * @param tnListWithSRC <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_TNLIST_SRC> 1086 * 1087 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1088 * (possibly-relative) {@code URL's} in the {@code Iterable} will be resolved. 1089 * 1090 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1091 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code SRC} tag, 1092 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1093 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1094 * exception in {@code Ret2.b} 1095 * 1096 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_SRC> 1097 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1098 * 1099 * @see #resolve_KE(String, URL) 1100 * @see TagNode#AV(String) 1101 * @see Ret2 1102 */ 1103 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1104 public static Vector<Ret2<URL, MalformedURLException>> resolveSRCs_KE 1105 (Iterable<TagNode> tnListWithSRC, URL sourcePage) 1106 { 1107 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1108 1109 for (TagNode tn : tnListWithSRC) 1110 ret.addElement(LinksResolve_KE.resolve(tn.AV("src"), sourcePage)); 1111 1112 return ret; 1113 } 1114 1115 1116 /** 1117 * This will use a "pointer array" - an array containing indexes into the downloaded page to 1118 * retrieve {@code TagNode's}. The {@code TagNode} to which this pointer-array points - must 1119 * contain {@code HREF} inner-tags with {@code URL's}, or partial {@code URL's}. 1120 * 1121 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 1122 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1123 * 1124 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> Any HTML page (or sub-page) 1125 * 1126 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 1127 * reference {@code TagNode's} that contain {@code HREF} attributes. Integer-pointer Arrays 1128 * are usually return from the {@code package 'NodeSearch'} "Find" methods. 1129 * 1130 * <DIV CLASS="EXAMPLE">{@code 1131 * // Retrieve 'pointers' to all the '<A HREF=...>' TagNode's. The term 'pointer' refers to 1132 * // integer-indices into the vectorized-html variable 'page' 1133 * 1134 * int[] anchorPosArr = TagNodeFind.all(page, TC.OpeningTags, "a"); 1135 * 1136 * // Extract each HREF inner-tag, and construct a URL. Use the 'sourcePage' parameter if 1137 * // the URL is only partially-resolved. If any URL's on the original-page are invalid, the 1138 * // method shall not crash, but save the exception instead. 1139 * 1140 * Vector<Ret2<URL, MalformedURLException> urlsWithEx = 1141 * Links.resolveHREFs_KE(page, picturePosArr, mySourcePage); 1142 * 1143 * // Print out any "failed" urls 1144 * for (Ret2<URL, MalformedURLException> r : urlsWithEx) 1145 * if (r.b != null) 1146 * System.out.println("There was an exception: " + r.b.toString()); 1147 * }</DIV> 1148 * 1149 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 1150 * {@code "<A ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 1151 * {@code 'html'}., and then resolve any shortened {@code URL's}. 1152 * 1153 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1154 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 1155 * 1156 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1157 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code HREF} tag, 1158 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1159 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1160 * exception in {@code Ret2.b} 1161 * 1162 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_HREF> 1163 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1164 * 1165 * @throws ArrayIndexOutOfBoundsException 1166 * <EMBED CLASS='external-html' DATA-FILE-ID=ATTR_AIOOB_EX> 1167 * @throws OpeningTagNodeExpectedException 1168 * <EMBED CLASS='external-html' DATA-FILE-ID=OPEN_TNE_EX> 1169 * 1170 * @throws TagNodeExpectedException <EMBED CLASS='external-html' DATA-FILE-ID=TNE_EX> 1171 * 1172 * @see #resolve_KE(String, URL) 1173 * @see TagNode#AV(String) 1174 * @see Ret2 1175 */ 1176 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1177 public static Vector<Ret2<URL, MalformedURLException>> resolveHREFs_KE 1178 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 1179 { 1180 // Return Vector 1181 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1182 1183 for (int nodePos : nodePosArr) 1184 { 1185 HTMLNode n = html.elementAt(nodePos); 1186 1187 // Must be an HTML TagNode 1188 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 1189 1190 TagNode tn = (TagNode) n; 1191 1192 // Must be an "Opening" HTML TagNode 1193 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 1194 1195 // Resolve the "HREF", keep the URL 1196 ret.addElement(LinksResolve_KE.resolve(tn.AV("href"), sourcePage)); 1197 } 1198 1199 return ret; 1200 } 1201 1202 /** 1203 * This will use a "pointer array" - an array containing indexes into the downloaded page to 1204 * retrieve {@code TagNode's}. The {@code TagNode} to which this pointer-array points - must 1205 * contain {@code SRC} inner-tags with {@code URL's}, or partial {@code URL's}. 1206 * 1207 * <EMBED CLASS='external-html' DATA-FILE-ID=BASE_URL_NOTE> 1208 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1209 * 1210 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVEC> Any HTML page (or sub-page) 1211 * 1212 * @param nodePosArr An array of pointers into the page or sub-page. The pointers must 1213 * reference {@code TagNode's} that contain {@code SRC} attributes. Integer-pointer Arrays are 1214 * usually return from the {@code package 'NodeSearch'} "Find" methods. 1215 * 1216 * <DIV CLASS="EXAMPLE">{@code 1217 * // Retrieve 'pointers' to all the '<IMG SRC=...>' TagNode's. The term 'pointer' refers to 1218 * // integer-indices into the vectorized-html variable 'page' 1219 * 1220 * int[] picturePosArr = TagNodeFind.all(page, TC.OpeningTags, "img"); 1221 * 1222 * // Extract each SRC inner-tag, and construct a URL. Use the 'sourcePage' parameter if 1223 * // the URL is only partially-resolved. If any URL's on the original-page are invalid, 1224 * // the method shall not crash, but save the exception instead. 1225 * 1226 * Vector<Ret2<URL, MalformedURLException> urlsWithEx = 1227 * Links.resolveSRCs_KE(page, picturePosArr, mySourcePage); 1228 * 1229 * // Print out any "failed" urls 1230 * for (Ret2<URL, MalformedURLException> r : urlsWithEx) 1231 * if (r.b != null) 1232 * System.out.println("There was an exception: " + r.b.toString()); 1233 * }</DIV> 1234 * 1235 * <BR /><I>which would obtain a pointer-array / (a.k.a. a "vector-index-array") to every HTML 1236 * {@code "<IMG ...>"} element</I> that was available in the HTML page-{@code Vector} parameter 1237 * {@code 'html'}, and then resolve any shortened {@code URL's}. 1238 * 1239 * @param sourcePage This is the source page {@code URL} from which the {@code TagNode's} 1240 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 1241 * 1242 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1243 * {@code 'sourcePage'} parameter. If there were any {@code TagNode} with no {@code SRC} tag, 1244 * then null is returned in the related {@code Vector} position. If any {@code TagNode} causes 1245 * a {@code MalformedURLException}, then that position in the {@code Vector} will contain the 1246 * exception in {@code Ret2.b} 1247 * 1248 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_NO_SRC> 1249 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1250 * 1251 * @throws ArrayIndexOutOfBoundsException 1252 * <EMBED CLASS='external-html' DATA-FILE-ID=ATTR_AIOOB_EX> 1253 * @throws OpeningTagNodeExpectedException 1254 * <EMBED CLASS='external-html' DATA-FILE-ID=OPEN_TNE_EX> 1255 * 1256 * @throws TagNodeExpectedException <EMBED CLASS='external-html' DATA-FILE-ID=TNE_EX> 1257 * 1258 * @see #resolve_KE(String, URL) 1259 * @see TagNode#AV(String) 1260 * @see Ret2 1261 */ 1262 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1263 public static Vector<Ret2<URL, MalformedURLException>> resolveSRCs_KE 1264 (Vector<? extends HTMLNode> html, int[] nodePosArr, URL sourcePage) 1265 { 1266 // Return Vector 1267 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1268 1269 for (int nodePos : nodePosArr) 1270 { 1271 HTMLNode n = html.elementAt(nodePos); 1272 1273 // Must be an HTML TagNode 1274 if (! n.isTagNode()) throw new TagNodeExpectedException(nodePos); 1275 1276 TagNode tn = (TagNode) n; 1277 1278 // Must be an "Opening" HTML TagNode 1279 if (tn.isClosing) throw new OpeningTagNodeExpectedException(nodePos); 1280 1281 // Resolve "SRC" and keep URL's 1282 ret.addElement(LinksResolve_KE.resolve(tn.AV("src"), sourcePage)); 1283 } 1284 1285 return ret; 1286 } 1287 1288 /** 1289 * Resolve all {@code URL's}, represented as {@code String's}, inside of a {@code Vector}. 1290 * 1291 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1292 * 1293 * @param src a list of {@code String's} - usually partially or totally completed Internet 1294 * {@code URL's} 1295 * 1296 * @param sourcePage This is the source page {@code URL} from which the {@code String's} 1297 * (possibly-relative) {@code URL's} in the {@code Vector} will be resolved. 1298 * 1299 * @return A list of {@code URL's}, each of which have been completed/resolved with the 1300 * {@code 'sourcePage'} parameter. If there were any {@code String's} that were zero-length or 1301 * null, then null is returned in the related {@code Vector} position. If any {@code TagNode} 1302 * causes a {@code MalformedURLException}, then that position in the {@code Vector} will 1303 * contain the exception in {@code Ret2.b} 1304 * 1305 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1306 * 1307 * @see #resolve_KE(String, URL) 1308 * @see Ret2 1309 */ 1310 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1311 public static Vector<Ret2<URL, MalformedURLException>> resolve_KE 1312 (Vector<String> src, URL sourcePage) 1313 { 1314 Vector<Ret2<URL, MalformedURLException>> ret = new Vector<>(); 1315 1316 for (String s : src) 1317 ret.addElement(LinksResolve_KE.resolve(s, sourcePage)); 1318 1319 return ret; 1320 } 1321 1322 /** 1323 * This will convert a simple java {@code String} to a {@code URL}, de-referencing any missing 1324 * information using the {@code 'sourcePage'} parameter. 1325 * 1326 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_KE> 1327 * 1328 * @param src Any java {@code String}, usually one which was scraped from an HTML-Page, and 1329 * needs to be "completed." 1330 * 1331 * @param sourcePage This is the source page {@code URL} from which the String (possibly 1332 * relative) {@code URL} will be resolved. 1333 * 1334 * @return A {@code URL}, which has been completed/resolved with the {@code 'sourcePage'} 1335 * parameter. If parameter {@code 'src'} is null or zero-length, null will be returned. If a 1336 * {@code MalformedURLException} is thrown, that will be included with the {@code Ret2<>} 1337 * result. 1338 * 1339 * <EMBED CLASS='external-html' DATA-FILE-ID=LINKS_RET2> 1340 * 1341 * @see Ret2 1342 */ 1343 @LinkJavaSource(handle="LinksResolve_KE", entity=METHOD, name="resolve") 1344 public static Ret2<URL, MalformedURLException> resolve_KE(String src, URL sourcePage) 1345 { return LinksResolve_KE.resolve(src, sourcePage); } 1346}