001package Torello.HTML; 002 003import Torello.HTML.NodeSearch.*; 004import Torello.Java.FileRW; // used in @see comments 005import Torello.Java.StringParse; 006import Torello.Java.Additional.Ret2; 007 008import java.util.*; 009import java.util.stream.IntStream; 010 011/** 012 * Utilities for checking that opening and closing {@link TagNode} elements match up (that the HTML 013 * is balanced). 014 * 015 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE> 016 */ 017@Torello.JavaDoc.StaticFunctional 018public class Balance 019{ 020 private Balance() { } 021 022 /** 023 * Invokes: 024 * 025 * <BR /><BR /><UL CLASS=JDUL> 026 * <LI>{@link #check(Vector)}</LI> 027 * <LI>{@link #checkNonZero(Hashtable)}</LI> 028 * <LI>{@link #toStringBalance(Hashtable)}</LI> 029 * </UL> 030 * 031 * <DIV CLASS=EXAMPLE>{@code 032 * String b = Balance.CB(a.articleBody); 033 * System.out.println((b == null) ? "Page has Balanced HTML" : b); 034 * 035 * // If Page has equal number of open and close tags prints: 036 * // Page Has Balanced HTML 037 * // OTHERWISE PRINTS REPORT 038 * }</DIV> 039 * 040 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 041 * 042 * @return Will return null if the snippet or page has 'balanced' HTML, otherwise returns the 043 * trimmed balance-report as a {@code String}. 044 */ 045 public static String CB(Vector<HTMLNode> html) 046 { 047 String ret = toStringBalance(checkNonZero(check(html))); 048 049 return (ret.length() == 0) ? null : ret; 050 } 051 052 /** 053 * Creates a {@code Hashtable} that has a count of all open and closed HTML tags found on the 054 * page. 055 * 056 * <BR /><BR />This {@code Hashtable} may be regarded as maintaining "counts" on each-and-every 057 * HTML tag to identify whether there is <I><B>a one-to-one balance mapping between opening and 058 * closing tags</I></B> for each element. When the {@code Hashtable} generated by 059 * this method is non-zero (for a particular HTML-Tag) it means that there are an unequal 060 * number of opening and closing elements for that tag. 061 * 062 * <BR /><BR />Suppose this method were to produce a {@code Hashtable}, and that 063 * {@code Hashtable} queried for a count on the HTML <B CLASS=JDHTags>{@code <DIV>}</B> tag 064 * (dividers). If that count turned out to be a non-zero positive number it would mean that 065 * the Vectorized-HTML had more opening <B CLASS=JDHTags>{@code <DIV>}</B> tags than the 066 * number of closing <B CLASS=JDHTags>{@code </DIV>}</B> tags on that page. 067 * 068 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE1> <!-- Validity Note --> 069 * 070 * <BR /><BR />The following example will help explain the use of this method. If an HTML page 071 * needs to be checked to see that all elements are properly opened and closed, this method can 072 * be used to return a list of any HTML element tag that does not have an equal number of 073 * opening and closing tags. 074 * 075 * <BR /><BR />In this example, the generated Java-Doc HTML-Page for class {@code TagNode} is 076 * checked. 077 * 078 * <DIV CLASS="EXAMPLE">{@code 079 * String html = FileRW.loadFileToString(htmlFileName); 080 * Vector<HTMLNode> v = HTMLPage.getPageTokens(html, false); 081 * Hashtable<String, Integer> b = Balance.check(v); 082 * StringBuffer sb = new StringBuffer(); 083 * 084 * // This part just prints a text-output to a string buffer, which is printed to the screen. 085 * for (String key : b.keySet()) 086 * { 087 * Integer i = b.get(key); 088 * 089 * // Only print keys that had a "non-zero count" 090 * // A Non-Zero-Count implies Opening-Tag-Count and Closing-Tag-Count are not equal! 091 * 092 * if (i.intValue() != 0) sb.append(key + "\t" + i.intValue() + "\n"); 093 * } 094 * 095 * // This example output was: "i -1", because of an unclosed italics element. 096 * // NOTE: To find where this unclosed element is, use method: nonNestedCheck(Vector, String) 097 * }</DIV> 098 * 099 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 100 * 101 * @return A {@code Hashtable} map of the count of each HTML-Tag present in the 102 * input {@code Vector}. 103 * 104 * <BR /><BR />For instance, if this {@code Vector} had five 105 * <B CLASS=JDHTags>{@code <A HREF=...>}</B> (Anchor-Link) tags, and six 106 * <B CLASS=JDHTags>{@code </A>}</B> tags, then the returned {@code Hashtable} would have a 107 * {@code String}-key equal to {@code "A"} with an integer value of {@code -1}. 108 * 109 * @see FileRW#loadFileToString(String) 110 * @see HTMLPage#getPageTokens(CharSequence, boolean) 111 */ 112 public static Hashtable<String, Integer> check(Vector<? super TagNode> html) 113 { 114 Hashtable<String, Integer> ht = new Hashtable<>(); 115 116 // Iterate through the HTML List, we are only counting HTML Elements, not text, and 117 // not HTML Comments 118 119 for (Object o : html) if (o instanceof TagNode) 120 { 121 TagNode tn = (TagNode) o; 122 123 // Singleton tags are also known as 'self-closing' tags. BR, HR, IMG, etc... 124 if (HTMLTags.isSingleton(tn.tok)) continue; 125 126 Integer I = ht.get(tn.tok); 127 int i = (I != null) ? I.intValue() : 0; 128 129 // An opening-version (TC.OpeningTags, For Instance <DIV ...>) will ADD 1 to the count 130 // A closing-tag (For Instance: </DIV>) will SUBTRACT 1 from the count 131 132 i += tn.isClosing ? -1 : 1; 133 134 // Update the return result Hashtable for this particular HTML-Element (tn.tok) 135 ht.put(tn.tok, Integer.valueOf(i)); 136 } 137 138 return ht; 139 } 140 141 /** 142 * Creates an array that includes an open-and-close {@code 'count'} for each HTML-Tag / 143 * that was requested via the passed input {@code String[]}-Array parameter {@code 'htmlTags'}. 144 * 145 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE1> <!-- Validity Note --> 146 * 147 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 148 * 149 * <BR /><BR />The HTML-Element Open-Close-Counts are computed from this page. 150 * 151 * @param htmlTags This may be one, or many, HTML-Tags whose open-close count needs to be 152 * computed. Any HTML Element that is not present in this list - <I>will not have a count 153 * computed.</I> 154 * 155 * <BR /><BR />The {@code count} results which are stored in an {@code int[]}-Array that should 156 * be considered "parallel" to this input Var-Args-Array. 157 * 158 * @return An array of the count of each html-element present in the input vectorized-html 159 * parameter {@code 'html'}. 160 * For instance, If the following values were passed to this method: 161 * 162 * <BR /><BR /><UL CLASS=JDUL> 163 * <LI> A Vectorized-HTML page that had 5 {@code '<SPAN ...>'} open-elements, and 6 164 * {@code '</SPAN>'} closing {@code SPAN}-Tags. 165 * </LI> 166 * 167 * <LI> And at least one of the {@code String's} in the Var-Args parameter {@code 'htmlTags'} 168 * was equal to the {@code String} {@code "SPAN"} (case insensitive). 169 * </LI> 170 * 171 * <LI> <B>==></B> Then the array-position corresponding to the position in array 172 * {@code 'htmlTags'} that had the {@code "SPAN"} would have a value of {@code '-1'}. 173 * </LI> 174 * </UL> 175 * 176 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 177 * 178 * @throws SingletonException If and of the {@code String}-Tags passed to parameter 179 * {@code 'htmlTags'} are {@code 'singleton'} (Self-Closing) Tags, then this exception throws 180 */ 181 public static int[] check(Vector<? super TagNode> html, String... htmlTags) 182 { 183 // Check that these are all valid HTML Tags, throw an exception if not. 184 htmlTags = ARGCHECK.htmlTags(htmlTags); 185 186 // Temporary Hash-table, used to store the count of each htmlTag 187 Hashtable<String, Integer> ht = new Hashtable<>(); 188 189 // Initialize the temporary hash-table. This will be discarded at the end of the method, 190 // and converted into a parallel array. (Parallel to the input String... htmlTags array). 191 // Also, check to make sure the user hasn't requested a count of Singleton HTML Elements. 192 193 for (String htmlTag : htmlTags) 194 { 195 if (HTMLTags.isSingleton(htmlTag)) throw new SingletonException( 196 "One of the tags you have passed: [" + htmlTag + "] is a singleton-tag, " + 197 "and is only allowed opening versions of the tag." 198 ); 199 200 ht.put(htmlTag, Integer.valueOf(0)); 201 } 202 203 Integer I; 204 205 // Iterate through the HTML List, we are only counting HTML Elements, not text, and 206 // not HTML Comments 207 for (Object o : html) if (o instanceof TagNode) 208 { 209 TagNode tn = (TagNode) o; 210 211 // Get the current count from the hash-table 212 I = ht.get(tn.tok); 213 214 // The hash-table only holds elements we are counting, if null, then skip. 215 if (I == null) continue; 216 217 // Save the new, computed count, in the hash-table 218 // 219 // An opening-version (TC.OpeningTags, For Instance <DIV ...>) will ADD 1 to the count 220 // A closing-tag (For Instance: </DIV>) will SUBTRACT 1 from the count 221 222 ht.put(tn.tok, Integer.valueOf(I.intValue() + (tn.isClosing ? -1 : 1))); 223 } 224 225 // Convert the hash-table to an integer-array, and return this to the user 226 int[] ret = new int[htmlTags.length]; 227 228 for (int i=0; i < ret.length; i++) 229 ret[i] = 0; 230 231 for (int i=0; i < htmlTags.length; i++) 232 if ((I = ht.get(htmlTags[i])) != null) 233 ret[i] = I.intValue(); 234 235 return ret; 236 } 237 238 /** 239 * Creates a {@code Hashtable} that has a count of all open and closed HTML-Tags found on 240 * the page - whose count-value is not equal to zero. 241 * 242 * <BR /><BR />This method will report when there are unbalanced HTML-Tags on a page, <I><B>and 243 * strictly ignore any & all tags with a count of zero</B></I>. Specifically, if a tag has 244 * a {@code 1-to-1} open-close count, then it will not have any keys avialable in the returned 245 * {@code Hashtable}. 246 * 247 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE1> <!-- Validity Note --> 248 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_CLONE> <!-- Clone Note --> 249 * 250 * @param ht This should be a {@code Hashtable} that was produced by a call to one of the two 251 * available {@code check(...)} methods. 252 * 253 * @return A {@code Hashtable} map of the count of each html-element present in this 254 * {@code Vector}. For instance, if this {@code Vector} had 5 {@code '<A ...>'} (Anchor-Link) 255 * elements, and six {@code '</A>'} then this {@code Hashtable} would have a {@code String}-key 256 * {@code 'a'} with an integer value of {@code '-1'}. 257 */ 258 public static Hashtable<String, Integer> checkNonZero(Hashtable<String, Integer> ht) 259 { 260 @SuppressWarnings("unchecked") 261 Hashtable<String, Integer> ret = (Hashtable<String, Integer>) ht.clone(); 262 Enumeration<String> keys = ret.keys(); 263 264 while (keys.hasMoreElements()) 265 { 266 String key = keys.nextElement(); 267 268 // Remove any keys (HTML element-names) that have a normal ('0') count. 269 if (ret.get(key).intValue() == 0) ret.remove(key); 270 } 271 272 return ret; 273 } 274 275 276 /** 277 * This will compute a {@code count} for just one, particular, HTML Element of whether that 278 * Element has been properly opened and closed. An open and close {@code count} (integer 279 * value) will be returned by this method. 280 * 281 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE1> <!-- Validity Note --> 282 * 283 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 284 * 285 * @param htmlTag This the html element whose open-close count needs to be kept. 286 * 287 * @return The count of each html-element present in this {@code Vector}. For instance, if the 288 * user had requested that HTML Anchor Links be counted, and if the input {@code Vector} had 5 289 * {@code '<A ...>'} (Anchor-Link) elements, and six {@code '</A>'} then this method would 290 * return {@code -1}. 291 * 292 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 293 * 294 * @throws SingletonException If this {@code 'htmlTag'} is a {@code 'singleton'} (Self-Closing) 295 * Tag, this exception will throw. 296 */ 297 public static int checkTag(Vector<? super TagNode> html, String htmlTag) 298 { 299 // Check that this is a valid HTML Tag, throw an exception if invalid 300 htmlTag = ARGCHECK.htmlTag(htmlTag); 301 302 if (HTMLTags.isSingleton(htmlTag)) throw new SingletonException( 303 "The tag you have passed: [" + htmlTag + "] is a singleton-tag, and is only " + 304 "allowed opening versions of the tag." 305 ); 306 307 TagNode tn; int i = 0; 308 309 // Iterate through the HTML List, we are only counting HTML Elements, not text, and 310 // not HTML Comments 311 312 for (Object o : html) if (o instanceof TagNode) 313 314 // If we encounter an HTML Element whose tag is the tag whose count we are 315 // computing, then.... 316 317 if ((tn = (TagNode) o).tok.equals(htmlTag)) 318 319 // An opening-version (TC.OpeningTags, For Instance <DIV ...>) will ADD 1 to the count 320 // A closing-tag (For Instance: </DIV>) will SUBTRACT 1 from the count 321 322 i += tn.isClosing ? -1 : 1; 323 324 return i; 325 } 326 327 328 /** 329 * This method will calculate the "Maximum" and "Minimum" depth for every HTML 5.0 Tag found on 330 * a page. The Max-Depth is the "Maximum-Number" of Opening HTML Element Opening Tags were 331 * found for a particular element, before a matching closing version of the same Element is 332 * encountered. In the example below, the maximum "open-count" for the HTML 'divider' Element 333 * ({@code <DIV>}) is {@code '2'}. This is because a second {@code <DIV>} element is opened 334 * before the first is closed. 335 * 336 * <DIV CLASS="HTML">{@code 337 * <DIV class="MySection"><H1>These are my ideas:</H1> 338 * <!-- Above is an outer divider, below is an inner divider --> 339 * <DIV class="MyNumbers">Here are the points: 340 * <!-- HTML Content Here --> 341 * </DIV></DIV> 342 * }</DIV> 343 * 344 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE2> 345 * 346 * <BR /><BR /><B CLASS=JDDescLabel>'Count' Computation-Heuristic:</B> 347 * 348 * <BR />This maximum and minimum depth count will not pay any attention to whether HTML open 349 * and close tags "enclose each-other" or are "interleaved." The actual mechanics of the 350 * for-loop which calculaties the {@code count} shall hopefully explain this computation 351 * clearly enough. This may be viewed in this method's hilited source-code, below. 352 * 353 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 354 * 355 * @return The returned {@code Hashtable} will contain an integer-array for each HTML Element 356 * that was found on the page. Each of these arrays shall be of length {@code 3}. 357 * 358 * <BR /><BR /><OL CLASS=JDUL> 359 * <LI>Minimum Depth: {@code return_array[0]}</LI> 360 * <LI>Maximum Depth: {@code return_array[1]}</LI> 361 * <LI>Total Count: {@code return_array[2]}</LI> 362 * </OL> 363 * 364 * <BR /><DIV CLASS=JDHint> 365 * <B>REDUNDANCY NOTE:</B> The third element of the returned array should be identical to the 366 * result produced by an invocation of method: {@code Balance.checkTag(html, htmlTag);} 367 * </DIV> 368 * 369 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 370 * 371 * @throws SingletonException If this {@code 'htmlTag'} is a {@code 'singleton'} (Self-Closing) 372 * Tag, this exception will throw. 373 */ 374 public static Hashtable<String, int[]> depth(Vector<? super TagNode> html) 375 { 376 Hashtable<String, int[]> ht = new Hashtable<>(); 377 378 // Iterate through the HTML List, we are only counting HTML Elements, not text, and not HTML Comments 379 for (Object o : html) if (o instanceof TagNode) 380 { 381 TagNode tn = (TagNode) o; 382 383 // Don't keep a count on singleton tags. 384 if (HTMLTags.isSingleton(tn.tok)) continue; 385 386 int[] curMaxAndMinArr = ht.get(tn.tok); 387 388 // If this is the first encounter of a particular HTML Element, create a MAX/MIN 389 // integer array, and initialize it's values to zero. 390 391 if (curMaxAndMinArr == null) 392 { 393 curMaxAndMinArr = new int[3]; 394 395 curMaxAndMinArr[0] = 0; // Current Min Depth Count for Element "tn.tok" is zero 396 curMaxAndMinArr[1] = 0; // Current Max Depth Count for Element "tn.tok" is zero 397 curMaxAndMinArr[2] = 0; // Current Computed Depth Count for "tn.tok" is zero 398 399 ht.put(tn.tok, curMaxAndMinArr); 400 } 401 402 // curCount += tn.isClosing ? -1 : 1; 403 // 404 // An opening-version (TC.OpeningTags, For Instance <DIV ...>) will ADD 1 to the count 405 // A closing-tag (For Instance: </DIV>) will SUBTRACT 1 from the count 406 407 curMaxAndMinArr[2] += tn.isClosing ? -1 : 1; 408 409 // If the current depth-count is a "New Minimum" (a new low! :), then save it in the 410 // minimum pos of the output-array. 411 412 if (curMaxAndMinArr[2] < curMaxAndMinArr[0]) curMaxAndMinArr[0] = curMaxAndMinArr[2]; 413 414 // If the current depth-count (for this tag) is a "New Maximum" (a new high), save it 415 // to the max-pos of the output-array. 416 417 if (curMaxAndMinArr[2] > curMaxAndMinArr[1]) curMaxAndMinArr[1] = curMaxAndMinArr[2]; 418 } 419 420 return ht; 421 } 422 423 424 425 /** 426 * This method will calculate the "Maximum" and "Minimum" depth for every HTML Tag listed in 427 * the {@code var-args String[] htmlTags} parameter. The Max-Depth is the "Maximum-Number" of 428 * Opening HTML Element Opening Tags were found for a particular element, before a matching 429 * closing version of the same Element is encountered. In the example below, the maximum 430 * {@code 'open-count'} for the HTML 'divider' Element ({@code <DIV>}) is {@code '2'}. This is 431 * because a second {@code <DIV>} element is opened before the first is closed. 432 * 433 * <DIV CLASS="HTML">{@code 434 * <DIV class="MySection"><H1>These are my ideas:</H1> 435 * <!-- Above is an outer divider, below is an inner divider --> 436 * <DIV class="MyNumbers">Here are the points: 437 * <!-- HTML Content Here --> 438 * </DIV></DIV> 439 * }</DIV> 440 * 441 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE2> 442 * 443 * <BR /><BR /><B CLASS=JDDescLabel>'Count' Computation-Heuristic:</B> 444 * 445 * <BR />This maximum and minimum depth count will not pay any attention to whether HTML open 446 * and close tags "enclose each-other" or are "interleaved." The actual mechanics of the 447 * for-loop which calculaties the {@code count} shall hopefully explain this computation 448 * clearly enough. This may be viewed in this method's hilited source-code, below. 449 * 450 * <BR /><BR /><B CLASS=JDDescLabel>Var-Args Addition:</B> 451 * 452 * <BR />This method differs from the method with an identical name (defined above) in that it 453 * adds a <I>{@code String}-VarArgs parameter</I> that allows a user to decide which tags he 454 * would like counted and returned in this {@code Hashtable}, and which he would like to ignore. 455 * 456 * <BR /><BR />If one of the requested HTML-Tags from this{@code String}-VarArgs parameter is not 457 * actually an HTML Element present on the page, the returned {@code Hashtable} will still 458 * contain an {@code int[]}-Array for that tag. The values in that array will be equal to 459 * zero. 460 * 461 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 462 * 463 * @return The returned {@code Hashtable} will contain an integer-array for each HTML Element 464 * that was found on the page. Each of these arrays shall be of length {@code 3}. 465 * 466 * <BR /><BR /><OL CLASS=JDUL> 467 * <LI>Minimum Depth: {@code return_array[0]}</LI> 468 * <LI>Maximum Depth: {@code return_array[1]}</LI> 469 * <LI>Total Count: {@code return_array[2]}</LI> 470 * </OL> 471 * 472 * <BR /><DIV CLASS=JDHint> 473 * <B>REDUNDANCY NOTE:</B> The third element of the returned array should be identical to the 474 * result produced by an invocation of method: {@code Balance.checkTag(html, htmlTag);} 475 * </DIV> 476 * 477 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 478 * 479 * @throws SingletonException If this {@code 'htmlTag'} is a {@code 'singleton'} 480 * (Self-Closing) Tag, this exception will throw. 481 */ 482 public static Hashtable<String, int[]> depth(Vector<? super TagNode> html, String... htmlTags) 483 { 484 // Check that these are all valid HTML Tags, throw an exception if not. 485 htmlTags = ARGCHECK.htmlTags(htmlTags); 486 487 Hashtable<String, int[]> ht = new Hashtable<>(); 488 489 // Initialize the temporary hash-table. This will be discarded at the end of the method, 490 // and converted into a parallel array. (Parallel to the input String... htmlTags array). 491 // Also, check to make sure the user hasn't requested a count of Singleton HTML Elements. 492 493 for (String htmlTag : htmlTags) 494 { 495 if (HTMLTags.isSingleton(htmlTag)) throw new SingletonException( 496 "One of the tags you have passed: [" + htmlTag + "] is a singleton-tag, " + 497 "and is only allowed opening versions of the tag." 498 ); 499 500 // Insert an initialized array (init to zero) for this HTML Tag/Token 501 int[] arr = new int[3]; 502 503 arr[0] = 0; // Current Minimum Depth Count for HTML Element "tn.tok" is zero 504 arr[1] = 0; // Current Maximum Depth Count for HTML Element "tn.tok" is zero 505 arr[2] = 0; // Current Computed Depth Count is HTML Element "tn.tok" is zero 506 507 ht.put(htmlTag, arr); 508 } 509 510 // Iterate through the HTML List, we are only counting HTML Elements, not text, 511 // and not HTML Comments 512 513 for (Object o: html) if (o instanceof TagNode) 514 { 515 TagNode tn = (TagNode) o; 516 517 int[] curMaxAndMinArr = ht.get(tn.tok); 518 519 // If this is null, we are attempting to perform the count on an HTML Element that 520 // wasn't requested by the user with the var-args 'String... htmlTags' parameter. 521 // The Hashtable was initialized to only have those tags. (see about 5 lines above 522 // where the Hashtable is initialized) 523 524 if (curMaxAndMinArr == null) continue; 525 526 // An opening-version (TC.OpeningTags, For Instance <DIV ...>) will ADD 1 to the count 527 // A closing-tag (For Instance: </DIV>) will SUBTRACT 1 from the count 528 529 curMaxAndMinArr[2] += tn.isClosing ? -1 : 1; 530 531 // If the current depth-count is a "New Minimum" (a new low! :), then save it in the 532 // minimum pos of the output-array. 533 534 if (curMaxAndMinArr[2] < curMaxAndMinArr[0]) curMaxAndMinArr[0] = curMaxAndMinArr[2]; 535 536 // If the current depth-count (for this tag) is a "New Maximum" (a new high), save it 537 // to the max-pos of the output-array. 538 539 if (curMaxAndMinArr[2] > curMaxAndMinArr[1]) curMaxAndMinArr[1] = curMaxAndMinArr[2]; 540 541 // NOTE: No need to update the hash-table, since this is an array - changing its 542 // values is already "reflected" into the Hashtable. 543 } 544 545 return ht; 546 } 547 548 549 /** 550 * Creates a {@code Hashtable} that has a maximum and minimum depth for all HTML tags found on 551 * the page. Any HTML Tags that meet ALL of these criteria shall be removed from the 552 * result-set {@code Hashtable} ... 553 * 554 * <BR /><BR /><UL CLASS=JDUL> 555 * <LI>Minimum Depth Is {@code '0'} - i.e. <I>closing tag never precedes opening.</I></LI> 556 * <LI>Count is {@code '0'} - i.ei. <I>there is a {@code 1-to-1} ratio of opening and closing 557 * tags</I> for the particular HTML Element.</LI> 558 * </UL> 559 * 560 * <BR /><DIV CLASS=JDHint> 561 * This means that there is a {@code 1:1} ratio of opening and closing versions of the tag, 562 * <B><I>and also</I></B> that there are no positions in the vector where a closing tag to come 563 * before an tag to open it. 564 * </DIV> 565 * 566 * <BR /><BR /><B CLASS=JDDescLabel>Cloned Input:</B> 567 * 568 * <BR />This method clones the original input {@code Hashtable}, and removes the tags whose 569 * depth-calculations are invalid - as described above. This allows the user to perform other 570 * operations with the original table, while this class is processing. 571 * 572 * @param ht This should be a {@code Hashtable} that was produced by a call to one of the two 573 * available {@code depth(...)} methods. 574 * 575 * @return This shall a return a list of HTML Tags that are <I>potentially (but not guaranteed 576 * to be)</I> invalid. 577 */ 578 public static Hashtable<String, int[]> depthInvalid(Hashtable<String, int[]> ht) 579 { 580 @SuppressWarnings("unchecked") 581 Hashtable<String, int[]> ret = (Hashtable<String, int[]>) ht.clone(); 582 Enumeration<String> keys = ret.keys(); 583 584 // Using the "Enumeration" class allows the situation where elements can be removed from 585 // the underlying data-structure - while iterating through that data-structure. This is 586 // not possible using a keySet Iterator. 587 588 while (keys.hasMoreElements()) 589 { 590 String key = keys.nextElement(); 591 int[] arr = ret.get(key); 592 593 if ((arr[1] >= 0) && (arr[2] == 0)) ret.remove(key); 594 } 595 596 return ret; 597 } 598 599 /** 600 * Creates a {@code Hashtable} that has a maximum and minimum depth for all HTML tags found on 601 * the page. Any HTML Tags that meet ALL of these criteria, below, shall be removed from the 602 * result-set {@code Hashtable} ... 603 * 604 * <BR /><BR /><UL CLASS=JDUL> 605 * <LI> Maximum Depth is precisely {@code '1'} - i.e. <I>Each element of this tag is closed 606 * before a second is open.</I> 607 * </LI> 608 * </UL> 609 * 610 * <BR /><BR /><B CLASS=JDDescLabel>Cloned Input:</B> 611 * 612 * <BR />This method clones the original input {@code Hashtable}, and removes the tags whose 613 * maximum-depth is not greater than one. This allows the user to perform other operations 614 * with the original table, while this class is processing. 615 * 616 * @param ht This should be a {@code Hashtable} that was produced by a call to one of the two 617 * available {@code depth(...)} methods. 618 * 619 * @return This shall a return a list of HTML Tags that are <I>potentially (but not guaranteed 620 * to be)</I> 621 * invalid. 622 */ 623 public static Hashtable<String, int[]> depthGreaterThanOne(Hashtable<String, int[]> ht) 624 { 625 @SuppressWarnings("unchecked") 626 Hashtable<String, int[]> ret = (Hashtable<String, int[]>) ht.clone(); 627 Enumeration<String> keys = ret.keys(); 628 629 // Using the "Enumeration" class allows the situation where elements can be removed from 630 // the underlying data-structure - while iterating through that data-structure. This is not 631 // possible using a keySet Iterator. 632 633 while (keys.hasMoreElements()) 634 { 635 String key = keys.nextElement(); 636 int[] arr = ret.get(key); 637 638 if (arr[1] == 1) ret.remove(key); 639 } 640 641 return ret; 642 } 643 644 645 /** 646 * This method will calculate the "Maximum" and "Minimum" depth for a particular HTML Tag. 647 * The Max-Depth just means the number of Maximum-Number of Opening HTML Element Opening Tags 648 * were found, before a matching closing version of the same Element is encountered. For 649 * instance: {@code <DIV ...><DIV ..> Some Page</DIV></DIV>} has a maximum depth of 650 * {@code '2'}. This means there is a point in the vectorized-html where there are 2 651 * successive divider elements that are opened, before even one has been closed. 652 * 653 * <EMBED CLASS='external-html' DATA-FILE-ID=BALANCE_VALID_NOTE2> 654 * 655 * <BR /><BR /><B CLASS=JDDescLabel>'Count' Computation-Heuristic:</B> 656 * 657 * <BR />This maximum and minimum depth count will not pay any attention to whether HTML open 658 * and close tags "enclose each-other" or are "interleaved." The actual mechanics of the 659 * for-loop which calculaties the {@code count} shall hopefully explain this computation 660 * clearly enough. This may be viewed in this method's hilited source-code, below. 661 * 662 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 663 * 664 * @param htmlTag This the html element whose maximum and minimum depth-count needs to be 665 * computed. 666 * 667 * @return The returned integer-array, shall be of length 3. 668 * 669 * <BR /><BR /><OL CLASS=JDUL> 670 * <LI>Minimum Depth: {@code return_array[0]}</LI> 671 * <LI>Maximum Depth: {@code return_array[1]}</LI> 672 * <LI>Total Count: {@code return_array[2]}</LI> 673 * </OL> 674 * 675 * <BR /><DIV CLASS=JDHint> 676 * <B>REDUNDANCY NOTE:</B> The third element of the returned array should be identical to the 677 * result produced by an invocation of method: {@code Balance.checkTag(html, htmlTag);} 678 * </DIV> 679 * 680 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 681 * 682 * @throws SingletonException If this {@code 'htmlTag'} is a {@code 'singleton'} (Self-Closing) 683 * Tag, this exception will throw. 684 */ 685 public static int[] depthTag(Vector<? super TagNode> html, String htmlTag) 686 { 687 // Check that this is a valid HTML Tag, throw an exception if invalid 688 htmlTag = ARGCHECK.htmlTag(htmlTag); 689 690 if (HTMLTags.isSingleton(htmlTag)) throw new SingletonException( 691 "The tag you have passed: [" + htmlTag + "] is a singleton-tag, and is only allowed " + 692 "opening versions of the tag." 693 ); 694 695 TagNode tn; int i = 0; int max = 0; int min = 0; 696 697 // Iterate through the HTML List, we are only counting HTML Elements, not text, and not HTML Comments 698 for (Object o : html) if (o instanceof TagNode) 699 700 if ((tn = (TagNode) o).tok.equals(htmlTag)) 701 { 702 // An opening-version (TC.OpeningTags, For Instance <DIV ...>) will ADD 1 to the count 703 // A closing-tag (For Instance: </DIV>) will SUBTRACT 1 from the count 704 705 i += tn.isClosing ? -1 : 1; 706 707 if (i > max) max = i; 708 if (i < min) min = i; 709 } 710 711 // Generate the output array, and return 712 int[] ret = new int[2]; 713 714 ret[0] = min; 715 ret[1] = max; 716 ret[2] = i; 717 718 return ret; 719 } 720 721 /** 722 * This will find the (likely) places where the "non-nested HTML Elements" have become nested. 723 * For the purposes of finding mismatched elements - such as an unclosed "Italics" Element, or 724 * an "Extra" Italics Element - this method will find places where a new HTML Tag has opened 725 * before a previous one has been closed - <I>or vice-versa (where there is an 'extra' 726 * closed-tag).</I> 727 * 728 * <BR /><BR />Certainly, if "nesting" is usually acceptable (for instance the HTML divider 729 * {@code '<DIV>...</DIV>'} construct) <I><B>then the results of this method would not have any 730 * meaning.</I></B> Fortunately, for the vast majority of HTML Elements {@code <I>, <B>, <A>, 731 * etc...} nesting the tags is not allowed or encouraged. 732 * 733 * <BR /><BR />The following example use of this method should make clear the application. If 734 * a user has identified that there is an unclosed HTML italics element ({@code <I>...</I>}) 735 * somewhere on a page, for-example, and that page has numerous italics elements, this method 736 * can pinpoint the failure instantly, using this example. Note that the file-name is a 737 * Java-Doc generated output HTML file. The documentation for this package received a copious 738 * amount of attention due to the sheer number of method-names and class-names used throughout. 739 * 740 * <DIV CLASS="EXAMPLE">{@code 741 * String fStr = FileRW.loadFileToString("javadoc/Torello/HTML/TagNode.html"); 742 * Vector<HTMLNode> v = HTMLPage.getPageTokens(fStr, false); 743 * int[] posArr = Balance.nonNestedCheck(v, "i"); 744 * 745 * // Below, the class 'Debug' is used to pretty-print the vectorized-html page. Here, the 746 * // output will find the lone, non-closed, HTML italics <I> ... </I> tag-element, and output 747 * // it to the terminal-window. The parameter '5' means the nearest 5 elements (in either 748 * // direction) are printed, in addition to the elements at the indices in the posArr. 749 * // Parameter 'true' implies that two curly braces are printed surrounding the matched node. 750 * 751 * System.out.println(Debug.print(v, posArr, 5, " Skip a few ", true, Debug::K)); 752 * }</DIV> 753 * 754 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 755 * 756 * @param htmlTag This the html element whose maximum and minimum depth-count was not {@code 1} 757 * and {@code 0}, respectively. The precise location where the depth achieved either a 758 * negative depth, or depth greater than {@code 1} will be returned in the integer array. In 759 * English: When two opening-tags or two closing-tags are identified, successively, then the 760 * index where the second tag was found is recorded into the output array. 761 * 762 * @return This will return an array of vectorized-html index-locations / index-pointers where 763 * the first instance of an extra opening, or an extra-closing tag, occurs. This will 764 * facilitate finding tags that are not intended to be nested. If "tag-nesting" (for example 765 * HTML divider, {@code 'DIV'}, elements), then the results returned by this method will not be 766 * useful. 767 * 768 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 769 * 770 * @throws SingletonException If this {@code 'htmlTag'} is a {@code 'singleton'} (Self-Closing) 771 * Tag, this exception will throw. 772 * 773 * @see FileRW#loadFileToString(String) 774 * @see HTMLPage#getPageTokens(CharSequence, boolean) 775 * @see Debug#print(Vector, int[], int, String, boolean, BiConsumer) 776 */ 777 public static int[] nonNestedCheck(Vector<? super TagNode> html, String htmlTag) 778 { 779 // Java Streams are an easier way to keep variable-length lists. They use 780 // "builders" - and this one is for an "IntStream" 781 782 IntStream.Builder b = IntStream.builder(); 783 784 // Check that this is a valid HTML Tag, throw an exception if invalid 785 htmlTag = ARGCHECK.htmlTag(htmlTag); 786 787 if (HTMLTags.isSingleton(htmlTag)) throw new SingletonException( 788 "The tag you have passed: [" + htmlTag + "] is a singleton-tag, and is only " + 789 "allowed opening versions of the tag." 790 ); 791 792 Object o; TagNode tn; int len = html.size(); TC last = null; 793 794 // Iterate through the HTML List, we are only counting HTML Elements, not text, 795 // and not HTML Comments 796 797 for (int i=0; i < len; i++) 798 799 if ((o = html.elementAt(i)) instanceof TagNode) 800 if ((tn = (TagNode) o).tok.equals(htmlTag)) 801 { 802 if ((tn.isClosing) && (last == TC.ClosingTags)) b.add(i); 803 if ((! tn.isClosing) && (last == TC.OpeningTags)) b.add(i); 804 805 last = tn.isClosing ? TC.ClosingTags : TC.OpeningTags; 806 } 807 808 return b.build().toArray(); 809 } 810 811 /** 812 * For likely greater than 95% of HTML tags - finding situations where that tag has 813 * <I><B>'nested tags'</I></B> is highly unlikely. Unfortunately, two or three of the most 814 * common tags in use, for instance {@code <DIV>, <SPAN>}, finding where a mis-match has 815 * occurred (tracking down an "Unclosed divider") is an order of magnitude more difficult than 816 * finding an unclosed anchor {@code '<A HREF...>'}. This method shall return two parallel 817 * arrays. The first array will contain vector indices. The second array contains the depth 818 * (nesting level) of that tag at that position. In this way, finding an unclosed divider is 819 * tantamount to finding where all closing-dividers seem to evaluate to a depth of '1' (one) 820 * rather than '0' (zero). 821 * 822 * <BR /><BR /><DIV CLASS=JDHint> 823 * This method can highly useful for {@code <SPAN>} and {@code DIV}, while the "non-standard 824 * depth locations" method can be extremely useful for simple, non-nested tags such as Anchor, 825 * Paragraph, Section, etc... - HTML Elements that are mostly never nested. 826 * </DIV> 827 * 828 * <DIV CLASS="EXAMPLE">{@code 829 * // Load an HTML File to a String 830 * String file = LFEC.loadFile("~/HTML/MyHTMLFile.html"); 831 * 832 * // Parse, and convert to vectorized-html 833 * Vector<HTMLNode> v = HTMLPage.getPageTokens(file, false); 834 * 835 * // Run this method 836 * Ret2<int[], int[]> r = Balance.locationsAndDepth(v, "div"); 837 * 838 * // This array has vector-indices 839 * int[] posArr = (int[]) r.a; 840 * 841 * // This (parallel) array has the depth at that index. 842 * int[] depthArr = (int[]) r.b; 843 * 844 * for (int i=0; i < posArr.length; i++) System.out.println( 845 * "(" + posArr[i] + ", " + depthArr[i] + "):\t" + // Prints the Vector-Index, and Depth 846 * C.BRED + v.elementAt(posArr[i]).str + C.RESET // Prints the actual HTML divider. 847 * ); 848 * }</DIV> 849 * 850 * <BR />The above code would produce a list of HTML Divider elements, along with their index 851 * in the {@code Vector}, and the exact depth (number of nested, open {@code 'DIV'} elements) 852 * at that location. This is usually helpful when trying to find unclosed HTML Tags. 853 * 854 * @param html <EMBED CLASS='external-html' DATA-FILE-ID=HTMLVECSUP> 855 * 856 * @param htmlTag This the html element that has an imbalanced OPEN-CLOSE ratio in the tree. 857 * 858 * @return Two parallel arrays, as follows: 859 * 860 * <BR /><BR /><OL CLASS=JDOL> 861 * <LI> {@code Ret2.a (int[])} 862 * <BR /><BR /> 863 * This shall be an integer array of {@code Vector}-indices where the HTML Element has 864 * been found. 865 * <BR /><BR /> 866 * </LI> 867 * <LI> {@code Ret2.b (int[])} 868 * <BR /><BR /> 869 * This shall contain an array of the value of the depth for the {@code 'htmlTag'} 870 * at the particular {@code Vector}-index identified in the first-array. 871 * </LI> 872 * </OL> 873 * 874 * @throws HTMLTokException If any of the tags passed are not valid HTML tags. 875 * 876 * @throws SingletonException If this {@code 'htmlTag'} is a {@code 'singleton'} (Self-Closing) 877 * Tag, this exception will throw. 878 */ 879 public static Ret2<int[], int[]> locationsAndDepth(Vector<? super TagNode> html, String htmlTag) 880 { 881 // Java Streams are an easier way to keep variable-length lists. They use 882 // "builders" - and this one is for an "IntStream" 883 884 IntStream.Builder locations = IntStream.builder(); 885 IntStream.Builder depthAtLocation = IntStream.builder(); 886 887 // Check that this is a valid HTML Tag, throw an exception if invalid 888 htmlTag = ARGCHECK.htmlTag(htmlTag); 889 890 if (HTMLTags.isSingleton(htmlTag)) throw new SingletonException( 891 "The tag you have passed: [" + htmlTag + "] is a singleton-tag, and is only " + 892 "allowed opening versions of the tag." 893 ); 894 895 Object o; TagNode tn; int len = html.size(); int depth = 0; 896 897 // Iterate through the HTML List, we are only counting HTML Elements, not text, and 898 // not HTML Comments 899 900 for (int i=0; i < len; i++) if ((o = html.elementAt(i)) instanceof TagNode) 901 902 if ((tn = (TagNode) o).tok.equals(htmlTag)) 903 { 904 depth += tn.isClosing ? -1 : 1; 905 906 locations.add(i); 907 908 depthAtLocation.add(depth); 909 } 910 911 return new Ret2<int[], int[]> 912 (locations.build().toArray(), depthAtLocation.build().toArray()); 913 } 914 915 /** 916 * Converts a depth report to a {@code String}, for printing. 917 * 918 * @param depthReport This should be a {@code Hashtable} returned by any of the depth-methods. 919 * 920 * @return This shall return the report as a {@code String}. 921 */ 922 public static String toStringDepth(Hashtable<String, int[]> depthReport) 923 { 924 StringBuilder sb = new StringBuilder(); 925 926 for (String htmlTag : depthReport.keySet()) 927 { 928 int[] arr = depthReport.get(htmlTag); 929 930 sb.append( 931 "HTML Element: [" + htmlTag + "]:\t" + 932 "Min-Depth: " + arr[0] + ",\tMax-Depth: " + arr[1] + ",\tCount: " + arr[2] + "\n" 933 ); 934 } 935 936 return sb.toString(); 937 } 938 939 940 /** 941 * Converts a balance report to a {@code String}, for printing. 942 * 943 * @param balanceCheckReport This should be a {@code Hashtable} returned by any of the 944 * balance-check methods. 945 * 946 * @return This shall return the report as a {@code String}. 947 */ 948 public static String toStringBalance(Hashtable<String, Integer> balanceCheckReport) 949 { 950 StringBuilder sb = new StringBuilder(); 951 int maxTagLen = 0; 952 int maxValStrLen = 0; 953 int maxAbsValStrLen = 0; 954 int val; 955 String valAsStr; 956 957 // For good spacing purposes, we need the length of the longest of the tags. 958 for (String htmlTag : balanceCheckReport.keySet()) 959 if (htmlTag.length() > maxTagLen) 960 maxTagLen = htmlTag.length(); 961 962 // 17 is the length of the string below, 2 is the amount of extra-space needed 963 maxTagLen += 17 + 2; 964 965 for (int v : balanceCheckReport.values()) 966 if ((valAsStr = ("" + v)).length() > maxValStrLen) 967 maxValStrLen = valAsStr.length(); 968 969 for (int v : balanceCheckReport.values()) 970 if ((valAsStr = ("" + Math.abs(v))).length() > maxAbsValStrLen) 971 maxAbsValStrLen = valAsStr.length(); 972 973 for (String htmlTag : balanceCheckReport.keySet()) 974 975 sb.append( 976 StringParse.rightSpacePad("HTML Element: [" + htmlTag + "]:", maxTagLen) + 977 StringParse.rightSpacePad( 978 ("" + (val = balanceCheckReport.get(htmlTag).intValue())), 979 maxValStrLen 980 ) + 981 NOTE(val, htmlTag, maxAbsValStrLen) + 982 "\n" 983 ); 984 985 return sb.toString(); 986 } 987 988 private static String NOTE(int val, String htmlTag, int padding) 989 { 990 if (val == 0) return ""; 991 992 else if (val > 0) return 993 ", which implies " + StringParse.rightSpacePad("" + Math.abs(val), padding) + 994 " unclosed <" + htmlTag + "> element(s)"; 995 996 else return 997 ", which implies " + StringParse.rightSpacePad("" + Math.abs(val), padding) + 998 " extra </" + htmlTag + "> element(s)"; 999 } 1000 1001 /** 1002 * Converts a balance report to a {@code String}, for printing. 1003 * 1004 * @param balanceCheckReport This should be a {@code Hashtable} returned by any of the 1005 * balance-check methods. 1006 * 1007 * @return This shall return the report as a {@code String}. 1008 * 1009 * @throws IllegalArgumentException This exception throws if the length of the two input arrays 1010 * are not equal. It is imperative that the balance report being printed was created by the 1011 * html-tags that are listed in the HTML Token var-args parameter. If the two arrays are the 1012 * same length, but the tags used to create the report Hashtable are not the same ones being 1013 * passed to the var-args parameter {@code 'htmlTags'} - <I>the logic will not know the 1014 * difference, and no exception is thrown.</I> 1015 */ 1016 public static String toStringBalance(int[] balanceCheckReport, String... htmlTags) 1017 { 1018 if (balanceCheckReport.length != htmlTags.length) throw new IllegalArgumentException( 1019 "The balance report that you are checking was not generated using the html token " + 1020 "list provided, they are different lengths. balanceCheckReport.length: " + 1021 "[" + balanceCheckReport.length + "]\t htmlTags.length: [" + htmlTags.length + "]" 1022 ); 1023 1024 StringBuilder sb = new StringBuilder(); 1025 1026 for (int i=0; i < balanceCheckReport.length; i++) 1027 sb.append("HTML Element: [" + htmlTags[i] + "]:\t" + balanceCheckReport[i] + "\n"); 1028 1029 return sb.toString(); 1030 } 1031 1032}