1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | package Torello.HTML; import Torello.HTML.helper.AttrRegEx; import Torello.Java.StringParse; import java.util.stream.Stream; import java.util.regex.Matcher; import java.util.ArrayList; import java.util.List; class KeyOnlyAttributes { // The purpose of this Helper-Method is to collect all of the stuff that occurs // BETWEEN Reg-Ex Matches... // // Note that the Input-Parameter 'str' is the String that occurs after the Token of an // HTML TagNode. If the TagNode were "<DIV CLASS=X>", then the 'str' would be " CLASS=X". // // It is imperative that the leading Space-Character after the "DIV" token be included. // The Reg-Ex expects mandatory (at least one) space characters to be between Key-Value // Attributes. // // About 75% of TagNode's do not have any attributes of all. (50% of them are closing tags, // which cannot have Attributes at all). // // Of the 25% of TagNode's that have attributes, less than 1% of them will be "Boolean" or // "Key Only" Attributes. private static List<String> inverseMatches(final String str) { // System.out.println("STRING: " + str); Matcher m = AttrRegEx.KEY_VALUE_REGEX.matcher(str); int prev = 0; List<String> ret = new ArrayList<>(); while (m.find()) { final int start = m.start(); if (start == prev) { prev = m.end(); continue; } final String inverseMatch = str.substring(prev, start).trim(); // System.out.println("inverseMatch: [" + inverseMatch + ']'); if (inverseMatch.length() > 0) ret.add(inverseMatch); prev = m.end(); } // Consume the last chunk of string that could still possibly remain... if (prev < str.length()) ret.add(str.substring(prev)); // System.out.println("Inverse-Matches: " + ret.toString()); return ret; } static Stream<String> allKeyOnlyAttributes( final TagNode tn, final boolean preserveKeysCase ) { // NOTE: OPTIMIZED, "closing-versions" of the TagNode, and TagNode's whose 'str' // field is only longer than the token, itself, by 3 or less characters cannot have // attributes. In that case, just return an empty 'Stream' instance. int len = tn.str.length(); if (tn.isClosing || (len <= (tn.tok.length() + 3))) return Stream.empty(); // Leaves off the opening 'token' and less-than '<' symbol (leaves off "<DIV" for example) // Also leave off the "ending-forward-slash" (if there is one) and ending '>' // // January 23, 2025: I have discovered a TagNode bug. It is extremley important to include // leading initial space that occurs after the token. Specifically, if there were a // TagNode such as <DIV CLASS=X>, IN ORDER FOR THE REG-EX TO WORK, the String 's' must be // " CLASS=X", rather than "CLASS=X". // // The Reg-Ex expects there to be at least a single character of White-Space before each of // the Key-Value Attributes. String s = tn.str.substring( tn.tok.length() + 1, len - ((tn.str.charAt(len - 2) == '/') ? 2 : 1) ); // if all lower-case is requested, do that here. if (! preserveKeysCase) s = s.toLowerCase(); // This is used to generate the Returned-Stream final Stream.Builder<String> b = Stream.builder(); // This method would print up **WAY TOO MUCH** stuff. This just prevents 99% of // the TagNode updates from being printed. Only the example that I'm working on // has the CSS-Class IHTA // // final boolean B = s.contains("IHTA"); for (String unMatchedStr : inverseMatches(s)) // if (B) System.out.println("unMatchedStr: " + unMatchedStr); // Of that stuff, now do a white-space split for connected characters for (String keyWord : unMatchedStr.split("\\s+")) // Call String.trim() and String.length() if ((keyWord = keyWord.trim()).length() > 0) // Check for valid Attribute-Name's only if (AttrRegEx.ATTRIBUTE_KEY_REGEX_PRED.test(keyWord)) // ... put it in the return stream. // NOTE: This has the potential to slightly change the original HTML // It will "leave out any guck" that was in the Element b.add(keyWord); // Build the Stream<String>, and return; return b.build(); } static boolean hasKeyOnlyAttribute( final TagNode tn, final String keyOnlyAttribute ) { // Closing TagNode's do not have attributes, return false immediately. if (tn.isClosing) return false; // ONLY CHECKS FOR WHITE-SPACE, *NOT* VALIDITY... if (StringParse.hasWhiteSpace(keyOnlyAttribute)) throw new IllegalArgumentException( "The attribute you have passed [" + keyOnlyAttribute + "] has white-space, " + "This is not allowed here, because the search routine splits on whitespace, and " + "therefore a match would never be found." ); // NOTE: TagNode's whose 'str' field is only longer than the token, itself, by 3 or less // characters cannot have attributes. In that case, just return false. int len = tn.str.length(); if (len <= (tn.tok.length() + 3)) return false; // Leaves off the opening 'token' and less-than '<' symbol (leaves off "<DIV " for example) // Also leave off the "ending-forward-slash" (if there is one), and edning '>' String s = tn.str.substring( tn.tok.length() + 2, len - ((tn.str.charAt(len - 2) == '/') ? 2 : 1) ); // java.util.regex.Pattern.split(CharSequence) is sort of an "inverse reg-ex" in that it // returns all of the text that was present BETWEEN the matches // // 'split' => inverse-matches (text between KEY-VALUE pairs) for (String unMatchedStr : inverseMatches(s)) // Of that stuff, now do a white-space split for connected characters for (String keyWord : unMatchedStr.split("\\s+")) // trim, check-length... if ((keyWord = keyWord.trim()).length() > 0) if (keyOnlyAttribute.equalsIgnoreCase(keyWord)) return true; // Was not found, return false; return false; } } |