1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
package Torello.HTML;

import Torello.HTML.helper.AttrRegEx;

import Torello.Java.StringParse;

import java.util.stream.Stream;
import java.util.regex.Matcher;
import java.util.ArrayList;
import java.util.List;

class KeyOnlyAttributes
{
    // The purpose of this Helper-Method is to collect all of the stuff that occurs
    // BETWEEN Reg-Ex Matches... 
    // 
    // Note that the Input-Parameter 'str' is the String that occurs after the Token of an
    // HTML TagNode.  If the TagNode were "<DIV CLASS=X>", then the 'str' would be " CLASS=X".
    // 
    // It is imperative that the leading Space-Character after the "DIV" token be included.
    // The Reg-Ex expects mandatory (at least one) space characters to be between Key-Value 
    // Attributes.
    // 
    // About 75% of TagNode's do not have any attributes of all.  (50% of them are closing tags,
    // which cannot have Attributes at all).
    // 
    // Of the 25% of TagNode's that have attributes, less than 1% of them will be "Boolean" or 
    // "Key Only" Attributes.

    private static List<String> inverseMatches(final String str)
    {
        // System.out.println("STRING: " + str);

        Matcher         m       = AttrRegEx.KEY_VALUE_REGEX.matcher(str);
        int             prev    = 0;
        List<String>    ret     = new ArrayList<>();

        while (m.find())
        {
            final int start = m.start();

            if (start == prev)
            {
                prev = m.end();
                continue;
            }

            final String inverseMatch = str.substring(prev, start).trim();

            // System.out.println("inverseMatch: [" + inverseMatch + ']');

            if (inverseMatch.length() > 0) ret.add(inverseMatch);

            prev = m.end();
        }

        // Consume the last chunk of string that could still possibly remain...
        if (prev < str.length()) ret.add(str.substring(prev));

        // System.out.println("Inverse-Matches: " + ret.toString());

        return ret;
    }
    
    static Stream<String> allKeyOnlyAttributes(
            final TagNode tn,
            final boolean preserveKeysCase
        )
    {
        // NOTE: OPTIMIZED, "closing-versions" of the TagNode, and TagNode's whose 'str'
        //       field is only longer than the token, itself, by 3 or less characters cannot have
        //       attributes.  In that case, just return an empty 'Stream' instance.

        int len = tn.str.length();
        if (tn.isClosing || (len <= (tn.tok.length() + 3))) return Stream.empty();


        // Leaves off the opening 'token' and less-than '<' symbol  (leaves off "<DIV" for example)
        // Also leave off the "ending-forward-slash" (if there is one) and ending '>'
        // 
        // January 23, 2025: I have discovered a TagNode bug.  It is extremley important to include
        // leading initial space that occurs after the token.  Specifically, if there were a 
        // TagNode such as <DIV CLASS=X>, IN ORDER FOR THE REG-EX TO WORK, the String 's' must be 
        // " CLASS=X", rather than "CLASS=X".
        // 
        // The Reg-Ex expects there to be at least a single character of White-Space before each of
        // the Key-Value Attributes.

        String s = tn.str.substring(
            tn.tok.length() + 1,
            len - ((tn.str.charAt(len - 2) == '/') ? 2 : 1)
        );

        // if all lower-case is requested, do that here.
        if (! preserveKeysCase) s = s.toLowerCase();

        // This is used to generate the Returned-Stream
        final Stream.Builder<String> b = Stream.builder();

        // This method would print up **WAY TOO MUCH** stuff.  This just prevents 99% of
        // the TagNode updates from being printed.  Only the example that I'm working on 
        // has the CSS-Class IHTA
        // 
        // final boolean B = s.contains("IHTA");

        for (String unMatchedStr : inverseMatches(s))

            // if (B) System.out.println("unMatchedStr: " + unMatchedStr);

            // Of that stuff, now do a white-space split for connected characters
            for (String keyWord : unMatchedStr.split("\\s+"))

                // Call String.trim() and String.length()
                if ((keyWord = keyWord.trim()).length() > 0)

                    // Check for valid Attribute-Name's only
                    if (AttrRegEx.ATTRIBUTE_KEY_REGEX_PRED.test(keyWord))

                        // ... put it in the return stream.
                        // NOTE: This has the potential to slightly change the original HTML
                        //       It will "leave out any guck" that was in the Element

                        b.add(keyWord);

        // Build the Stream<String>, and return;
        return b.build();
    }

    static boolean hasKeyOnlyAttribute(
            final TagNode   tn,
            final String    keyOnlyAttribute
        )
    {
        // Closing TagNode's do not have attributes, return false immediately.
        if (tn.isClosing) return false;

        // ONLY CHECKS FOR WHITE-SPACE, *NOT* VALIDITY...
        if (StringParse.hasWhiteSpace(keyOnlyAttribute)) throw new IllegalArgumentException(
            "The attribute you have passed [" + keyOnlyAttribute + "] has white-space, " +
            "This is not allowed here, because the search routine splits on whitespace, and " +
            "therefore a match would never be found."
        );


        // NOTE: TagNode's whose 'str' field is only longer than the token, itself, by 3 or less
        //       characters cannot have attributes.  In that case, just return false.

        int len = tn.str.length();
        if (len <= (tn.tok.length() + 3)) return false;


        // Leaves off the opening 'token' and less-than '<' symbol  (leaves off "<DIV " for example)
        // Also leave off the "ending-forward-slash" (if there is one), and edning '>'

        String s = tn.str.substring(
            tn.tok.length() + 2,
            len - ((tn.str.charAt(len - 2) == '/') ? 2 : 1)
        );


        // java.util.regex.Pattern.split(CharSequence) is sort of an "inverse reg-ex" in that it 
        // returns all of the text that was present BETWEEN the matches 
        // 
        // 'split' => inverse-matches (text between KEY-VALUE pairs)

        for (String unMatchedStr : inverseMatches(s))

            // Of that stuff, now do a white-space split for connected characters
            for (String keyWord : unMatchedStr.split("\\s+"))

                // trim, check-length...
                if ((keyWord = keyWord.trim()).length() > 0)

                    if (keyOnlyAttribute.equalsIgnoreCase(keyWord)) return true;

        // Was not found, return false;
        return false;
    }
}