TagNode-KeyOnlyAttributes.java.html

package Torello.HTML;

import Torello.HTML.helper.AttrRegEx;

import Torello.Java.StringParse;

import java.util.stream.Stream;
import java.util.regex.Matcher;
import java.util.ArrayList;
import java.util.List;

class KeyOnlyAttributes
{
    // The purpose of this Helper-Method is to collect all of the stuff that occurs
    // BETWEEN Reg-Ex Matches... 
    // 
    // Note that the Input-Parameter 'str' is the String that occurs after the Token of an
    // HTML TagNode.  If the TagNode were "<DIV CLASS=X>", then the 'str' would be " CLASS=X".
    // 
    // It is imperative that the leading Space-Character after the "DIV" token be included.
    // The Reg-Ex expects mandatory (at least one) space characters to be between Key-Value 
    // Attributes.
    // 
    // About 75% of TagNode's do not have any attributes of all.  (50% of them are closing tags,
    // which cannot have Attributes at all).
    // 
    // Of the 25% of TagNode's that have attributes, less than 1% of them will be "Boolean" or 
    // "Key Only" Attributes.

    private static List<String> inverseMatches(final String str)
    {
        // System.out.println("STRING: " + str);

        Matcher         m       = AttrRegEx.KEY_VALUE_REGEX.matcher(str);
        int             prev    = 0;
        List<String>    ret     = new ArrayList<>();

        while (m.find())
        {
            final int start = m.start();

            if (start == prev)
            {
                prev = m.end();
                continue;
            }

            final String inverseMatch = str.substring(prev, start).trim();

            // System.out.println("inverseMatch: [" + inverseMatch + ']');

            if (inverseMatch.length() > 0) ret.add(inverseMatch);

            prev = m.end();
        }

        // Consume the last chunk of string that could still possibly remain...
        if (prev < str.length()) ret.add(str.substring(prev));

        // System.out.println("Inverse-Matches: " + ret.toString());

        return ret;
    }
    
    static Stream<String> allKeyOnlyAttributes(
            final TagNode tn,
            final boolean preserveKeysCase
        )
    {
        // NOTE: OPTIMIZED, "closing-versions" of the TagNode, and TagNode's whose 'str'
        //       field is only longer than the token, itself, by 3 or less characters cannot have
        //       attributes.  In that case, just return an empty 'Stream' instance.

        int len = tn.str.length();
        if (tn.isClosing || (len <= (tn.tok.length() + 3))) return Stream.empty();


        // Leaves off the opening 'token' and less-than '<' symbol  (leaves off "<DIV" for example)
        // Also leave off the "ending-forward-slash" (if there is one) and ending '>'
        // 
        // January 23, 2025: I have discovered a TagNode bug.  It is extremley important to include
        // leading initial space that occurs after the token.  Specifically, if there were a 
        // TagNode such as <DIV CLASS=X>, IN ORDER FOR THE REG-EX TO WORK, the String 's' must be 
        // " CLASS=X", rather than "CLASS=X".
        // 
        // The Reg-Ex expects there to be at least a single character of White-Space before each of
        // the Key-Value Attributes.

        String s = tn.str.substring(
            tn.tok.length() + 1,
            len - ((tn.str.charAt(len - 2) == '/') ? 2 : 1)
        );

        // if all lower-case is requested, do that here.
        if (! preserveKeysCase) s = s.toLowerCase();

        // This is used to generate the Returned-Stream
        final Stream.Builder<String> b = Stream.builder();

        // This method would print up **WAY TOO MUCH** stuff.  This just prevents 99% of
        // the TagNode updates from being printed.  Only the example that I'm working on 
        // has the CSS-Class IHTA
        // 
        // final boolean B = s.contains("IHTA");

        for (String unMatchedStr : inverseMatches(s))

            // if (B) System.out.println("unMatchedStr: " + unMatchedStr);

            // Of that stuff, now do a white-space split for connected characters
            for (String keyWord : unMatchedStr.split("\\s+"))

                // Call String.trim() and String.length()
                if ((keyWord = keyWord.trim()).length() > 0)

                    // Check for valid Attribute-Name's only
                    if (AttrRegEx.ATTRIBUTE_KEY_REGEX_PRED.test(keyWord))

                        // ... put it in the return stream.
                        // NOTE: This has the potential to slightly change the original HTML
                        //       It will "leave out any guck" that was in the Element

                        b.add(keyWord);

        // Build the Stream<String>, and return;
        return b.build();
    }

    static boolean hasKeyOnlyAttribute(
            final TagNode   tn,
            final String    keyOnlyAttribute
        )
    {
        // Closing TagNode's do not have attributes, return false immediately.
        if (tn.isClosing) return false;

        // ONLY CHECKS FOR WHITE-SPACE, *NOT* VALIDITY...
        if (StringParse.hasWhiteSpace(keyOnlyAttribute)) throw new IllegalArgumentException(
            "The attribute you have passed [" + keyOnlyAttribute + "] has white-space, " +
            "This is not allowed here, because the search routine splits on whitespace, and " +
            "therefore a match would never be found."
        );


        // NOTE: TagNode's whose 'str' field is only longer than the token, itself, by 3 or less
        //       characters cannot have attributes.  In that case, just return false.

        int len = tn.str.length();
        if (len <= (tn.tok.length() + 3)) return false;


        // Leaves off the opening 'token' and less-than '<' symbol  (leaves off "<DIV " for example)
        // Also leave off the "ending-forward-slash" (if there is one), and edning '>'

        String s = tn.str.substring(
            tn.tok.length() + 2,
            len - ((tn.str.charAt(len - 2) == '/') ? 2 : 1)
        );


        // java.util.regex.Pattern.split(CharSequence) is sort of an "inverse reg-ex" in that it 
        // returns all of the text that was present BETWEEN the matches 
        // 
        // 'split' => inverse-matches (text between KEY-VALUE pairs)

        for (String unMatchedStr : inverseMatches(s))

            // Of that stuff, now do a white-space split for connected characters
            for (String keyWord : unMatchedStr.split("\\s+"))

                // trim, check-length...
                if ((keyWord = keyWord.trim()).length() > 0)

                    if (keyOnlyAttribute.equalsIgnoreCase(keyWord)) return true;

        // Was not found, return false;
        return false;
    }
}