Escape.java.html

package Torello.HTML;

import java.util.*;
import java.util.regex.*;
import java.util.stream.*;

import Torello.Java.*;

import Torello.JavaDoc.LinkJavaSource;
import static Torello.JavaDoc.Entity.METHOD;

/**
 * Easy utilities for escaping and un-escaping HTML characters such as {@code &nbsp;}, and even
 * code-point based Emoji's.
 * 
 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE>
 */
@Torello.JavaDoc.StaticFunctional
public final class Escape
{
    private Escape() { }


    // ********************************************************************************************
    // ********************************************************************************************
    // Internal Fields, used by this class only
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Regular Expression for characters represented in HTML as
     * <CODE>&amp;#x[Hexadecimal-Code];</CODE>
     */
    private static final Pattern HEX_CODE = Pattern.compile("&#x([A-F,a-f,\\d]{1,8});");

    /**
     * Regular Expression for characters represented in HTML as <CODE>&amp;#[Decimal-Code];</CODE>
     */
    private static final Pattern DEC_CODE = Pattern.compile("&#(\\d{1,8});");

    /**
     * Regular Expression (approximate, not exact) for hard-coded escape sequences such as
     * <CODE>"&amp;amp;"</CODE>
     * 
     * <BR /><BR />This is <I>"approximate"</I> - because it does not actually look the sequence
     * up in the hash table.  This means, of course, that not everything which matches this Regular
     * Expression Pattern is actually an escaped HTML ASCII/UniCode character.
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>For Example:</B>
     * 
     * <BR /><CODE>&amp;NotACode;</CODE> will match this Regular-Expression, but it is not an
     * actual HTML Escape-sequence.  For that, one needs to consult the internal
     * {@code 'htmlEscSeq'} or {@code 'htmlEscChars'} tables themselves.
     * 
     * @see #htmlEscChars
     * @see #htmlEscSeq
     */
    private static final Pattern TEXT_CODE = Pattern.compile("&[A-Z,a-z,0-9]{1,8};");

    @SuppressWarnings("rawtypes")
    private static final Vector data = LFEC.readObjectFromFile_JAR
        (Escape.class, "data-files/Escape.htdat", true, Vector.class);


    // This {@code Hashtable} contains all of the HTML escape characters which are represented by
    // a short Text-{@code String}.  The file listed above contains that list.
    // 
    // This is "Package-Private", because it used by a "Helper-Class" (EscapeRepl) and one of the
    // Replace-All Methods!

    @SuppressWarnings("unchecked")
    static final Hashtable<String, Character> htmlEscChars = 
        (Hashtable<String, Character>) data.elementAt(0);

    /**
     * This {@code Hashtable} is the reverse of the previous table.  It allows a user to look up
     * the escape sequence, given a particular ASCII {@code char}.
     * 
     * @see HTML_ESC_CHARS
     * @see #htmlEscChars
     */
    @SuppressWarnings("unchecked")
    private static final Hashtable<Character, String> htmlEscSeq =
        (Hashtable<Character, String>) data.elementAt(1);


    // ********************************************************************************************
    // ********************************************************************************************
    // Some debug, and "View Data" methods
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Print's the HTML Escape Character lookup table to {@code System.out}.
     * This is useful for debugging.
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>View Escape-Codes:</B>
     * 
     * <BR />The JAR Data-File List included within the page attached (below) is a complete list of
     * all <B><CODE>text-String</B> HTML Escape Sequences </CODE> that are known to this class.  
     * This list, does not include any <CODE>Code Point, Hex</CODE> or <CODE>Decimal Number</CODE>
     * sequences.
     *
     * <BR /><BR /><B><CODE><A HREF="doc-files/EscapeCodes.html">
     * All HTML Escape Sequences</A></CODE></B>
     */
    public static void printHTMLEsc()
    {
        Enumeration<String> e = htmlEscChars.keys();

        while (e.hasMoreElements())
        {
            String tag = e.nextElement();
            System.out.println("&" + tag + "; ==> " + htmlEscChars.get(tag));
        }
    }


    // ********************************************************************************************
    // ********************************************************************************************
    // Main Part of the class
    // ********************************************************************************************
    // ********************************************************************************************


    /**
     * Converts a single {@code String} from an HTML-escape sequence into the appropriate
     * character.
     * 
     * <BR /><BR />
     * <CODE>&amp;[escape-sequence];</CODE> ==&gt; actual ASCII or UniCode character.
     *
     * @param escHTML An HTML escape sequence.
     * 
     * @return the {@code ASCII} or {@code Unicode} character represented by this escape sequence.
     * 
     * <BR /><BR />This method will return {@code '0'} if the input it does not represent a valid
     * HTML Escape sequence.
     */
    public static char escHTMLToChar(String escHTML)
    {
        if (! escHTML.startsWith("&") || ! escHTML.endsWith(";")) return (char) 0;

        String  s = escHTML.substring(1, escHTML.length() - 1);

        // Temporary Variable.
        int     i = 0;

        // Since the EMOJI Escape Sequences use Code Point, they cannot, generally be
        // converted into a single Character.  Skip them.

        if (HEX_CODE.matcher(s).find())
        {
            if ((i = Integer.parseInt(s.substring(2), 16)) < Character.MAX_VALUE)
                return (char) i;
            else
                return 0;
        }


        // Again, deal with Emoji's here...  Parse the integer, and make sure it is a
        // character in the standard UNICODE range.

        if (DEC_CODE.matcher(s).find()) 
        {
            if ((i = Integer.parseInt(s.substring(1))) < Character.MAX_VALUE)
                return (char) i;
            else
                return 0;
        }

        // Now check if the provided Escape String is listed in the htmlEscChars Hashtable.
        Character c = htmlEscChars.get(s);


        // If the character was found in the table that lists all escape sequence characters,
        // then return it.  Otherwise just return ASCII zero.

        return (c != null) ? c.charValue() : 0;
    }

    /**
     * Will generate a {@code String} whereby any &amp; all <B STYLE='color: red;'><I>Hexadecimal
     * Escape Sequences</I></B> have been removed and subsequently replaced with their actual
     * ASCII/UniCode un-escaped characters!
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>Hexadecimal HTML Escape-Sequence Examples:</B>
     * 
     * <BR /><TABLE CLASS=JDBriefTable>
     * <TR><TH>Substring from Input:</TH><TH>Web-Browser Converts To:</TH></TR>
     * <TR><TD><CODE>&amp;#xAA;</CODE></TD><TD><CODE>'&#xAA;'</CODE> within a browser</TD></TR>
     * <TR><TD><CODE>&amp;#x67;</CODE></TD><TD><CODE>'&#x67;'</CODE> within a browser</TD></TR>
     * <TR><TD><CODE>&amp;#x84;</CODE></TD><TD><CODE>'&#x84;'</CODE> within a browser</TD></TR>
     * </TABLE>
     * 
     * <BR />This method might be thought of as similar to the older C/C++ {@code 'Ord()'}
     * function, except it is for HTML.
     * 
     * @param str any {@code String} that contains an HTML Escape Sequence
     * &amp;#x[HEXADECIMAL VALUE];
     * 
     * @return a {@code String}, with all of the hexadecimal escape sequences removed and replaced
     * with their equivalent ASCII or UniCode Characters.
     * 
     * @see #replaceAll_DEC(String str)
     * @see StrReplace#r(String, String[], char[])
     */
    public static String replaceAll_HEX(String str)
    {
        // This is the RegEx Matcher from the top.  It matches string's that look like: &#x\d+;
        Matcher m = HEX_CODE.matcher(str);


        // Save the escape-string regex search matches in a TreeMap.  We need to use a
        // TreeMap because it is much easier to check if a particular escape sequence has already
        // been found.  It is easier to find duplicates with TreeMap's.

        TreeMap<String, Character> escMap = new TreeMap<>();

        while (m.find())
        {
            // Use Base-16 Integer-Parse
            int i = Integer.valueOf(m.group(1), 16);

            // Do not un-escape EMOJI's... It makes a mess - they are sequences of characters
            // not single characters.

            if (i > Character.MAX_VALUE) continue;

            // Retrieve the Text Information about the HTML Escape Sequence
            String text = m.group();

            // Check if it is a valid HTML 5 Escape Sequence.
            if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i));
        }


        // Build the matchStr's and replaceChar's arrays.  These are just the KEY's and
        // the VALUE's of the TreeMap<String, Character> which was just built.
        // NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping
        //       duplicates when the replacement occurs.

        String[]    matchStrs       = escMap.keySet().toArray(new String[escMap.size()]);
        char[]      replaceChars    = new char[escMap.size()];


        // Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars"
        // array.  The class StrReplace will replace all the escape squences with the actual
        // characters.

        for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]);

        return StrReplace.r(str, matchStrs, replaceChars);
    }

    /**
     * This method functions the same as {@code replaceAll_HEX(String)} - except it replaces only
     * HTML Escape sequences that are represented using decimal (base-10) values.
     * {@code 'replaceAll_HEX(...)'} works on hexadecimal (base-16) values.
     * 
     * <BR /><BR /><B CLASS=JDDescLabel>Base-10 HTML Escape-Sequence Examples:</B>
     * 
     * <BR /><TABLE CLASS=JDBriefTable>
     * <TR><TH>Substring from Input:</TH>   <TH>Web-Browser Converts To:</TH></TR>
     * <TR><TD><CODE>&amp;#48;</CODE></TD>  <TD><CODE>'&#48;'</CODE> in your browser</TD></TR>
     * <TR><TD><CODE>&amp;#64;</CODE></TD>  <TD><CODE>'&#64;'</CODE> in your browser</TD></TR>
     * <TR><TD><CODE>&amp;#123;</CODE></TD> <TD><CODE>'&#123;'</CODE> in your browser</TD></TR>
     * <TR><TD><CODE>&amp;#125;</CODE></TD> <TD><CODE>'&#125;'</CODE> in your browser</TD></TR>
     * </TABLE>
     * 
     * <BR /><B CLASS=JDDescLabel>Base-10 &amp; Base-16 Escape-Sequence Difference:</B>
     * 
     * <BR /><UL CLASS=JDUL>
     * 
     * <LI> <CODE>&amp;#x[hex base-16 value];</CODE>  There is an {@code 'x'} as the third character
     *      in  the {@code String}
     *      </LI>
     * 
     * <LI> <CODE>&amp;#[decimal base-10 value];</CODE>  There is no {@code 'x'} in the
     *      escape-sequence  {@code String!}
     *      </LI>
     * 
     * </UL>
     * 
     * <BR />This short example delineates the difference between an HTML escape-sequence that
     * employs {@code Base-10} numbers, and one using {@code Base-16} (Hexadecimal) numbers.
     * 
     * @param str any {@code String} that contains the HTML Escape Sequence 
     * <CODE>&amp;#[DECIMAL VALUE];</CODE>.
     * 
     * @return a {@code String}, with all of the decimal escape sequences removed and replaced with
     * ASCII UniCode Characters.
     * 
     * <BR /><BR />If this parameter does not contain such a sequence, then this method will return
     * the same input-{@code String} reference as its return value.  
     * 
     * @see #replaceAll_HEX(String str)
     * @see StrReplace#r(String, String[], char[])
     */
    public static String replaceAll_DEC(String str)
    {
        // This is the RegEx Matcher from the top.  It matches string's that look like: &#\d+;
        Matcher m = DEC_CODE.matcher(str);


        // Save the escape-string regex search matches in a TreeMap.  We need to use a
        // TreeMap because it is much easier to check if a particular escape sequence has already
        // been found.  It is easier to find duplicates with TreeMap's.

        TreeMap<String, Character> escMap = new TreeMap<>();

        while (m.find())
        {
            // Use Base-10 Integer-Parse
            int i = Integer.valueOf(m.group(1));

            // Do not un-escape EMOJI's... It makes a mess - they are sequences of characters
            // not single characters.

            if (i > Character.MAX_VALUE) continue;

            // Retrieve the Text Information about the HTML Escape Sequence
            String text = m.group();

            // Check if it is a valid HTML 5 Escape Sequence.
            if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i));
        }


        // Build the matchStr's and replaceChar's arrays.  These are just the KEY's and
        // the VALUE's of the TreeMap<String, Character> which was just built.
        // NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping
        //       duplicates when the replacement occurs.

        String[]    matchStrs       = escMap.keySet().toArray(new String[escMap.size()]);
        char[]      replaceChars    = new char[escMap.size()];


        // Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars"
        // array.  The class StrReplace will replace all the escape sequences with the actual
        // characters.

        for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]);

        return StrReplace.r(str, matchStrs, replaceChars);
    }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_ALL_TEXT>
     * 
     * @param str any {@code String} that contains HTML Escape Sequences that need to be converted
     * to their ASCII-UniCode character representations.
     * 
     * @return a {@code String}, with all of the decimal escape sequences removed and replaced with
     * ASCII UniCode Characters.
     * 
     * @see #replaceAll_HEX(String str)
     * @see StrReplace#r(String, boolean, String[], Torello.Java.Function.ToCharIntTFunc)
     * 
     * @throws IllegalStateException
     */
    public static String replaceAll_TEXT(String str)
    {
        // We only need to find which escape sequences are in this string.
        // use a TreeSet<String> to list them.  It will

        Matcher                 m        = TEXT_CODE.matcher(str);
        TreeMap<String, String> escMap   = new TreeMap<>();

        while (m.find())
        {
            // Retrieve the Text Information about the HTML Escape Sequence
            String text     = m.group();
            String sequence = text.substring(1, text.length() - 1);

            // Check if it is a valid HTML 5 Escape Sequence.
            if ((! escMap.containsKey(text)) && htmlEscChars.containsKey(sequence))
                escMap.put(text, sequence);
        }
        
        // Convert the TreeSet to a String[] array... and use StrReplace
        String[] escArr = new String[escMap.size()];

        return StrReplace.r(
            str, false, escMap.keySet().toArray(escArr),
            (int i, String sequence) -> htmlEscChars.get(escMap.get(sequence))
        );
    }

    /**
     * Calls all of the HTML Escape Sequence convert/replace {@code String} functions at once.
     * 
     * @param s This may be any Java {@code String} which may (or may not) contain HTML Escape
     * sequences.
     * 
     * @return a new {@code String} where all HTML escape-sequence substrings have been replaced 
     * with their natural character representations.
     * 
     * @see #replaceAll_DEC(String)
     * @see #replaceAll_HEX(String)
     * @see #replaceAll_TEXT(String)
     */
    @Deprecated
    public static String replaceAll(String s)
    { return replaceAll_HEX(replaceAll_DEC(replaceAll_TEXT(s))); }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_REPLACE>
     * 
     * @param s This may be any Java {@code String} which may (or may not) contain HTML Escape
     * sequences.
     * 
     * @return a new {@code String} where all HTML escape-sequence substrings have been replaced 
     * with their natural character representations.
     */
    @LinkJavaSource(handle="EscapeRepl", entity=METHOD, name="replace")
    public static String replace(String s)
    { return EscapeRepl.replace(s); }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CHAR>
     * 
     * @param c Any Java Character.  Note that the Java <B>Primitive Type</B> {@code 'char'}
     * is a 16-bit type.  This parameter equates to the <B>UNICODE</B> Characters
     * {@code 0x0000} up to {@code 0xFFFF}.
     * 
     * @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String}
     * to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter.  If the
     * user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B>
     * digits, then pass {@code FALSE} to this parameter.
     * 
     * @return The passed character parameter {@code 'c'} will be converted to an HTML Escape
     * Sequence.  For instance if the character <CODE>'&#6211;'</CODE>, which is the Chinese
     * Character for <I>I, Me, Myself</I> were passed to this method, then the {@code String}
     * {@code "&#25105;"} would be returned.
     * 
     * <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE},
     * then this method would, instead, return the {@code String "&#x6211;"}.
     */
    public static String escChar(char c, boolean use16BitEscapeSequence)
    {
        return use16BitEscapeSequence
            ? "&#" + ((int) c) + ";"
            : "&#x" + Integer.toHexString((int) c).toUpperCase() + ";";
    }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CODE_PT>
     * 
     * @param codePoint This will take any integer.  It will be interpreted as a {@code UNICODE}
     * {@code code point}.  
     * 
     * <BR /><BR /><DIV CLASS=JDHint>
     * Java uses <B>16-bit</B> values for it's primitive {@code 'char'} type.  This is also the
     * "first plane" of the <B>UNICODE Space</B> and actually referred to as the <B>Basic Multi
     * Lingual Plane</B>.  Any value passed to this method that is lower than {@code 65,535} would
     * receive the same escape-{@code String} that it would from a call to the method
     * {@link #escChar(char, boolean)}.
     * </DIV>
     * 
     * @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String}
     * to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter.  If the
     * user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B>
     * digits, then pass {@code FALSE} to this parameter.
     * 
     * @return The {@code code point} will be converted to an HTML Escape Sequence, as a 
     * {@code java.lang.String}.  For instance if the {@code code point} for "the snowman" glyph
     * (character &#x2603;), which happens to be represented by a {@code code point} that is below
     * {@code 65,535} (and, incidentally, does "fit" into a single Java {@code 'char'}) - this
     * method would return the {@code String "&#9731;"}. 
     * 
     * <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE},
     * then this method would, instead, return the {@code String "&#x2603;"}.
     * 
     * @throws IllegalArgumentException Java has a method for determining whether any integer is a
     * valid {@code code point}.  Not all of the integers "fit" into the 17 Unicode "planes".  
     * Note that each of the planes in {@code 'Unicode Space'} contain {@code 65,535}
     * (or {@code 2^16}) characters.
     */
    public static String escCodePoint(int codePoint, boolean use16BitEscapeSequence)
    {
        if (! Character.isValidCodePoint(codePoint)) throw new IllegalArgumentException(
            "The integer you have passed to this method [" + codePoint + "] was deemed an " +
            "invalid Code Point after a call to: [java.lang.Character.isValidCodePoint(int)].  " +
            "Therefore this method is unable to provide an HTML Escape Sequence."
        );

        return use16BitEscapeSequence
            ? "&#" + codePoint + ";"
            : "&#x" + Integer.toHexString(codePoint).toUpperCase() + ";";
    }
    
    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HAS_HTML>
     *
     * @param c Any <B>ASCII</B> or <B>UNICODE</B> Character
     * 
     * @return {@code TRUE} if there is a {@code String} escape sequence for this character, and
     * {@code FALSE} otherwise.
     * 
     * @see #htmlEsc(char)
     */
    public static boolean hasHTMLEsc(char c)
    { return htmlEscSeq.get(Character.valueOf(c)) != null; }

    /**
     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HTML_ESC>
     *
     * @param c Any <B>ASCII</B> or <B>UNICODE</B> Character
     * 
     * @return The {@code String} that is used by web-browsers to escape this ASCII / Uni-Code
     * character - <I>if there is one saved</I> in the <B>internal</B> <CODE>Lookup Table</CODE>.
     * If the character provided does not have an associated {@code HTML Escape String}, then
     * 'null' is returned.
     * 
     * <BR /><BR /><DIV CLASS=JDHint>
     * The entire escape-{@code String} is not provided, just the inner-characters.  The leading
     * {@code '&'} (Ampersand) and the trailing {@code ';'} (Semi-Colon) are not appended to the
     * returned {@code String}.
     * </DIV>
     * 
     * @see #hasHTMLEsc(char)
     */
    public static String htmlEsc(char c)
    { return htmlEscSeq.get(Character.valueOf(c)); }
}