001package Torello.HTML;
002
003import java.util.*;
004import java.util.regex.*;
005import java.util.stream.*;
006
007import Torello.Java.*;
008
009import Torello.JavaDoc.LinkJavaSource;
010import static Torello.JavaDoc.Entity.METHOD;
011
012/**
013 * Easy utilities for escaping and un-escaping HTML characters such as {@code  }, and even
014 * code-point based Emoji's.
015 * 
016 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE>
017 */
018@Torello.JavaDoc.StaticFunctional
019public final class Escape
020{
021    private Escape() { }
022
023
024    // ********************************************************************************************
025    // ********************************************************************************************
026    // Internal Fields, used by this class only
027    // ********************************************************************************************
028    // ********************************************************************************************
029
030
031    /**
032     * Regular Expression for characters represented in HTML as
033     * <CODE>&amp;#x[Hexadecimal-Code];</CODE>
034     */
035    private static final Pattern HEX_CODE = Pattern.compile("&#x([A-F,a-f,\\d]{1,8});");
036
037    /**
038     * Regular Expression for characters represented in HTML as <CODE>&amp;#[Decimal-Code];</CODE>
039     */
040    private static final Pattern DEC_CODE = Pattern.compile("&#(\\d{1,8});");
041
042    /**
043     * Regular Expression (approximate, not exact) for hard-coded escape sequences such as
044     * <CODE>"&amp;amp;"</CODE>
045     * 
046     * <BR /><BR />This is <I>"approximate"</I> - because it does not actually look the sequence
047     * up in the hash table.  This means, of course, that not everything which matches this Regular
048     * Expression Pattern is actually an escaped HTML ASCII/UniCode character.
049     * 
050     * <BR /><BR /><B CLASS=JDDescLabel>For Example:</B>
051     * 
052     * <BR /><CODE>&amp;NotACode;</CODE> will match this Regular-Expression, but it is not an
053     * actual HTML Escape-sequence.  For that, one needs to consult the internal
054     * {@code 'htmlEscSeq'} or {@code 'htmlEscChars'} tables themselves.
055     * 
056     * @see #htmlEscChars
057     * @see #htmlEscSeq
058     */
059    private static final Pattern TEXT_CODE = Pattern.compile("&[A-Z,a-z,0-9]{1,8};");
060
061    @SuppressWarnings("rawtypes")
062    private static final Vector data = LFEC.readObjectFromFile_JAR
063        (Escape.class, "data-files/Escape.htdat", true, Vector.class);
064
065
066    // This {@code Hashtable} contains all of the HTML escape characters which are represented by
067    // a short Text-{@code String}.  The file listed above contains that list.
068    // 
069    // This is "Package-Private", because it used by a "Helper-Class" (EscapeRepl) and one of the
070    // Replace-All Methods!
071
072    @SuppressWarnings("unchecked")
073    static final Hashtable<String, Character> htmlEscChars = 
074        (Hashtable<String, Character>) data.elementAt(0);
075
076    /**
077     * This {@code Hashtable} is the reverse of the previous table.  It allows a user to look up
078     * the escape sequence, given a particular ASCII {@code char}.
079     * 
080     * @see HTML_ESC_CHARS
081     * @see #htmlEscChars
082     */
083    @SuppressWarnings("unchecked")
084    private static final Hashtable<Character, String> htmlEscSeq =
085        (Hashtable<Character, String>) data.elementAt(1);
086
087
088    // ********************************************************************************************
089    // ********************************************************************************************
090    // Some debug, and "View Data" methods
091    // ********************************************************************************************
092    // ********************************************************************************************
093
094
095    /**
096     * Print's the HTML Escape Character lookup table to {@code System.out}.
097     * This is useful for debugging.
098     * 
099     * <BR /><BR /><B CLASS=JDDescLabel>View Escape-Codes:</B>
100     * 
101     * <BR />The JAR Data-File List included within the page attached (below) is a complete list of
102     * all <B><CODE>text-String</B> HTML Escape Sequences </CODE> that are known to this class.  
103     * This list, does not include any <CODE>Code Point, Hex</CODE> or <CODE>Decimal Number</CODE>
104     * sequences.
105     *
106     * <BR /><BR /><B><CODE><A HREF="doc-files/EscapeCodes.html">
107     * All HTML Escape Sequences</A></CODE></B>
108     */
109    public static void printHTMLEsc()
110    {
111        Enumeration<String> e = htmlEscChars.keys();
112
113        while (e.hasMoreElements())
114        {
115            String tag = e.nextElement();
116            System.out.println("&" + tag + "; ==> " + htmlEscChars.get(tag));
117        }
118    }
119
120
121    // ********************************************************************************************
122    // ********************************************************************************************
123    // Main Part of the class
124    // ********************************************************************************************
125    // ********************************************************************************************
126
127
128    /**
129     * Converts a single {@code String} from an HTML-escape sequence into the appropriate
130     * character.
131     * 
132     * <BR /><BR />
133     * <CODE>&amp;[escape-sequence];</CODE> ==&gt; actual ASCII or UniCode character.
134     *
135     * @param escHTML An HTML escape sequence.
136     * 
137     * @return the {@code ASCII} or {@code Unicode} character represented by this escape sequence.
138     * 
139     * <BR /><BR />This method will return {@code '0'} if the input it does not represent a valid
140     * HTML Escape sequence.
141     */
142    public static char escHTMLToChar(String escHTML)
143    {
144        if (! escHTML.startsWith("&") || ! escHTML.endsWith(";")) return (char) 0;
145
146        String  s = escHTML.substring(1, escHTML.length() - 1);
147
148        // Temporary Variable.
149        int     i = 0;
150
151        // Since the EMOJI Escape Sequences use Code Point, they cannot, generally be
152        // converted into a single Character.  Skip them.
153
154        if (HEX_CODE.matcher(s).find())
155        {
156            if ((i = Integer.parseInt(s.substring(2), 16)) < Character.MAX_VALUE)
157                return (char) i;
158            else
159                return 0;
160        }
161
162        // Again, deal with Emoji's here...  Parse the integer, and make sure it is a
163        // character in the standard UNICODE range.
164
165        if (DEC_CODE.matcher(s).find()) 
166        {
167            if ((i = Integer.parseInt(s.substring(1))) < Character.MAX_VALUE)
168                return (char) i;
169            else
170                return 0;
171        }
172
173        // Now check if the provided Escape String is listed in the htmlEscChars Hashtable.
174        Character c = htmlEscChars.get(s);
175
176        // If the character was found in the table that lists all escape sequence characters,
177        // then return it.  Otherwise just return ASCII zero.
178
179        return (c != null) ? c.charValue() : 0;
180    }
181
182    /**
183     * Will generate a {@code String} whereby any &amp; all <B STYLE='color: red;'><I>Hexadecimal
184     * Escape Sequences</I></B> have been removed and subsequently replaced with their actual
185     * ASCII/UniCode un-escaped characters!
186     * 
187     * <BR /><BR /><B CLASS=JDDescLabel>Hexadecimal HTML Escape-Sequence Examples:</B>
188     * 
189     * <BR /><TABLE CLASS=JDBriefTable>
190     * <TR><TH>Substring from Input:</TH><TH>Web-Browser Converts To:</TH></TR>
191     * <TR><TD><CODE>&amp;#xAA;</CODE></TD><TD><CODE>'&#xAA;'</CODE> within a browser</TD></TR>
192     * <TR><TD><CODE>&amp;#x67;</CODE></TD><TD><CODE>'&#x67;'</CODE> within a browser</TD></TR>
193     * <TR><TD><CODE>&amp;#x84;</CODE></TD><TD><CODE>'&#x84;'</CODE> within a browser</TD></TR>
194     * </TABLE>
195     * 
196     * <BR />This method might be thought of as similar to the older C/C++ {@code 'Ord()'}
197     * function, except it is for HTML.
198     * 
199     * @param str any {@code String} that contains an HTML Escape Sequence
200     * &amp;#x[HEXADECIMAL VALUE];
201     * 
202     * @return a {@code String}, with all of the hexadecimal escape sequences removed and replaced
203     * with their equivalent ASCII or UniCode Characters.
204     * 
205     * @see #replaceAll_DEC(String str)
206     * @see StrReplace#r(String, String[], char[])
207     */
208    public static String replaceAll_HEX(String str)
209    {
210        // This is the RegEx Matcher from the top.  It matches string's that look like: &#x\d+;
211        Matcher m = HEX_CODE.matcher(str);
212
213        // Save the escape-string regex search matches in a TreeMap.  We need to use a
214        // TreeMap because it is much easier to check if a particular escape sequence has already
215        // been found.  It is easier to find duplicates with TreeMap's.
216
217        TreeMap<String, Character> escMap = new TreeMap<>();
218
219        while (m.find())
220        {
221            // Use Base-16 Integer-Parse
222            int i = Integer.valueOf(m.group(1), 16);
223
224            // Do not un-escape EMOJI's... It makes a mess - they are sequences of characters
225            // not single characters.
226
227            if (i > Character.MAX_VALUE) continue;
228
229            // Retrieve the Text Information about the HTML Escape Sequence
230            String text = m.group();
231
232            // Check if it is a valid HTML 5 Escape Sequence.
233            if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i));
234        }
235        
236        // Build the matchStr's and replaceChar's arrays.  These are just the KEY's and
237        // the VALUE's of the TreeMap<String, Character> which was just built.
238        // NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping
239        //       duplicates when the replacement occurs.
240
241        String[]    matchStrs       = escMap.keySet().toArray(new String[escMap.size()]);
242        char[]      replaceChars    = new char[escMap.size()];
243
244        // Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars"
245        // array.  The class StrReplace will replace all the escape squences with the actual
246        // characters.
247
248        for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]);
249
250        return StrReplace.r(str, matchStrs, replaceChars);
251    }
252
253    /**
254     * This method functions the same as {@code replaceAll_HEX(String)} - except it replaces only
255     * HTML Escape sequences that are represented using decimal (base-10) values.
256     * {@code 'replaceAll_HEX(...)'} works on hexadecimal (base-16) values.
257     * 
258     * <BR /><BR /><B CLASS=JDDescLabel>Base-10 HTML Escape-Sequence Examples:</B>
259     * 
260     * <BR /><TABLE CLASS=JDBriefTable>
261     * <TR><TH>Substring from Input:</TH><TH>Web-Browser Converts To:</TH></TR>
262     * <TR><TD><CODE>&amp;#48;</CODE></TD><TD><CODE>'&#48;'</CODE> in your browser</TD></TR>
263     * <TR><TD><CODE>&amp;#64;</CODE></TD><TD><CODE>'&#64;'</CODE> in your browser</TD></TR>
264     * <TR><TD><CODE>&amp;#123;</CODE></TD><TD><CODE>'&#123;'</CODE> in your browser</TD></TR>
265     * <TR><TD><CODE>&amp;#125;</CODE></TD><TD><CODE>'&#125;'</CODE> in your browser</TD></TR>
266     * </TABLE>
267     * 
268     * <BR /><B CLASS=JDDescLabel>Base-10 &amp; Base-16 Escape-Sequence Difference:</B>
269     * 
270     * <BR /><UL CLASS=JDUL>
271     * <LI> <CODE>&amp;#x[hex base-16 value];</CODE>  There is an {@code 'x'} as the third character
272     *      in  the {@code String}
273     * </LI>
274     * <LI> <CODE>&amp;#[decimal base-10 value];</CODE>  There is no {@code 'x'} in the
275     *      escape-sequence  {@code String!}
276     * </LI>
277     * </UL>
278     * 
279     * <BR />This short example delineates the difference between an HTML escape-sequence that
280     * employs {@code Base-10} numbers, and one using {@code Base-16} (Hexadecimal) numbers.
281     * 
282     * @param str any {@code String} that contains the HTML Escape Sequence 
283     * <CODE>&amp;#[DECIMAL VALUE];</CODE>.
284     * 
285     * @return a {@code String}, with all of the decimal escape sequences removed and replaced with
286     * ASCII UniCode Characters.
287     * 
288     * <BR /><BR />If this parameter does not contain such a sequence, then this method will return
289     * the same input-{@code String} reference as its return value.  
290     * 
291     * @see #replaceAll_HEX(String str)
292     * @see StrReplace#r(String, String[], char[])
293     */
294    public static String replaceAll_DEC(String str)
295    {
296        // This is the RegEx Matcher from the top.  It matches string's that look like: &#\d+;
297        Matcher m = DEC_CODE.matcher(str);
298
299        // Save the escape-string regex search matches in a TreeMap.  We need to use a
300        // TreeMap because it is much easier to check if a particular escape sequence has already
301        // been found.  It is easier to find duplicates with TreeMap's.
302
303        TreeMap<String, Character> escMap = new TreeMap<>();
304
305        while (m.find())
306        {
307            // Use Base-10 Integer-Parse
308            int i = Integer.valueOf(m.group(1));
309
310            // Do not un-escape EMOJI's... It makes a mess - they are sequences of characters
311            // not single characters.
312
313            if (i > Character.MAX_VALUE) continue;
314
315            // Retrieve the Text Information about the HTML Escape Sequence
316            String text = m.group();
317
318            // Check if it is a valid HTML 5 Escape Sequence.
319            if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i));
320        }
321        
322        // Build the matchStr's and replaceChar's arrays.  These are just the KEY's and
323        // the VALUE's of the TreeMap<String, Character> which was just built.
324        // NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping
325        //       duplicates when the replacement occurs.
326
327        String[]    matchStrs       = escMap.keySet().toArray(new String[escMap.size()]);
328        char[]      replaceChars    = new char[escMap.size()];
329
330        // Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars"
331        // array.  The class StrReplace will replace all the escape sequences with the actual
332        // characters.
333
334        for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]);
335
336        return StrReplace.r(str, matchStrs, replaceChars);
337    }
338
339    /**
340     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_ALL_TEXT>
341     * 
342     * @param str any {@code String} that contains HTML Escape Sequences that need to be converted
343     * to their ASCII-UniCode character representations.
344     * 
345     * @return a {@code String}, with all of the decimal escape sequences removed and replaced with
346     * ASCII UniCode Characters.
347     * 
348     * @see #replaceAll_HEX(String str)
349     * @see StrReplace#r(String, boolean, String[], Torello.Java.Function.ToCharIntTFunc)
350     * 
351     * @throws IllegalStateException
352     */
353    public static String replaceAll_TEXT(String str)
354    {
355        // We only need to find which escape sequences are in this string.
356        // use a TreeSet<String> to list them.  It will
357
358        Matcher                 m        = TEXT_CODE.matcher(str);
359        TreeMap<String, String> escMap   = new TreeMap<>();
360
361        while (m.find())
362        {
363            // Retrieve the Text Information about the HTML Escape Sequence
364            String text     = m.group();
365            String sequence = text.substring(1, text.length() - 1);
366
367            // Check if it is a valid HTML 5 Escape Sequence.
368            if ((! escMap.containsKey(text)) && htmlEscChars.containsKey(sequence))
369                escMap.put(text, sequence);
370        }
371        
372        // Convert the TreeSet to a String[] array... and use StrReplace
373        String[] escArr = new String[escMap.size()];
374
375        return StrReplace.r(
376            str, false, escMap.keySet().toArray(escArr),
377            (int i, String sequence) -> htmlEscChars.get(escMap.get(sequence))
378        );
379    }
380
381    /**
382     * Calls all of the HTML Escape Sequence convert/replace {@code String} functions at once.
383     * 
384     * @param s This may be any Java {@code String} which may (or may not) contain HTML Escape
385     * sequences.
386     * 
387     * @return a new {@code String} where all HTML escape-sequence substrings have been replaced 
388     * with their natural character representations.
389     * 
390     * @see #replaceAll_DEC(String)
391     * @see #replaceAll_HEX(String)
392     * @see #replaceAll_TEXT(String)
393     */
394    @Deprecated
395    public static String replaceAll(String s)
396    { return replaceAll_HEX(replaceAll_DEC(replaceAll_TEXT(s))); }
397
398    /**
399     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_REPLACE>
400     * 
401     * @param s This may be any Java {@code String} which may (or may not) contain HTML Escape
402     * sequences.
403     * 
404     * @return a new {@code String} where all HTML escape-sequence substrings have been replaced 
405     * with their natural character representations.
406     */
407    @LinkJavaSource(handle="EscapeRepl", entity=METHOD, name="replace")
408    public static String replace(String s)
409    { return EscapeRepl.replace(s); }
410
411    /**
412     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CHAR>
413     * 
414     * @param c Any Java Character.  Note that the Java <B>Primitive Type</B> {@code 'char'}
415     * is a 16-bit type.  This parameter equates to the <B>UNICODE</B> Characters
416     * {@code 0x0000} up to {@code 0xFFFF}.
417     * 
418     * @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String}
419     * to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter.  If the
420     * user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B>
421     * digits, then pass {@code FALSE} to this parameter.
422     * 
423     * @return The passed character parameter {@code 'c'} will be converted to an HTML Escape
424     * Sequence.  For instance if the character <CODE>'&#6211;'</CODE>, which is the Chinese
425     * Character for <I>I, Me, Myself</I> were passed to this method, then the {@code String}
426     * {@code "&#25105;"} would be returned.
427     * 
428     * <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE},
429     * then this method would, instead, return the {@code String "&#x6211;"}.
430     */
431    public static String escChar(char c, boolean use16BitEscapeSequence)
432    {
433        return use16BitEscapeSequence
434            ? "&#" + ((int) c) + ";"
435            : "&#x" + Integer.toHexString((int) c).toUpperCase() + ";";
436    }
437
438    /**
439     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CODE_PT>
440     * 
441     * @param codePoint This will take any integer.  It will be interpreted as a {@code UNICODE}
442     * {@code code point}.  
443     * 
444     * <BR /><BR /><B STYLE="color:red;">NOTE:</B> Java uses <B>16-bit</B> values for it's
445     * primitive {@code 'char'} type.  This is also the "first plane" of the <B>UNICODE Space</B>
446     * and actually referred to as the <B>Basic Multi Lingual Plane</B>.  Any value passed to this
447     * method that is lower than {@code 65,535} would receive the same escape-{@code String} that
448     * it would from a call to the method {@link #escChar(char, boolean)}.
449     * 
450     * @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String}
451     * to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter.  If the
452     * user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B>
453     * digits, then pass {@code FALSE} to this parameter.
454     * 
455     * @return The {@code code point} will be converted to an HTML Escape Sequence, as a 
456     * {@code java.lang.String}.  For instance if the {@code code point} for "the snowman" glyph
457     * (character &#x2603;), which happens to be represented by a {@code code point} that is below
458     * {@code 65,535} (and, incidentally, does "fit" into a single Java {@code 'char'}) - this
459     * method would return the {@code String "&#9731;"}. 
460     * 
461     * <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE},
462     * then this method would, instead, return the {@code String "&#x2603;"}.
463     * 
464     * @throws IllegalArgumentException Java has a method for determining whether any integer is a
465     * valid {@code code point}.  Not all of the integers "fit" into the 17 Unicode "planes".  
466     * Note that each of the planes in {@code 'Unicode Space'} contain {@code 65,535}
467     * (or {@code 2^16}) characters.
468     */
469    public static String escCodePoint(int codePoint, boolean use16BitEscapeSequence)
470    {
471        if (! Character.isValidCodePoint(codePoint)) throw new IllegalArgumentException(
472            "The integer you have passed to this method [" + codePoint + "] was deemed an " +
473            "invalid Code Point after a call to: [java.lang.Character.isValidCodePoint(int)].  " +
474            "Therefore this method is unable to provide an HTML Escape Sequence."
475        );
476
477        return use16BitEscapeSequence
478            ? "&#" + codePoint + ";"
479            : "&#x" + Integer.toHexString(codePoint).toUpperCase() + ";";
480    }
481    
482    /**
483     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HAS_HTML>
484     *
485     * @param c Any <B>ASCII</B> or <B>UNICODE</B> Character
486     * 
487     * @return {@code TRUE} if there is a {@code String} escape sequence for this character, and
488     * {@code FALSE} otherwise.
489     * 
490     * @see #htmlEsc(char)
491     */
492    public static boolean hasHTMLEsc(char c)
493    { return htmlEscSeq.get(Character.valueOf(c)) != null; }
494
495    /**
496     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HTML_ESC>
497     *
498     * @param c Any <B>ASCII</B> or <B>UNICODE</B> Character
499     * 
500     * @return The {@code String} that is used by web-browsers to escape this ASCII / Uni-Code
501     * character - <I>if there is one saved</I> in the <B>internal</B> <CODE>Lookup Table</CODE>.
502     * If the character provided does not have an associated {@code HTML Escape String}, then
503     * 'null' is returned.
504     * 
505     * <BR /><BR /><B>NOTE:</B> The entire escape-{@code String} is not provided, just the
506     * inner-characters.  The leading {@code '&'} (Ampersand) and the trailing {@code ';'} 
507     * (Semi-Colon) are not appended to the returned {@code String}.
508     * 
509     * @see #hasHTMLEsc(char)
510     */
511    public static String htmlEsc(char c)
512    { return htmlEscSeq.get(Character.valueOf(c)); }
513}