Source code

001package Torello.HTML;
002
003import java.util.*;
004import java.util.regex.*;
005import java.util.stream.*;
006
007import Torello.Java.*;
008
009import Torello.JavaDoc.LinkJavaSource;
010import static Torello.JavaDoc.Entity.METHOD;
011
012/**
013 * Easy utilities for escaping and un-escaping HTML characters such as {@code &nbsp;}, and even
014 * code-point based Emoji's.
015 * 
016 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE>
017 */
018@Torello.JavaDoc.StaticFunctional
019public final class Escape
020{
021    private Escape() { }
022
023
024    // ********************************************************************************************
025    // ********************************************************************************************
026    // Internal Fields, used by this class only
027    // ********************************************************************************************
028    // ********************************************************************************************
029
030
031    /**
032     * Regular Expression for characters represented in HTML as
033     * <CODE>&amp;#x[Hexadecimal-Code];</CODE>
034     */
035    private static final Pattern HEX_CODE = Pattern.compile("&#x([A-F,a-f,\\d]{1,8});");
036
037    /**
038     * Regular Expression for characters represented in HTML as <CODE>&amp;#[Decimal-Code];</CODE>
039     */
040    private static final Pattern DEC_CODE = Pattern.compile("&#(\\d{1,8});");
041
042    /**
043     * Regular Expression (approximate, not exact) for hard-coded escape sequences such as
044     * <CODE>"&amp;amp;"</CODE>
045     * 
046     * <BR /><BR />This is <I>"approximate"</I> - because it does not actually look the sequence
047     * up in the hash table.  This means, of course, that not everything which matches this Regular
048     * Expression Pattern is actually an escaped HTML ASCII/UniCode character.
049     * 
050     * <BR /><BR /><B CLASS=JDDescLabel>For Example:</B>
051     * 
052     * <BR /><CODE>&amp;NotACode;</CODE> will match this Regular-Expression, but it is not an
053     * actual HTML Escape-sequence.  For that, one needs to consult the internal
054     * {@code 'htmlEscSeq'} or {@code 'htmlEscChars'} tables themselves.
055     * 
056     * @see #htmlEscChars
057     * @see #htmlEscSeq
058     */
059    private static final Pattern TEXT_CODE = Pattern.compile("&[A-Z,a-z,0-9]{1,8};");
060
061    @SuppressWarnings("rawtypes")
062    private static final Vector data = LFEC.readObjectFromFile_JAR
063        (Escape.class, "data-files/Escape.htdat", true, Vector.class);
064
065
066    // This {@code Hashtable} contains all of the HTML escape characters which are represented by
067    // a short Text-{@code String}.  The file listed above contains that list.
068    // 
069    // This is "Package-Private", because it used by a "Helper-Class" (EscapeRepl) and one of the
070    // Replace-All Methods!
071
072    @SuppressWarnings("unchecked")
073    static final Hashtable<String, Character> htmlEscChars = 
074        (Hashtable<String, Character>) data.elementAt(0);
075
076    /**
077     * This {@code Hashtable} is the reverse of the previous table.  It allows a user to look up
078     * the escape sequence, given a particular ASCII {@code char}.
079     * 
080     * @see HTML_ESC_CHARS
081     * @see #htmlEscChars
082     */
083    @SuppressWarnings("unchecked")
084    private static final Hashtable<Character, String> htmlEscSeq =
085        (Hashtable<Character, String>) data.elementAt(1);
086
087
088    // ********************************************************************************************
089    // ********************************************************************************************
090    // Some debug, and "View Data" methods
091    // ********************************************************************************************
092    // ********************************************************************************************
093
094
095    /**
096     * Print's the HTML Escape Character lookup table to {@code System.out}.
097     * This is useful for debugging.
098     * 
099     * <BR /><BR /><B CLASS=JDDescLabel>View Escape-Codes:</B>
100     * 
101     * <BR />The JAR Data-File List included within the page attached (below) is a complete list of
102     * all <B><CODE>text-String</B> HTML Escape Sequences </CODE> that are known to this class.  
103     * This list, does not include any <CODE>Code Point, Hex</CODE> or <CODE>Decimal Number</CODE>
104     * sequences.
105     *
106     * <BR /><BR /><B><CODE><A HREF="doc-files/EscapeCodes.html">
107     * All HTML Escape Sequences</A></CODE></B>
108     */
109    public static void printHTMLEsc()
110    {
111        Enumeration<String> e = htmlEscChars.keys();
112
113        while (e.hasMoreElements())
114        {
115            String tag = e.nextElement();
116            System.out.println("&" + tag + "; ==> " + htmlEscChars.get(tag));
117        }
118    }
119
120
121    // ********************************************************************************************
122    // ********************************************************************************************
123    // Main Part of the class
124    // ********************************************************************************************
125    // ********************************************************************************************
126
127
128    /**
129     * Converts a single {@code String} from an HTML-escape sequence into the appropriate
130     * character.
131     * 
132     * <BR /><BR />
133     * <CODE>&amp;[escape-sequence];</CODE> ==&gt; actual ASCII or UniCode character.
134     *
135     * @param escHTML An HTML escape sequence.
136     * 
137     * @return the {@code ASCII} or {@code Unicode} character represented by this escape sequence.
138     * 
139     * <BR /><BR />This method will return {@code '0'} if the input it does not represent a valid
140     * HTML Escape sequence.
141     */
142    public static char escHTMLToChar(String escHTML)
143    {
144        if (! escHTML.startsWith("&") || ! escHTML.endsWith(";")) return (char) 0;
145
146        String  s = escHTML.substring(1, escHTML.length() - 1);
147
148        // Temporary Variable.
149        int     i = 0;
150
151        // Since the EMOJI Escape Sequences use Code Point, they cannot, generally be
152        // converted into a single Character.  Skip them.
153
154        if (HEX_CODE.matcher(s).find())
155        {
156            if ((i = Integer.parseInt(s.substring(2), 16)) < Character.MAX_VALUE)
157                return (char) i;
158            else
159                return 0;
160        }
161
162
163        // Again, deal with Emoji's here...  Parse the integer, and make sure it is a
164        // character in the standard UNICODE range.
165
166        if (DEC_CODE.matcher(s).find()) 
167        {
168            if ((i = Integer.parseInt(s.substring(1))) < Character.MAX_VALUE)
169                return (char) i;
170            else
171                return 0;
172        }
173
174        // Now check if the provided Escape String is listed in the htmlEscChars Hashtable.
175        Character c = htmlEscChars.get(s);
176
177
178        // If the character was found in the table that lists all escape sequence characters,
179        // then return it.  Otherwise just return ASCII zero.
180
181        return (c != null) ? c.charValue() : 0;
182    }
183
184    /**
185     * Will generate a {@code String} whereby any &amp; all <B STYLE='color: red;'><I>Hexadecimal
186     * Escape Sequences</I></B> have been removed and subsequently replaced with their actual
187     * ASCII/UniCode un-escaped characters!
188     * 
189     * <BR /><BR /><B CLASS=JDDescLabel>Hexadecimal HTML Escape-Sequence Examples:</B>
190     * 
191     * <BR /><TABLE CLASS=JDBriefTable>
192     * <TR><TH>Substring from Input:</TH><TH>Web-Browser Converts To:</TH></TR>
193     * <TR><TD><CODE>&amp;#xAA;</CODE></TD><TD><CODE>'&#xAA;'</CODE> within a browser</TD></TR>
194     * <TR><TD><CODE>&amp;#x67;</CODE></TD><TD><CODE>'&#x67;'</CODE> within a browser</TD></TR>
195     * <TR><TD><CODE>&amp;#x84;</CODE></TD><TD><CODE>'&#x84;'</CODE> within a browser</TD></TR>
196     * </TABLE>
197     * 
198     * <BR />This method might be thought of as similar to the older C/C++ {@code 'Ord()'}
199     * function, except it is for HTML.
200     * 
201     * @param str any {@code String} that contains an HTML Escape Sequence
202     * &amp;#x[HEXADECIMAL VALUE];
203     * 
204     * @return a {@code String}, with all of the hexadecimal escape sequences removed and replaced
205     * with their equivalent ASCII or UniCode Characters.
206     * 
207     * @see #replaceAll_DEC(String str)
208     * @see StrReplace#r(String, String[], char[])
209     */
210    public static String replaceAll_HEX(String str)
211    {
212        // This is the RegEx Matcher from the top.  It matches string's that look like: &#x\d+;
213        Matcher m = HEX_CODE.matcher(str);
214
215
216        // Save the escape-string regex search matches in a TreeMap.  We need to use a
217        // TreeMap because it is much easier to check if a particular escape sequence has already
218        // been found.  It is easier to find duplicates with TreeMap's.
219
220        TreeMap<String, Character> escMap = new TreeMap<>();
221
222        while (m.find())
223        {
224            // Use Base-16 Integer-Parse
225            int i = Integer.valueOf(m.group(1), 16);
226
227            // Do not un-escape EMOJI's... It makes a mess - they are sequences of characters
228            // not single characters.
229
230            if (i > Character.MAX_VALUE) continue;
231
232            // Retrieve the Text Information about the HTML Escape Sequence
233            String text = m.group();
234
235            // Check if it is a valid HTML 5 Escape Sequence.
236            if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i));
237        }
238
239
240        // Build the matchStr's and replaceChar's arrays.  These are just the KEY's and
241        // the VALUE's of the TreeMap<String, Character> which was just built.
242        // NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping
243        //       duplicates when the replacement occurs.
244
245        String[]    matchStrs       = escMap.keySet().toArray(new String[escMap.size()]);
246        char[]      replaceChars    = new char[escMap.size()];
247
248
249        // Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars"
250        // array.  The class StrReplace will replace all the escape squences with the actual
251        // characters.
252
253        for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]);
254
255        return StrReplace.r(str, matchStrs, replaceChars);
256    }
257
258    /**
259     * This method functions the same as {@code replaceAll_HEX(String)} - except it replaces only
260     * HTML Escape sequences that are represented using decimal (base-10) values.
261     * {@code 'replaceAll_HEX(...)'} works on hexadecimal (base-16) values.
262     * 
263     * <BR /><BR /><B CLASS=JDDescLabel>Base-10 HTML Escape-Sequence Examples:</B>
264     * 
265     * <BR /><TABLE CLASS=JDBriefTable>
266     * <TR><TH>Substring from Input:</TH>   <TH>Web-Browser Converts To:</TH></TR>
267     * <TR><TD><CODE>&amp;#48;</CODE></TD>  <TD><CODE>'&#48;'</CODE> in your browser</TD></TR>
268     * <TR><TD><CODE>&amp;#64;</CODE></TD>  <TD><CODE>'&#64;'</CODE> in your browser</TD></TR>
269     * <TR><TD><CODE>&amp;#123;</CODE></TD> <TD><CODE>'&#123;'</CODE> in your browser</TD></TR>
270     * <TR><TD><CODE>&amp;#125;</CODE></TD> <TD><CODE>'&#125;'</CODE> in your browser</TD></TR>
271     * </TABLE>
272     * 
273     * <BR /><B CLASS=JDDescLabel>Base-10 &amp; Base-16 Escape-Sequence Difference:</B>
274     * 
275     * <BR /><UL CLASS=JDUL>
276     * 
277     * <LI> <CODE>&amp;#x[hex base-16 value];</CODE>  There is an {@code 'x'} as the third character
278     *      in  the {@code String}
279     *      </LI>
280     * 
281     * <LI> <CODE>&amp;#[decimal base-10 value];</CODE>  There is no {@code 'x'} in the
282     *      escape-sequence  {@code String!}
283     *      </LI>
284     * 
285     * </UL>
286     * 
287     * <BR />This short example delineates the difference between an HTML escape-sequence that
288     * employs {@code Base-10} numbers, and one using {@code Base-16} (Hexadecimal) numbers.
289     * 
290     * @param str any {@code String} that contains the HTML Escape Sequence 
291     * <CODE>&amp;#[DECIMAL VALUE];</CODE>.
292     * 
293     * @return a {@code String}, with all of the decimal escape sequences removed and replaced with
294     * ASCII UniCode Characters.
295     * 
296     * <BR /><BR />If this parameter does not contain such a sequence, then this method will return
297     * the same input-{@code String} reference as its return value.  
298     * 
299     * @see #replaceAll_HEX(String str)
300     * @see StrReplace#r(String, String[], char[])
301     */
302    public static String replaceAll_DEC(String str)
303    {
304        // This is the RegEx Matcher from the top.  It matches string's that look like: &#\d+;
305        Matcher m = DEC_CODE.matcher(str);
306
307
308        // Save the escape-string regex search matches in a TreeMap.  We need to use a
309        // TreeMap because it is much easier to check if a particular escape sequence has already
310        // been found.  It is easier to find duplicates with TreeMap's.
311
312        TreeMap<String, Character> escMap = new TreeMap<>();
313
314        while (m.find())
315        {
316            // Use Base-10 Integer-Parse
317            int i = Integer.valueOf(m.group(1));
318
319            // Do not un-escape EMOJI's... It makes a mess - they are sequences of characters
320            // not single characters.
321
322            if (i > Character.MAX_VALUE) continue;
323
324            // Retrieve the Text Information about the HTML Escape Sequence
325            String text = m.group();
326
327            // Check if it is a valid HTML 5 Escape Sequence.
328            if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i));
329        }
330
331
332        // Build the matchStr's and replaceChar's arrays.  These are just the KEY's and
333        // the VALUE's of the TreeMap<String, Character> which was just built.
334        // NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping
335        //       duplicates when the replacement occurs.
336
337        String[]    matchStrs       = escMap.keySet().toArray(new String[escMap.size()]);
338        char[]      replaceChars    = new char[escMap.size()];
339
340
341        // Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars"
342        // array.  The class StrReplace will replace all the escape sequences with the actual
343        // characters.
344
345        for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]);
346
347        return StrReplace.r(str, matchStrs, replaceChars);
348    }
349
350    /**
351     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_ALL_TEXT>
352     * 
353     * @param str any {@code String} that contains HTML Escape Sequences that need to be converted
354     * to their ASCII-UniCode character representations.
355     * 
356     * @return a {@code String}, with all of the decimal escape sequences removed and replaced with
357     * ASCII UniCode Characters.
358     * 
359     * @see #replaceAll_HEX(String str)
360     * @see StrReplace#r(String, boolean, String[], Torello.Java.Function.ToCharIntTFunc)
361     * 
362     * @throws IllegalStateException
363     */
364    public static String replaceAll_TEXT(String str)
365    {
366        // We only need to find which escape sequences are in this string.
367        // use a TreeSet<String> to list them.  It will
368
369        Matcher                 m        = TEXT_CODE.matcher(str);
370        TreeMap<String, String> escMap   = new TreeMap<>();
371
372        while (m.find())
373        {
374            // Retrieve the Text Information about the HTML Escape Sequence
375            String text     = m.group();
376            String sequence = text.substring(1, text.length() - 1);
377
378            // Check if it is a valid HTML 5 Escape Sequence.
379            if ((! escMap.containsKey(text)) && htmlEscChars.containsKey(sequence))
380                escMap.put(text, sequence);
381        }
382        
383        // Convert the TreeSet to a String[] array... and use StrReplace
384        String[] escArr = new String[escMap.size()];
385
386        return StrReplace.r(
387            str, false, escMap.keySet().toArray(escArr),
388            (int i, String sequence) -> htmlEscChars.get(escMap.get(sequence))
389        );
390    }
391
392    /**
393     * Calls all of the HTML Escape Sequence convert/replace {@code String} functions at once.
394     * 
395     * @param s This may be any Java {@code String} which may (or may not) contain HTML Escape
396     * sequences.
397     * 
398     * @return a new {@code String} where all HTML escape-sequence substrings have been replaced 
399     * with their natural character representations.
400     * 
401     * @see #replaceAll_DEC(String)
402     * @see #replaceAll_HEX(String)
403     * @see #replaceAll_TEXT(String)
404     */
405    @Deprecated
406    public static String replaceAll(String s)
407    { return replaceAll_HEX(replaceAll_DEC(replaceAll_TEXT(s))); }
408
409    /**
410     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_REPLACE>
411     * 
412     * @param s This may be any Java {@code String} which may (or may not) contain HTML Escape
413     * sequences.
414     * 
415     * @return a new {@code String} where all HTML escape-sequence substrings have been replaced 
416     * with their natural character representations.
417     */
418    @LinkJavaSource(handle="EscapeRepl", entity=METHOD, name="replace")
419    public static String replace(String s)
420    { return EscapeRepl.replace(s); }
421
422    /**
423     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CHAR>
424     * 
425     * @param c Any Java Character.  Note that the Java <B>Primitive Type</B> {@code 'char'}
426     * is a 16-bit type.  This parameter equates to the <B>UNICODE</B> Characters
427     * {@code 0x0000} up to {@code 0xFFFF}.
428     * 
429     * @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String}
430     * to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter.  If the
431     * user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B>
432     * digits, then pass {@code FALSE} to this parameter.
433     * 
434     * @return The passed character parameter {@code 'c'} will be converted to an HTML Escape
435     * Sequence.  For instance if the character <CODE>'&#6211;'</CODE>, which is the Chinese
436     * Character for <I>I, Me, Myself</I> were passed to this method, then the {@code String}
437     * {@code "&#25105;"} would be returned.
438     * 
439     * <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE},
440     * then this method would, instead, return the {@code String "&#x6211;"}.
441     */
442    public static String escChar(char c, boolean use16BitEscapeSequence)
443    {
444        return use16BitEscapeSequence
445            ? "&#" + ((int) c) + ";"
446            : "&#x" + Integer.toHexString((int) c).toUpperCase() + ";";
447    }
448
449    /**
450     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CODE_PT>
451     * 
452     * @param codePoint This will take any integer.  It will be interpreted as a {@code UNICODE}
453     * {@code code point}.  
454     * 
455     * <BR /><BR /><DIV CLASS=JDHint>
456     * Java uses <B>16-bit</B> values for it's primitive {@code 'char'} type.  This is also the
457     * "first plane" of the <B>UNICODE Space</B> and actually referred to as the <B>Basic Multi
458     * Lingual Plane</B>.  Any value passed to this method that is lower than {@code 65,535} would
459     * receive the same escape-{@code String} that it would from a call to the method
460     * {@link #escChar(char, boolean)}.
461     * </DIV>
462     * 
463     * @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String}
464     * to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter.  If the
465     * user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B>
466     * digits, then pass {@code FALSE} to this parameter.
467     * 
468     * @return The {@code code point} will be converted to an HTML Escape Sequence, as a 
469     * {@code java.lang.String}.  For instance if the {@code code point} for "the snowman" glyph
470     * (character &#x2603;), which happens to be represented by a {@code code point} that is below
471     * {@code 65,535} (and, incidentally, does "fit" into a single Java {@code 'char'}) - this
472     * method would return the {@code String "&#9731;"}. 
473     * 
474     * <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE},
475     * then this method would, instead, return the {@code String "&#x2603;"}.
476     * 
477     * @throws IllegalArgumentException Java has a method for determining whether any integer is a
478     * valid {@code code point}.  Not all of the integers "fit" into the 17 Unicode "planes".  
479     * Note that each of the planes in {@code 'Unicode Space'} contain {@code 65,535}
480     * (or {@code 2^16}) characters.
481     */
482    public static String escCodePoint(int codePoint, boolean use16BitEscapeSequence)
483    {
484        if (! Character.isValidCodePoint(codePoint)) throw new IllegalArgumentException(
485            "The integer you have passed to this method [" + codePoint + "] was deemed an " +
486            "invalid Code Point after a call to: [java.lang.Character.isValidCodePoint(int)].  " +
487            "Therefore this method is unable to provide an HTML Escape Sequence."
488        );
489
490        return use16BitEscapeSequence
491            ? "&#" + codePoint + ";"
492            : "&#x" + Integer.toHexString(codePoint).toUpperCase() + ";";
493    }
494    
495    /**
496     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HAS_HTML>
497     *
498     * @param c Any <B>ASCII</B> or <B>UNICODE</B> Character
499     * 
500     * @return {@code TRUE} if there is a {@code String} escape sequence for this character, and
501     * {@code FALSE} otherwise.
502     * 
503     * @see #htmlEsc(char)
504     */
505    public static boolean hasHTMLEsc(char c)
506    { return htmlEscSeq.get(Character.valueOf(c)) != null; }
507
508    /**
509     * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HTML_ESC>
510     *
511     * @param c Any <B>ASCII</B> or <B>UNICODE</B> Character
512     * 
513     * @return The {@code String} that is used by web-browsers to escape this ASCII / Uni-Code
514     * character - <I>if there is one saved</I> in the <B>internal</B> <CODE>Lookup Table</CODE>.
515     * If the character provided does not have an associated {@code HTML Escape String}, then
516     * 'null' is returned.
517     * 
518     * <BR /><BR /><DIV CLASS=JDHint>
519     * The entire escape-{@code String} is not provided, just the inner-characters.  The leading
520     * {@code '&'} (Ampersand) and the trailing {@code ';'} (Semi-Colon) are not appended to the
521     * returned {@code String}.
522     * </DIV>
523     * 
524     * @see #hasHTMLEsc(char)
525     */
526    public static String htmlEsc(char c)
527    { return htmlEscSeq.get(Character.valueOf(c)); }
528}