package Torello.HTML;
import java.util.*;
import java.util.regex.*;
import java.util.stream.*;
import Torello.Java.*;
import Torello.JavaDoc.LinkJavaSource;
import static Torello.JavaDoc.Entity.METHOD;
/**
* Easy utilities for escaping and un-escaping HTML characters such as {@code }, and even
* code-point based Emoji's.
*
* <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE>
*/
@Torello.JavaDoc.StaticFunctional
public final class Escape
{
private Escape() { }
// ********************************************************************************************
// ********************************************************************************************
// Internal Fields, used by this class only
// ********************************************************************************************
// ********************************************************************************************
/**
* Regular Expression for characters represented in HTML as
* <CODE>&#x[Hexadecimal-Code];</CODE>
*/
private static final Pattern HEX_CODE = Pattern.compile("&#x([A-F,a-f,\\d]{1,8});");
/**
* Regular Expression for characters represented in HTML as <CODE>&#[Decimal-Code];</CODE>
*/
private static final Pattern DEC_CODE = Pattern.compile("&#(\\d{1,8});");
/**
* Regular Expression (approximate, not exact) for hard-coded escape sequences such as
* <CODE>"&amp;"</CODE>
*
* <BR /><BR />This is <I>"approximate"</I> - because it does not actually look the sequence
* up in the hash table. This means, of course, that not everything which matches this Regular
* Expression Pattern is actually an escaped HTML ASCII/UniCode character.
*
* <BR /><BR /><B CLASS=JDDescLabel>For Example:</B>
*
* <BR /><CODE>&NotACode;</CODE> will match this Regular-Expression, but it is not an
* actual HTML Escape-sequence. For that, one needs to consult the internal
* {@code 'htmlEscSeq'} or {@code 'htmlEscChars'} tables themselves.
*
* @see #htmlEscChars
* @see #htmlEscSeq
*/
private static final Pattern TEXT_CODE = Pattern.compile("&[A-Z,a-z,0-9]{1,8};");
@SuppressWarnings("rawtypes")
private static final Vector data = LFEC.readObjectFromFile_JAR
(Escape.class, "data-files/Escape.htdat", true, Vector.class);
// This {@code Hashtable} contains all of the HTML escape characters which are represented by
// a short Text-{@code String}. The file listed above contains that list.
//
// This is "Package-Private", because it used by a "Helper-Class" (EscapeRepl) and one of the
// Replace-All Methods!
@SuppressWarnings("unchecked")
static final Hashtable<String, Character> htmlEscChars =
(Hashtable<String, Character>) data.elementAt(0);
/**
* This {@code Hashtable} is the reverse of the previous table. It allows a user to look up
* the escape sequence, given a particular ASCII {@code char}.
*
* @see HTML_ESC_CHARS
* @see #htmlEscChars
*/
@SuppressWarnings("unchecked")
private static final Hashtable<Character, String> htmlEscSeq =
(Hashtable<Character, String>) data.elementAt(1);
// ********************************************************************************************
// ********************************************************************************************
// Some debug, and "View Data" methods
// ********************************************************************************************
// ********************************************************************************************
/**
* Print's the HTML Escape Character lookup table to {@code System.out}.
* This is useful for debugging.
*
* <BR /><BR /><B CLASS=JDDescLabel>View Escape-Codes:</B>
*
* <BR />The JAR Data-File List included within the page attached (below) is a complete list of
* all <B><CODE>text-String</B> HTML Escape Sequences </CODE> that are known to this class.
* This list, does not include any <CODE>Code Point, Hex</CODE> or <CODE>Decimal Number</CODE>
* sequences.
*
* <BR /><BR /><B><CODE><A HREF="doc-files/EscapeCodes.html">
* All HTML Escape Sequences</A></CODE></B>
*/
public static void printHTMLEsc()
{
Enumeration<String> e = htmlEscChars.keys();
while (e.hasMoreElements())
{
String tag = e.nextElement();
System.out.println("&" + tag + "; ==> " + htmlEscChars.get(tag));
}
}
// ********************************************************************************************
// ********************************************************************************************
// Main Part of the class
// ********************************************************************************************
// ********************************************************************************************
/**
* Converts a single {@code String} from an HTML-escape sequence into the appropriate
* character.
*
* <BR /><BR />
* <CODE>&[escape-sequence];</CODE> ==> actual ASCII or UniCode character.
*
* @param escHTML An HTML escape sequence.
*
* @return the {@code ASCII} or {@code Unicode} character represented by this escape sequence.
*
* <BR /><BR />This method will return {@code '0'} if the input it does not represent a valid
* HTML Escape sequence.
*/
public static char escHTMLToChar(String escHTML)
{
if (! escHTML.startsWith("&") || ! escHTML.endsWith(";")) return (char) 0;
String s = escHTML.substring(1, escHTML.length() - 1);
// Temporary Variable.
int i = 0;
// Since the EMOJI Escape Sequences use Code Point, they cannot, generally be
// converted into a single Character. Skip them.
if (HEX_CODE.matcher(s).find())
{
if ((i = Integer.parseInt(s.substring(2), 16)) < Character.MAX_VALUE)
return (char) i;
else
return 0;
}
// Again, deal with Emoji's here... Parse the integer, and make sure it is a
// character in the standard UNICODE range.
if (DEC_CODE.matcher(s).find())
{
if ((i = Integer.parseInt(s.substring(1))) < Character.MAX_VALUE)
return (char) i;
else
return 0;
}
// Now check if the provided Escape String is listed in the htmlEscChars Hashtable.
Character c = htmlEscChars.get(s);
// If the character was found in the table that lists all escape sequence characters,
// then return it. Otherwise just return ASCII zero.
return (c != null) ? c.charValue() : 0;
}
/**
* Will generate a {@code String} whereby any & all <B STYLE='color: red;'><I>Hexadecimal
* Escape Sequences</I></B> have been removed and subsequently replaced with their actual
* ASCII/UniCode un-escaped characters!
*
* <BR /><BR /><B CLASS=JDDescLabel>Hexadecimal HTML Escape-Sequence Examples:</B>
*
* <BR /><TABLE CLASS=JDBriefTable>
* <TR><TH>Substring from Input:</TH><TH>Web-Browser Converts To:</TH></TR>
* <TR><TD><CODE>&#xAA;</CODE></TD><TD><CODE>'ª'</CODE> within a browser</TD></TR>
* <TR><TD><CODE>&#x67;</CODE></TD><TD><CODE>'g'</CODE> within a browser</TD></TR>
* <TR><TD><CODE>&#x84;</CODE></TD><TD><CODE>'„'</CODE> within a browser</TD></TR>
* </TABLE>
*
* <BR />This method might be thought of as similar to the older C/C++ {@code 'Ord()'}
* function, except it is for HTML.
*
* @param str any {@code String} that contains an HTML Escape Sequence
* &#x[HEXADECIMAL VALUE];
*
* @return a {@code String}, with all of the hexadecimal escape sequences removed and replaced
* with their equivalent ASCII or UniCode Characters.
*
* @see #replaceAll_DEC(String str)
* @see StrReplace#r(String, String[], char[])
*/
public static String replaceAll_HEX(String str)
{
// This is the RegEx Matcher from the top. It matches string's that look like: &#x\d+;
Matcher m = HEX_CODE.matcher(str);
// Save the escape-string regex search matches in a TreeMap. We need to use a
// TreeMap because it is much easier to check if a particular escape sequence has already
// been found. It is easier to find duplicates with TreeMap's.
TreeMap<String, Character> escMap = new TreeMap<>();
while (m.find())
{
// Use Base-16 Integer-Parse
int i = Integer.valueOf(m.group(1), 16);
// Do not un-escape EMOJI's... It makes a mess - they are sequences of characters
// not single characters.
if (i > Character.MAX_VALUE) continue;
// Retrieve the Text Information about the HTML Escape Sequence
String text = m.group();
// Check if it is a valid HTML 5 Escape Sequence.
if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i));
}
// Build the matchStr's and replaceChar's arrays. These are just the KEY's and
// the VALUE's of the TreeMap<String, Character> which was just built.
// NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping
// duplicates when the replacement occurs.
String[] matchStrs = escMap.keySet().toArray(new String[escMap.size()]);
char[] replaceChars = new char[escMap.size()];
// Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars"
// array. The class StrReplace will replace all the escape squences with the actual
// characters.
for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]);
return StrReplace.r(str, matchStrs, replaceChars);
}
/**
* This method functions the same as {@code replaceAll_HEX(String)} - except it replaces only
* HTML Escape sequences that are represented using decimal (base-10) values.
* {@code 'replaceAll_HEX(...)'} works on hexadecimal (base-16) values.
*
* <BR /><BR /><B CLASS=JDDescLabel>Base-10 HTML Escape-Sequence Examples:</B>
*
* <BR /><TABLE CLASS=JDBriefTable>
* <TR><TH>Substring from Input:</TH> <TH>Web-Browser Converts To:</TH></TR>
* <TR><TD><CODE>&#48;</CODE></TD> <TD><CODE>'0'</CODE> in your browser</TD></TR>
* <TR><TD><CODE>&#64;</CODE></TD> <TD><CODE>'@'</CODE> in your browser</TD></TR>
* <TR><TD><CODE>&#123;</CODE></TD> <TD><CODE>'{'</CODE> in your browser</TD></TR>
* <TR><TD><CODE>&#125;</CODE></TD> <TD><CODE>'}'</CODE> in your browser</TD></TR>
* </TABLE>
*
* <BR /><B CLASS=JDDescLabel>Base-10 & Base-16 Escape-Sequence Difference:</B>
*
* <BR /><UL CLASS=JDUL>
*
* <LI> <CODE>&#x[hex base-16 value];</CODE> There is an {@code 'x'} as the third character
* in the {@code String}
* </LI>
*
* <LI> <CODE>&#[decimal base-10 value];</CODE> There is no {@code 'x'} in the
* escape-sequence {@code String!}
* </LI>
*
* </UL>
*
* <BR />This short example delineates the difference between an HTML escape-sequence that
* employs {@code Base-10} numbers, and one using {@code Base-16} (Hexadecimal) numbers.
*
* @param str any {@code String} that contains the HTML Escape Sequence
* <CODE>&#[DECIMAL VALUE];</CODE>.
*
* @return a {@code String}, with all of the decimal escape sequences removed and replaced with
* ASCII UniCode Characters.
*
* <BR /><BR />If this parameter does not contain such a sequence, then this method will return
* the same input-{@code String} reference as its return value.
*
* @see #replaceAll_HEX(String str)
* @see StrReplace#r(String, String[], char[])
*/
public static String replaceAll_DEC(String str)
{
// This is the RegEx Matcher from the top. It matches string's that look like: &#\d+;
Matcher m = DEC_CODE.matcher(str);
// Save the escape-string regex search matches in a TreeMap. We need to use a
// TreeMap because it is much easier to check if a particular escape sequence has already
// been found. It is easier to find duplicates with TreeMap's.
TreeMap<String, Character> escMap = new TreeMap<>();
while (m.find())
{
// Use Base-10 Integer-Parse
int i = Integer.valueOf(m.group(1));
// Do not un-escape EMOJI's... It makes a mess - they are sequences of characters
// not single characters.
if (i > Character.MAX_VALUE) continue;
// Retrieve the Text Information about the HTML Escape Sequence
String text = m.group();
// Check if it is a valid HTML 5 Escape Sequence.
if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i));
}
// Build the matchStr's and replaceChar's arrays. These are just the KEY's and
// the VALUE's of the TreeMap<String, Character> which was just built.
// NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping
// duplicates when the replacement occurs.
String[] matchStrs = escMap.keySet().toArray(new String[escMap.size()]);
char[] replaceChars = new char[escMap.size()];
// Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars"
// array. The class StrReplace will replace all the escape sequences with the actual
// characters.
for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]);
return StrReplace.r(str, matchStrs, replaceChars);
}
/**
* <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_ALL_TEXT>
*
* @param str any {@code String} that contains HTML Escape Sequences that need to be converted
* to their ASCII-UniCode character representations.
*
* @return a {@code String}, with all of the decimal escape sequences removed and replaced with
* ASCII UniCode Characters.
*
* @see #replaceAll_HEX(String str)
* @see StrReplace#r(String, boolean, String[], Torello.Java.Function.ToCharIntTFunc)
*
* @throws IllegalStateException
*/
public static String replaceAll_TEXT(String str)
{
// We only need to find which escape sequences are in this string.
// use a TreeSet<String> to list them. It will
Matcher m = TEXT_CODE.matcher(str);
TreeMap<String, String> escMap = new TreeMap<>();
while (m.find())
{
// Retrieve the Text Information about the HTML Escape Sequence
String text = m.group();
String sequence = text.substring(1, text.length() - 1);
// Check if it is a valid HTML 5 Escape Sequence.
if ((! escMap.containsKey(text)) && htmlEscChars.containsKey(sequence))
escMap.put(text, sequence);
}
// Convert the TreeSet to a String[] array... and use StrReplace
String[] escArr = new String[escMap.size()];
return StrReplace.r(
str, false, escMap.keySet().toArray(escArr),
(int i, String sequence) -> htmlEscChars.get(escMap.get(sequence))
);
}
/**
* Calls all of the HTML Escape Sequence convert/replace {@code String} functions at once.
*
* @param s This may be any Java {@code String} which may (or may not) contain HTML Escape
* sequences.
*
* @return a new {@code String} where all HTML escape-sequence substrings have been replaced
* with their natural character representations.
*
* @see #replaceAll_DEC(String)
* @see #replaceAll_HEX(String)
* @see #replaceAll_TEXT(String)
*/
@Deprecated
public static String replaceAll(String s)
{ return replaceAll_HEX(replaceAll_DEC(replaceAll_TEXT(s))); }
/**
* <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_REPLACE>
*
* @param s This may be any Java {@code String} which may (or may not) contain HTML Escape
* sequences.
*
* @return a new {@code String} where all HTML escape-sequence substrings have been replaced
* with their natural character representations.
*/
@LinkJavaSource(handle="EscapeRepl", entity=METHOD, name="replace")
public static String replace(String s)
{ return EscapeRepl.replace(s); }
/**
* <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CHAR>
*
* @param c Any Java Character. Note that the Java <B>Primitive Type</B> {@code 'char'}
* is a 16-bit type. This parameter equates to the <B>UNICODE</B> Characters
* {@code 0x0000} up to {@code 0xFFFF}.
*
* @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String}
* to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter. If the
* user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B>
* digits, then pass {@code FALSE} to this parameter.
*
* @return The passed character parameter {@code 'c'} will be converted to an HTML Escape
* Sequence. For instance if the character <CODE>'ᡃ'</CODE>, which is the Chinese
* Character for <I>I, Me, Myself</I> were passed to this method, then the {@code String}
* {@code "我"} would be returned.
*
* <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE},
* then this method would, instead, return the {@code String "我"}.
*/
public static String escChar(char c, boolean use16BitEscapeSequence)
{
return use16BitEscapeSequence
? "&#" + ((int) c) + ";"
: "&#x" + Integer.toHexString((int) c).toUpperCase() + ";";
}
/**
* <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CODE_PT>
*
* @param codePoint This will take any integer. It will be interpreted as a {@code UNICODE}
* {@code code point}.
*
* <BR /><BR /><DIV CLASS=JDHint>
* Java uses <B>16-bit</B> values for it's primitive {@code 'char'} type. This is also the
* "first plane" of the <B>UNICODE Space</B> and actually referred to as the <B>Basic Multi
* Lingual Plane</B>. Any value passed to this method that is lower than {@code 65,535} would
* receive the same escape-{@code String} that it would from a call to the method
* {@link #escChar(char, boolean)}.
* </DIV>
*
* @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String}
* to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter. If the
* user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B>
* digits, then pass {@code FALSE} to this parameter.
*
* @return The {@code code point} will be converted to an HTML Escape Sequence, as a
* {@code java.lang.String}. For instance if the {@code code point} for "the snowman" glyph
* (character ☃), which happens to be represented by a {@code code point} that is below
* {@code 65,535} (and, incidentally, does "fit" into a single Java {@code 'char'}) - this
* method would return the {@code String "☃"}.
*
* <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE},
* then this method would, instead, return the {@code String "☃"}.
*
* @throws IllegalArgumentException Java has a method for determining whether any integer is a
* valid {@code code point}. Not all of the integers "fit" into the 17 Unicode "planes".
* Note that each of the planes in {@code 'Unicode Space'} contain {@code 65,535}
* (or {@code 2^16}) characters.
*/
public static String escCodePoint(int codePoint, boolean use16BitEscapeSequence)
{
if (! Character.isValidCodePoint(codePoint)) throw new IllegalArgumentException(
"The integer you have passed to this method [" + codePoint + "] was deemed an " +
"invalid Code Point after a call to: [java.lang.Character.isValidCodePoint(int)]. " +
"Therefore this method is unable to provide an HTML Escape Sequence."
);
return use16BitEscapeSequence
? "&#" + codePoint + ";"
: "&#x" + Integer.toHexString(codePoint).toUpperCase() + ";";
}
/**
* <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HAS_HTML>
*
* @param c Any <B>ASCII</B> or <B>UNICODE</B> Character
*
* @return {@code TRUE} if there is a {@code String} escape sequence for this character, and
* {@code FALSE} otherwise.
*
* @see #htmlEsc(char)
*/
public static boolean hasHTMLEsc(char c)
{ return htmlEscSeq.get(Character.valueOf(c)) != null; }
/**
* <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HTML_ESC>
*
* @param c Any <B>ASCII</B> or <B>UNICODE</B> Character
*
* @return The {@code String} that is used by web-browsers to escape this ASCII / Uni-Code
* character - <I>if there is one saved</I> in the <B>internal</B> <CODE>Lookup Table</CODE>.
* If the character provided does not have an associated {@code HTML Escape String}, then
* 'null' is returned.
*
* <BR /><BR /><DIV CLASS=JDHint>
* The entire escape-{@code String} is not provided, just the inner-characters. The leading
* {@code '&'} (Ampersand) and the trailing {@code ';'} (Semi-Colon) are not appended to the
* returned {@code String}.
* </DIV>
*
* @see #hasHTMLEsc(char)
*/
public static String htmlEsc(char c)
{ return htmlEscSeq.get(Character.valueOf(c)); }
}