001package Torello.HTML; 002 003import java.util.*; 004import java.util.regex.*; 005import java.util.stream.*; 006 007import Torello.Java.*; 008 009import Torello.JavaDoc.LinkJavaSource; 010import static Torello.JavaDoc.Entity.METHOD; 011 012/** 013 * Easy utilities for escaping and un-escaping HTML characters such as {@code }, and even 014 * code-point based Emoji's. 015 * 016 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE> 017 */ 018@Torello.JavaDoc.StaticFunctional 019public final class Escape 020{ 021 private Escape() { } 022 023 024 // ******************************************************************************************** 025 // ******************************************************************************************** 026 // Internal Fields, used by this class only 027 // ******************************************************************************************** 028 // ******************************************************************************************** 029 030 031 /** 032 * Regular Expression for characters represented in HTML as 033 * <CODE>&#x[Hexadecimal-Code];</CODE> 034 */ 035 private static final Pattern HEX_CODE = Pattern.compile("&#x([A-F,a-f,\\d]{1,8});"); 036 037 /** 038 * Regular Expression for characters represented in HTML as <CODE>&#[Decimal-Code];</CODE> 039 */ 040 private static final Pattern DEC_CODE = Pattern.compile("&#(\\d{1,8});"); 041 042 /** 043 * Regular Expression (approximate, not exact) for hard-coded escape sequences such as 044 * <CODE>"&amp;"</CODE> 045 * 046 * <BR /><BR />This is <I>"approximate"</I> - because it does not actually look the sequence 047 * up in the hash table. This means, of course, that not everything which matches this Regular 048 * Expression Pattern is actually an escaped HTML ASCII/UniCode character. 049 * 050 * <BR /><BR /><B CLASS=JDDescLabel>For Example:</B> 051 * 052 * <BR /><CODE>&NotACode;</CODE> will match this Regular-Expression, but it is not an 053 * actual HTML Escape-sequence. For that, one needs to consult the internal 054 * {@code 'htmlEscSeq'} or {@code 'htmlEscChars'} tables themselves. 055 * 056 * @see #htmlEscChars 057 * @see #htmlEscSeq 058 */ 059 private static final Pattern TEXT_CODE = Pattern.compile("&[A-Z,a-z,0-9]{1,8};"); 060 061 @SuppressWarnings("rawtypes") 062 private static final Vector data = LFEC.readObjectFromFile_JAR 063 (Escape.class, "data-files/Escape.htdat", true, Vector.class); 064 065 066 // This {@code Hashtable} contains all of the HTML escape characters which are represented by 067 // a short Text-{@code String}. The file listed above contains that list. 068 // 069 // This is "Package-Private", because it used by a "Helper-Class" (EscapeRepl) and one of the 070 // Replace-All Methods! 071 072 @SuppressWarnings("unchecked") 073 static final Hashtable<String, Character> htmlEscChars = 074 (Hashtable<String, Character>) data.elementAt(0); 075 076 /** 077 * This {@code Hashtable} is the reverse of the previous table. It allows a user to look up 078 * the escape sequence, given a particular ASCII {@code char}. 079 * 080 * @see HTML_ESC_CHARS 081 * @see #htmlEscChars 082 */ 083 @SuppressWarnings("unchecked") 084 private static final Hashtable<Character, String> htmlEscSeq = 085 (Hashtable<Character, String>) data.elementAt(1); 086 087 088 // ******************************************************************************************** 089 // ******************************************************************************************** 090 // Some debug, and "View Data" methods 091 // ******************************************************************************************** 092 // ******************************************************************************************** 093 094 095 /** 096 * Print's the HTML Escape Character lookup table to {@code System.out}. 097 * This is useful for debugging. 098 * 099 * <BR /><BR /><B CLASS=JDDescLabel>View Escape-Codes:</B> 100 * 101 * <BR />The JAR Data-File List included within the page attached (below) is a complete list of 102 * all <B><CODE>text-String</B> HTML Escape Sequences </CODE> that are known to this class. 103 * This list, does not include any <CODE>Code Point, Hex</CODE> or <CODE>Decimal Number</CODE> 104 * sequences. 105 * 106 * <BR /><BR /><B><CODE><A HREF="doc-files/EscapeCodes.html"> 107 * All HTML Escape Sequences</A></CODE></B> 108 */ 109 public static void printHTMLEsc() 110 { 111 Enumeration<String> e = htmlEscChars.keys(); 112 113 while (e.hasMoreElements()) 114 { 115 String tag = e.nextElement(); 116 System.out.println("&" + tag + "; ==> " + htmlEscChars.get(tag)); 117 } 118 } 119 120 121 // ******************************************************************************************** 122 // ******************************************************************************************** 123 // Main Part of the class 124 // ******************************************************************************************** 125 // ******************************************************************************************** 126 127 128 /** 129 * Converts a single {@code String} from an HTML-escape sequence into the appropriate 130 * character. 131 * 132 * <BR /><BR /> 133 * <CODE>&[escape-sequence];</CODE> ==> actual ASCII or UniCode character. 134 * 135 * @param escHTML An HTML escape sequence. 136 * 137 * @return the {@code ASCII} or {@code Unicode} character represented by this escape sequence. 138 * 139 * <BR /><BR />This method will return {@code '0'} if the input it does not represent a valid 140 * HTML Escape sequence. 141 */ 142 public static char escHTMLToChar(String escHTML) 143 { 144 if (! escHTML.startsWith("&") || ! escHTML.endsWith(";")) return (char) 0; 145 146 String s = escHTML.substring(1, escHTML.length() - 1); 147 148 // Temporary Variable. 149 int i = 0; 150 151 // Since the EMOJI Escape Sequences use Code Point, they cannot, generally be 152 // converted into a single Character. Skip them. 153 154 if (HEX_CODE.matcher(s).find()) 155 { 156 if ((i = Integer.parseInt(s.substring(2), 16)) < Character.MAX_VALUE) 157 return (char) i; 158 else 159 return 0; 160 } 161 162 // Again, deal with Emoji's here... Parse the integer, and make sure it is a 163 // character in the standard UNICODE range. 164 165 if (DEC_CODE.matcher(s).find()) 166 { 167 if ((i = Integer.parseInt(s.substring(1))) < Character.MAX_VALUE) 168 return (char) i; 169 else 170 return 0; 171 } 172 173 // Now check if the provided Escape String is listed in the htmlEscChars Hashtable. 174 Character c = htmlEscChars.get(s); 175 176 // If the character was found in the table that lists all escape sequence characters, 177 // then return it. Otherwise just return ASCII zero. 178 179 return (c != null) ? c.charValue() : 0; 180 } 181 182 /** 183 * Will generate a {@code String} whereby any & all <B STYLE='color: red;'><I>Hexadecimal 184 * Escape Sequences</I></B> have been removed and subsequently replaced with their actual 185 * ASCII/UniCode un-escaped characters! 186 * 187 * <BR /><BR /><B CLASS=JDDescLabel>Hexadecimal HTML Escape-Sequence Examples:</B> 188 * 189 * <BR /><TABLE CLASS=JDBriefTable> 190 * <TR><TH>Substring from Input:</TH><TH>Web-Browser Converts To:</TH></TR> 191 * <TR><TD><CODE>&#xAA;</CODE></TD><TD><CODE>'ª'</CODE> within a browser</TD></TR> 192 * <TR><TD><CODE>&#x67;</CODE></TD><TD><CODE>'g'</CODE> within a browser</TD></TR> 193 * <TR><TD><CODE>&#x84;</CODE></TD><TD><CODE>'„'</CODE> within a browser</TD></TR> 194 * </TABLE> 195 * 196 * <BR />This method might be thought of as similar to the older C/C++ {@code 'Ord()'} 197 * function, except it is for HTML. 198 * 199 * @param str any {@code String} that contains an HTML Escape Sequence 200 * &#x[HEXADECIMAL VALUE]; 201 * 202 * @return a {@code String}, with all of the hexadecimal escape sequences removed and replaced 203 * with their equivalent ASCII or UniCode Characters. 204 * 205 * @see #replaceAll_DEC(String str) 206 * @see StrReplace#r(String, String[], char[]) 207 */ 208 public static String replaceAll_HEX(String str) 209 { 210 // This is the RegEx Matcher from the top. It matches string's that look like: &#x\d+; 211 Matcher m = HEX_CODE.matcher(str); 212 213 // Save the escape-string regex search matches in a TreeMap. We need to use a 214 // TreeMap because it is much easier to check if a particular escape sequence has already 215 // been found. It is easier to find duplicates with TreeMap's. 216 217 TreeMap<String, Character> escMap = new TreeMap<>(); 218 219 while (m.find()) 220 { 221 // Use Base-16 Integer-Parse 222 int i = Integer.valueOf(m.group(1), 16); 223 224 // Do not un-escape EMOJI's... It makes a mess - they are sequences of characters 225 // not single characters. 226 227 if (i > Character.MAX_VALUE) continue; 228 229 // Retrieve the Text Information about the HTML Escape Sequence 230 String text = m.group(); 231 232 // Check if it is a valid HTML 5 Escape Sequence. 233 if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i)); 234 } 235 236 // Build the matchStr's and replaceChar's arrays. These are just the KEY's and 237 // the VALUE's of the TreeMap<String, Character> which was just built. 238 // NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping 239 // duplicates when the replacement occurs. 240 241 String[] matchStrs = escMap.keySet().toArray(new String[escMap.size()]); 242 char[] replaceChars = new char[escMap.size()]; 243 244 // Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars" 245 // array. The class StrReplace will replace all the escape squences with the actual 246 // characters. 247 248 for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]); 249 250 return StrReplace.r(str, matchStrs, replaceChars); 251 } 252 253 /** 254 * This method functions the same as {@code replaceAll_HEX(String)} - except it replaces only 255 * HTML Escape sequences that are represented using decimal (base-10) values. 256 * {@code 'replaceAll_HEX(...)'} works on hexadecimal (base-16) values. 257 * 258 * <BR /><BR /><B CLASS=JDDescLabel>Base-10 HTML Escape-Sequence Examples:</B> 259 * 260 * <BR /><TABLE CLASS=JDBriefTable> 261 * <TR><TH>Substring from Input:</TH><TH>Web-Browser Converts To:</TH></TR> 262 * <TR><TD><CODE>&#48;</CODE></TD><TD><CODE>'0'</CODE> in your browser</TD></TR> 263 * <TR><TD><CODE>&#64;</CODE></TD><TD><CODE>'@'</CODE> in your browser</TD></TR> 264 * <TR><TD><CODE>&#123;</CODE></TD><TD><CODE>'{'</CODE> in your browser</TD></TR> 265 * <TR><TD><CODE>&#125;</CODE></TD><TD><CODE>'}'</CODE> in your browser</TD></TR> 266 * </TABLE> 267 * 268 * <BR /><B CLASS=JDDescLabel>Base-10 & Base-16 Escape-Sequence Difference:</B> 269 * 270 * <BR /><UL CLASS=JDUL> 271 * <LI> <CODE>&#x[hex base-16 value];</CODE> There is an {@code 'x'} as the third character 272 * in the {@code String} 273 * </LI> 274 * <LI> <CODE>&#[decimal base-10 value];</CODE> There is no {@code 'x'} in the 275 * escape-sequence {@code String!} 276 * </LI> 277 * </UL> 278 * 279 * <BR />This short example delineates the difference between an HTML escape-sequence that 280 * employs {@code Base-10} numbers, and one using {@code Base-16} (Hexadecimal) numbers. 281 * 282 * @param str any {@code String} that contains the HTML Escape Sequence 283 * <CODE>&#[DECIMAL VALUE];</CODE>. 284 * 285 * @return a {@code String}, with all of the decimal escape sequences removed and replaced with 286 * ASCII UniCode Characters. 287 * 288 * <BR /><BR />If this parameter does not contain such a sequence, then this method will return 289 * the same input-{@code String} reference as its return value. 290 * 291 * @see #replaceAll_HEX(String str) 292 * @see StrReplace#r(String, String[], char[]) 293 */ 294 public static String replaceAll_DEC(String str) 295 { 296 // This is the RegEx Matcher from the top. It matches string's that look like: &#\d+; 297 Matcher m = DEC_CODE.matcher(str); 298 299 // Save the escape-string regex search matches in a TreeMap. We need to use a 300 // TreeMap because it is much easier to check if a particular escape sequence has already 301 // been found. It is easier to find duplicates with TreeMap's. 302 303 TreeMap<String, Character> escMap = new TreeMap<>(); 304 305 while (m.find()) 306 { 307 // Use Base-10 Integer-Parse 308 int i = Integer.valueOf(m.group(1)); 309 310 // Do not un-escape EMOJI's... It makes a mess - they are sequences of characters 311 // not single characters. 312 313 if (i > Character.MAX_VALUE) continue; 314 315 // Retrieve the Text Information about the HTML Escape Sequence 316 String text = m.group(); 317 318 // Check if it is a valid HTML 5 Escape Sequence. 319 if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i)); 320 } 321 322 // Build the matchStr's and replaceChar's arrays. These are just the KEY's and 323 // the VALUE's of the TreeMap<String, Character> which was just built. 324 // NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping 325 // duplicates when the replacement occurs. 326 327 String[] matchStrs = escMap.keySet().toArray(new String[escMap.size()]); 328 char[] replaceChars = new char[escMap.size()]; 329 330 // Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars" 331 // array. The class StrReplace will replace all the escape sequences with the actual 332 // characters. 333 334 for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]); 335 336 return StrReplace.r(str, matchStrs, replaceChars); 337 } 338 339 /** 340 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_ALL_TEXT> 341 * 342 * @param str any {@code String} that contains HTML Escape Sequences that need to be converted 343 * to their ASCII-UniCode character representations. 344 * 345 * @return a {@code String}, with all of the decimal escape sequences removed and replaced with 346 * ASCII UniCode Characters. 347 * 348 * @see #replaceAll_HEX(String str) 349 * @see StrReplace#r(String, boolean, String[], Torello.Java.Function.ToCharIntTFunc) 350 * 351 * @throws IllegalStateException 352 */ 353 public static String replaceAll_TEXT(String str) 354 { 355 // We only need to find which escape sequences are in this string. 356 // use a TreeSet<String> to list them. It will 357 358 Matcher m = TEXT_CODE.matcher(str); 359 TreeMap<String, String> escMap = new TreeMap<>(); 360 361 while (m.find()) 362 { 363 // Retrieve the Text Information about the HTML Escape Sequence 364 String text = m.group(); 365 String sequence = text.substring(1, text.length() - 1); 366 367 // Check if it is a valid HTML 5 Escape Sequence. 368 if ((! escMap.containsKey(text)) && htmlEscChars.containsKey(sequence)) 369 escMap.put(text, sequence); 370 } 371 372 // Convert the TreeSet to a String[] array... and use StrReplace 373 String[] escArr = new String[escMap.size()]; 374 375 return StrReplace.r( 376 str, false, escMap.keySet().toArray(escArr), 377 (int i, String sequence) -> htmlEscChars.get(escMap.get(sequence)) 378 ); 379 } 380 381 /** 382 * Calls all of the HTML Escape Sequence convert/replace {@code String} functions at once. 383 * 384 * @param s This may be any Java {@code String} which may (or may not) contain HTML Escape 385 * sequences. 386 * 387 * @return a new {@code String} where all HTML escape-sequence substrings have been replaced 388 * with their natural character representations. 389 * 390 * @see #replaceAll_DEC(String) 391 * @see #replaceAll_HEX(String) 392 * @see #replaceAll_TEXT(String) 393 */ 394 @Deprecated 395 public static String replaceAll(String s) 396 { return replaceAll_HEX(replaceAll_DEC(replaceAll_TEXT(s))); } 397 398 /** 399 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_REPLACE> 400 * 401 * @param s This may be any Java {@code String} which may (or may not) contain HTML Escape 402 * sequences. 403 * 404 * @return a new {@code String} where all HTML escape-sequence substrings have been replaced 405 * with their natural character representations. 406 */ 407 @LinkJavaSource(handle="EscapeRepl", entity=METHOD, name="replace") 408 public static String replace(String s) 409 { return EscapeRepl.replace(s); } 410 411 /** 412 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CHAR> 413 * 414 * @param c Any Java Character. Note that the Java <B>Primitive Type</B> {@code 'char'} 415 * is a 16-bit type. This parameter equates to the <B>UNICODE</B> Characters 416 * {@code 0x0000} up to {@code 0xFFFF}. 417 * 418 * @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String} 419 * to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter. If the 420 * user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B> 421 * digits, then pass {@code FALSE} to this parameter. 422 * 423 * @return The passed character parameter {@code 'c'} will be converted to an HTML Escape 424 * Sequence. For instance if the character <CODE>'ᡃ'</CODE>, which is the Chinese 425 * Character for <I>I, Me, Myself</I> were passed to this method, then the {@code String} 426 * {@code "我"} would be returned. 427 * 428 * <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE}, 429 * then this method would, instead, return the {@code String "我"}. 430 */ 431 public static String escChar(char c, boolean use16BitEscapeSequence) 432 { 433 return use16BitEscapeSequence 434 ? "&#" + ((int) c) + ";" 435 : "&#x" + Integer.toHexString((int) c).toUpperCase() + ";"; 436 } 437 438 /** 439 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CODE_PT> 440 * 441 * @param codePoint This will take any integer. It will be interpreted as a {@code UNICODE} 442 * {@code code point}. 443 * 444 * <BR /><BR /><B STYLE="color:red;">NOTE:</B> Java uses <B>16-bit</B> values for it's 445 * primitive {@code 'char'} type. This is also the "first plane" of the <B>UNICODE Space</B> 446 * and actually referred to as the <B>Basic Multi Lingual Plane</B>. Any value passed to this 447 * method that is lower than {@code 65,535} would receive the same escape-{@code String} that 448 * it would from a call to the method {@link #escChar(char, boolean)}. 449 * 450 * @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String} 451 * to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter. If the 452 * user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B> 453 * digits, then pass {@code FALSE} to this parameter. 454 * 455 * @return The {@code code point} will be converted to an HTML Escape Sequence, as a 456 * {@code java.lang.String}. For instance if the {@code code point} for "the snowman" glyph 457 * (character ☃), which happens to be represented by a {@code code point} that is below 458 * {@code 65,535} (and, incidentally, does "fit" into a single Java {@code 'char'}) - this 459 * method would return the {@code String "☃"}. 460 * 461 * <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE}, 462 * then this method would, instead, return the {@code String "☃"}. 463 * 464 * @throws IllegalArgumentException Java has a method for determining whether any integer is a 465 * valid {@code code point}. Not all of the integers "fit" into the 17 Unicode "planes". 466 * Note that each of the planes in {@code 'Unicode Space'} contain {@code 65,535} 467 * (or {@code 2^16}) characters. 468 */ 469 public static String escCodePoint(int codePoint, boolean use16BitEscapeSequence) 470 { 471 if (! Character.isValidCodePoint(codePoint)) throw new IllegalArgumentException( 472 "The integer you have passed to this method [" + codePoint + "] was deemed an " + 473 "invalid Code Point after a call to: [java.lang.Character.isValidCodePoint(int)]. " + 474 "Therefore this method is unable to provide an HTML Escape Sequence." 475 ); 476 477 return use16BitEscapeSequence 478 ? "&#" + codePoint + ";" 479 : "&#x" + Integer.toHexString(codePoint).toUpperCase() + ";"; 480 } 481 482 /** 483 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HAS_HTML> 484 * 485 * @param c Any <B>ASCII</B> or <B>UNICODE</B> Character 486 * 487 * @return {@code TRUE} if there is a {@code String} escape sequence for this character, and 488 * {@code FALSE} otherwise. 489 * 490 * @see #htmlEsc(char) 491 */ 492 public static boolean hasHTMLEsc(char c) 493 { return htmlEscSeq.get(Character.valueOf(c)) != null; } 494 495 /** 496 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HTML_ESC> 497 * 498 * @param c Any <B>ASCII</B> or <B>UNICODE</B> Character 499 * 500 * @return The {@code String} that is used by web-browsers to escape this ASCII / Uni-Code 501 * character - <I>if there is one saved</I> in the <B>internal</B> <CODE>Lookup Table</CODE>. 502 * If the character provided does not have an associated {@code HTML Escape String}, then 503 * 'null' is returned. 504 * 505 * <BR /><BR /><B>NOTE:</B> The entire escape-{@code String} is not provided, just the 506 * inner-characters. The leading {@code '&'} (Ampersand) and the trailing {@code ';'} 507 * (Semi-Colon) are not appended to the returned {@code String}. 508 * 509 * @see #hasHTMLEsc(char) 510 */ 511 public static String htmlEsc(char c) 512 { return htmlEscSeq.get(Character.valueOf(c)); } 513}