001package Torello.HTML; 002 003import java.util.*; 004import java.util.regex.*; 005import java.util.stream.*; 006 007import Torello.Java.*; 008 009import Torello.JavaDoc.LinkJavaSource; 010import static Torello.JavaDoc.Entity.METHOD; 011 012/** 013 * Easy utilities for escaping and un-escaping HTML characters such as {@code }, and even 014 * code-point based Emoji's. 015 * 016 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE> 017 */ 018@Torello.JavaDoc.StaticFunctional 019public final class Escape 020{ 021 private Escape() { } 022 023 024 // ******************************************************************************************** 025 // ******************************************************************************************** 026 // Internal Fields, used by this class only 027 // ******************************************************************************************** 028 // ******************************************************************************************** 029 030 031 /** 032 * Regular Expression for characters represented in HTML as 033 * <CODE>&#x[Hexadecimal-Code];</CODE> 034 */ 035 private static final Pattern HEX_CODE = Pattern.compile("&#x([A-F,a-f,\\d]{1,8});"); 036 037 /** 038 * Regular Expression for characters represented in HTML as <CODE>&#[Decimal-Code];</CODE> 039 */ 040 private static final Pattern DEC_CODE = Pattern.compile("&#(\\d{1,8});"); 041 042 /** 043 * Regular Expression (approximate, not exact) for hard-coded escape sequences such as 044 * <CODE>"&amp;"</CODE> 045 * 046 * <BR /><BR />This is <I>"approximate"</I> - because it does not actually look the sequence 047 * up in the hash table. This means, of course, that not everything which matches this Regular 048 * Expression Pattern is actually an escaped HTML ASCII/UniCode character. 049 * 050 * <BR /><BR /><B CLASS=JDDescLabel>For Example:</B> 051 * 052 * <BR /><CODE>&NotACode;</CODE> will match this Regular-Expression, but it is not an 053 * actual HTML Escape-sequence. For that, one needs to consult the internal 054 * {@code 'htmlEscSeq'} or {@code 'htmlEscChars'} tables themselves. 055 * 056 * @see #htmlEscChars 057 * @see #htmlEscSeq 058 */ 059 private static final Pattern TEXT_CODE = Pattern.compile("&[A-Z,a-z,0-9]{1,8};"); 060 061 @SuppressWarnings("rawtypes") 062 private static final Vector data = LFEC.readObjectFromFile_JAR 063 (Escape.class, "data-files/Escape.htdat", true, Vector.class); 064 065 066 // This {@code Hashtable} contains all of the HTML escape characters which are represented by 067 // a short Text-{@code String}. The file listed above contains that list. 068 // 069 // This is "Package-Private", because it used by a "Helper-Class" (EscapeRepl) and one of the 070 // Replace-All Methods! 071 072 @SuppressWarnings("unchecked") 073 static final Hashtable<String, Character> htmlEscChars = 074 (Hashtable<String, Character>) data.elementAt(0); 075 076 /** 077 * This {@code Hashtable} is the reverse of the previous table. It allows a user to look up 078 * the escape sequence, given a particular ASCII {@code char}. 079 * 080 * @see HTML_ESC_CHARS 081 * @see #htmlEscChars 082 */ 083 @SuppressWarnings("unchecked") 084 private static final Hashtable<Character, String> htmlEscSeq = 085 (Hashtable<Character, String>) data.elementAt(1); 086 087 088 // ******************************************************************************************** 089 // ******************************************************************************************** 090 // Some debug, and "View Data" methods 091 // ******************************************************************************************** 092 // ******************************************************************************************** 093 094 095 /** 096 * Print's the HTML Escape Character lookup table to {@code System.out}. 097 * This is useful for debugging. 098 * 099 * <BR /><BR /><B CLASS=JDDescLabel>View Escape-Codes:</B> 100 * 101 * <BR />The JAR Data-File List included within the page attached (below) is a complete list of 102 * all <B><CODE>text-String</B> HTML Escape Sequences </CODE> that are known to this class. 103 * This list, does not include any <CODE>Code Point, Hex</CODE> or <CODE>Decimal Number</CODE> 104 * sequences. 105 * 106 * <BR /><BR /><B><CODE><A HREF="doc-files/EscapeCodes.html"> 107 * All HTML Escape Sequences</A></CODE></B> 108 */ 109 public static void printHTMLEsc() 110 { 111 Enumeration<String> e = htmlEscChars.keys(); 112 113 while (e.hasMoreElements()) 114 { 115 String tag = e.nextElement(); 116 System.out.println("&" + tag + "; ==> " + htmlEscChars.get(tag)); 117 } 118 } 119 120 121 // ******************************************************************************************** 122 // ******************************************************************************************** 123 // Main Part of the class 124 // ******************************************************************************************** 125 // ******************************************************************************************** 126 127 128 /** 129 * Converts a single {@code String} from an HTML-escape sequence into the appropriate 130 * character. 131 * 132 * <BR /><BR /> 133 * <CODE>&[escape-sequence];</CODE> ==> actual ASCII or UniCode character. 134 * 135 * @param escHTML An HTML escape sequence. 136 * 137 * @return the {@code ASCII} or {@code Unicode} character represented by this escape sequence. 138 * 139 * <BR /><BR />This method will return {@code '0'} if the input it does not represent a valid 140 * HTML Escape sequence. 141 */ 142 public static char escHTMLToChar(String escHTML) 143 { 144 if (! escHTML.startsWith("&") || ! escHTML.endsWith(";")) return (char) 0; 145 146 String s = escHTML.substring(1, escHTML.length() - 1); 147 148 // Temporary Variable. 149 int i = 0; 150 151 // Since the EMOJI Escape Sequences use Code Point, they cannot, generally be 152 // converted into a single Character. Skip them. 153 154 if (HEX_CODE.matcher(s).find()) 155 { 156 if ((i = Integer.parseInt(s.substring(2), 16)) < Character.MAX_VALUE) 157 return (char) i; 158 else 159 return 0; 160 } 161 162 163 // Again, deal with Emoji's here... Parse the integer, and make sure it is a 164 // character in the standard UNICODE range. 165 166 if (DEC_CODE.matcher(s).find()) 167 { 168 if ((i = Integer.parseInt(s.substring(1))) < Character.MAX_VALUE) 169 return (char) i; 170 else 171 return 0; 172 } 173 174 // Now check if the provided Escape String is listed in the htmlEscChars Hashtable. 175 Character c = htmlEscChars.get(s); 176 177 178 // If the character was found in the table that lists all escape sequence characters, 179 // then return it. Otherwise just return ASCII zero. 180 181 return (c != null) ? c.charValue() : 0; 182 } 183 184 /** 185 * Will generate a {@code String} whereby any & all <B STYLE='color: red;'><I>Hexadecimal 186 * Escape Sequences</I></B> have been removed and subsequently replaced with their actual 187 * ASCII/UniCode un-escaped characters! 188 * 189 * <BR /><BR /><B CLASS=JDDescLabel>Hexadecimal HTML Escape-Sequence Examples:</B> 190 * 191 * <BR /><TABLE CLASS=JDBriefTable> 192 * <TR><TH>Substring from Input:</TH><TH>Web-Browser Converts To:</TH></TR> 193 * <TR><TD><CODE>&#xAA;</CODE></TD><TD><CODE>'ª'</CODE> within a browser</TD></TR> 194 * <TR><TD><CODE>&#x67;</CODE></TD><TD><CODE>'g'</CODE> within a browser</TD></TR> 195 * <TR><TD><CODE>&#x84;</CODE></TD><TD><CODE>'„'</CODE> within a browser</TD></TR> 196 * </TABLE> 197 * 198 * <BR />This method might be thought of as similar to the older C/C++ {@code 'Ord()'} 199 * function, except it is for HTML. 200 * 201 * @param str any {@code String} that contains an HTML Escape Sequence 202 * &#x[HEXADECIMAL VALUE]; 203 * 204 * @return a {@code String}, with all of the hexadecimal escape sequences removed and replaced 205 * with their equivalent ASCII or UniCode Characters. 206 * 207 * @see #replaceAll_DEC(String str) 208 * @see StrReplace#r(String, String[], char[]) 209 */ 210 public static String replaceAll_HEX(String str) 211 { 212 // This is the RegEx Matcher from the top. It matches string's that look like: &#x\d+; 213 Matcher m = HEX_CODE.matcher(str); 214 215 216 // Save the escape-string regex search matches in a TreeMap. We need to use a 217 // TreeMap because it is much easier to check if a particular escape sequence has already 218 // been found. It is easier to find duplicates with TreeMap's. 219 220 TreeMap<String, Character> escMap = new TreeMap<>(); 221 222 while (m.find()) 223 { 224 // Use Base-16 Integer-Parse 225 int i = Integer.valueOf(m.group(1), 16); 226 227 // Do not un-escape EMOJI's... It makes a mess - they are sequences of characters 228 // not single characters. 229 230 if (i > Character.MAX_VALUE) continue; 231 232 // Retrieve the Text Information about the HTML Escape Sequence 233 String text = m.group(); 234 235 // Check if it is a valid HTML 5 Escape Sequence. 236 if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i)); 237 } 238 239 240 // Build the matchStr's and replaceChar's arrays. These are just the KEY's and 241 // the VALUE's of the TreeMap<String, Character> which was just built. 242 // NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping 243 // duplicates when the replacement occurs. 244 245 String[] matchStrs = escMap.keySet().toArray(new String[escMap.size()]); 246 char[] replaceChars = new char[escMap.size()]; 247 248 249 // Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars" 250 // array. The class StrReplace will replace all the escape squences with the actual 251 // characters. 252 253 for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]); 254 255 return StrReplace.r(str, matchStrs, replaceChars); 256 } 257 258 /** 259 * This method functions the same as {@code replaceAll_HEX(String)} - except it replaces only 260 * HTML Escape sequences that are represented using decimal (base-10) values. 261 * {@code 'replaceAll_HEX(...)'} works on hexadecimal (base-16) values. 262 * 263 * <BR /><BR /><B CLASS=JDDescLabel>Base-10 HTML Escape-Sequence Examples:</B> 264 * 265 * <BR /><TABLE CLASS=JDBriefTable> 266 * <TR><TH>Substring from Input:</TH> <TH>Web-Browser Converts To:</TH></TR> 267 * <TR><TD><CODE>&#48;</CODE></TD> <TD><CODE>'0'</CODE> in your browser</TD></TR> 268 * <TR><TD><CODE>&#64;</CODE></TD> <TD><CODE>'@'</CODE> in your browser</TD></TR> 269 * <TR><TD><CODE>&#123;</CODE></TD> <TD><CODE>'{'</CODE> in your browser</TD></TR> 270 * <TR><TD><CODE>&#125;</CODE></TD> <TD><CODE>'}'</CODE> in your browser</TD></TR> 271 * </TABLE> 272 * 273 * <BR /><B CLASS=JDDescLabel>Base-10 & Base-16 Escape-Sequence Difference:</B> 274 * 275 * <BR /><UL CLASS=JDUL> 276 * 277 * <LI> <CODE>&#x[hex base-16 value];</CODE> There is an {@code 'x'} as the third character 278 * in the {@code String} 279 * </LI> 280 * 281 * <LI> <CODE>&#[decimal base-10 value];</CODE> There is no {@code 'x'} in the 282 * escape-sequence {@code String!} 283 * </LI> 284 * 285 * </UL> 286 * 287 * <BR />This short example delineates the difference between an HTML escape-sequence that 288 * employs {@code Base-10} numbers, and one using {@code Base-16} (Hexadecimal) numbers. 289 * 290 * @param str any {@code String} that contains the HTML Escape Sequence 291 * <CODE>&#[DECIMAL VALUE];</CODE>. 292 * 293 * @return a {@code String}, with all of the decimal escape sequences removed and replaced with 294 * ASCII UniCode Characters. 295 * 296 * <BR /><BR />If this parameter does not contain such a sequence, then this method will return 297 * the same input-{@code String} reference as its return value. 298 * 299 * @see #replaceAll_HEX(String str) 300 * @see StrReplace#r(String, String[], char[]) 301 */ 302 public static String replaceAll_DEC(String str) 303 { 304 // This is the RegEx Matcher from the top. It matches string's that look like: &#\d+; 305 Matcher m = DEC_CODE.matcher(str); 306 307 308 // Save the escape-string regex search matches in a TreeMap. We need to use a 309 // TreeMap because it is much easier to check if a particular escape sequence has already 310 // been found. It is easier to find duplicates with TreeMap's. 311 312 TreeMap<String, Character> escMap = new TreeMap<>(); 313 314 while (m.find()) 315 { 316 // Use Base-10 Integer-Parse 317 int i = Integer.valueOf(m.group(1)); 318 319 // Do not un-escape EMOJI's... It makes a mess - they are sequences of characters 320 // not single characters. 321 322 if (i > Character.MAX_VALUE) continue; 323 324 // Retrieve the Text Information about the HTML Escape Sequence 325 String text = m.group(); 326 327 // Check if it is a valid HTML 5 Escape Sequence. 328 if (! escMap.containsKey(text)) escMap.put(text, Character.valueOf((char) i)); 329 } 330 331 332 // Build the matchStr's and replaceChar's arrays. These are just the KEY's and 333 // the VALUE's of the TreeMap<String, Character> which was just built. 334 // NOTE: A TreeMap is used *RATHER THAN* two parallel arrays in order to avoid keeping 335 // duplicates when the replacement occurs. 336 337 String[] matchStrs = escMap.keySet().toArray(new String[escMap.size()]); 338 char[] replaceChars = new char[escMap.size()]; 339 340 341 // Lookup each "ReplaceChar" in the TreeMap, and put it in the output "replaceChars" 342 // array. The class StrReplace will replace all the escape sequences with the actual 343 // characters. 344 345 for (int i=0; i < matchStrs.length; i++) replaceChars[i] = escMap.get(matchStrs[i]); 346 347 return StrReplace.r(str, matchStrs, replaceChars); 348 } 349 350 /** 351 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_ALL_TEXT> 352 * 353 * @param str any {@code String} that contains HTML Escape Sequences that need to be converted 354 * to their ASCII-UniCode character representations. 355 * 356 * @return a {@code String}, with all of the decimal escape sequences removed and replaced with 357 * ASCII UniCode Characters. 358 * 359 * @see #replaceAll_HEX(String str) 360 * @see StrReplace#r(String, boolean, String[], Torello.Java.Function.ToCharIntTFunc) 361 * 362 * @throws IllegalStateException 363 */ 364 public static String replaceAll_TEXT(String str) 365 { 366 // We only need to find which escape sequences are in this string. 367 // use a TreeSet<String> to list them. It will 368 369 Matcher m = TEXT_CODE.matcher(str); 370 TreeMap<String, String> escMap = new TreeMap<>(); 371 372 while (m.find()) 373 { 374 // Retrieve the Text Information about the HTML Escape Sequence 375 String text = m.group(); 376 String sequence = text.substring(1, text.length() - 1); 377 378 // Check if it is a valid HTML 5 Escape Sequence. 379 if ((! escMap.containsKey(text)) && htmlEscChars.containsKey(sequence)) 380 escMap.put(text, sequence); 381 } 382 383 // Convert the TreeSet to a String[] array... and use StrReplace 384 String[] escArr = new String[escMap.size()]; 385 386 return StrReplace.r( 387 str, false, escMap.keySet().toArray(escArr), 388 (int i, String sequence) -> htmlEscChars.get(escMap.get(sequence)) 389 ); 390 } 391 392 /** 393 * Calls all of the HTML Escape Sequence convert/replace {@code String} functions at once. 394 * 395 * @param s This may be any Java {@code String} which may (or may not) contain HTML Escape 396 * sequences. 397 * 398 * @return a new {@code String} where all HTML escape-sequence substrings have been replaced 399 * with their natural character representations. 400 * 401 * @see #replaceAll_DEC(String) 402 * @see #replaceAll_HEX(String) 403 * @see #replaceAll_TEXT(String) 404 */ 405 @Deprecated 406 public static String replaceAll(String s) 407 { return replaceAll_HEX(replaceAll_DEC(replaceAll_TEXT(s))); } 408 409 /** 410 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_REPLACE> 411 * 412 * @param s This may be any Java {@code String} which may (or may not) contain HTML Escape 413 * sequences. 414 * 415 * @return a new {@code String} where all HTML escape-sequence substrings have been replaced 416 * with their natural character representations. 417 */ 418 @LinkJavaSource(handle="EscapeRepl", entity=METHOD, name="replace") 419 public static String replace(String s) 420 { return EscapeRepl.replace(s); } 421 422 /** 423 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CHAR> 424 * 425 * @param c Any Java Character. Note that the Java <B>Primitive Type</B> {@code 'char'} 426 * is a 16-bit type. This parameter equates to the <B>UNICODE</B> Characters 427 * {@code 0x0000} up to {@code 0xFFFF}. 428 * 429 * @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String} 430 * to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter. If the 431 * user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B> 432 * digits, then pass {@code FALSE} to this parameter. 433 * 434 * @return The passed character parameter {@code 'c'} will be converted to an HTML Escape 435 * Sequence. For instance if the character <CODE>'ᡃ'</CODE>, which is the Chinese 436 * Character for <I>I, Me, Myself</I> were passed to this method, then the {@code String} 437 * {@code "我"} would be returned. 438 * 439 * <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE}, 440 * then this method would, instead, return the {@code String "我"}. 441 */ 442 public static String escChar(char c, boolean use16BitEscapeSequence) 443 { 444 return use16BitEscapeSequence 445 ? "&#" + ((int) c) + ";" 446 : "&#x" + Integer.toHexString((int) c).toUpperCase() + ";"; 447 } 448 449 /** 450 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_CODE_PT> 451 * 452 * @param codePoint This will take any integer. It will be interpreted as a {@code UNICODE} 453 * {@code code point}. 454 * 455 * <BR /><BR /><DIV CLASS=JDHint> 456 * Java uses <B>16-bit</B> values for it's primitive {@code 'char'} type. This is also the 457 * "first plane" of the <B>UNICODE Space</B> and actually referred to as the <B>Basic Multi 458 * Lingual Plane</B>. Any value passed to this method that is lower than {@code 65,535} would 459 * receive the same escape-{@code String} that it would from a call to the method 460 * {@link #escChar(char, boolean)}. 461 * </DIV> 462 * 463 * @param use16BitEscapeSequence If the user would like the returned, escaped, {@code String} 464 * to use <B>Base 16</B> for the escaped digits, pass {@code TRUE} to this parameter. If the 465 * user would like to retrieve an escaped {@code String} that uses standard <B>Base 10</B> 466 * digits, then pass {@code FALSE} to this parameter. 467 * 468 * @return The {@code code point} will be converted to an HTML Escape Sequence, as a 469 * {@code java.lang.String}. For instance if the {@code code point} for "the snowman" glyph 470 * (character ☃), which happens to be represented by a {@code code point} that is below 471 * {@code 65,535} (and, incidentally, does "fit" into a single Java {@code 'char'}) - this 472 * method would return the {@code String "☃"}. 473 * 474 * <BR /><BR />If the parameter {@code 'use16BitEscapeSequence'} had been passed {@code TRUE}, 475 * then this method would, instead, return the {@code String "☃"}. 476 * 477 * @throws IllegalArgumentException Java has a method for determining whether any integer is a 478 * valid {@code code point}. Not all of the integers "fit" into the 17 Unicode "planes". 479 * Note that each of the planes in {@code 'Unicode Space'} contain {@code 65,535} 480 * (or {@code 2^16}) characters. 481 */ 482 public static String escCodePoint(int codePoint, boolean use16BitEscapeSequence) 483 { 484 if (! Character.isValidCodePoint(codePoint)) throw new IllegalArgumentException( 485 "The integer you have passed to this method [" + codePoint + "] was deemed an " + 486 "invalid Code Point after a call to: [java.lang.Character.isValidCodePoint(int)]. " + 487 "Therefore this method is unable to provide an HTML Escape Sequence." 488 ); 489 490 return use16BitEscapeSequence 491 ? "&#" + codePoint + ";" 492 : "&#x" + Integer.toHexString(codePoint).toUpperCase() + ";"; 493 } 494 495 /** 496 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HAS_HTML> 497 * 498 * @param c Any <B>ASCII</B> or <B>UNICODE</B> Character 499 * 500 * @return {@code TRUE} if there is a {@code String} escape sequence for this character, and 501 * {@code FALSE} otherwise. 502 * 503 * @see #htmlEsc(char) 504 */ 505 public static boolean hasHTMLEsc(char c) 506 { return htmlEscSeq.get(Character.valueOf(c)) != null; } 507 508 /** 509 * <EMBED CLASS='external-html' DATA-FILE-ID=ESCAPE_HTML_ESC> 510 * 511 * @param c Any <B>ASCII</B> or <B>UNICODE</B> Character 512 * 513 * @return The {@code String} that is used by web-browsers to escape this ASCII / Uni-Code 514 * character - <I>if there is one saved</I> in the <B>internal</B> <CODE>Lookup Table</CODE>. 515 * If the character provided does not have an associated {@code HTML Escape String}, then 516 * 'null' is returned. 517 * 518 * <BR /><BR /><DIV CLASS=JDHint> 519 * The entire escape-{@code String} is not provided, just the inner-characters. The leading 520 * {@code '&'} (Ampersand) and the trailing {@code ';'} (Semi-Colon) are not appended to the 521 * returned {@code String}. 522 * </DIV> 523 * 524 * @see #hasHTMLEsc(char) 525 */ 526 public static String htmlEsc(char c) 527 { return htmlEscSeq.get(Character.valueOf(c)); } 528}