1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | package Torello.HTML; import static Torello.HTML.Escape.htmlEscChars; class EscapeRepl { static String replace(String s) { // The primary optimization is to do this the "C" way (As in The C Programming Language) // The String to Escape is converted to a character array, and the characters are shifted // as the Escape Sequences are replaced. This is all done "in place" without creating // new substring's in memory. char[] c = s.toCharArray(); // These two pointers are kept as the "Source Character" - as in the next character to // "Read" ... and the "Destination Character" - as in the next location to write. int sourcePos = 0; int destPos = 0; while (sourcePos < c.length) // All Escape Sequences begin with the Ampersand Symbol. If the next character // does not begin with the Ampersand, we should skip and move on. Copy the next source // character to the next destination location, and continue the loop. if (c[sourcePos] != '&') { c[destPos++]=c[sourcePos++]; continue; } // Here, an Ampersand has been found. Now check if the character immediately // following the Ampersand is a Pound Sign. If it is a Pound Sign, that implies // this escape sequence is simply going to be a number. else if ((sourcePos < (c.length-1)) && (c[sourcePos + 1] == '#')) { int evaluatingPos = sourcePos + 1; boolean isHex = false; // If the Character after the Pound Sign is an 'X', it means that the number // that has been escaped is a Base 16 (Hexadecimal) number. // IMPORTANT: Check to see that the Ampersand wasn't the last char in the String if (evaluatingPos + 1 < c.length) if (c[evaluatingPos + 1] == 'x') { isHex = true; evaluatingPos++; } // Keep skipping the numbers, until a non-digit character is identified. while ((++evaluatingPos < c.length) && Character.isDigit(c[evaluatingPos])); // If the character immediately after the last digit isn't a ';' (Semicolon), // then this entire thing is NOT an escaped HTML character. In this case, make // sure to copy the next source-character to the next destination location in the // char[] array... Then continue the loop to the next 'char' (after Ampersand) if ((evaluatingPos == c.length) || (c[evaluatingPos] != ';')) { c[destPos++]=c[sourcePos++]; continue; } int escapedChar; try { // Make sure to convert 16-bit numbers using the 16-bit radix using the // standard java parse integer way. escapedChar = isHex ? Integer.parseInt(s.substring(sourcePos + 3, evaluatingPos), 16) : Integer.parseInt(s.substring(sourcePos + 2, evaluatingPos)); } // If for whatever reason java was unable to parse the digits in the escape // sequence, then copy the next source-character to the next destination-location // and move on in the loop. catch (NumberFormatException e) { c[destPos++]=c[sourcePos++]; continue; } // If the character was an Emoji, then it would be a number greater than // 2^16. Emoji's use Code Points - which are multiple characters used up // together. Their escape sequences are always characters larger than 65,535. // If so, just copy the next source-character to the next destination location, and // move on in the loop. if (escapedChar > Character.MAX_VALUE) { c[destPos++]=c[sourcePos++]; continue; } // Replace the next "Destination Location" with the (un) escaped char. c[destPos++] = (char) escapedChar; // Skip the entire HTML Escape Sequence by skipping to the location after the // position where the "evaluation" (all this processing) was occurring. This // just happens to be the next-character immediately after the semi-colon sourcePos = evaluatingPos + 1; // will be pointing at the ';' (semicolon) } // An Ampersand was just found, but it was not followed by a '#' (Pound Sign). This // means that it is not a "numbered" (to invent a term) HTML Escape Sequence. Instead // we shall check if there is a valid Escape-String (before the next semi-colon) that // can be identified in the Hashtable 'htmlEscChars' else if (sourcePos < (c.length - 1)) { // We need to create a 'temp variable' and it will be called "evaluating position" int evaluatingPos = sourcePos; // All text (non "Numbered") HTML Escape String's are comprised of letter or digits while ((++evaluatingPos < c.length) && Character.isLetterOrDigit(c[evaluatingPos])); // If the character immediately after the last letter or digit is not a semi-colon, // then there is no way this is an HTML Escape Sequence. Copy the next source to // the next destination location, and continue with the loop. if ((evaluatingPos == c.length) || (c[evaluatingPos] != ';')) { c[destPos++]=c[sourcePos++]; continue; } // Get the replacement character from the lookup table. Character replacement = htmlEscChars.get(s.substring(sourcePos + 1, evaluatingPos)); // The lookup table will return null if there this was not a valid escape sequence. // If this was not a valid sequence, just copy the next character from the source // location, and move on in the loop. if (replacement == null) { c[destPos++]=c[sourcePos++]; continue; } c[destPos++] = replacement; sourcePos = evaluatingPos + 1; } else { c[destPos++]=c[sourcePos++]; continue; } return new String(c, 0, destPos); } } |