1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
package Torello.HTML;

import static Torello.HTML.Escape.htmlEscChars;

class EscapeRepl
{
    static String replace(String s)
    {
        // The primary optimization is to do this the "C" way (As in The C Programming Language)
        // The String to Escape is converted to a character array, and the characters are shifted
        // as the Escape Sequences are replaced.  This is all done "in place" without creating
        // new substring's in memory.

        char[] c = s.toCharArray();


        // These two pointers are kept as the "Source Character" - as in the next character to
        // "Read" ... and the "Destination Character" - as in the next location to write.

        int sourcePos   = 0;
        int destPos     = 0;

        while (sourcePos < c.length)


            // All Escape Sequences begin with the Ampersand Symbol.  If the next character
            // does not begin with the Ampersand, we should skip and move on.  Copy the next source
            // character to the next destination location, and continue the loop.

            if (c[sourcePos] != '&')
            { c[destPos++]=c[sourcePos++];  continue; }
    

            // Here, an Ampersand has been found.  Now check if the character immediately 
            // following the Ampersand is a Pound Sign.  If it is a Pound Sign, that implies
            // this escape sequence is simply going to be a number.

            else if ((sourcePos < (c.length-1)) && (c[sourcePos + 1] == '#'))
            {
                int     evaluatingPos   = sourcePos + 1;
                boolean isHex           = false;


                // If the Character after the Pound Sign is an 'X', it means that the number
                // that has been escaped is a Base 16 (Hexadecimal) number.
                // IMPORTANT: Check to see that the Ampersand wasn't the last char in the String

                if (evaluatingPos + 1 < c.length)
                    if (c[evaluatingPos + 1] == 'x')
                    { isHex = true; evaluatingPos++; }

                // Keep skipping the numbers, until a non-digit character is identified.
                while ((++evaluatingPos < c.length) && Character.isDigit(c[evaluatingPos]));


                // If the character immediately after the last digit isn't a ';' (Semicolon),
                // then this entire thing is NOT an escaped HTML character.  In this case, make
                // sure to copy the next source-character to the next destination location in the
                // char[] array...  Then continue the loop to the next 'char' (after Ampersand)

                if ((evaluatingPos == c.length) || (c[evaluatingPos] != ';'))
                    { c[destPos++]=c[sourcePos++];  continue; }

                int escapedChar;

                try
                { 
                    // Make sure to convert 16-bit numbers using the 16-bit radix using the
                    // standard java parse integer way.

                    escapedChar = isHex
                        ? Integer.parseInt(s.substring(sourcePos + 3, evaluatingPos), 16)
                        : Integer.parseInt(s.substring(sourcePos + 2, evaluatingPos));
                }


                // If for whatever reason java was unable to parse the digits in the escape
                // sequence, then copy the next source-character to the next destination-location
                // and move on in the loop.

                catch (NumberFormatException e)
                    { c[destPos++]=c[sourcePos++];  continue; }


                // If the character was an Emoji, then it would be a number greater than
                // 2^16.  Emoji's use Code Points - which are multiple characters used up
                // together.  Their escape sequences are always characters larger than 65,535.
                // If so, just copy the next source-character to the next destination location, and
                // move on in the loop.

                if (escapedChar > Character.MAX_VALUE)
                    { c[destPos++]=c[sourcePos++];  continue; }

                // Replace the next "Destination Location" with the (un) escaped char.
                c[destPos++] = (char) escapedChar;


                // Skip the entire HTML Escape Sequence by skipping to the location after the
                // position where the "evaluation" (all this processing) was occurring.  This
                // just happens to be the next-character immediately after the semi-colon

                sourcePos = evaluatingPos + 1;  // will be pointing at the ';' (semicolon)
            }


            // An Ampersand was just found, but it was not followed by a '#' (Pound Sign).  This
            // means that it is not a "numbered" (to invent a term) HTML Escape Sequence.  Instead
            // we shall check if there is a valid Escape-String (before the next semi-colon) that
            // can be identified in the Hashtable 'htmlEscChars'

            else if (sourcePos < (c.length - 1))
            {
                // We need to create a 'temp variable' and it will be called "evaluating position"
                int evaluatingPos = sourcePos;

                // All text (non "Numbered") HTML Escape String's are comprised of letter or digits
                while ((++evaluatingPos < c.length) && Character.isLetterOrDigit(c[evaluatingPos]));


                // If the character immediately after the last letter or digit is not a semi-colon,
                // then there is no way this is an HTML Escape Sequence.  Copy the next source to
                // the next destination location, and continue with the loop.

                if ((evaluatingPos == c.length) || (c[evaluatingPos] != ';'))
                    { c[destPos++]=c[sourcePos++];  continue; }

                // Get the replacement character from the lookup table.
                Character replacement = htmlEscChars.get(s.substring(sourcePos + 1, evaluatingPos));


                // The lookup table will return null if there this was not a valid escape sequence.
                // If this was not a valid sequence, just copy the next character from the source
                // location, and move on in the loop.

                if (replacement == null)
                    { c[destPos++]=c[sourcePos++];  continue; }

                c[destPos++]    = replacement;
                sourcePos       = evaluatingPos + 1;
            }

            else
                { c[destPos++]=c[sourcePos++];  continue; }

        return new String(c, 0, destPos);    
    }
}