EscapeRepl.java.html

package Torello.HTML;

import static Torello.HTML.Escape.htmlEscChars;

class EscapeRepl
{
    static String replace(String s)
    {
        // The primary optimization is to do this the "C" way (As in The C Programming Language)
        // The String to Escape is converted to a character array, and the characters are shifted
        // as the Escape Sequences are replaced.  This is all done "in place" without creating
        // new substring's in memory.

        char[] c = s.toCharArray();


        // These two pointers are kept as the "Source Character" - as in the next character to
        // "Read" ... and the "Destination Character" - as in the next location to write.

        int sourcePos   = 0;
        int destPos     = 0;

        while (sourcePos < c.length)


            // All Escape Sequences begin with the Ampersand Symbol.  If the next character
            // does not begin with the Ampersand, we should skip and move on.  Copy the next source
            // character to the next destination location, and continue the loop.

            if (c[sourcePos] != '&')
            { c[destPos++]=c[sourcePos++];  continue; }
    

            // Here, an Ampersand has been found.  Now check if the character immediately 
            // following the Ampersand is a Pound Sign.  If it is a Pound Sign, that implies
            // this escape sequence is simply going to be a number.

            else if ((sourcePos < (c.length-1)) && (c[sourcePos + 1] == '#'))
            {
                int     evaluatingPos   = sourcePos + 1;
                boolean isHex           = false;


                // If the Character after the Pound Sign is an 'X', it means that the number
                // that has been escaped is a Base 16 (Hexadecimal) number.
                // IMPORTANT: Check to see that the Ampersand wasn't the last char in the String

                if (evaluatingPos + 1 < c.length)
                    if (c[evaluatingPos + 1] == 'x')
                    { isHex = true; evaluatingPos++; }

                // Keep skipping the numbers, until a non-digit character is identified.
                while ((++evaluatingPos < c.length) && Character.isDigit(c[evaluatingPos]));


                // If the character immediately after the last digit isn't a ';' (Semicolon),
                // then this entire thing is NOT an escaped HTML character.  In this case, make
                // sure to copy the next source-character to the next destination location in the
                // char[] array...  Then continue the loop to the next 'char' (after Ampersand)

                if ((evaluatingPos == c.length) || (c[evaluatingPos] != ';'))
                    { c[destPos++]=c[sourcePos++];  continue; }

                int escapedChar;

                try
                { 
                    // Make sure to convert 16-bit numbers using the 16-bit radix using the
                    // standard java parse integer way.

                    escapedChar = isHex
                        ? Integer.parseInt(s.substring(sourcePos + 3, evaluatingPos), 16)
                        : Integer.parseInt(s.substring(sourcePos + 2, evaluatingPos));
                }


                // If for whatever reason java was unable to parse the digits in the escape
                // sequence, then copy the next source-character to the next destination-location
                // and move on in the loop.

                catch (NumberFormatException e)
                    { c[destPos++]=c[sourcePos++];  continue; }


                // If the character was an Emoji, then it would be a number greater than
                // 2^16.  Emoji's use Code Points - which are multiple characters used up
                // together.  Their escape sequences are always characters larger than 65,535.
                // If so, just copy the next source-character to the next destination location, and
                // move on in the loop.

                if (escapedChar > Character.MAX_VALUE)
                    { c[destPos++]=c[sourcePos++];  continue; }

                // Replace the next "Destination Location" with the (un) escaped char.
                c[destPos++] = (char) escapedChar;


                // Skip the entire HTML Escape Sequence by skipping to the location after the
                // position where the "evaluation" (all this processing) was occurring.  This
                // just happens to be the next-character immediately after the semi-colon

                sourcePos = evaluatingPos + 1;  // will be pointing at the ';' (semicolon)
            }


            // An Ampersand was just found, but it was not followed by a '#' (Pound Sign).  This
            // means that it is not a "numbered" (to invent a term) HTML Escape Sequence.  Instead
            // we shall check if there is a valid Escape-String (before the next semi-colon) that
            // can be identified in the Hashtable 'htmlEscChars'

            else if (sourcePos < (c.length - 1))
            {
                // We need to create a 'temp variable' and it will be called "evaluating position"
                int evaluatingPos = sourcePos;

                // All text (non "Numbered") HTML Escape String's are comprised of letter or digits
                while ((++evaluatingPos < c.length) && Character.isLetterOrDigit(c[evaluatingPos]));


                // If the character immediately after the last letter or digit is not a semi-colon,
                // then there is no way this is an HTML Escape Sequence.  Copy the next source to
                // the next destination location, and continue with the loop.

                if ((evaluatingPos == c.length) || (c[evaluatingPos] != ';'))
                    { c[destPos++]=c[sourcePos++];  continue; }

                // Get the replacement character from the lookup table.
                Character replacement = htmlEscChars.get(s.substring(sourcePos + 1, evaluatingPos));


                // The lookup table will return null if there this was not a valid escape sequence.
                // If this was not a valid sequence, just copy the next character from the source
                // location, and move on in the loop.

                if (replacement == null)
                    { c[destPos++]=c[sourcePos++];  continue; }

                c[destPos++]    = replacement;
                sourcePos       = evaluatingPos + 1;
            }

            else
                { c[destPos++]=c[sourcePos++];  continue; }

        return new String(c, 0, destPos);    
    }
}