001package Torello.Java;
002
003import java.util.*;
004import java.io.*;
005import java.util.regex.*;
006import java.util.zip.*;
007
008/**
009 * A utility for saving Regular-Expressions in a text-file that may be lazily-loaded at run-time
010 * when needed.
011 * 
012 * <EMBED CLASS='external-html' DATA-FILE-ID=REGEX_FILES>
013 */
014@Torello.JavaDoc.StaticFunctional
015public class RegExFiles
016{
017    private RegExFiles() { }
018
019    /**
020     * This loads a regular expression text file.  Each line is interpreted as a new Regular
021     * Expression {@code Pattern}.
022     *
023     * <BR /><BR />This method expects the <B><I>entire regular expression to fit on a single
024     * line</I></B>, and therefore, each new line containing text-data (without a starting
025     * <B>{@code '#'}</B>) will be compile into a new regular expression.  Use the {@code '\n'}
026     * within the expression to generated newlines.
027     *
028     * <BR /><BR /><B CLASS=JDDescLabel>Some Syntax Rules:</B>
029     * 
030     * <BR /><UL CLASS=JDUL>
031     * 
032     * <LI> <B>Comment lines</B> are lines beginning with the <I>POUND</I> <B>({@code '#'})</B>
033     *      sign.
034     *      <BR /><BR />
035     *      </LI>
036     * 
037     * <LI> <B>Blank lines</B> are ignored by the file-parse completely.
038     *      <BR /><BR />
039     *      </LI>
040     * 
041     * <LI> Lines with <B>only white-space</B> are considered blank.
042     *      <BR /><BR />
043     *      </LI>
044     * 
045     * <LI> <B>Flag Lines</B> are lines that begin with two, successive, <I>POUND</I>
046     *      <B>({@code '##'})</B> signs.
047     *      <BR /><BR />
048     *      </LI>
049     * 
050     * <LI> All non-comment, non-blank and non-flag lines are converted into Regular-Expression
051     *      {@code Pattern's}
052     *      </LI>
053     * </UL>
054     * 
055     * <BR /><BR /><B CLASS=JDDescLabel>LFEC Note:</B>
056     * 
057     * <BR />This method will <I>halt program execution</I> if any exceptions occur when loading a
058     * Regular-Expression text file!  This is the primary-purpose of all {@code 'LFEC'} - Load File
059     * Exception Catch methods.
060     * 
061     * @param f Filename for a Regular Expression
062     * 
063     * @return A {@code Vector} containing one compiled regular expression per line.  Comment lines
064     * &amp; blank lines will all be ignored.
065     *
066     * @see java.util.regex.Pattern
067     * @see #generateFlags(String)
068     * @see LFEC#ERROR_EXIT(Throwable, String)
069     */
070    public static Vector<Pattern> LFEC(String f)
071    {
072        try
073            { return parse(FileRW.loadFileToVector(f, false), f); }
074
075        catch (Throwable t)
076        {
077            LFEC.ERROR_EXIT(t, "Attempt to load Regular Expression file: [" + f + "], failed.\n");
078        }
079
080        return null; // Should NOT be possible to reach this statement...
081    }
082
083    /**
084     * This does the <B><I>exact same thing</I></B> as {@link LFEC}, but loads the file into a
085     * {@code Vector} using the "JAR File" information included here.  In this case, parameter
086     * {@code f} indicates a jar-file class-loader pointer.  It will not load from the standard
087     * file-system.
088     *
089     * <BR /><BR /><B CLASS=JDDescLabel>Java's <CODE>getResourceAsStream</CODE>:</B>
090     * 
091     * <BR />The JAR implies that Java's "load resource as stream" features are being used in place
092     * of standard file i/o routines.  Specifically, this loads from a JAR file, as seen below:
093     *
094     * <DIV CLASS=SNIP>{@code
095     * BufferedReader br =
096     *     new BufferedReader(new InputStreamReader(c.getResourceAsStream(f)));
097     * }</DIV>
098     *
099     * @param c This contains the class that is loading the file.  It is not too important to use
100     * the "exact class" - since the only reason the class doing the loading is because the
101     * "Class Loader" employs the exact "Package Name" of the class for figuring out the
102     * directory / sub-directory where the data-file is stored.  This variable may not be null.
103     *
104     * <BR /><BR /><DIV CLASS=JDHint>
105     * <B STYLE='color:red;'>Example:</B> If you wanted to load a "Regular Expressions.txt" file
106     * that was in the same BASH/Debian/etc...  directory as the following class - the following
107     * call to {@code 'RegExFiles'} would load the text-file "Regular Expressions.txt" into memory
108     * quickly.  The primary purpose being that text files are <B>much easier to read than
109     * 'double-escaped' Java {@code String's}.</B>
110     * </DIV>
111     *
112     * <BR /><DIV CLASS=JDHintAlt>
113     * <B>Note:</B> It might be important to read the Java Doc's about the
114     * {@code 'getResourceAsStream(String)'} method for retrieving data that was stored to a JAR
115     * file instead of a UNIX/BASH/MS-DOS system file.  Oracle's Java 8 would help.
116     * </DIV>
117     *
118     * <EMBED CLASS='external-html' DATA-FILE-ID=REF_RAW_TYPES_NOTE>
119     *
120     * @param f This is a file-pointer to a file stored inside a Java JAR file.
121     *
122     * @return A Vector containing one compiled regular expression per line.  Comment lines &amp;
123     * blank lines will all be ignored.
124     *
125     * @see #LFEC(String)
126     * @see #parse(Vector, String)
127     * @see LFEC#ERROR_EXIT(Throwable, String)
128     */
129    public static Vector<Pattern> LFEC_JAR(Class<?> c, String f)
130    {
131        try (
132            InputStream     is = c.getResourceAsStream(f);
133            BufferedReader  br = new BufferedReader(new InputStreamReader(is));
134        )
135        {
136            String          s       = "";
137            StringBuilder   sb      = new StringBuilder();
138            Vector<String>  file    = new Vector<String>();
139
140            while ((s = br.readLine()) != null) file.addElement(s);
141
142            return parse(file, f);
143        }
144
145        catch (Throwable t)
146        { 
147            LFEC.ERROR_EXIT(
148                t,
149                "Attempted to load Regular Expression file: [" + f + "]\n" +
150                "From jar-file using class: [" + c.getCanonicalName() + "]\n" +
151                "Did not load successfully."
152            );
153        }
154
155
156        // Should NOT be possible to reach this statement...
157        // Compiler does not recognize LFEC.ERROR_EXIT
158
159        return null;
160    }
161
162    /**
163     * This is identical to {@code LFEC_JAR}, except that it presumes the file was compressed
164     * before saving.
165     *
166     * @param c This contains the class that is loading the file.  It is not too important to use
167     * the "exact class" - since the only reason the class doing the loading is because the "Class
168     * Loader" employs the exact "Package Name" of the class for figuring out the directory /
169     * sub-directory where the data-file is stored.  This variable may not be null.  Again, the
170     * class-loader looks in the directory of the package that contains this class!
171     *
172     * <BR /><BR /><DIV CLASS=JDHintAlt>
173     * <B STYLE='color:red;'>Note:</B> The method {@code public static Vector<Pattern> 
174     * LFEC_JAR(Class, String;)} has a more detailed look at the particular use of this parameter.
175     * The easy way to understand is: just pass the class that is doing the actual loading of the
176     * regular-expression <B>(presuming the regex.dat file is in the same directory as the 
177     * {@code '.class'} file!)</B>
178     * </DIV>
179     *
180     * <EMBED CLASS='external-html' DATA-FILE-ID=REF_RAW_TYPES_NOTE>
181     * 
182     * @param f This is a file-pointer to a file stored inside a Java JAR file.
183     *
184     * @return A {@code Vector} containing one compiled regular expression per line.  Comment
185     * lines &amp; blank lines will all be ignored.
186     *
187     * @see #LFEC_JAR(Class, String)
188     * @see #parse(Vector, String)
189     * @see LFEC#ERROR_EXIT(Throwable, String)
190     */
191    public static Vector<Pattern> LFEC_JAR_ZIP(Class<?> c, String f)
192    {
193        try (
194            InputStream         is      = c.getResourceAsStream(f);
195            GZIPInputStream     gzip    = new GZIPInputStream(is);
196            ObjectInputStream   ois     = new ObjectInputStream(gzip);
197        )
198        {
199            Object              ret         = ois.readObject();
200            String              fileStr     = (String) ret;
201            Vector<String>      file        = new Vector<>();
202            int                 newLinePos  = 0;
203
204            while ((newLinePos = fileStr.indexOf('\n')) != -1)
205            {
206                file.addElement(fileStr.substring(0, newLinePos));
207                fileStr = fileStr.substring(newLinePos + 1);
208            }
209
210            return parse(file, f);
211        }
212
213        catch (Throwable t)
214        {
215            LFEC.ERROR_EXIT(t,
216                "Attempted to load Regular Expression file: [" + f + "]\n" +
217                "From jar-file using class: [" + c.getCanonicalName() + "]\n" +
218                "Content was zipped, but failed to load."
219            );
220        }
221
222        return null; // Should NOT be possible to reach this statement...
223    }
224
225    /**
226     * This does the <B><I>exact same thing</I></B> as {@link LFEC}, but takes a "pre-loaded file"
227     * as a {@code Vector}.  This is an internal class - used to ensure that the methods:
228     * {@code LFEC_JAR} and {@code LFEC} do the exact same thing.
229     *
230     * @param file This presumes that the regular-expression text-file has been loaded into a
231     * {@code Vector<String>} (w/out the "include newlines" option!)
232     *
233     * @param name The name of the file loading is required so that error-printing-information is
234     * easier.
235     *
236     * @return A {@code Vector} containing one compiled regular expression per line.  Comment lines
237     * &amp; blank lines will all be ignored.
238     *
239     * @see #LFEC(String)
240     */
241    protected static Vector<Pattern> parse(Vector<String> file, String name)
242    {
243        try
244        {
245            Vector<Pattern> ret     = new Vector<Pattern>();
246            int             flags   = 0;
247
248            for (String line : file)
249            {
250                if (line.trim().length() == 0) continue;
251
252                if (line.charAt(0) == '#')
253                {
254                    if (line.length() > 1) if (line.charAt(1) == '#') flags = generateFlags(line);
255                    continue;
256                }
257
258                if (flags != 0) ret.add(Pattern.compile(line, flags));
259                else            ret.add(Pattern.compile(line));
260
261                flags = 0;
262            }
263
264            return ret;
265        }
266
267        catch (Throwable t)
268            { LFEC.ERROR_EXIT(t, "error parsing regular expression file: " + name); }
269
270        return null; // Should NOT be possible to reach this statement...
271    }
272
273    /**
274     * This information has been copied from Java's regular expression: {@code Pattern}. This is a
275     * Helper function as it converts the text-{@code String's} into their constants, so that a
276     * user may include these text {@code String's} in a regular expression file.
277     *
278     * <BR /><BR /><DIV CLASS=JDHint>
279     * <B STYLE='color:red;'>Note:</B> The regular expression loader will only load regular
280     * expressions that fit on a single line of text.  Other than lines that begin with a comment,
281     * each line is intended/interpreted as an independent Regular Expression.
282     * </DIV>
283     *
284     * @see java.util.regex.Pattern
285     */
286    protected static int generateFlags(String line)
287    {
288        int mask = 0;
289
290        if (line.contains("CANON_EQ"))          mask |= Pattern.CANON_EQ;
291        if (line.contains("CASE_INSENSITIVE"))  mask |= Pattern.CASE_INSENSITIVE;
292        if (line.contains("DOTALL"))            mask |= Pattern.DOTALL;
293        if (line.contains("COMMENTS"))          mask |= Pattern.COMMENTS;
294        if (line.contains("LITERAL"))           mask |= Pattern.LITERAL;
295        if (line.contains("MULTILINE"))         mask |= Pattern.MULTILINE;
296        if (line.contains("UNICODE_CASE"))      mask |= Pattern.UNICODE_CASE;
297
298        return mask;
299    }
300}