001package Torello.Languages;
002
003import java.io.IOException;
004import java.util.Vector;
005
006import java.util.regex.Pattern;
007import java.util.regex.Matcher;
008
009import Torello.Java.FileRW;
010import Torello.Java.RegExFiles;
011
012public class Helper
013{
014    private static final String DATA_FILE = "../data-files/FNA-Helper.sdat";
015    private static final Pattern WHITE_SPACE, PUNCTUATION;
016
017    static
018    {
019        Vector<Pattern> v = RegExFiles.LFEC_JAR_ZIP(Helper.class, DATA_FILE);
020
021        WHITE_SPACE = v.elementAt(0);
022        PUNCTUATION = v.elementAt(1);
023    }
024
025    public static void main(String[] argv) throws IOException
026    {
027        System.out.println(
028            "WHITE_SPACE: " + WHITE_SPACE + '\n' +
029            "PUNCTUATION: " + PUNCTUATION + '\n' +
030            "Exiting..."
031        );
032
033        System.exit(1);
034
035        // The File "Regular Expressions.txt" is missing.  I still don't have time
036        FileRW.writeObjectToFileNOCNFE(
037            FileRW.loadFileToString
038                ("Torello/Languages/FNA/Regular Expressions.txt"),
039            DATA_FILE,
040            true
041        );
042    }
043
044    /**
045     * This will split a sentenceinto words.  Also, all punctuation surrounding each word will be
046     * removed!
047     * 
048     * @param text A sentence, usually in a foreign language.  Will work on any String.
049     * 
050     * @param sbDOUT This is a "developer notes" or "debug notes" output stream.  If null, notes
051     * will simply be discarded.
052     * 
053     * @return A list of words.  Each will be trimmed of white-space and leading or trailing
054     * punctuation.
055     */
056    public static Vector<String> splitOnWhiteSpace(String text, StringBuilder sbDOUT)
057    {
058        Vector<String>  ret = new Vector<String>();
059        Matcher         m1  = WHITE_SPACE.matcher(text);
060
061        while (m1.find())
062        {
063            String word = m1.group(1).trim();
064            
065            DOUT(sbDOUT, "\nP1: [" + word + "], len=" + word.length() + " ");
066
067            if (word.length() == 0)
068            {
069                DOUT(sbDOUT, "\tSkipping, zero length word.\n");
070                continue;
071            }
072
073            Matcher m2 = PUNCTUATION.matcher(word);
074
075            if (m2.find())
076                word = m2.group(2).trim();
077
078            else
079            {
080                DOUT(sbDOUT, "\tSkipping, PUNCTUATION RegEx found no match.\n");
081                continue;
082            }
083
084            DOUT(sbDOUT, "\nP2: [" + word + "], len=" + word.length() + " ");
085
086            if (word.length() == 0)
087            {
088                DOUT(sbDOUT, "\tSkipping, zero-length, punctuation-stripped word\n");
089                continue;
090            }
091            
092            ret.addElement(word);
093        }
094
095        DOUT(sbDOUT, "\n");
096        return ret;
097    }
098
099    static void DOUT(StringBuilder debug, String... sArr)
100    {
101        if (debug == null) return;
102
103        for (int i=0; i < sArr.length; i++) debug.append(sArr[i]);
104    }
105}