1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
package Torello.Languages;

import java.io.IOException;
import java.util.Vector;

import java.util.regex.Pattern;
import java.util.regex.Matcher;

import Torello.Java.FileRW;
import Torello.Java.RegExFiles;

public class Helper
{
    private static final String DATA_FILE = "../data-files/FNA-Helper.sdat";
    private static final Pattern WHITE_SPACE, PUNCTUATION;

    static
    {
        Vector<Pattern> v = RegExFiles.LFEC_JAR_ZIP(Helper.class, DATA_FILE);

        WHITE_SPACE = v.elementAt(0);
        PUNCTUATION = v.elementAt(1);
    }

    public static void main(String[] argv) throws IOException
    {
        System.out.println(
            "WHITE_SPACE: " + WHITE_SPACE + '\n' +
            "PUNCTUATION: " + PUNCTUATION + '\n' +
            "Exiting..."
        );

        System.exit(1);

        // The File "Regular Expressions.txt" is missing.  I still don't have time
        FileRW.writeObjectToFileNOCNFE(
            FileRW.loadFileToString
                ("Torello/Languages/FNA/Regular Expressions.txt"),
            DATA_FILE,
            true
        );
    }

    /**
     * This will split a sentenceinto words.  Also, all punctuation surrounding each word will be
     * removed!
     * 
     * @param text A sentence, usually in a foreign language.  Will work on any String.
     * 
     * @param sbDOUT This is a "developer notes" or "debug notes" output stream.  If null, notes
     * will simply be discarded.
     * 
     * @return A list of words.  Each will be trimmed of white-space and leading or trailing
     * punctuation.
     */
    public static Vector<String> splitOnWhiteSpace(String text, StringBuilder sbDOUT)
    {
        Vector<String>  ret = new Vector<String>();
        Matcher         m1  = WHITE_SPACE.matcher(text);

        while (m1.find())
        {
            String word = m1.group(1).trim();
            
            DOUT(sbDOUT, "\nP1: [" + word + "], len=" + word.length() + " ");

            if (word.length() == 0)
            {
                DOUT(sbDOUT, "\tSkipping, zero length word.\n");
                continue;
            }

            Matcher m2 = PUNCTUATION.matcher(word);

            if (m2.find())
                word = m2.group(2).trim();

            else
            {
                DOUT(sbDOUT, "\tSkipping, PUNCTUATION RegEx found no match.\n");
                continue;
            }

            DOUT(sbDOUT, "\nP2: [" + word + "], len=" + word.length() + " ");

            if (word.length() == 0)
            {
                DOUT(sbDOUT, "\tSkipping, zero-length, punctuation-stripped word\n");
                continue;
            }
            
            ret.addElement(word);
        }

        DOUT(sbDOUT, "\n");
        return ret;
    }

    static void DOUT(StringBuilder debug, String... sArr)
    {
        if (debug == null) return;

        for (int i=0; i < sArr.length; i++) debug.append(sArr[i]);
    }
}