1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105 | package Torello.Languages;
import java.io.IOException;
import java.util.Vector;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import Torello.Java.FileRW;
import Torello.Java.RegExFiles;
public class Helper
{
private static final String DATA_FILE = "../data-files/FNA-Helper.sdat";
private static final Pattern WHITE_SPACE, PUNCTUATION;
static
{
Vector<Pattern> v = RegExFiles.LFEC_JAR_ZIP(Helper.class, DATA_FILE);
WHITE_SPACE = v.elementAt(0);
PUNCTUATION = v.elementAt(1);
}
public static void main(String[] argv) throws IOException
{
System.out.println(
"WHITE_SPACE: " + WHITE_SPACE + '\n' +
"PUNCTUATION: " + PUNCTUATION + '\n' +
"Exiting..."
);
System.exit(1);
// The File "Regular Expressions.txt" is missing. I still don't have time
FileRW.writeObjectToFileNOCNFE(
FileRW.loadFileToString
("Torello/Languages/FNA/Regular Expressions.txt"),
DATA_FILE,
true
);
}
/**
* This will split a sentenceinto words. Also, all punctuation surrounding each word will be
* removed!
*
* @param text A sentence, usually in a foreign language. Will work on any String.
*
* @param sbDOUT This is a "developer notes" or "debug notes" output stream. If null, notes
* will simply be discarded.
*
* @return A list of words. Each will be trimmed of white-space and leading or trailing
* punctuation.
*/
public static Vector<String> splitOnWhiteSpace(String text, StringBuilder sbDOUT)
{
Vector<String> ret = new Vector<String>();
Matcher m1 = WHITE_SPACE.matcher(text);
while (m1.find())
{
String word = m1.group(1).trim();
DOUT(sbDOUT, "\nP1: [" + word + "], len=" + word.length() + " ");
if (word.length() == 0)
{
DOUT(sbDOUT, "\tSkipping, zero length word.\n");
continue;
}
Matcher m2 = PUNCTUATION.matcher(word);
if (m2.find())
word = m2.group(2).trim();
else
{
DOUT(sbDOUT, "\tSkipping, PUNCTUATION RegEx found no match.\n");
continue;
}
DOUT(sbDOUT, "\nP2: [" + word + "], len=" + word.length() + " ");
if (word.length() == 0)
{
DOUT(sbDOUT, "\tSkipping, zero-length, punctuation-stripped word\n");
continue;
}
ret.addElement(word);
}
DOUT(sbDOUT, "\n");
return ret;
}
static void DOUT(StringBuilder debug, String... sArr)
{
if (debug == null) return;
for (int i=0; i < sArr.length; i++) debug.append(sArr[i]);
}
}
|