001package Torello.Languages; 002 003import java.io.IOException; 004import java.util.Vector; 005 006import java.util.regex.Pattern; 007import java.util.regex.Matcher; 008 009import Torello.Java.FileRW; 010import Torello.Java.RegExFiles; 011 012public class Helper 013{ 014 private static final String DATA_FILE = "../data-files/FNA-Helper.sdat"; 015 private static final Pattern WHITE_SPACE, PUNCTUATION; 016 017 static 018 { 019 Vector<Pattern> v = RegExFiles.LFEC_JAR_ZIP(Helper.class, DATA_FILE); 020 021 WHITE_SPACE = v.elementAt(0); 022 PUNCTUATION = v.elementAt(1); 023 } 024 025 public static void main(String[] argv) throws IOException 026 { 027 System.out.println( 028 "WHITE_SPACE: " + WHITE_SPACE + '\n' + 029 "PUNCTUATION: " + PUNCTUATION + '\n' + 030 "Exiting..." 031 ); 032 033 System.exit(1); 034 035 // The File "Regular Expressions.txt" is missing. I still don't have time 036 FileRW.writeObjectToFileNOCNFE( 037 FileRW.loadFileToString 038 ("Torello/Languages/FNA/Regular Expressions.txt"), 039 DATA_FILE, 040 true 041 ); 042 } 043 044 /** 045 * This will split a sentenceinto words. Also, all punctuation surrounding each word will be 046 * removed! 047 * 048 * @param text A sentence, usually in a foreign language. Will work on any String. 049 * 050 * @param sbDOUT This is a "developer notes" or "debug notes" output stream. If null, notes 051 * will simply be discarded. 052 * 053 * @return A list of words. Each will be trimmed of white-space and leading or trailing 054 * punctuation. 055 */ 056 public static Vector<String> splitOnWhiteSpace(String text, StringBuilder sbDOUT) 057 { 058 Vector<String> ret = new Vector<String>(); 059 Matcher m1 = WHITE_SPACE.matcher(text); 060 061 while (m1.find()) 062 { 063 String word = m1.group(1).trim(); 064 065 DOUT(sbDOUT, "\nP1: [" + word + "], len=" + word.length() + " "); 066 067 if (word.length() == 0) 068 { 069 DOUT(sbDOUT, "\tSkipping, zero length word.\n"); 070 continue; 071 } 072 073 Matcher m2 = PUNCTUATION.matcher(word); 074 075 if (m2.find()) 076 word = m2.group(2).trim(); 077 078 else 079 { 080 DOUT(sbDOUT, "\tSkipping, PUNCTUATION RegEx found no match.\n"); 081 continue; 082 } 083 084 DOUT(sbDOUT, "\nP2: [" + word + "], len=" + word.length() + " "); 085 086 if (word.length() == 0) 087 { 088 DOUT(sbDOUT, "\tSkipping, zero-length, punctuation-stripped word\n"); 089 continue; 090 } 091 092 ret.addElement(word); 093 } 094 095 DOUT(sbDOUT, "\n"); 096 return ret; 097 } 098 099 static void DOUT(StringBuilder debug, String... sArr) 100 { 101 if (debug == null) return; 102 103 for (int i=0; i < sArr.length; i++) debug.append(sArr[i]); 104 } 105}