001/** 002 * Portions Copyright 2001 Sun Microsystems, Inc. 003 * Portions Copyright 1999-2001 Language Technologies Institute, 004 * Carnegie Mellon University. 005 * All Rights Reserved. Use is subject to license terms. 006 * 007 * See the file "license.terms" for information on usage and 008 * redistribution of this file, and for a DISCLAIMER OF ALL 009 * WARRANTIES. 010 */ 011package com.sun.speech.freetts.en; 012 013import com.sun.speech.freetts.Token; 014import com.sun.speech.freetts.Tokenizer; 015import java.io.Reader; 016import java.io.IOException; 017 018 019/** 020 * Implements the tokenizer interface. Breaks an input sequence of 021 * characters into a set of tokens. 022 */ 023public class TokenizerImpl implements Tokenizer { 024 025 /** A constant indicating that the end of the stream has been read. */ 026 public static final int EOF = -1; 027 028 /** A string containing the default whitespace characters. */ 029 public static final String DEFAULT_WHITESPACE_SYMBOLS = " \t\n\r"; 030 031 /** A string containing the default single characters. */ 032 public static final String DEFAULT_SINGLE_CHAR_SYMBOLS = "(){}[]"; 033 034 /** A string containing the default pre-punctuation characters. */ 035 public static final String DEFAULT_PREPUNCTUATION_SYMBOLS = "\"'`({["; 036 037 /** A string containing the default post-punctuation characters. */ 038 public static final String DEFAULT_POSTPUNCTUATION_SYMBOLS 039 = "\"'`.,:;!?(){}[]"; 040 041 042 /** The line number. */ 043 private int lineNumber; 044 045 /** The input text (from the Utterance) to tokenize. */ 046 private String inputText; 047 048 /** The file to read input text from, if using file mode. */ 049 private Reader reader; 050 051 /** The token position - doesn't seem really necessary at this point. */ 052 // private int tokenPosition = 0; 053 054 /** The current character, whether its from the file or the input text. */ 055 private int currentChar; 056 057 /** 058 * The current char position for the input text (not the file) 059 * this is called "file_pos" in flite 060 */ 061 private int currentPosition; 062 063 064 /** The delimiting symbols of this tokenizer. */ 065 private String whitespaceSymbols = DEFAULT_WHITESPACE_SYMBOLS; 066 private String singleCharSymbols = DEFAULT_SINGLE_CHAR_SYMBOLS; 067 private String prepunctuationSymbols = DEFAULT_PREPUNCTUATION_SYMBOLS; 068 private String postpunctuationSymbols = DEFAULT_POSTPUNCTUATION_SYMBOLS; 069 070 /** The error description. */ 071 private String errorDescription; 072 073 /** A place to store the current token. */ 074 private Token token; 075 private Token lastToken; 076 077 /** For timing. */ 078// private long duration = 0; 079 080 081 /** 082 * Constructs a Tokenizer. 083 */ 084 public TokenizerImpl() { 085 } 086 087 088 /** 089 * Creates a tokenizer that will return tokens from 090 * the given string. 091 * 092 * @param string the string to tokenize 093 */ 094 public TokenizerImpl(String string) { 095 setInputText(string); 096 } 097 098 /** 099 * Creates a tokenizer that will return tokens from 100 * the given file. 101 * 102 * @param file where to read the input from 103 */ 104 public TokenizerImpl(Reader file) { 105 setInputReader(file); 106 } 107 108 109 /** 110 * Sets the whitespace symbols of this Tokenizer to the given symbols. 111 * 112 * @param symbols the whitespace symbols 113 */ 114 public void setWhitespaceSymbols(String symbols) { 115 whitespaceSymbols = symbols; 116 } 117 118 119 /** 120 * Sets the single character symbols of this Tokenizer to the given 121 * symbols. 122 * 123 * @param symbols the single character symbols 124 */ 125 public void setSingleCharSymbols(String symbols) { 126 singleCharSymbols = symbols; 127 } 128 129 130 /** 131 * Sets the prepunctuation symbols of this Tokenizer to the given 132 * symbols. 133 * 134 * @param symbols the prepunctuation symbols 135 */ 136 public void setPrepunctuationSymbols(String symbols) { 137 prepunctuationSymbols = symbols; 138 } 139 140 141 /** 142 * Sets the postpunctuation symbols of this Tokenizer to the given 143 * symbols. 144 * 145 * @param symbols the postpunctuation symbols 146 */ 147 public void setPostpunctuationSymbols(String symbols) { 148 postpunctuationSymbols = symbols; 149 } 150 151 152 /** 153 * Sets the text to tokenize. 154 * 155 * @param inputString the string to tokenize 156 */ 157 public void setInputText(String inputString) { 158 inputText = inputString; 159 currentPosition = 0; 160 161 if (inputText != null) { 162 getNextChar(); 163 } 164 } 165 166 /** 167 * Sets the input reader 168 * 169 * @param reader the input source 170 */ 171 public void setInputReader(Reader reader) { 172 this.reader = reader; 173 getNextChar(); 174 } 175 176 177 /** 178 * Returns the next token. 179 * 180 * @return the next token if it exists, 181 * <code>null</code> if no more tokens 182 */ 183 public Token getNextToken() { 184 lastToken = token; 185 token = new Token(); 186 187 // Skip whitespace 188 token.setWhitespace(getTokenOfCharClass(whitespaceSymbols)); 189 190 // quoted strings currently ignored 191 192 // get prepunctuation 193 token.setPrepunctuation(getTokenOfCharClass(prepunctuationSymbols)); 194 195 // get the symbol itself 196 if (singleCharSymbols.indexOf(currentChar) != -1) { 197 token.setWord(String.valueOf((char) currentChar)); 198 getNextChar(); 199 } else { 200 token.setWord(getTokenNotOfCharClass(whitespaceSymbols)); 201 } 202 203 token.setPosition(currentPosition); 204 token.setLineNumber(lineNumber); 205 206 // This'll have token *plus* postpunctuation 207 // Get postpunctuation 208 removeTokenPostpunctuation(); 209 210 return token; 211 } 212 213 214 /** 215 * Returns <code>true</code> if there are more tokens, 216 * <code>false</code> otherwise. 217 * 218 * @return <code>true</code> if there are more tokens 219 * <code>false</code> otherwise 220 */ 221 public boolean hasMoreTokens() { 222 int nextChar = currentChar; 223 return (nextChar != EOF); 224 } 225 226 227 /** 228 * Advances the currentPosition pointer by 1 (if not exceeding 229 * length of inputText, and returns the character pointed by 230 * currentPosition. 231 * 232 * @return the next character EOF if no more characters exist 233 */ 234 private int getNextChar() { 235 if (reader != null) { 236 try { 237 int readVal = reader.read(); 238 if (readVal == -1) { 239 currentChar = EOF; 240 } else { 241 currentChar = (char) readVal; 242 } 243 } catch (IOException ioe) { 244 currentChar = EOF; 245 errorDescription = ioe.getMessage(); 246 } 247 } else if (inputText != null) { 248 if (currentPosition < inputText.length()) { 249 currentChar = (int) inputText.charAt(currentPosition); 250 } else { 251 currentChar = EOF; 252 } 253 } 254 if (currentChar != EOF) { 255 currentPosition++; 256 } 257 if (currentChar == '\n') { 258 lineNumber++; 259 } 260 return currentChar; 261 } 262 263 264 /** 265 * Starting from the current position of the input text, 266 * returns the subsequent characters of type charClass, 267 * and not of type singleCharSymbols. 268 * 269 * @param charClass the type of characters to look for 270 * @param buffer the place to append characters of type charClass 271 * 272 * @return a string of characters starting from the current position 273 * of the input text, until it encounters a character not 274 * in the string charClass 275 * 276 */ 277 private String getTokenOfCharClass(String charClass) { 278 return getTokenByCharClass(charClass, true); 279 } 280 281 /** 282 * Starting from the current position of the input text/file, 283 * returns the subsequent characters, not of type singleCharSymbols, 284 * and ended at characters of type endingCharClass. E.g., if the current 285 * string is "xxxxyyy", endingCharClass is "yz", and singleCharClass 286 * "abc". Then this method will return to "xxxx". 287 * 288 * @param endingCharClass the type of characters to look for 289 * 290 * @return a string of characters from the current position until 291 * it encounters characters in endingCharClass 292 * 293 */ 294 private String getTokenNotOfCharClass(String endingCharClass) { 295 return getTokenByCharClass(endingCharClass, false); 296 } 297 298 /** 299 * Provides a `compressed' method from getTokenOfCharClass() and 300 * getTokenNotOfCharClass(). 301 * If parameter containThisCharClass is <code>true</code>, 302 * then a string from the 303 * current position to the last character in charClass is returned. 304 * If containThisCharClass is <code>false</code>, then a string 305 * before the first 306 * occurrence of a character in containThisCharClass is returned. 307 * 308 * @param charClass the string of characters you want included or 309 * excluded in your return 310 * @param containThisCharClass determines if you want characters 311 * in charClass in the returned string or not 312 * 313 * @return a string of characters from the current position until 314 * it encounters characters in endingCharClass 315 */ 316 private String getTokenByCharClass(String charClass, 317 boolean containThisCharClass) { 318 final StringBuilder buffer = new StringBuilder(); 319 320 // if we want the returned string to contain chars in charClass, then 321 // containThisCharClass is TRUE and 322 // (charClass.indexOf(currentChar) != 1) == containThisCharClass) 323 // returns true; if we want it to stop at characters of charClass, 324 // then containThisCharClass is FALSE, and the condition returns 325 // false. 326 while ((charClass.indexOf(currentChar) != -1) == containThisCharClass 327 && singleCharSymbols.indexOf(currentChar) == -1 328 && currentChar != EOF) { 329 buffer.append((char) currentChar); 330 getNextChar(); 331 } 332 return buffer.toString(); 333 } 334 335 /** 336 * Removes the postpunctuation characters from the current token. 337 * Copies those postpunctuation characters to the class 338 * variable 'postpunctuation'. 339 */ 340 private void removeTokenPostpunctuation() { 341 if (token == null) { 342 return; 343 } 344 final String tokenWord = token.getWord(); 345 346 int tokenLength = tokenWord.length(); 347 int position = tokenLength - 1; 348 349 while (position > 0 350 && postpunctuationSymbols.indexOf((int) tokenWord 351 .charAt(position)) != -1) { 352 position--; 353 } 354 355 if (tokenLength - 1 != position) { 356 // Copy postpunctuation from token 357 token.setPostpunctuation(tokenWord.substring(position + 1)); 358 359 // truncate token at postpunctuation 360 token.setWord(tokenWord.substring(0, position + 1)); 361 } else { 362 token.setPostpunctuation(""); 363 } 364 } 365 366 /** 367 * Returns <code>true</code> if there were errors while reading tokens 368 * 369 * @return <code>true</code> if there were errors; 370 * <code>false</code> otherwise 371 */ 372 public boolean hasErrors() { 373 return errorDescription != null; 374 } 375 376 /** 377 * if hasErrors returns <code>true</code>, this will return a 378 * description of the error encountered, otherwise 379 * it will return <code>null</code> 380 * 381 * @return a description of the last error that occurred. 382 */ 383 public String getErrorDescription() { 384 return errorDescription; 385 } 386 387 /** 388 * Determines if the current token should start a new sentence. 389 * 390 * @return <code>true</code> if a new sentence should be started 391 */ 392 public boolean isBreak() { 393 String tokenWhiteSpace = token.getWhitespace(); 394 String lastTokenPostpunctuation = null; 395 if (lastToken != null) { 396 lastTokenPostpunctuation = lastToken.getPostpunctuation(); 397 } 398 399 if (lastToken == null || token == null) { 400 return false; 401 } else if (tokenWhiteSpace.indexOf('\n') != tokenWhiteSpace 402 .lastIndexOf('\n')) { 403 return true; 404 } else if (lastTokenPostpunctuation.indexOf(':') != -1 405 || lastTokenPostpunctuation.indexOf('?') != -1 406 || lastTokenPostpunctuation.indexOf('!') != -1) { 407 return true; 408 } else if (lastTokenPostpunctuation.indexOf('.') != -1 409 && tokenWhiteSpace.length() > 1 410 && Character.isUpperCase(token.getWord().charAt(0))) { 411 return true; 412 } else { 413 String lastWord = lastToken.getWord(); 414 int lastWordLength = lastWord.length(); 415 416 if (lastTokenPostpunctuation.indexOf('.') != -1 417 && 418 /* next word starts with a capital */ 419 Character.isUpperCase(token.getWord().charAt(0)) 420 && 421 /* last word isn't an abbreviation */ 422 !(Character.isUpperCase(lastWord.charAt(lastWordLength - 1)) 423 || (lastWordLength < 4 424 && Character.isUpperCase(lastWord.charAt(0))))) { 425 return true; 426 } 427 } 428 return false; 429 } 430} 431