001/** 002 * Portions Copyright 2001-2003 Sun Microsystems, Inc. 003 * Portions Copyright 1999-2001 Language Technologies Institute, 004 * Carnegie Mellon University. 005 * All Rights Reserved. Use is subject to license terms. 006 * 007 * See the file "license.terms" for information on usage and 008 * redistribution of this file, and for a DISCLAIMER OF ALL 009 * WARRANTIES. 010 */ 011package com.sun.speech.freetts.en.us; 012 013import java.util.Hashtable; 014import java.util.regex.Matcher; 015import java.util.regex.Pattern; 016 017import com.sun.speech.freetts.FeatureSet; 018import com.sun.speech.freetts.Item; 019import com.sun.speech.freetts.ProcessException; 020import com.sun.speech.freetts.Relation; 021import com.sun.speech.freetts.Utterance; 022import com.sun.speech.freetts.UtteranceProcessor; 023import com.sun.speech.freetts.cart.CART; 024import com.sun.speech.freetts.util.Utilities; 025 026 027/** 028 * Converts the Tokens (in US English words) in an 029 * Utterance into a list of words. It puts the produced list back 030 * into the Utterance. Usually, the tokens that gets expanded are numbers 031 * like "23" (to "twenty" "three"). 032 * <p> * It translates the following code from flite: 033 * <br> 034 * <code> 035 * lang/usenglish/us_text.c 036 * </code> 037 */ 038public class TokenToWords implements UtteranceProcessor { 039 040 /** Regular expression for something that has a vowel */ 041 private static final String RX_HAS_VOWEL = ".*[aeiouAEIOU].*"; 042 043 // Patterns for regular expression matching 044 private static final Pattern alphabetPattern; 045 private static final Pattern commaIntPattern; 046 private static final Pattern digits2DashPattern; 047 private static final Pattern digitsPattern; 048 private static final Pattern digitsSlashDigitsPattern; 049 private static final Pattern dottedAbbrevPattern; 050 private static final Pattern doublePattern; 051 private static final Pattern drStPattern; 052 private static final Pattern fourDigitsPattern; 053 private static final Pattern hasVowelPattern; 054 private static final Pattern illionPattern; 055 private static final Pattern numberTimePattern; 056 private static final Pattern numessPattern; 057 private static final Pattern ordinalPattern; 058 private static final Pattern romanNumbersPattern; 059 private static final Pattern sevenPhoneNumberPattern; 060 private static final Pattern threeDigitsPattern; 061 private static final Pattern usMoneyPattern; 062 063 static { 064 alphabetPattern = Pattern.compile(USEnglish.RX_ALPHABET); 065 commaIntPattern = Pattern.compile(USEnglish.RX_COMMAINT); 066 digits2DashPattern = Pattern.compile(USEnglish.RX_DIGITS2DASH); 067 digitsPattern = Pattern.compile(USEnglish.RX_DIGITS); 068 digitsSlashDigitsPattern = Pattern.compile(USEnglish.RX_DIGITSSLASHDIGITS); 069 dottedAbbrevPattern = Pattern.compile(USEnglish.RX_DOTTED_ABBREV); 070 doublePattern = Pattern.compile(USEnglish.RX_DOUBLE); 071 drStPattern = Pattern.compile(USEnglish.RX_DRST); 072 fourDigitsPattern = Pattern.compile(USEnglish.RX_FOUR_DIGIT); 073 hasVowelPattern = Pattern.compile(USEnglish.RX_HAS_VOWEL); 074 illionPattern = Pattern.compile(USEnglish.RX_ILLION); 075 numberTimePattern = Pattern.compile(USEnglish.RX_NUMBER_TIME); 076 numessPattern = Pattern.compile(USEnglish.RX_NUMESS); 077 ordinalPattern = Pattern.compile(USEnglish.RX_ORDINAL_NUMBER); 078 romanNumbersPattern = Pattern.compile(USEnglish.RX_ROMAN_NUMBER); 079 sevenPhoneNumberPattern = Pattern.compile(USEnglish.RX_SEVEN_DIGIT_PHONE_NUMBER); 080 threeDigitsPattern = Pattern.compile(USEnglish.RX_THREE_DIGIT); 081 usMoneyPattern = Pattern.compile(USEnglish.RX_US_MONEY); 082 } 083 084 // King-like words 085 private static final String[] kingNames = { 086 "louis", "henry", "charles", "philip", "george", 087 "edward", "pius", "william", "richard", "ptolemy", 088 "john", "paul", "peter", "nicholas", "frederick", 089 "james", "alfonso", "ivan", "napoleon", "leo", 090 "gregory", "catherine", "alexandria", "pierre", "elizabeth", 091 "mary" }; 092 093 private static final String[] kingTitles = { 094 "king", "queen", "pope", "duke", "tsar", 095 "emperor", "shah", "caesar", "duchess", "tsarina", 096 "empress", "baron", "baroness", "sultan", "count", 097 "countess" }; 098 099 // Section-like words 100 private static final String[] sectionTypes = { 101 "section", "chapter", "part", "phrase", "verse", 102 "scene", "act", "book", "volume", "chap", 103 "war", "apollo", "trek", "fortran" }; 104 105 /** 106 * Here we use a hashtable for constant time matching, instead of using 107 * if (A.equals(B) || A.equals(C) || ...) to match Strings 108 */ 109 private static Hashtable kingSectionLikeHash = new Hashtable(); 110 111 private static final String KING_NAMES = "kingNames"; 112 private static final String KING_TITLES = "kingTitles"; 113 private static final String SECTION_TYPES = "sectionTypes"; 114 115 // Hashtable initialization 116 static { 117 for (int i = 0; i < kingNames.length; i++) { 118 kingSectionLikeHash.put(kingNames[i], KING_NAMES); 119 } 120 for (int i = 0; i < kingTitles.length; i++) { 121 kingSectionLikeHash.put(kingTitles[i], KING_TITLES); 122 } 123 for (int i = 0; i < sectionTypes.length; i++) { 124 kingSectionLikeHash.put(sectionTypes[i], SECTION_TYPES); 125 } 126 } 127 128 private static final String[] postrophes = { 129 "'s", "'ll", "'ve", "'d" }; 130 131 // Finite state machines to check if a Token is pronounceable 132 private PronounceableFSM prefixFSM = null; 133 private PronounceableFSM suffixFSM = null; 134 135 // List of US states abbreviations and their full names 136 private static final String[][] usStates = 137 { 138 { "AL", "ambiguous", "alabama" }, 139 { "Al", "ambiguous", "alabama" }, 140 { "Ala", "", "alabama" }, 141 { "AK", "", "alaska" }, 142 { "Ak", "", "alaska" }, 143 { "AZ", "", "arizona" }, 144 { "Az", "", "arizona" }, 145 { "CA", "", "california" }, 146 { "Ca", "", "california" }, 147 { "Cal", "ambiguous", "california" }, 148 { "Calif", "", "california" }, 149 { "CO", "ambiguous", "colorado" }, 150 { "Co", "ambiguous", "colorado" }, 151 { "Colo", "", "colorado" }, 152 { "DC", "", "d" , "c" }, 153 { "DE", "", "delaware" }, 154 { "De", "ambiguous", "delaware" }, 155 { "Del", "ambiguous", "delaware" }, 156 { "FL", "", "florida" }, 157 { "Fl", "ambiguous", "florida" }, 158 { "Fla", "", "florida" }, 159 { "GA", "", "georgia" }, 160 { "Ga", "", "georgia" }, 161 { "HI", "ambiguous", "hawaii" }, 162 { "Hi", "ambiguous", "hawaii" }, 163 { "IA", "", "iowa" }, 164 { "Ia", "ambiguous", "iowa" }, 165 { "IN", "ambiguous", "indiana" }, 166 { "In", "ambiguous", "indiana" }, 167 { "Ind", "ambiguous", "indiana" }, 168 { "ID", "ambiguous", "idaho" }, 169 { "IL", "ambiguous", "illinois" }, 170 { "Il", "ambiguous", "illinois" }, 171 { "ILL", "ambiguous", "illinois" }, 172 { "KS", "", "kansas" }, 173 { "Ks", "", "kansas" }, 174 { "Kans", "", "kansas" }, 175 { "KY", "ambiguous", "kentucky" }, 176 { "Ky", "ambiguous", "kentucky" }, 177 { "LA", "ambiguous", "louisiana" }, 178 { "La", "ambiguous", "louisiana" }, 179 { "Lou", "ambiguous", "louisiana" }, 180 { "Lous", "ambiguous", "louisiana" }, 181 { "MA", "ambiguous", "massachusetts" }, 182 { "Mass", "ambiguous", "massachusetts" }, 183 { "Ma", "ambiguous", "massachusetts" }, 184 { "MD", "ambiguous", "maryland" }, 185 { "Md", "ambiguous", "maryland" }, 186 { "ME", "ambiguous", "maine" }, 187 { "Me", "ambiguous", "maine" }, 188 { "MI", "", "michigan" }, 189 { "Mi", "ambiguous", "michigan" }, 190 { "Mich", "ambiguous", "michigan" }, 191 { "MN", "ambiguous", "minnestota" }, 192 { "Minn", "ambiguous", "minnestota" }, 193 { "MS", "ambiguous", "mississippi" }, 194 { "Miss", "ambiguous", "mississippi" }, 195 { "MT", "ambiguous", "montanna" }, 196 { "Mt", "ambiguous", "montanna" }, 197 { "MO", "ambiguous", "missouri" }, 198 { "Mo", "ambiguous", "missouri" }, 199 { "NC", "ambiguous", "north" , "carolina" }, 200 { "ND", "ambiguous", "north" , "dakota" }, 201 { "NE", "ambiguous", "nebraska" }, 202 { "Ne", "ambiguous", "nebraska" }, 203 { "Neb", "ambiguous", "nebraska" }, 204 { "NH", "ambiguous", "new" , "hampshire" }, 205 { "NV", "", "nevada" }, 206 { "Nev", "", "nevada" }, 207 { "NY", "", "new" , "york" }, 208 { "OH", "ambiguous", "ohio" }, 209 { "OK", "ambiguous", "oklahoma" }, 210 { "Okla", "", "oklahoma" }, 211 { "OR", "ambiguous", "oregon" }, 212 { "Or", "ambiguous", "oregon" }, 213 { "Ore", "ambiguous", "oregon" }, 214 { "PA", "ambiguous", "pennsylvania" }, 215 { "Pa", "ambiguous", "pennsylvania" }, 216 { "Penn", "ambiguous", "pennsylvania" }, 217 { "RI", "ambiguous", "rhode" , "island" }, 218 { "SC", "ambiguous", "south" , "carlolina" }, 219 { "SD", "ambiguous", "south" , "dakota" }, 220 { "TN", "ambiguous", "tennesee" }, 221 { "Tn", "ambiguous", "tennesee" }, 222 { "Tenn", "ambiguous", "tennesee" }, 223 { "TX", "ambiguous", "texas" }, 224 { "Tx", "ambiguous", "texas" }, 225 { "Tex", "ambiguous", "texas" }, 226 { "UT", "ambiguous", "utah" }, 227 { "VA", "ambiguous", "virginia" }, 228 { "WA", "ambiguous", "washington" }, 229 { "Wa", "ambiguous", "washington" }, 230 { "Wash", "ambiguous", "washington" }, 231 { "WI", "ambiguous", "wisconsin" }, 232 { "Wi", "ambiguous", "wisconsin" }, 233 { "WV", "ambiguous", "west" , "virginia" }, 234 { "WY", "ambiguous", "wyoming" }, 235 { "Wy", "ambiguous", "wyoming" }, 236 { "Wyo", "", "wyoming" }, 237 { "PR", "ambiguous", "puerto" , "rico" } 238 }; 239 240 // Again hashtable for constant time searching 241 private static Hashtable usStatesHash = new Hashtable(); 242 243 // initialize the Hashtable for usStates 244 static { 245 for (int i = 0; i < usStates.length; i++) { 246 usStatesHash.put(usStates[i][0], usStates[i]); 247 } 248 }; 249 250 251 // class variables 252 253 // the word relation that we are building 254 private WordRelation wordRelation; 255 256 // the current token Item 257 private Item tokenItem; 258 259 // a CART for classifying numbers 260 private CART cart; 261 262 263 /** 264 * Constructs a default USTokenWordProcessor. It uses the USEnglish 265 * regular expression set (USEngRegExp) by default. 266 * 267 * @param usNumbersCART the cart to use to classify numbers 268 */ 269 public TokenToWords(CART usNumbersCART, 270 PronounceableFSM prefixFSM, 271 PronounceableFSM suffixFSM) { 272 this.cart = usNumbersCART; 273 this.prefixFSM = prefixFSM; 274 this.suffixFSM = suffixFSM; 275 } 276 277 278 /** 279 * Returns the currently processing token Item. 280 * 281 * @return the current token Item; null if no item 282 */ 283 public Item getTokenItem() { 284 return tokenItem; 285 } 286 287 288 /** 289 * process the utterance 290 * 291 * @param utterance the utterance contain the tokens 292 * 293 * @throws ProcessException if an IOException is thrown during the 294 * processing of the utterance 295 */ 296 public void processUtterance(Utterance utterance) throws ProcessException { 297 Relation tokenRelation; 298 if ((tokenRelation = utterance.getRelation(Relation.TOKEN)) == null) { 299 throw new IllegalStateException 300 ("TokenToWords: Token relation does not exist"); 301 } 302 303 wordRelation = WordRelation.createWordRelation(utterance, this); 304 305 for (tokenItem = tokenRelation.getHead(); 306 tokenItem != null; 307 tokenItem = tokenItem.getNext()) { 308 309 FeatureSet featureSet = tokenItem.getFeatures(); 310 String tokenVal = featureSet.getString("name"); 311 312 // convert the token into a list of words 313 tokenToWords(tokenVal); 314 } 315 } 316 317 318 /** 319 * Returns true if the given token matches part of a phone number 320 * 321 * @param tokenItem the token 322 * @param tokenVal the string value of the token 323 * 324 * @return true or false 325 */ 326 private boolean matchesPartPhoneNumber(String tokenVal) { 327 328 String n_name = (String) tokenItem.findFeature("n.name"); 329 String n_n_name = (String) tokenItem.findFeature("n.n.name"); 330 String p_name = (String) tokenItem.findFeature("p.name"); 331 String p_p_name = (String) tokenItem.findFeature("p.p.name"); 332 333 boolean matches3DigitsP_name = matches(threeDigitsPattern, p_name); 334 335 return ((matches(threeDigitsPattern, tokenVal) && 336 ((!matches(digitsPattern, p_name) 337 && matches(threeDigitsPattern, n_name) 338 && matches(fourDigitsPattern, n_n_name)) || 339 (matches(sevenPhoneNumberPattern, n_name)) || 340 (!matches(digitsPattern, p_p_name) 341 && matches3DigitsP_name 342 && matches(fourDigitsPattern, n_name)))) || 343 (matches(fourDigitsPattern, tokenVal) && 344 (!matches(digitsPattern, n_name) 345 && matches3DigitsP_name 346 && matches(threeDigitsPattern, p_p_name)))); 347 } 348 349 350 /** 351 * Returns true if the given string is in the given string array. 352 * 353 * @param value the string to check 354 * @param stringArray the array to check 355 * 356 * @return true if the string is in the array, false otherwise 357 */ 358 private static boolean inStringArray(String value, String[] stringArray) { 359 for (int i = 0; i < stringArray.length; i++) { 360 if (stringArray[i].equals(value)) { 361 return true; 362 } 363 } 364 return false; 365 } 366 367 368 369 /** 370 * Converts the given Token into (word) Items in the WordRelation. 371 * 372 * @param tokenVal the String value of the token, which may or may not be 373 * same as the one in called "name" in flite 374 * 375 */ 376 private void tokenToWords(String tokenVal) { 377 378 FeatureSet tokenFeatures = tokenItem.getFeatures(); 379 String itemName = tokenFeatures.getString("name"); 380 int tokenLength = tokenVal.length(); 381 382 if (tokenFeatures.isPresent("phones")) { 383 wordRelation.addWord(tokenVal); 384 385 } else if ((tokenVal.equals("a") || tokenVal.equals("A")) && 386 ((tokenItem.getNext() == null) || 387 !(tokenVal.equals(itemName)) || 388 !(((String) tokenItem.findFeature("punc")).equals("")))) { 389 /* if A is a sub part of a token, then its ey not ah */ 390 wordRelation.addWord("_a"); 391 392 } else if (matches(alphabetPattern, tokenVal)) { 393 394 if (matches(romanNumbersPattern, tokenVal)) { 395 396 /* XVIII */ 397 romanToWords(tokenVal); 398 399 } else if (matches(illionPattern, tokenVal) && 400 matches(usMoneyPattern, 401 (String) tokenItem.findFeature("p.name"))) { 402 /* $ X -illion */ 403 wordRelation.addWord(tokenVal); 404 wordRelation.addWord("dollars"); 405 406 } else if (matches(drStPattern, tokenVal)) { 407 408 /* St Andrew's St, Dr King Dr */ 409 drStToWords(tokenVal); 410 411 } else if (tokenVal.equals("Mr")) { 412 413 tokenItem.getFeatures().setString("punc", ""); 414 wordRelation.addWord("mister"); 415 416 } else if (tokenVal.equals("Mrs")) { 417 418 tokenItem.getFeatures().setString("punc", ""); 419 wordRelation.addWord("missus"); 420 421 } else if (tokenLength == 1 422 && isUppercaseLetter(tokenVal.charAt(0)) 423 && ((String)tokenItem.findFeature("n.whitespace")).equals(" ") 424 && isUppercaseLetter 425 (((String) tokenItem.findFeature("n.name")).charAt(0))) { 426 427 tokenFeatures.setString("punc", ""); 428 String aaa = tokenVal.toLowerCase(); 429 if (aaa.equals("a")) { 430 wordRelation.addWord("_a"); 431 } else { 432 wordRelation.addWord(aaa); 433 } 434 } else if (isStateName(tokenVal)) { 435 /* 436 The name of a US state 437 isStateName() has already added the full name of the 438 state, so we're all set. 439 */ 440 } else if (tokenLength > 1 && !isPronounceable(tokenVal)) { 441 /* Need common exception list */ 442 /* unpronouncable list of alphas */ 443 NumberExpander.expandLetters 444 (tokenVal, wordRelation); 445 446 } else { 447 /* just a word */ 448 wordRelation.addWord(tokenVal.toLowerCase()); 449 } 450 451 } else if (matches(dottedAbbrevPattern, tokenVal)) { 452 453 /* U.S.A. */ 454 // remove all dots 455 String aaa = Utilities.deleteChar(tokenVal, '.'); 456 NumberExpander.expandLetters(aaa, wordRelation); 457 458 } else if (matches(commaIntPattern, tokenVal)) { 459 460 /* 99,999,999 */ 461 String aaa = Utilities.deleteChar(tokenVal, ','); 462 NumberExpander.expandReal(aaa, wordRelation); 463 464 } else if (matches(sevenPhoneNumberPattern, tokenVal)) { 465 466 /* 234-3434 telephone numbers */ 467 int dashIndex = tokenVal.indexOf('-'); 468 String aaa = tokenVal.substring(0, dashIndex); 469 String bbb = tokenVal.substring(dashIndex+1); 470 471 NumberExpander.expandDigits(aaa, wordRelation); 472 wordRelation.addBreak(); 473 NumberExpander.expandDigits(bbb, wordRelation); 474 475 } else if (matchesPartPhoneNumber(tokenVal)) { 476 477 /* part of a telephone number */ 478 String punctuation = (String) tokenItem.findFeature("punc"); 479 if (punctuation.equals("")) { 480 tokenItem.getFeatures().setString("punc", ","); 481 } 482 NumberExpander.expandDigits(tokenVal, wordRelation); 483 wordRelation.addBreak(); 484 485 } else if (matches(numberTimePattern, tokenVal)) { 486 487 /* 12:35 */ 488 int colonIndex = tokenVal.indexOf(':'); 489 String aaa = tokenVal.substring(0, colonIndex); 490 String bbb = tokenVal.substring(colonIndex+1); 491 492 NumberExpander.expandNumber(aaa, wordRelation); 493 if (!(bbb.equals("00"))) { 494 NumberExpander.expandID(bbb, wordRelation); 495 } 496 497 } else if (matches(digits2DashPattern, tokenVal)) { 498 499 /* 999-999-999 */ 500 digitsDashToWords(tokenVal); 501 502 } else if (matches(digitsPattern, tokenVal)) { 503 504 digitsToWords(tokenVal); 505 506 } else if (tokenLength == 1 507 && isUppercaseLetter(tokenVal.charAt(0)) 508 && ((String)tokenItem.findFeature("n.whitespace")).equals 509 (" ") 510 && isUppercaseLetter 511 (((String) tokenItem.findFeature("n.name")).charAt(0))) { 512 513 tokenFeatures.setString("punc", ""); 514 String aaa = tokenVal.toLowerCase(); 515 if (aaa.equals("a")) { 516 wordRelation.addWord("_a"); 517 } else { 518 wordRelation.addWord(aaa); 519 } 520 } else if (matches(doublePattern, tokenVal)) { 521 522 NumberExpander.expandReal(tokenVal, wordRelation); 523 524 } else if (matches(ordinalPattern, tokenVal)) { 525 526 /* explicit ordinals */ 527 String aaa = tokenVal.substring(0, tokenLength - 2); 528 NumberExpander.expandOrdinal(aaa, wordRelation); 529 530 } else if (matches(usMoneyPattern, tokenVal)) { 531 532 /* US money */ 533 usMoneyToWords(tokenVal); 534 535 } else if (tokenLength > 0 536 && tokenVal.charAt(tokenLength - 1) == '%') { 537 538 /* Y% */ 539 tokenToWords(tokenVal.substring(0, tokenLength - 1)); 540 wordRelation.addWord("per"); 541 wordRelation.addWord("cent"); 542 543 } else if (matches(numessPattern, tokenVal)) { 544 545 /* 60s and 7s and 9s */ 546 tokenToWords(tokenVal.substring(0, tokenLength - 1)); 547 wordRelation.addWord("'s"); 548 549 } else if (tokenVal.indexOf('\'') != -1) { 550 551 postropheToWords(tokenVal); 552 553 } else if (matches(digitsSlashDigitsPattern, tokenVal) && 554 tokenVal.equals(itemName)) { 555 556 digitsSlashDigitsToWords(tokenVal); 557 558 } else if (tokenVal.indexOf('-') != -1) { 559 560 dashToWords(tokenVal); 561 562 } else if (tokenLength > 1 && 563 !matches(alphabetPattern, tokenVal)) { 564 565 notJustAlphasToWords(tokenVal); 566 567 } else { 568 /* just a word */ 569 wordRelation.addWord(tokenVal.toLowerCase()); 570 } 571 } 572 573 574 /** 575 * Convert the given digit token with dashes (e.g. 999-999-999) 576 * into (word) Items in the WordRelation. 577 * 578 * @param tokenVal the digit string 579 */ 580 private void digitsDashToWords(String tokenVal) { 581 int tokenLength = tokenVal.length(); 582 int a = 0; 583 for (int p = 0; p <= tokenLength; p++) { 584 if (p == tokenLength || tokenVal.charAt(p) == '-') { 585 String aaa = tokenVal.substring(a, p); 586 NumberExpander.expandDigits(aaa, wordRelation); 587 wordRelation.addBreak(); 588 a = p+1; 589 } 590 } 591 } 592 593 594 /** 595 * Convert the given digit token into (word) Items in the WordRelation. 596 * 597 * @param tokenVal the digit string 598 */ 599 private void digitsToWords(String tokenVal) { 600 FeatureSet featureSet = tokenItem.getFeatures(); 601 String nsw = ""; 602 if (featureSet.isPresent("nsw")) { 603 nsw = featureSet.getString("nsw"); 604 } 605 606 if (nsw.equals("nide")) { 607 NumberExpander.expandID(tokenVal, wordRelation); 608 } else { 609 String rName = featureSet.getString("name"); 610 String digitsType = null; 611 612 if (tokenVal.equals(rName)) { 613 digitsType = (String) cart.interpret(tokenItem); 614 } else { 615 featureSet.setString("name", tokenVal); 616 digitsType = (String) cart.interpret(tokenItem); 617 featureSet.setString("name", rName); 618 } 619 620 if (digitsType.equals("ordinal")) { 621 NumberExpander.expandOrdinal(tokenVal, wordRelation); 622 } else if (digitsType.equals("digits")) { 623 NumberExpander.expandDigits(tokenVal, wordRelation); 624 } else if (digitsType.equals("year")) { 625 NumberExpander.expandID(tokenVal, wordRelation); 626 } else { 627 NumberExpander.expandNumber(tokenVal, wordRelation); 628 } 629 } 630 } 631 632 633 /** 634 * Converts the given Roman numeral string into (word) Items in the 635 * WordRelation. 636 * 637 * @param romanString the roman numeral string 638 */ 639 private void romanToWords(String romanString) { 640 String punctuation = (String) tokenItem.findFeature("p.punc"); 641 642 if (punctuation.equals("")) { 643 /* no preceeding punctuation */ 644 String n = String.valueOf(NumberExpander.expandRoman(romanString)); 645 646 if (kingLike(tokenItem)) { 647 wordRelation.addWord("the"); 648 NumberExpander.expandOrdinal(n, wordRelation); 649 } else if (sectionLike(tokenItem)) { 650 NumberExpander.expandNumber(n, wordRelation); 651 } else { 652 NumberExpander.expandLetters(romanString, wordRelation); 653 } 654 } else { 655 NumberExpander.expandLetters(romanString, wordRelation); 656 } 657 } 658 659 660 /** 661 * Returns true if the given key is in the kingSectionLikeHash 662 * Hashtable, and the value is the same as the given value. 663 * 664 * @param key key to look for in the hashtable 665 * @param value the value to match 666 * 667 * @return true if it matches, or false if it does not or if 668 * the key is not mapped to any value in the hashtable. 669 */ 670 private static boolean inKingSectionLikeHash(String key, String value) { 671 String hashValue = (String) kingSectionLikeHash.get(key); 672 if (hashValue != null) { 673 return (hashValue.equals(value)); 674 } else { 675 return false; 676 } 677 } 678 679 680 681 /** 682 * Returns true if the given token item contains a token that is 683 * in a king-like context, e.g., "King" or "Louis". 684 * 685 * @param tokenItem the token item to check 686 * 687 * @return true or false 688 */ 689 public static boolean kingLike(Item tokenItem) { 690 String kingName = 691 ((String) tokenItem.findFeature("p.name")).toLowerCase(); 692 if (inKingSectionLikeHash(kingName, KING_NAMES)) { 693 return true; 694 } else { 695 String kingTitle = 696 ((String) tokenItem.findFeature("p.p.name")).toLowerCase(); 697 return inKingSectionLikeHash(kingTitle, KING_TITLES); 698 } 699 } 700 701 702 /** 703 * Returns true if the given token item contains a token that is 704 * in a section-like context, e.g., "chapter" or "act". 705 * 706 * @param tokenItem the token item to check 707 * 708 * @return true or false 709 */ 710 public static boolean sectionLike(Item tokenItem) { 711 String sectionType = 712 ((String) tokenItem.findFeature("p.name")).toLowerCase(); 713 return inKingSectionLikeHash(sectionType, SECTION_TYPES); 714 } 715 716 717 /** 718 * Converts the given string containing "St" and "Dr" to (word) Items 719 * in the WordRelation. 720 * 721 * @param drStString the string with "St" and "Dr" 722 */ 723 private void drStToWords(String drStString) { 724 String street = null; 725 String saint = null; 726 char c0 = drStString.charAt(0); 727 728 if (c0 == 's' || c0 == 'S') { 729 street = "street"; 730 saint = "saint"; 731 } else { 732 street = "drive"; 733 saint = "doctor"; 734 } 735 736 FeatureSet featureSet = tokenItem.getFeatures(); 737 String punctuation = featureSet.getString("punc"); 738 739 String featPunctuation = (String) tokenItem.findFeature("punc"); 740 741 if (tokenItem.getNext() == null || 742 punctuation.indexOf(',') != -1) { 743 wordRelation.addWord(street); 744 } else if (featPunctuation.equals(",")) { 745 wordRelation.addWord(saint); 746 } else { 747 String pName = (String) tokenItem.findFeature("p.name"); 748 String nName = (String) tokenItem.findFeature("n.name"); 749 750 char p0 = pName.charAt(0); 751 char n0 = nName.charAt(0); 752 753 if (isUppercaseLetter(p0) && isLowercaseLetter(n0)) { 754 wordRelation.addWord(street); 755 } else if (NumberExpander.isDigit(p0) && isLowercaseLetter(n0)) { 756 wordRelation.addWord(street); 757 } else if (isLowercaseLetter(p0) && isUppercaseLetter(n0)) { 758 wordRelation.addWord(saint); 759 } else { 760 String whitespace = (String) tokenItem.findFeature("n.whitespace"); 761 if (whitespace.equals(" ")) { 762 wordRelation.addWord(saint); 763 } else { 764 wordRelation.addWord(street); 765 } 766 } 767 } 768 769 if (punctuation != null && punctuation.equals(".")) { 770 featureSet.setString("punc", ""); 771 } 772 } 773 774 775 /** 776 * Converts US money string into (word) Items in the WordRelation. 777 * 778 * @param tokenVal the US money string 779 */ 780 private void usMoneyToWords(String tokenVal) { 781 782 int dotIndex = tokenVal.indexOf('.'); 783 784 if (matches(illionPattern, 785 (String) tokenItem.findFeature("n.name"))) { 786 NumberExpander.expandReal(tokenVal.substring(1), wordRelation); 787 } else if (dotIndex == -1) { 788 789 String aaa = tokenVal.substring(1); 790 tokenToWords(aaa); 791 792 if (aaa.equals("1")) { 793 wordRelation.addWord("dollar"); 794 } else { 795 wordRelation.addWord("dollars"); 796 } 797 } else if (dotIndex == (tokenVal.length() - 1) || 798 (tokenVal.length() - dotIndex) > 3) { 799 /* simply read as mumble point mumble */ 800 NumberExpander.expandReal(tokenVal.substring(1), wordRelation); 801 wordRelation.addWord("dollars"); 802 } else { 803 String aaa = tokenVal.substring(1, dotIndex); 804 aaa = Utilities.deleteChar(aaa, ','); 805 String bbb = tokenVal.substring(dotIndex+1); 806 807 NumberExpander.expandNumber(aaa, wordRelation); 808 809 if (aaa.equals("1")) { 810 wordRelation.addWord("dollar"); 811 } else { 812 wordRelation.addWord("dollars"); 813 } 814 815 if (bbb.equals("00")) { 816 // add nothing to the word list 817 } else { 818 NumberExpander.expandNumber(bbb, wordRelation); 819 if (bbb.equals("01")) { 820 wordRelation.addWord("cent"); 821 } else { 822 wordRelation.addWord("cents"); 823 } 824 } 825 } 826 } 827 828 829 /** 830 * Convert the given apostrophed word into (word) Items in the Word 831 * Relation. 832 * 833 * @param tokenVal the apostrophed word string 834 */ 835 private void postropheToWords(String tokenVal) { 836 int index = tokenVal.indexOf('\''); 837 String bbb = tokenVal.substring(index).toLowerCase(); 838 839 if (inStringArray(bbb, postrophes)) { 840 String aaa = tokenVal.substring(0, index); 841 tokenToWords(aaa); 842 wordRelation.addWord(bbb); 843 844 } else if (bbb.equals("'tve")) { 845 String aaa = tokenVal.substring(0, index-2); 846 tokenToWords(aaa); 847 wordRelation.addWord("'ve"); 848 849 } else { 850 /* internal single quote deleted */ 851 StringBuffer buffer = new StringBuffer(tokenVal); 852 buffer.deleteCharAt(index); 853 tokenToWords(buffer.toString()); 854 } 855 } 856 857 858 /** 859 * Convert the given digits/digits string into word (Items) in the 860 * WordRelation. 861 * 862 * @param tokenVal the digits/digits string 863 */ 864 private void digitsSlashDigitsToWords(String tokenVal) { 865 866 /* might be fraction, or not */ 867 int index = tokenVal.indexOf('/'); 868 String aaa = tokenVal.substring(0, index); 869 String bbb = tokenVal.substring(index+1); 870 int a, b; 871 872 // if the previous token is a number, add an "and" 873 if (matches(digitsPattern, (String) tokenItem.findFeature("p.name")) 874 && tokenItem.getPrevious() != null) { 875 wordRelation.addWord("and"); 876 } 877 878 if (aaa.equals("1") && bbb.equals("2")) { 879 wordRelation.addWord("a"); 880 wordRelation.addWord("half"); 881 } else if ((a = Integer.parseInt(aaa)) < (b = Integer.parseInt(bbb))) { 882 NumberExpander.expandNumber(aaa, wordRelation); 883 NumberExpander.expandOrdinal(bbb, wordRelation); 884 if (a > 1) { 885 wordRelation.addWord("'s"); 886 } 887 } else { 888 NumberExpander.expandNumber(aaa, wordRelation); 889 wordRelation.addWord("slash"); 890 NumberExpander.expandNumber(bbb, wordRelation); 891 } 892 } 893 894 895 /** 896 * Convert the given dashed string (e.g. "aaa-bbb") into (word) Items 897 * in the WordRelation. 898 * 899 * @param tokenVal the dashed string 900 */ 901 private void dashToWords(String tokenVal) { 902 903 int index = tokenVal.indexOf('-'); 904 String aaa = tokenVal.substring(0, index); 905 String bbb = tokenVal.substring(index+1, tokenVal.length()); 906 907 if (matches(digitsPattern, aaa) && matches(digitsPattern, bbb)) { 908 FeatureSet featureSet = tokenItem.getFeatures(); 909 featureSet.setString("name", aaa); 910 tokenToWords(aaa); 911 wordRelation.addWord("to"); 912 featureSet.setString("name", bbb); 913 tokenToWords(bbb); 914 featureSet.setString("name", ""); 915 } else { 916 tokenToWords(aaa); 917 tokenToWords(bbb); 918 } 919 } 920 921 922 /** 923 * Convert the given string (which does not only consist of alphabet) 924 * into (word) Items in the WordRelation. 925 * 926 * @param tokenVal the string 927 */ 928 private void notJustAlphasToWords(String tokenVal) { 929 930 /* its not just alphas */ 931 int index = 0; 932 int tokenLength = tokenVal.length(); 933 934 for (; index < tokenLength; index++) { 935 if (isTextSplitable(tokenVal, index)) { 936 break; 937 } 938 } 939 940 String aaa = tokenVal.substring(0, index+1); 941 String bbb = tokenVal.substring(index+1, tokenLength); 942 943 FeatureSet featureSet = tokenItem.getFeatures(); 944 featureSet.setString("nsw", "nide"); 945 tokenToWords(aaa); 946 tokenToWords(bbb); 947 } 948 949 950 /** 951 * Returns true if the given word is pronounceable. 952 * This method is originally called us_aswd() in Flite 1.1. 953 * 954 * @param word the word to test 955 * 956 * @return true if the word is pronounceable, false otherwise 957 */ 958 public boolean isPronounceable(String word) { 959 String lowerCaseWord = word.toLowerCase(); 960 return (prefixFSM.accept(lowerCaseWord) && 961 suffixFSM.accept(lowerCaseWord)); 962 } 963 964 965 /** 966 * Returns true if the given token is the name of a US state. 967 * If it is, it will add the name of the state to (word) Items in the 968 * WordRelation. 969 * 970 * @param tokenVal the token string 971 */ 972 private boolean isStateName(String tokenVal) { 973 String[] state = (String[]) usStatesHash.get(tokenVal); 974 if (state != null) { 975 boolean expandState = false; 976 977 // check to see if the state initials are ambiguous 978 // in the English language 979 if (state[1].equals("ambiguous")) { 980 String previous = (String) tokenItem.findFeature("p.name"); 981 String next = (String) tokenItem.findFeature("n.name"); 982 983 // System.out.println("previous = " + previous); 984 // System.out.println("next = " + next); 985 986 int nextLength = next.length(); 987 FeatureSet featureSet = tokenItem.getFeatures(); 988 989 // check if the previous word starts with a capital letter, 990 // is at least 3 letters long, is an alphabet sequence, 991 // and has a comma. 992 boolean previousIsCity = 993 (isUppercaseLetter(previous.charAt(0)) 994 && previous.length() > 2 995 && matches(alphabetPattern, previous) 996 && tokenItem.findFeature("p.punc").equals(",")); 997 998 // check if next token starts with a lower case, or 999 // this is the end of sentence, or if next token 1000 // is a period (".") or a zip code (5 or 10 digits). 1001 boolean nextIsGood = 1002 (isLowercaseLetter(next.charAt(0)) 1003 || tokenItem.getNext() == null 1004 || featureSet.getString("punc").equals(".") 1005 || ((nextLength == 5 || nextLength == 10) && 1006 matches(digitsPattern, next))); 1007 1008 if (previousIsCity && nextIsGood) { 1009 expandState = true; 1010 } else { 1011 expandState = false; 1012 } 1013 } else { 1014 expandState = true; 1015 } 1016 if (expandState) { 1017 for (int j = 2; j < state.length; j++) { 1018 if (state[j] != null) { 1019 wordRelation.addWord(state[j]); 1020 } 1021 } 1022 return true; 1023 } 1024 } 1025 return false; 1026 } 1027 1028 1029 /** 1030 * Determines if the given input matches the given Pattern. 1031 * 1032 * @param pattern the pattern to match 1033 * @param input the string to test 1034 * 1035 * @return <code>true</code> if the input string matches the given Pattern; 1036 * <code>false</code> otherwise 1037 */ 1038 private static boolean matches(Pattern pattern, String input) { 1039 Matcher m = pattern.matcher(input); 1040 return m.matches(); 1041 } 1042 1043 1044 /** 1045 * Determines if the character at the given position of the given 1046 * input text is splittable. A character is splittable if: 1047 * <p> 1048 * 1) the character and the following character are not letters 1049 * in the English alphabet (A-Z and a-z) 1050 * <p> 1051 * 2) the character and the following character are not digits (0-9) 1052 * <p> 1053 * @param text the text containing the character of interest 1054 * @param index the index of the character of interest 1055 * 1056 * @return true if the position of the given text is splittable 1057 * false otherwise 1058 */ 1059 private static boolean isTextSplitable(String text, int index) { 1060 1061 char c0 = text.charAt(index); 1062 char c1 = text.charAt(index+1); 1063 1064 if (isLetter(c0) && isLetter(c1)) { 1065 return false; 1066 } else if (NumberExpander.isDigit(c0) && NumberExpander.isDigit(c1)) { 1067 return false; 1068 } else { 1069 return true; 1070 } 1071 } 1072 1073 1074 /** 1075 * Returns true if the given character is a letter (a-z or A-Z). 1076 * 1077 * @param ch the character to test 1078 * 1079 * @return true or false 1080 */ 1081 private static boolean isLetter(char ch) { 1082 return (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')); 1083 } 1084 1085 1086 /** 1087 * Returns true if the given character is an uppercase letter (A-Z). 1088 * 1089 * @param ch the character to test 1090 * 1091 * @return true or false 1092 */ 1093 private static boolean isUppercaseLetter(char ch) { 1094 return ('A' <= ch && ch <= 'Z'); 1095 } 1096 1097 1098 /** 1099 * Returns true if the given character is a lowercase letter (a-z). 1100 * 1101 * @param ch the character to test 1102 * 1103 * @return true or false 1104 */ 1105 private static boolean isLowercaseLetter(char ch) { 1106 return ('a' <= ch && ch <= 'z'); 1107 } 1108 1109 1110 /** 1111 * Converts this object to its String representation 1112 * 1113 * @return the string representation of this object 1114 */ 1115 public String toString() { 1116 return "TokenToWords"; 1117 } 1118}