001/**
002 * Portions Copyright 2001-2003 Sun Microsystems, Inc.
003 * Portions Copyright 1999-2001 Language Technologies Institute, 
004 * Carnegie Mellon University.
005 * All Rights Reserved.  Use is subject to license terms.
006 *
007 * See the file "license.terms" for information on usage and
008 * redistribution of this file, and for a DISCLAIMER OF ALL 
009 * WARRANTIES.
010 */
011package com.sun.speech.freetts.en.us;
012
013import java.util.Hashtable;
014import java.util.regex.Matcher;
015import java.util.regex.Pattern;
016
017import com.sun.speech.freetts.FeatureSet;
018import com.sun.speech.freetts.Item;
019import com.sun.speech.freetts.ProcessException;
020import com.sun.speech.freetts.Relation;
021import com.sun.speech.freetts.Utterance;
022import com.sun.speech.freetts.UtteranceProcessor;
023import com.sun.speech.freetts.cart.CART;
024import com.sun.speech.freetts.util.Utilities;
025
026
027/**
028 * Converts the Tokens (in US English words) in an 
029 * Utterance into a list of words. It puts the produced list back
030 * into the Utterance. Usually, the tokens that gets expanded are numbers
031 * like "23" (to "twenty" "three").
032 * <p> * It translates the following code from flite:
033 * <br>
034 * <code>
035 * lang/usenglish/us_text.c
036 * </code>
037 */
038public class TokenToWords implements UtteranceProcessor {
039
040    /** Regular expression for something that has a vowel */
041    private static final String RX_HAS_VOWEL = ".*[aeiouAEIOU].*";    
042                            
043    // Patterns for regular expression matching
044    private static final Pattern alphabetPattern;
045    private static final Pattern commaIntPattern;
046    private static final Pattern digits2DashPattern;
047    private static final Pattern digitsPattern;
048    private static final Pattern digitsSlashDigitsPattern;
049    private static final Pattern dottedAbbrevPattern;
050    private static final Pattern doublePattern;
051    private static final Pattern drStPattern;
052    private static final Pattern fourDigitsPattern;
053    private static final Pattern hasVowelPattern;
054    private static final Pattern illionPattern;
055    private static final Pattern numberTimePattern;
056    private static final Pattern numessPattern;
057    private static final Pattern ordinalPattern;
058    private static final Pattern romanNumbersPattern;
059    private static final Pattern sevenPhoneNumberPattern;
060    private static final Pattern threeDigitsPattern;
061    private static final Pattern usMoneyPattern;
062    
063    static {
064        alphabetPattern = Pattern.compile(USEnglish.RX_ALPHABET);
065        commaIntPattern = Pattern.compile(USEnglish.RX_COMMAINT);
066        digits2DashPattern = Pattern.compile(USEnglish.RX_DIGITS2DASH);
067        digitsPattern = Pattern.compile(USEnglish.RX_DIGITS);
068        digitsSlashDigitsPattern = Pattern.compile(USEnglish.RX_DIGITSSLASHDIGITS);
069        dottedAbbrevPattern = Pattern.compile(USEnglish.RX_DOTTED_ABBREV);
070        doublePattern = Pattern.compile(USEnglish.RX_DOUBLE);
071        drStPattern = Pattern.compile(USEnglish.RX_DRST);
072        fourDigitsPattern = Pattern.compile(USEnglish.RX_FOUR_DIGIT);
073        hasVowelPattern = Pattern.compile(USEnglish.RX_HAS_VOWEL);
074        illionPattern = Pattern.compile(USEnglish.RX_ILLION);
075        numberTimePattern = Pattern.compile(USEnglish.RX_NUMBER_TIME);
076        numessPattern = Pattern.compile(USEnglish.RX_NUMESS);
077        ordinalPattern = Pattern.compile(USEnglish.RX_ORDINAL_NUMBER);
078        romanNumbersPattern = Pattern.compile(USEnglish.RX_ROMAN_NUMBER);
079        sevenPhoneNumberPattern = Pattern.compile(USEnglish.RX_SEVEN_DIGIT_PHONE_NUMBER);
080        threeDigitsPattern = Pattern.compile(USEnglish.RX_THREE_DIGIT);
081        usMoneyPattern = Pattern.compile(USEnglish.RX_US_MONEY);
082    }
083
084    // King-like words 
085    private static final String[] kingNames = {
086        "louis", "henry", "charles", "philip", "george",
087        "edward", "pius", "william", "richard", "ptolemy",
088        "john", "paul", "peter", "nicholas", "frederick",
089        "james", "alfonso", "ivan", "napoleon", "leo",
090        "gregory", "catherine", "alexandria", "pierre", "elizabeth",
091        "mary" };
092    
093    private static final String[] kingTitles = {
094        "king", "queen", "pope", "duke", "tsar",
095        "emperor", "shah", "caesar", "duchess", "tsarina",
096        "empress", "baron", "baroness", "sultan", "count",
097        "countess" };
098
099    // Section-like words
100    private static final String[] sectionTypes = {
101        "section", "chapter", "part", "phrase", "verse",
102        "scene", "act", "book", "volume", "chap",
103        "war", "apollo", "trek", "fortran" };
104    
105    /**
106     * Here we use a hashtable for constant time matching, instead of using
107     * if (A.equals(B) || A.equals(C) || ...) to match Strings
108     */
109    private static Hashtable kingSectionLikeHash = new Hashtable();
110
111    private static final String KING_NAMES = "kingNames";
112    private static final String KING_TITLES = "kingTitles";
113    private static final String SECTION_TYPES = "sectionTypes";
114
115    // Hashtable initialization
116    static {
117        for (int i = 0; i < kingNames.length; i++) {
118            kingSectionLikeHash.put(kingNames[i], KING_NAMES);
119        }
120        for (int i = 0; i < kingTitles.length; i++) {
121            kingSectionLikeHash.put(kingTitles[i], KING_TITLES);
122        }
123        for (int i = 0; i < sectionTypes.length; i++) {
124            kingSectionLikeHash.put(sectionTypes[i], SECTION_TYPES);
125        }
126    }
127
128    private static final String[] postrophes = {
129        "'s", "'ll", "'ve", "'d" };
130
131    // Finite state machines to check if a Token is pronounceable
132    private PronounceableFSM prefixFSM = null;
133    private PronounceableFSM suffixFSM = null;
134
135    // List of US states abbreviations and their full names
136    private static final String[][] usStates =
137    {
138        { "AL", "ambiguous", "alabama"  },
139        { "Al", "ambiguous", "alabama"  },
140        { "Ala", "", "alabama"  },
141        { "AK", "", "alaska"  },
142        { "Ak", "", "alaska"  },
143        { "AZ", "", "arizona"  },
144        { "Az", "", "arizona"  },
145        { "CA", "", "california"  },
146        { "Ca", "", "california"  },
147        { "Cal", "ambiguous", "california"  },
148        { "Calif", "", "california"  },
149        { "CO", "ambiguous", "colorado"  },
150        { "Co", "ambiguous", "colorado"  },
151        { "Colo", "", "colorado"  },
152        { "DC", "", "d" , "c" },
153        { "DE", "", "delaware"  },
154        { "De", "ambiguous", "delaware"  },
155        { "Del", "ambiguous", "delaware"  },
156        { "FL", "", "florida"  },
157        { "Fl", "ambiguous", "florida"  },
158        { "Fla", "", "florida"  },
159        { "GA", "", "georgia"  },
160        { "Ga", "", "georgia"  },
161        { "HI", "ambiguous", "hawaii"  },
162        { "Hi", "ambiguous", "hawaii"  },
163        { "IA", "", "iowa"  },
164        { "Ia", "ambiguous", "iowa"  },
165        { "IN", "ambiguous", "indiana"  },
166        { "In", "ambiguous", "indiana"  },
167        { "Ind", "ambiguous", "indiana"  },
168        { "ID", "ambiguous", "idaho"  },
169        { "IL", "ambiguous", "illinois"  },
170        { "Il", "ambiguous", "illinois"  },
171        { "ILL", "ambiguous", "illinois"  },
172        { "KS", "", "kansas"  },
173        { "Ks", "", "kansas"  },
174        { "Kans", "", "kansas"  },
175        { "KY", "ambiguous", "kentucky"  },
176        { "Ky", "ambiguous", "kentucky"  },
177        { "LA", "ambiguous", "louisiana"  },
178        { "La", "ambiguous", "louisiana"  },
179        { "Lou", "ambiguous", "louisiana"  },
180        { "Lous", "ambiguous", "louisiana"  },
181        { "MA", "ambiguous", "massachusetts"  },
182        { "Mass", "ambiguous", "massachusetts"  },
183        { "Ma", "ambiguous", "massachusetts"  },
184        { "MD", "ambiguous", "maryland"  },
185        { "Md", "ambiguous", "maryland"  },
186        { "ME", "ambiguous", "maine"  },
187        { "Me", "ambiguous", "maine"  },
188        { "MI", "", "michigan"  },
189        { "Mi", "ambiguous", "michigan"  },
190        { "Mich", "ambiguous", "michigan"  },
191        { "MN", "ambiguous", "minnestota"  },
192        { "Minn", "ambiguous", "minnestota"  },
193        { "MS", "ambiguous", "mississippi"  },
194        { "Miss", "ambiguous", "mississippi"  },
195        { "MT", "ambiguous", "montanna"  },
196        { "Mt", "ambiguous", "montanna"  },
197        { "MO", "ambiguous", "missouri"  },
198        { "Mo", "ambiguous", "missouri"  },
199        { "NC", "ambiguous", "north" , "carolina" },
200        { "ND", "ambiguous", "north" , "dakota" },
201        { "NE", "ambiguous", "nebraska"  },
202        { "Ne", "ambiguous", "nebraska"  },
203        { "Neb", "ambiguous", "nebraska"  },
204        { "NH", "ambiguous", "new" , "hampshire" },
205        { "NV", "", "nevada"  },
206        { "Nev", "", "nevada"  },
207        { "NY", "", "new" , "york" },
208        { "OH", "ambiguous", "ohio"  },
209        { "OK", "ambiguous", "oklahoma"  },
210        { "Okla", "", "oklahoma"  },
211        { "OR", "ambiguous", "oregon"  },
212        { "Or", "ambiguous", "oregon"  },
213        { "Ore", "ambiguous", "oregon"  },
214        { "PA", "ambiguous", "pennsylvania"  },
215        { "Pa", "ambiguous", "pennsylvania"  },
216        { "Penn", "ambiguous", "pennsylvania"  },
217        { "RI", "ambiguous", "rhode" , "island" },
218        { "SC", "ambiguous", "south" , "carlolina" },
219        { "SD", "ambiguous", "south" , "dakota" },
220        { "TN", "ambiguous", "tennesee"  },
221        { "Tn", "ambiguous", "tennesee"  },
222        { "Tenn", "ambiguous", "tennesee"  },
223        { "TX", "ambiguous", "texas"  },
224        { "Tx", "ambiguous", "texas"  },
225        { "Tex", "ambiguous", "texas"  },
226        { "UT", "ambiguous", "utah"  },
227        { "VA", "ambiguous", "virginia"  },
228        { "WA", "ambiguous", "washington"  },
229        { "Wa", "ambiguous", "washington"  },
230        { "Wash", "ambiguous", "washington"  },
231        { "WI", "ambiguous", "wisconsin"  },
232        { "Wi", "ambiguous", "wisconsin"  },
233        { "WV", "ambiguous", "west" , "virginia" },
234        { "WY", "ambiguous", "wyoming"  },
235        { "Wy", "ambiguous", "wyoming"  },
236        { "Wyo", "", "wyoming"  },
237        { "PR", "ambiguous", "puerto" , "rico" }
238    };
239
240    // Again hashtable for constant time searching
241    private static Hashtable usStatesHash = new Hashtable();
242    
243    // initialize the Hashtable for usStates
244    static {
245        for (int i = 0; i < usStates.length; i++) {
246            usStatesHash.put(usStates[i][0], usStates[i]);
247        }
248    };
249
250
251    // class variables
252
253    // the word relation that we are building
254    private WordRelation wordRelation;
255
256    // the current token Item
257    private Item tokenItem;
258
259    // a CART for classifying numbers
260    private CART cart;
261
262
263    /**
264     * Constructs a default USTokenWordProcessor. It uses the USEnglish
265     * regular expression set (USEngRegExp) by default.
266     *
267     * @param usNumbersCART the cart to use to classify numbers
268     */
269    public TokenToWords(CART usNumbersCART,
270                        PronounceableFSM prefixFSM,
271                        PronounceableFSM suffixFSM) {
272        this.cart = usNumbersCART;
273        this.prefixFSM = prefixFSM;
274        this.suffixFSM = suffixFSM;
275    }
276
277
278    /**
279     * Returns the currently processing token Item.
280     *
281     * @return the current token Item; null if no item
282     */
283    public Item getTokenItem() {
284        return tokenItem;
285    }
286
287
288    /**
289     *  process the utterance
290     *
291     * @param  utterance  the utterance contain the tokens
292     *
293     * @throws ProcessException if an IOException is thrown during the
294     *         processing of the utterance
295     */
296    public void processUtterance(Utterance utterance) throws ProcessException {
297        Relation tokenRelation;
298        if ((tokenRelation = utterance.getRelation(Relation.TOKEN)) == null) {
299            throw new IllegalStateException
300                ("TokenToWords: Token relation does not exist");
301        }
302        
303        wordRelation = WordRelation.createWordRelation(utterance, this);
304        
305        for (tokenItem = tokenRelation.getHead();
306             tokenItem != null;
307             tokenItem = tokenItem.getNext()) {
308
309            FeatureSet featureSet = tokenItem.getFeatures();
310            String tokenVal = featureSet.getString("name");
311            
312            // convert the token into a list of words
313            tokenToWords(tokenVal);
314        }
315    }
316
317
318    /**
319     * Returns true if the given token matches part of a phone number
320     *
321     * @param tokenItem the token
322     * @param tokenVal the string value of the token
323     *
324     * @return true or false
325     */
326    private boolean matchesPartPhoneNumber(String tokenVal) {
327
328        String n_name = (String) tokenItem.findFeature("n.name");
329        String n_n_name = (String) tokenItem.findFeature("n.n.name");
330        String p_name = (String) tokenItem.findFeature("p.name");
331        String p_p_name = (String) tokenItem.findFeature("p.p.name");
332
333        boolean matches3DigitsP_name = matches(threeDigitsPattern, p_name);
334
335        return ((matches(threeDigitsPattern, tokenVal) &&
336                 ((!matches(digitsPattern, p_name)
337                   && matches(threeDigitsPattern, n_name)
338                   && matches(fourDigitsPattern, n_n_name)) ||
339                  (matches(sevenPhoneNumberPattern, n_name)) ||
340                  (!matches(digitsPattern, p_p_name)
341                   && matches3DigitsP_name
342                   && matches(fourDigitsPattern, n_name)))) ||
343                (matches(fourDigitsPattern, tokenVal) &&
344                 (!matches(digitsPattern, n_name)
345                  && matches3DigitsP_name
346                  && matches(threeDigitsPattern, p_p_name))));
347    }
348    
349
350    /**
351     * Returns true if the given string is in the given string array.
352     *
353     * @param value the string to check
354     * @param stringArray the array to check
355     *
356     * @return true if the string is in the array, false otherwise
357     */
358    private static boolean inStringArray(String value, String[] stringArray) {
359        for (int i = 0; i < stringArray.length; i++) {
360            if (stringArray[i].equals(value)) {
361                return true;
362            }
363        }
364        return false;
365    }
366
367
368
369    /**
370     * Converts the given Token into (word) Items in the WordRelation.
371     *
372     * @param  tokenVal the String value of the token, which may or may not be
373     *                  same as the one in called "name" in flite
374     *
375     */
376    private void tokenToWords(String tokenVal) {
377
378        FeatureSet tokenFeatures = tokenItem.getFeatures();
379        String itemName = tokenFeatures.getString("name");
380        int tokenLength = tokenVal.length();
381
382        if (tokenFeatures.isPresent("phones")) {
383            wordRelation.addWord(tokenVal);
384
385        } else if ((tokenVal.equals("a") || tokenVal.equals("A")) &&
386                ((tokenItem.getNext() == null) ||
387                 !(tokenVal.equals(itemName)) ||
388                 !(((String) tokenItem.findFeature("punc")).equals("")))) {
389            /* if A is a sub part of a token, then its ey not ah */
390            wordRelation.addWord("_a");
391
392        } else if (matches(alphabetPattern, tokenVal)) {
393
394            if (matches(romanNumbersPattern, tokenVal)) {
395                
396                /* XVIII */
397                romanToWords(tokenVal);
398                
399            } else if (matches(illionPattern, tokenVal) &&
400                       matches(usMoneyPattern, 
401                               (String) tokenItem.findFeature("p.name"))) {
402                /* $ X -illion */
403                wordRelation.addWord(tokenVal);
404                wordRelation.addWord("dollars");            
405                
406            } else if (matches(drStPattern, tokenVal)) {
407                
408                /* St Andrew's St, Dr King Dr */
409                drStToWords(tokenVal);
410                
411            } else if (tokenVal.equals("Mr")) {
412                
413                tokenItem.getFeatures().setString("punc", "");
414                wordRelation.addWord("mister");
415                
416            } else if (tokenVal.equals("Mrs")) {
417                
418                tokenItem.getFeatures().setString("punc", "");
419                wordRelation.addWord("missus");
420                
421            } else if (tokenLength == 1
422                       && isUppercaseLetter(tokenVal.charAt(0))
423                       && ((String)tokenItem.findFeature("n.whitespace")).equals(" ")
424                       && isUppercaseLetter
425                       (((String) tokenItem.findFeature("n.name")).charAt(0))) {
426                
427                tokenFeatures.setString("punc", "");
428                String aaa = tokenVal.toLowerCase();
429                if (aaa.equals("a")) {
430                    wordRelation.addWord("_a");
431                } else {
432                    wordRelation.addWord(aaa);
433                }
434            } else if (isStateName(tokenVal)) {
435                /*
436                  The name of a US state
437                  isStateName() has already added the full name of the
438                  state, so we're all set.
439                */
440            } else if (tokenLength > 1 && !isPronounceable(tokenVal)) {
441                /* Need common exception list */
442                /* unpronouncable list of alphas */
443                NumberExpander.expandLetters
444                    (tokenVal, wordRelation);
445                
446            } else {
447                /* just a word */
448                wordRelation.addWord(tokenVal.toLowerCase());
449            }
450            
451        } else if (matches(dottedAbbrevPattern, tokenVal)) {
452            
453            /* U.S.A. */
454            // remove all dots
455            String aaa = Utilities.deleteChar(tokenVal, '.'); 
456            NumberExpander.expandLetters(aaa, wordRelation);
457            
458        } else if (matches(commaIntPattern, tokenVal)) {
459            
460            /* 99,999,999 */
461            String aaa = Utilities.deleteChar(tokenVal, ',');
462            NumberExpander.expandReal(aaa, wordRelation);
463            
464        } else if (matches(sevenPhoneNumberPattern, tokenVal)) {
465            
466            /* 234-3434  telephone numbers */
467            int dashIndex = tokenVal.indexOf('-');
468            String aaa = tokenVal.substring(0, dashIndex);
469            String bbb = tokenVal.substring(dashIndex+1);
470            
471            NumberExpander.expandDigits(aaa, wordRelation);
472            wordRelation.addBreak();
473            NumberExpander.expandDigits(bbb, wordRelation);
474            
475        } else if (matchesPartPhoneNumber(tokenVal)) {
476            
477            /* part of a telephone number */
478            String punctuation = (String) tokenItem.findFeature("punc");
479            if (punctuation.equals("")) {
480                tokenItem.getFeatures().setString("punc", ",");
481            }
482            NumberExpander.expandDigits(tokenVal, wordRelation);
483            wordRelation.addBreak();
484                
485        } else if (matches(numberTimePattern, tokenVal)) {
486            
487            /* 12:35 */
488            int colonIndex = tokenVal.indexOf(':');
489            String aaa = tokenVal.substring(0, colonIndex);
490            String bbb = tokenVal.substring(colonIndex+1);
491            
492            NumberExpander.expandNumber(aaa, wordRelation);
493            if (!(bbb.equals("00"))) {
494                NumberExpander.expandID(bbb, wordRelation);
495            }
496            
497        } else if (matches(digits2DashPattern, tokenVal)) {
498            
499            /* 999-999-999 */
500            digitsDashToWords(tokenVal);
501            
502        } else if (matches(digitsPattern, tokenVal)) {
503            
504            digitsToWords(tokenVal);
505            
506        } else if (tokenLength == 1
507                   && isUppercaseLetter(tokenVal.charAt(0))
508                   && ((String)tokenItem.findFeature("n.whitespace")).equals
509                   (" ")
510                   && isUppercaseLetter
511                   (((String) tokenItem.findFeature("n.name")).charAt(0))) {
512            
513            tokenFeatures.setString("punc", "");
514            String aaa = tokenVal.toLowerCase();
515            if (aaa.equals("a")) {
516                wordRelation.addWord("_a");
517            } else {
518                wordRelation.addWord(aaa);
519            }
520        } else if (matches(doublePattern, tokenVal)) {
521
522            NumberExpander.expandReal(tokenVal, wordRelation);
523
524        } else if (matches(ordinalPattern, tokenVal)) {
525            
526            /* explicit ordinals */
527            String aaa = tokenVal.substring(0, tokenLength - 2);
528            NumberExpander.expandOrdinal(aaa, wordRelation);
529
530        } else if (matches(usMoneyPattern, tokenVal)) {
531
532            /* US money */
533            usMoneyToWords(tokenVal);
534
535        } else if (tokenLength > 0
536                   && tokenVal.charAt(tokenLength - 1) == '%') {
537            
538            /* Y% */
539            tokenToWords(tokenVal.substring(0, tokenLength - 1));
540            wordRelation.addWord("per");
541            wordRelation.addWord("cent");
542
543        } else if (matches(numessPattern, tokenVal)) {
544
545            /* 60s and 7s and 9s */
546            tokenToWords(tokenVal.substring(0, tokenLength - 1));
547            wordRelation.addWord("'s");
548            
549        } else if (tokenVal.indexOf('\'') != -1) {
550            
551            postropheToWords(tokenVal);
552            
553        } else if (matches(digitsSlashDigitsPattern, tokenVal) &&
554                   tokenVal.equals(itemName)) {
555
556            digitsSlashDigitsToWords(tokenVal);
557
558        } else if (tokenVal.indexOf('-') != -1) {
559            
560            dashToWords(tokenVal);
561            
562        } else if (tokenLength > 1 &&
563                   !matches(alphabetPattern, tokenVal)) {
564            
565            notJustAlphasToWords(tokenVal);
566
567        } else {
568            /* just a word */
569            wordRelation.addWord(tokenVal.toLowerCase());
570        }   
571    }
572
573        
574    /**
575     * Convert the given digit token with dashes (e.g. 999-999-999)
576     * into (word) Items in the WordRelation.
577     *
578     * @param tokenVal  the digit string
579     */
580    private void digitsDashToWords(String tokenVal) {
581        int tokenLength = tokenVal.length();
582        int a = 0;
583        for (int p = 0; p <= tokenLength; p++) {
584            if (p == tokenLength || tokenVal.charAt(p) == '-') {
585                String aaa = tokenVal.substring(a, p);
586                NumberExpander.expandDigits(aaa, wordRelation);
587                wordRelation.addBreak();
588                a = p+1;
589            }
590        }
591    }
592
593        
594    /**
595     * Convert the given digit token into (word) Items in the WordRelation.
596     *
597     * @param tokenVal  the digit string
598     */
599    private void digitsToWords(String tokenVal) {
600        FeatureSet featureSet = tokenItem.getFeatures();
601        String nsw = "";
602        if (featureSet.isPresent("nsw")) {
603            nsw = featureSet.getString("nsw");
604        }
605
606        if (nsw.equals("nide")) {
607            NumberExpander.expandID(tokenVal, wordRelation);
608        } else {
609            String rName = featureSet.getString("name");
610            String digitsType = null;
611            
612            if (tokenVal.equals(rName)) {
613                digitsType = (String) cart.interpret(tokenItem);
614            } else {
615                featureSet.setString("name", tokenVal);
616                digitsType = (String) cart.interpret(tokenItem);
617                featureSet.setString("name", rName);
618            }
619            
620            if (digitsType.equals("ordinal")) {
621                NumberExpander.expandOrdinal(tokenVal, wordRelation);
622            } else if (digitsType.equals("digits")) {
623                NumberExpander.expandDigits(tokenVal, wordRelation);
624            } else if (digitsType.equals("year")) {
625                NumberExpander.expandID(tokenVal, wordRelation);
626            } else {
627                NumberExpander.expandNumber(tokenVal, wordRelation);
628            }
629        }
630    }
631    
632    
633    /**
634     * Converts the given Roman numeral string into (word) Items in the
635     * WordRelation.
636     *
637     * @param romanString the roman numeral string
638     */
639    private void romanToWords(String romanString) {
640        String punctuation = (String) tokenItem.findFeature("p.punc");
641        
642        if (punctuation.equals("")) {
643            /* no preceeding punctuation */
644            String n = String.valueOf(NumberExpander.expandRoman(romanString));
645            
646            if (kingLike(tokenItem)) {
647                wordRelation.addWord("the");
648                NumberExpander.expandOrdinal(n, wordRelation);
649            } else if (sectionLike(tokenItem)) {
650                NumberExpander.expandNumber(n, wordRelation);
651            } else {
652                NumberExpander.expandLetters(romanString, wordRelation);
653            }
654        } else {
655            NumberExpander.expandLetters(romanString, wordRelation);
656        }
657    }
658    
659
660    /**
661     * Returns true if the given key is in the kingSectionLikeHash
662     * Hashtable, and the value is the same as the given value.
663     *
664     * @param key key to look for in the hashtable
665     * @param value the value to match
666     *
667     * @return true if it matches, or false if it does not or if
668     * the key is not mapped to any value in the hashtable.
669     */
670    private static boolean inKingSectionLikeHash(String key, String value) {
671        String hashValue = (String) kingSectionLikeHash.get(key);
672        if (hashValue != null) {
673            return (hashValue.equals(value));
674        } else {
675            return false;
676        }
677    }
678
679
680
681    /**
682     * Returns true if the given token item contains a token that is
683     * in a king-like context, e.g., "King" or "Louis".
684     *
685     * @param tokenItem the token item to check
686     *
687     * @return true or false
688     */
689    public static boolean kingLike(Item tokenItem) {
690        String kingName = 
691            ((String) tokenItem.findFeature("p.name")).toLowerCase();
692        if (inKingSectionLikeHash(kingName, KING_NAMES)) {
693            return true;
694        } else {
695            String kingTitle =
696                ((String) tokenItem.findFeature("p.p.name")).toLowerCase();
697            return inKingSectionLikeHash(kingTitle, KING_TITLES);
698        }
699    }
700
701    
702    /**
703     * Returns true if the given token item contains a token that is
704     * in a section-like context, e.g., "chapter" or "act".
705     *
706     * @param tokenItem the token item to check
707     *
708     * @return true or false
709     */
710    public static boolean sectionLike(Item tokenItem) {
711        String sectionType =
712            ((String) tokenItem.findFeature("p.name")).toLowerCase();
713        return inKingSectionLikeHash(sectionType, SECTION_TYPES);
714    }
715
716
717    /**
718     * Converts the given string containing "St" and "Dr" to (word) Items
719     * in the WordRelation.
720     *
721     * @param drStString the string with "St" and "Dr"
722     */
723    private void drStToWords(String drStString) {
724        String street = null;
725        String saint = null;
726        char c0 = drStString.charAt(0);
727
728        if (c0 == 's' || c0 == 'S') {
729            street = "street";
730            saint = "saint";
731        } else {
732            street = "drive";
733            saint = "doctor";
734        }
735        
736        FeatureSet featureSet = tokenItem.getFeatures();
737        String punctuation = featureSet.getString("punc");
738
739        String featPunctuation = (String) tokenItem.findFeature("punc");
740
741        if (tokenItem.getNext() == null ||
742            punctuation.indexOf(',') != -1) {
743            wordRelation.addWord(street);
744        } else if (featPunctuation.equals(",")) {
745            wordRelation.addWord(saint);
746        } else {
747            String pName = (String) tokenItem.findFeature("p.name");
748            String nName = (String) tokenItem.findFeature("n.name");
749
750            char p0 = pName.charAt(0);
751            char n0 = nName.charAt(0);
752
753            if (isUppercaseLetter(p0) && isLowercaseLetter(n0)) {
754                wordRelation.addWord(street);
755            } else if (NumberExpander.isDigit(p0) && isLowercaseLetter(n0)) {
756                wordRelation.addWord(street);
757            } else if (isLowercaseLetter(p0) && isUppercaseLetter(n0)) {
758                wordRelation.addWord(saint);
759            } else {
760                String whitespace = (String) tokenItem.findFeature("n.whitespace");
761                if (whitespace.equals(" ")) {
762                    wordRelation.addWord(saint);
763                } else {
764                    wordRelation.addWord(street);
765                }
766            }
767        }
768
769        if (punctuation != null && punctuation.equals(".")) {
770            featureSet.setString("punc", "");
771        }
772    }
773                
774
775    /**
776     * Converts US money string into (word) Items in the WordRelation.
777     *
778     * @param tokenVal the US money string
779     */
780    private void usMoneyToWords(String tokenVal) {
781        
782        int dotIndex = tokenVal.indexOf('.');
783
784        if (matches(illionPattern, 
785                    (String) tokenItem.findFeature("n.name"))) {
786            NumberExpander.expandReal(tokenVal.substring(1), wordRelation);
787        } else if (dotIndex == -1) {
788
789            String aaa = tokenVal.substring(1);
790            tokenToWords(aaa);
791
792            if (aaa.equals("1")) {
793                wordRelation.addWord("dollar");
794            } else {
795                wordRelation.addWord("dollars");
796            }
797        } else if (dotIndex == (tokenVal.length() - 1) ||
798                   (tokenVal.length() - dotIndex) > 3) {
799            /* simply read as mumble point mumble */
800            NumberExpander.expandReal(tokenVal.substring(1), wordRelation);
801            wordRelation.addWord("dollars");
802        } else {
803            String aaa = tokenVal.substring(1, dotIndex);
804            aaa = Utilities.deleteChar(aaa, ',');
805            String bbb = tokenVal.substring(dotIndex+1);
806            
807            NumberExpander.expandNumber(aaa, wordRelation);
808
809            if (aaa.equals("1")) {
810                wordRelation.addWord("dollar");
811            } else {
812                wordRelation.addWord("dollars");
813            }
814
815            if (bbb.equals("00")) {
816                // add nothing to the word list
817            } else {
818                NumberExpander.expandNumber(bbb, wordRelation);
819                if (bbb.equals("01")) {
820                    wordRelation.addWord("cent");
821                } else {
822                    wordRelation.addWord("cents");
823                }
824            }
825        }
826    }       
827
828
829    /**
830     * Convert the given apostrophed word into (word) Items in the Word
831     * Relation.
832     *
833     * @param tokenVal the apostrophed word string
834     */
835    private void postropheToWords(String tokenVal) {
836        int index = tokenVal.indexOf('\'');
837        String bbb = tokenVal.substring(index).toLowerCase();
838
839        if (inStringArray(bbb, postrophes)) {
840            String aaa = tokenVal.substring(0, index);
841            tokenToWords(aaa);
842            wordRelation.addWord(bbb);
843
844        } else if (bbb.equals("'tve")) {
845            String aaa = tokenVal.substring(0, index-2);
846            tokenToWords(aaa);
847            wordRelation.addWord("'ve");
848
849        } else {
850            /* internal single quote deleted */
851            StringBuffer buffer = new StringBuffer(tokenVal);
852            buffer.deleteCharAt(index);
853            tokenToWords(buffer.toString());
854        }
855    }
856
857
858    /**
859     * Convert the given digits/digits string into word (Items) in the
860     * WordRelation.
861     *
862     * @param tokenVal the digits/digits string
863     */
864    private void digitsSlashDigitsToWords(String tokenVal) {
865
866        /* might be fraction, or not */
867        int index = tokenVal.indexOf('/');
868        String aaa = tokenVal.substring(0, index);
869        String bbb = tokenVal.substring(index+1);
870        int a, b;
871        
872        // if the previous token is a number, add an "and"
873        if (matches(digitsPattern, (String) tokenItem.findFeature("p.name"))
874            && tokenItem.getPrevious() != null) {
875            wordRelation.addWord("and");
876        }
877
878        if (aaa.equals("1") && bbb.equals("2")) {
879            wordRelation.addWord("a");
880            wordRelation.addWord("half");
881        } else if ((a = Integer.parseInt(aaa)) < (b = Integer.parseInt(bbb))) {
882            NumberExpander.expandNumber(aaa, wordRelation);
883            NumberExpander.expandOrdinal(bbb, wordRelation);
884            if (a > 1) {
885                wordRelation.addWord("'s");
886            }
887        } else {
888            NumberExpander.expandNumber(aaa, wordRelation);
889            wordRelation.addWord("slash");
890            NumberExpander.expandNumber(bbb, wordRelation);
891        }
892    }
893
894
895    /**
896     * Convert the given dashed string (e.g. "aaa-bbb") into (word) Items
897     * in the WordRelation.
898     *
899     * @param tokenVal the dashed string
900     */
901    private void dashToWords(String tokenVal) {
902
903        int index = tokenVal.indexOf('-');
904        String aaa = tokenVal.substring(0, index);
905        String bbb = tokenVal.substring(index+1, tokenVal.length());
906
907        if (matches(digitsPattern, aaa) && matches(digitsPattern, bbb)) {
908            FeatureSet featureSet = tokenItem.getFeatures();
909            featureSet.setString("name", aaa);
910            tokenToWords(aaa);
911            wordRelation.addWord("to");
912            featureSet.setString("name", bbb);
913            tokenToWords(bbb);
914            featureSet.setString("name", "");
915        } else {            
916            tokenToWords(aaa);
917            tokenToWords(bbb);
918        }
919    }
920
921
922    /**
923     * Convert the given string (which does not only consist of alphabet)
924     * into (word) Items in the WordRelation.
925     *
926     * @param tokenVal the string
927     */
928    private void notJustAlphasToWords(String tokenVal) {
929
930        /* its not just alphas */
931        int index = 0;
932        int tokenLength = tokenVal.length();
933
934        for (; index < tokenLength; index++) {
935            if (isTextSplitable(tokenVal, index)) {
936                break;
937            }
938        }
939        
940        String aaa = tokenVal.substring(0, index+1);
941        String bbb = tokenVal.substring(index+1, tokenLength);
942        
943        FeatureSet featureSet = tokenItem.getFeatures();
944        featureSet.setString("nsw", "nide");
945        tokenToWords(aaa);
946        tokenToWords(bbb);
947    }
948
949
950    /**
951     * Returns true if the given word is pronounceable.
952     * This method is originally called us_aswd() in Flite 1.1.
953     *
954     * @param word the word to test
955     *
956     * @return true if the word is pronounceable, false otherwise
957     */
958    public boolean isPronounceable(String word) {
959        String lowerCaseWord = word.toLowerCase();
960        return (prefixFSM.accept(lowerCaseWord) &&
961                suffixFSM.accept(lowerCaseWord));
962    }
963
964
965    /**
966     * Returns true if the given token is the name of a US state.
967     * If it is, it will add the name of the state to (word) Items in the
968     * WordRelation.
969     *
970     * @param tokenVal the token string
971     */
972    private boolean isStateName(String tokenVal) {
973        String[] state = (String[]) usStatesHash.get(tokenVal);
974        if (state != null) {
975            boolean expandState = false;
976
977            // check to see if the state initials are ambiguous
978            // in the English language
979            if (state[1].equals("ambiguous")) {
980                String previous = (String) tokenItem.findFeature("p.name");
981                String next = (String) tokenItem.findFeature("n.name");
982
983                // System.out.println("previous = " + previous);
984                // System.out.println("next = " + next);
985                
986                int nextLength = next.length();
987                FeatureSet featureSet = tokenItem.getFeatures();
988                
989                // check if the previous word starts with a capital letter,
990                // is at least 3 letters long, is an alphabet sequence,
991                // and has a comma.
992                boolean previousIsCity =
993                    (isUppercaseLetter(previous.charAt(0))
994                     && previous.length() > 2
995                     && matches(alphabetPattern, previous)
996                     && tokenItem.findFeature("p.punc").equals(","));
997                
998                // check if next token starts with a lower case, or
999                // this is the end of sentence, or if next token
1000                // is a period (".") or a zip code (5 or 10 digits).
1001                boolean nextIsGood =
1002                    (isLowercaseLetter(next.charAt(0))
1003                     || tokenItem.getNext() == null
1004                     || featureSet.getString("punc").equals(".")
1005                     || ((nextLength == 5 || nextLength == 10) &&
1006                         matches(digitsPattern, next)));
1007                
1008                if (previousIsCity && nextIsGood) {
1009                    expandState = true;
1010                } else {
1011                    expandState = false;
1012                }
1013            } else {
1014                expandState = true;
1015            }
1016            if (expandState) {
1017                for (int j = 2; j < state.length; j++) {
1018                    if (state[j] != null) {
1019                        wordRelation.addWord(state[j]);
1020                    }
1021                }
1022                return true;
1023            }
1024        }
1025        return false;
1026    }
1027        
1028                   
1029    /**
1030     * Determines if the given input matches the given Pattern.
1031     *
1032     * @param pattern the pattern to match
1033     * @param input the string to test
1034     *
1035     * @return <code>true</code> if the input string matches the given Pattern;
1036     *         <code>false</code> otherwise
1037     */
1038    private static boolean matches(Pattern pattern, String input) {
1039        Matcher m = pattern.matcher(input);
1040        return m.matches();
1041    }
1042    
1043
1044    /**
1045     * Determines if the character at the given position of the given
1046     * input text is splittable. A character is splittable if:
1047     * <p>
1048     * 1) the character and the following character are not letters
1049     *    in the English alphabet (A-Z and a-z)
1050     * <p>
1051     * 2) the character and the following character are not digits (0-9)
1052     * <p>
1053     * @param text the text containing the character of interest
1054     * @param index the index of the character of interest
1055     * 
1056     * @return true if the position of the given text is splittable
1057     *         false otherwise
1058     */ 
1059    private static boolean isTextSplitable(String text, int index) {
1060
1061        char c0 = text.charAt(index);
1062        char c1 = text.charAt(index+1);
1063        
1064        if (isLetter(c0) && isLetter(c1)) {
1065            return false;
1066        } else if (NumberExpander.isDigit(c0) && NumberExpander.isDigit(c1)) {
1067            return false;
1068        } else {
1069            return true;
1070        }
1071    }
1072
1073
1074    /**
1075     * Returns true if the given character is a letter (a-z or A-Z).
1076     *
1077     * @param ch the character to test
1078     *
1079     * @return true or false
1080     */
1081    private static boolean isLetter(char ch) {
1082        return (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'));
1083    }
1084
1085
1086    /**
1087     * Returns true if the given character is an uppercase letter (A-Z).
1088     *
1089     * @param ch the character to test
1090     *
1091     * @return true or false
1092     */
1093    private static boolean isUppercaseLetter(char ch) {
1094        return ('A' <= ch && ch <= 'Z');
1095    }
1096
1097    
1098    /**
1099     * Returns true if the given character is a lowercase letter (a-z).
1100     *
1101     * @param ch the character to test
1102     *
1103     * @return true or false
1104     */
1105    private static boolean isLowercaseLetter(char ch) {
1106        return ('a' <= ch && ch <= 'z');
1107    }
1108
1109
1110    /**
1111     * Converts this object to its String representation
1112     * 
1113     * @return the string representation of this object
1114     */
1115    public String toString() {
1116        return "TokenToWords";
1117    }
1118}