001/**
002 * Portions Copyright 2001 Sun Microsystems, Inc.
003 * Portions Copyright 1999-2001 Language Technologies Institute, 
004 * Carnegie Mellon University.
005 * All Rights Reserved.  Use is subject to license terms.
006 * 
007 * See the file "license.terms" for information on usage and
008 * redistribution of this file, and for a DISCLAIMER OF ALL 
009 * WARRANTIES.
010 */
011package com.sun.speech.freetts.en.us;
012
013
014
015/**
016 * Provides the definitions for US English whitespace, punctuations,
017 * prepunctuation, and postpunctuation symbols. It also contains a set of
018 * Regular Expressions for the US English language.
019 * With regular expressions, it specifies what are whitespace,
020 * letters in the alphabet, uppercase and lowercase letters, alphanumeric
021 * characters, identifiers, integers, doubles, digits, and 'comma and int'. 
022 *
023 * It translates the following code from flite:
024 * src/regex/cst_regex.c
025 * lang/usenglish/us_text.c
026 */
027public class USEnglish {
028
029    /** default whitespace regular expression pattern */
030    public static final String RX_DEFAULT_US_EN_WHITESPACE = "[ \n\t\r]+";
031    /** default letter regular expression pattern */
032    public static final String RX_DEFAULT_US_EN_ALPHABET = "[A-Za-z]+";
033    /** default uppercase regular expression pattern */
034    public static final String RX_DEFAULT_US_EN_UPPERCASE = "[A-Z]+";
035    /** default lowercase regular expression pattern */
036    public static final String RX_DEFAULT_US_EN_LOWERCASE = "[a-z]+";
037    /** default alpha-numeric regular expression pattern */
038    public static final String RX_DEFAULT_US_EN_ALPHANUMERIC = "[0-9A-Za-z]+";
039    /** default identifier regular expression pattern */
040    public static final String RX_DEFAULT_US_EN_IDENTIFIER = "[A-Za-z_][0-9A-Za-z_]+";
041    /** default integer regular expression pattern */
042    public static final String RX_DEFAULT_US_EN_INT = "-?[0-9]+";
043    /** default double regular expression pattern */
044    public static final String RX_DEFAULT_US_EN_DOUBLE =
045        "-?(([0-9]+\\.[0-9]*)|([0-9]+)|(\\.[0-9]+))([eE][---+]?[0-9]+)?";
046    /** default integer with commas  regular expression pattern */
047    public static final String RX_DEFAULT_US_EN_COMMAINT =
048        "[0-9][0-9]?[0-9]?,([0-9][0-9][0-9],)*[0-9][0-9][0-9](\\.[0-9]+)?";
049    /** default digits regular expression pattern */
050    public static final String RX_DEFAULT_US_EN_DIGITS = "[0-9][0-9]*";
051    /** default dotted abbreviation  regular expression pattern */
052    public static final String RX_DEFAULT_US_EN_DOTTED_ABBREV = "([A-Za-z]\\.)*[A-Za-z]";
053    /** default ordinal number regular expression pattern */
054    public static final String RX_DEFAULT_US_EN_ORDINAL_NUMBER =
055        "[0-9][0-9,]*(th|TH|st|ST|nd|ND|rd|RD)";
056    /** default has-vowel regular expression */
057    public static final String RX_DEFAULT_HAS_VOWEL = ".*[aeiouAEIOU].*";
058    /** default US money regular expression */
059    public static final String RX_DEFAULT_US_MONEY = "\\$[0-9,]+(\\.[0-9]+)?";
060    /** default -illion regular expression */
061    public static final String RX_DEFAULT_ILLION = ".*illion";
062    /** default digits2dash (e.g. 999-999-999) regular expression */
063    public static final String RX_DEFAULT_DIGITS2DASH = "[0-9]+(-[0-9]+)(-[0-9]+)+";
064    /** default digits/digits (e.g. 999/999) regular expression */
065    public static final String RX_DEFAULT_DIGITSSLASHDIGITS = "[0-9]+/[0-9]+";
066    /** default number time regular expression */
067    public static final String RX_DEFAULT_NUMBER_TIME = "((0[0-2])|(1[0-9])):([0-5][0-9])";
068    /** default Roman numerals regular expression */
069    public static final String RX_DEFAULT_ROMAN_NUMBER =
070        "(II?I?|IV|VI?I?I?|IX|X[VIX]*)";
071    /** default drst "Dr. St" regular expression */
072    public static final String RX_DEFAULT_DRST = "([dD][Rr]|[Ss][Tt])";
073    /** default numess */
074    public static final String RX_DEFAULT_NUMESS = "[0-9]+s";
075    /** default 7-digit phone number */
076    public static final String RX_DEFAULT_SEVEN_DIGIT_PHONE_NUMBER =
077        "[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]";
078    /** default 4-digit number */
079    public static final String RX_DEFAULT_FOUR_DIGIT =
080        "[0-9][0-9][0-9][0-9]";
081    /** default 3-digit number */
082    public static final String RX_DEFAULT_THREE_DIGIT =
083        "[0-9][0-9][0-9]";
084    
085    
086    /** whitespace regular expression pattern */
087    public static String RX_WHITESPACE = RX_DEFAULT_US_EN_WHITESPACE;
088    /** letter  regular expression pattern */
089    public static String RX_ALPHABET = RX_DEFAULT_US_EN_ALPHABET;
090    /** uppercase  regular expression pattern */
091    public static String RX_UPPERCASE = RX_DEFAULT_US_EN_UPPERCASE;
092    /** lowercase  regular expression pattern */
093    public static String RX_LOWERCASE = RX_DEFAULT_US_EN_LOWERCASE;
094    /** alphanumeric  regular expression pattern */
095    public static String RX_ALPHANUMERIC = RX_DEFAULT_US_EN_ALPHANUMERIC;
096    /** identifier  regular expression pattern */
097    public static String RX_IDENTIFIER = RX_DEFAULT_US_EN_IDENTIFIER;
098    /** integer  regular expression pattern */
099    public static String RX_INT = RX_DEFAULT_US_EN_INT;
100    /** double  regular expression pattern */
101    public static String RX_DOUBLE = RX_DEFAULT_US_EN_DOUBLE;
102    /** comma separated integer  regular expression pattern */
103    public static String RX_COMMAINT = RX_DEFAULT_US_EN_COMMAINT;
104    /** digits regular expression pattern */
105    public static String RX_DIGITS = RX_DEFAULT_US_EN_DIGITS;
106    /** dotted abbreviation  regular expression pattern */
107    public static String RX_DOTTED_ABBREV = RX_DEFAULT_US_EN_DOTTED_ABBREV;
108    /** ordinal number regular expression pattern */
109    public static String RX_ORDINAL_NUMBER = RX_DEFAULT_US_EN_ORDINAL_NUMBER;
110    /** has-vowel regular expression */
111    public static final String RX_HAS_VOWEL = RX_DEFAULT_HAS_VOWEL;
112    /** US money regular expression */
113    public static final String RX_US_MONEY = RX_DEFAULT_US_MONEY;
114    /** -illion regular expression */
115    public static final String RX_ILLION = RX_DEFAULT_ILLION;
116    /** digits2dash (e.g. 999-999-999) regular expression */
117    public static final String RX_DIGITS2DASH = RX_DEFAULT_DIGITS2DASH;
118    /** digits/digits (e.g. 999/999) regular expression */
119    public static final String RX_DIGITSSLASHDIGITS = RX_DEFAULT_DIGITSSLASHDIGITS;
120    /** number time regular expression */
121    public static final String RX_NUMBER_TIME = RX_DEFAULT_NUMBER_TIME;
122    /** Roman numerals regular expression */
123    public static final String RX_ROMAN_NUMBER = RX_DEFAULT_ROMAN_NUMBER;
124    /** drst "Dr. St" regular expression */
125    public static final String RX_DRST = RX_DEFAULT_DRST;
126    /** default numess */
127    public static final String RX_NUMESS = RX_DEFAULT_NUMESS;
128    /** 7-digit phone number */
129    public static final String RX_SEVEN_DIGIT_PHONE_NUMBER = RX_DEFAULT_SEVEN_DIGIT_PHONE_NUMBER;
130    /** 4-digit number */
131    public static final String RX_FOUR_DIGIT = RX_DEFAULT_FOUR_DIGIT;
132    /** 3-digit number */
133    public static final String RX_THREE_DIGIT = RX_DEFAULT_THREE_DIGIT;
134    
135
136    // the following symbols are from lang/usenglish/us_text.c
137
138    /** punctuation regular expression pattern */
139    public static final String PUNCTUATION_SYMBOLS = "\"'`.,:;!?(){}[]";
140    /** pre-punctuation regular expression pattern */
141    public static final String PREPUNCTUATION_SYMBOLS = "\"'`({[";
142    /** single char symbols  regular expression pattern */
143    public static final String SINGLE_CHAR_SYMBOLS = "";
144    /** whitespace symbols  regular expression pattern */
145    public static final String WHITESPACE_SYMBOLS = " \t\n\r";
146
147
148    /**
149     * Not constructable
150     */
151    private USEnglish() {}
152}
153