001/** 002 * Portions Copyright 2001 Sun Microsystems, Inc. 003 * Portions Copyright 1999-2001 Language Technologies Institute, 004 * Carnegie Mellon University. 005 * All Rights Reserved. Use is subject to license terms. 006 * 007 * See the file "license.terms" for information on usage and 008 * redistribution of this file, and for a DISCLAIMER OF ALL 009 * WARRANTIES. 010 */ 011package com.sun.speech.freetts.en.us; 012 013 014 015/** 016 * Provides the definitions for US English whitespace, punctuations, 017 * prepunctuation, and postpunctuation symbols. It also contains a set of 018 * Regular Expressions for the US English language. 019 * With regular expressions, it specifies what are whitespace, 020 * letters in the alphabet, uppercase and lowercase letters, alphanumeric 021 * characters, identifiers, integers, doubles, digits, and 'comma and int'. 022 * 023 * It translates the following code from flite: 024 * src/regex/cst_regex.c 025 * lang/usenglish/us_text.c 026 */ 027public class USEnglish { 028 029 /** default whitespace regular expression pattern */ 030 public static final String RX_DEFAULT_US_EN_WHITESPACE = "[ \n\t\r]+"; 031 /** default letter regular expression pattern */ 032 public static final String RX_DEFAULT_US_EN_ALPHABET = "[A-Za-z]+"; 033 /** default uppercase regular expression pattern */ 034 public static final String RX_DEFAULT_US_EN_UPPERCASE = "[A-Z]+"; 035 /** default lowercase regular expression pattern */ 036 public static final String RX_DEFAULT_US_EN_LOWERCASE = "[a-z]+"; 037 /** default alpha-numeric regular expression pattern */ 038 public static final String RX_DEFAULT_US_EN_ALPHANUMERIC = "[0-9A-Za-z]+"; 039 /** default identifier regular expression pattern */ 040 public static final String RX_DEFAULT_US_EN_IDENTIFIER = "[A-Za-z_][0-9A-Za-z_]+"; 041 /** default integer regular expression pattern */ 042 public static final String RX_DEFAULT_US_EN_INT = "-?[0-9]+"; 043 /** default double regular expression pattern */ 044 public static final String RX_DEFAULT_US_EN_DOUBLE = 045 "-?(([0-9]+\\.[0-9]*)|([0-9]+)|(\\.[0-9]+))([eE][---+]?[0-9]+)?"; 046 /** default integer with commas regular expression pattern */ 047 public static final String RX_DEFAULT_US_EN_COMMAINT = 048 "[0-9][0-9]?[0-9]?,([0-9][0-9][0-9],)*[0-9][0-9][0-9](\\.[0-9]+)?"; 049 /** default digits regular expression pattern */ 050 public static final String RX_DEFAULT_US_EN_DIGITS = "[0-9][0-9]*"; 051 /** default dotted abbreviation regular expression pattern */ 052 public static final String RX_DEFAULT_US_EN_DOTTED_ABBREV = "([A-Za-z]\\.)*[A-Za-z]"; 053 /** default ordinal number regular expression pattern */ 054 public static final String RX_DEFAULT_US_EN_ORDINAL_NUMBER = 055 "[0-9][0-9,]*(th|TH|st|ST|nd|ND|rd|RD)"; 056 /** default has-vowel regular expression */ 057 public static final String RX_DEFAULT_HAS_VOWEL = ".*[aeiouAEIOU].*"; 058 /** default US money regular expression */ 059 public static final String RX_DEFAULT_US_MONEY = "\\$[0-9,]+(\\.[0-9]+)?"; 060 /** default -illion regular expression */ 061 public static final String RX_DEFAULT_ILLION = ".*illion"; 062 /** default digits2dash (e.g. 999-999-999) regular expression */ 063 public static final String RX_DEFAULT_DIGITS2DASH = "[0-9]+(-[0-9]+)(-[0-9]+)+"; 064 /** default digits/digits (e.g. 999/999) regular expression */ 065 public static final String RX_DEFAULT_DIGITSSLASHDIGITS = "[0-9]+/[0-9]+"; 066 /** default number time regular expression */ 067 public static final String RX_DEFAULT_NUMBER_TIME = "((0[0-2])|(1[0-9])):([0-5][0-9])"; 068 /** default Roman numerals regular expression */ 069 public static final String RX_DEFAULT_ROMAN_NUMBER = 070 "(II?I?|IV|VI?I?I?|IX|X[VIX]*)"; 071 /** default drst "Dr. St" regular expression */ 072 public static final String RX_DEFAULT_DRST = "([dD][Rr]|[Ss][Tt])"; 073 /** default numess */ 074 public static final String RX_DEFAULT_NUMESS = "[0-9]+s"; 075 /** default 7-digit phone number */ 076 public static final String RX_DEFAULT_SEVEN_DIGIT_PHONE_NUMBER = 077 "[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]"; 078 /** default 4-digit number */ 079 public static final String RX_DEFAULT_FOUR_DIGIT = 080 "[0-9][0-9][0-9][0-9]"; 081 /** default 3-digit number */ 082 public static final String RX_DEFAULT_THREE_DIGIT = 083 "[0-9][0-9][0-9]"; 084 085 086 /** whitespace regular expression pattern */ 087 public static String RX_WHITESPACE = RX_DEFAULT_US_EN_WHITESPACE; 088 /** letter regular expression pattern */ 089 public static String RX_ALPHABET = RX_DEFAULT_US_EN_ALPHABET; 090 /** uppercase regular expression pattern */ 091 public static String RX_UPPERCASE = RX_DEFAULT_US_EN_UPPERCASE; 092 /** lowercase regular expression pattern */ 093 public static String RX_LOWERCASE = RX_DEFAULT_US_EN_LOWERCASE; 094 /** alphanumeric regular expression pattern */ 095 public static String RX_ALPHANUMERIC = RX_DEFAULT_US_EN_ALPHANUMERIC; 096 /** identifier regular expression pattern */ 097 public static String RX_IDENTIFIER = RX_DEFAULT_US_EN_IDENTIFIER; 098 /** integer regular expression pattern */ 099 public static String RX_INT = RX_DEFAULT_US_EN_INT; 100 /** double regular expression pattern */ 101 public static String RX_DOUBLE = RX_DEFAULT_US_EN_DOUBLE; 102 /** comma separated integer regular expression pattern */ 103 public static String RX_COMMAINT = RX_DEFAULT_US_EN_COMMAINT; 104 /** digits regular expression pattern */ 105 public static String RX_DIGITS = RX_DEFAULT_US_EN_DIGITS; 106 /** dotted abbreviation regular expression pattern */ 107 public static String RX_DOTTED_ABBREV = RX_DEFAULT_US_EN_DOTTED_ABBREV; 108 /** ordinal number regular expression pattern */ 109 public static String RX_ORDINAL_NUMBER = RX_DEFAULT_US_EN_ORDINAL_NUMBER; 110 /** has-vowel regular expression */ 111 public static final String RX_HAS_VOWEL = RX_DEFAULT_HAS_VOWEL; 112 /** US money regular expression */ 113 public static final String RX_US_MONEY = RX_DEFAULT_US_MONEY; 114 /** -illion regular expression */ 115 public static final String RX_ILLION = RX_DEFAULT_ILLION; 116 /** digits2dash (e.g. 999-999-999) regular expression */ 117 public static final String RX_DIGITS2DASH = RX_DEFAULT_DIGITS2DASH; 118 /** digits/digits (e.g. 999/999) regular expression */ 119 public static final String RX_DIGITSSLASHDIGITS = RX_DEFAULT_DIGITSSLASHDIGITS; 120 /** number time regular expression */ 121 public static final String RX_NUMBER_TIME = RX_DEFAULT_NUMBER_TIME; 122 /** Roman numerals regular expression */ 123 public static final String RX_ROMAN_NUMBER = RX_DEFAULT_ROMAN_NUMBER; 124 /** drst "Dr. St" regular expression */ 125 public static final String RX_DRST = RX_DEFAULT_DRST; 126 /** default numess */ 127 public static final String RX_NUMESS = RX_DEFAULT_NUMESS; 128 /** 7-digit phone number */ 129 public static final String RX_SEVEN_DIGIT_PHONE_NUMBER = RX_DEFAULT_SEVEN_DIGIT_PHONE_NUMBER; 130 /** 4-digit number */ 131 public static final String RX_FOUR_DIGIT = RX_DEFAULT_FOUR_DIGIT; 132 /** 3-digit number */ 133 public static final String RX_THREE_DIGIT = RX_DEFAULT_THREE_DIGIT; 134 135 136 // the following symbols are from lang/usenglish/us_text.c 137 138 /** punctuation regular expression pattern */ 139 public static final String PUNCTUATION_SYMBOLS = "\"'`.,:;!?(){}[]"; 140 /** pre-punctuation regular expression pattern */ 141 public static final String PREPUNCTUATION_SYMBOLS = "\"'`({["; 142 /** single char symbols regular expression pattern */ 143 public static final String SINGLE_CHAR_SYMBOLS = ""; 144 /** whitespace symbols regular expression pattern */ 145 public static final String WHITESPACE_SYMBOLS = " \t\n\r"; 146 147 148 /** 149 * Not constructable 150 */ 151 private USEnglish() {} 152} 153