001/**
002 * Portions Copyright 2001 Sun Microsystems, Inc.
003 * Portions Copyright 1999-2001 Language Technologies Institute, 
004 * Carnegie Mellon University.
005 * All Rights Reserved.  Use is subject to license terms.
006 * 
007 * See the file "license.terms" for information on usage and
008 * redistribution of this file, and for a DISCLAIMER OF ALL 
009 * WARRANTIES.
010 */
011package com.sun.speech.freetts.en;
012
013import com.sun.speech.freetts.Token;
014import com.sun.speech.freetts.Tokenizer;
015import java.io.Reader;
016import java.io.IOException;
017
018
019/**
020 * Implements the tokenizer interface. Breaks an input sequence of
021 * characters into a set of tokens.
022 */
023public class TokenizerImpl implements Tokenizer {
024        
025    /** A constant indicating that the end of the stream has been read. */
026    public static final int EOF = -1;
027    
028    /** A string containing the default whitespace characters. */
029    public static final String DEFAULT_WHITESPACE_SYMBOLS = " \t\n\r";
030    
031    /** A string containing the default single characters. */
032    public static final String DEFAULT_SINGLE_CHAR_SYMBOLS = "(){}[]";
033    
034    /** A string containing the default pre-punctuation characters. */
035    public static final String DEFAULT_PREPUNCTUATION_SYMBOLS = "\"'`({[";
036    
037    /** A string containing the default post-punctuation characters. */
038    public static final String DEFAULT_POSTPUNCTUATION_SYMBOLS 
039        = "\"'`.,:;!?(){}[]";
040
041
042    /** The line number. */
043    private int lineNumber;
044    
045    /** The input text (from the Utterance) to tokenize. */
046    private String inputText;
047
048    /** The file to read input text from, if using file mode. */
049    private Reader reader;
050
051    /** The token position - doesn't seem really necessary at this point. */
052    // private int tokenPosition = 0;
053
054    /** The current character, whether its from the file or the input text. */
055    private int currentChar;
056    
057    /**
058     * The current char position for the input text (not the file)
059     * this is called "file_pos" in flite
060     */
061    private int currentPosition;
062    
063    
064    /** The delimiting symbols of this tokenizer. */
065    private String whitespaceSymbols = DEFAULT_WHITESPACE_SYMBOLS;
066    private String singleCharSymbols = DEFAULT_SINGLE_CHAR_SYMBOLS;
067    private String prepunctuationSymbols = DEFAULT_PREPUNCTUATION_SYMBOLS;
068    private String postpunctuationSymbols = DEFAULT_POSTPUNCTUATION_SYMBOLS;
069
070    /** The error description. */
071    private String errorDescription;
072    
073    /** A place to store the current token. */
074    private Token token;
075    private Token lastToken;
076
077    /** For timing. */
078//    private long duration = 0;
079        
080
081    /**
082     * Constructs a Tokenizer.
083     */
084    public TokenizerImpl() {
085    }
086
087
088    /**
089     * Creates a tokenizer that will return tokens from
090     * the given string.
091     *
092     * @param string the string to tokenize
093     */
094    public TokenizerImpl(String string) {
095        setInputText(string);
096    }
097
098    /**
099     * Creates a tokenizer that will return tokens from
100     * the given file.
101     *
102     * @param file where to read the input from
103     */
104    public TokenizerImpl(Reader file) {
105        setInputReader(file);
106    }
107
108
109    /**
110     * Sets the whitespace symbols of this Tokenizer to the given symbols.
111     *
112     * @param symbols the whitespace symbols
113     */
114    public void setWhitespaceSymbols(String symbols) {
115        whitespaceSymbols = symbols;
116    }
117        
118
119    /**
120     * Sets the single character symbols of this Tokenizer to the given
121     * symbols.
122     *
123     * @param symbols the single character symbols
124     */
125    public void setSingleCharSymbols(String symbols) {
126        singleCharSymbols = symbols;
127    }
128        
129
130    /**
131     * Sets the prepunctuation symbols of this Tokenizer to the given
132     * symbols.
133     *
134     * @param symbols the prepunctuation symbols
135     */
136    public void setPrepunctuationSymbols(String symbols) {
137        prepunctuationSymbols = symbols;
138    }
139        
140
141    /**
142     * Sets the postpunctuation symbols of this Tokenizer to the given
143     * symbols.
144     *
145     * @param symbols the postpunctuation symbols
146     */
147    public void setPostpunctuationSymbols(String symbols) {
148        postpunctuationSymbols = symbols;
149    }
150    
151
152    /**
153     * Sets the text to tokenize. 
154     *
155     * @param  inputString  the string to tokenize
156     */
157    public void setInputText(String inputString) {
158        inputText = inputString;
159        currentPosition = 0;
160        
161        if (inputText != null) {
162            getNextChar();
163        }
164    }
165
166    /**
167     * Sets the input reader
168     *
169     * @param  reader the input source
170     */
171    public void setInputReader(Reader reader) {
172        this.reader = reader;
173        getNextChar();
174    }
175    
176
177    /**
178     * Returns the next token.
179     *
180     * @return  the next token if it exists,
181     *          <code>null</code> if no more tokens
182     */
183    public Token getNextToken() {
184        lastToken = token;
185        token = new Token();
186
187        // Skip whitespace
188        token.setWhitespace(getTokenOfCharClass(whitespaceSymbols));
189
190        // quoted strings currently ignored
191
192        // get prepunctuation
193        token.setPrepunctuation(getTokenOfCharClass(prepunctuationSymbols));
194
195        // get the symbol itself
196        if (singleCharSymbols.indexOf(currentChar) != -1) {
197            token.setWord(String.valueOf((char) currentChar));
198            getNextChar();
199        } else {
200            token.setWord(getTokenNotOfCharClass(whitespaceSymbols));
201        }
202
203        token.setPosition(currentPosition);
204        token.setLineNumber(lineNumber);
205
206        // This'll have token *plus* postpunctuation
207        // Get postpunctuation
208        removeTokenPostpunctuation();
209
210        return token;
211    }
212    
213
214    /**
215     * Returns <code>true</code> if there are more tokens,
216     *          <code>false</code> otherwise.
217     *
218     * @return <code>true</code> if there are more tokens
219     *         <code>false</code> otherwise
220     */
221    public boolean hasMoreTokens() {
222        int nextChar = currentChar;
223        return (nextChar != EOF);
224    }
225    
226
227    /**
228     * Advances the currentPosition pointer by 1 (if not exceeding
229     * length of inputText, and returns the character pointed by
230     * currentPosition.
231     *
232     * @return the next character EOF if no more characters exist
233     */
234    private int getNextChar() {
235        if (reader != null) {
236            try {
237                int readVal = reader.read();
238                if (readVal == -1) {
239                    currentChar = EOF;
240                } else {
241                    currentChar = (char) readVal;
242                }
243            } catch (IOException ioe) {
244                currentChar = EOF;
245                errorDescription = ioe.getMessage();
246            }
247        } else if (inputText != null) {
248            if (currentPosition < inputText.length()) {
249                currentChar = (int) inputText.charAt(currentPosition);
250            } else {
251                currentChar = EOF;
252            }
253        }
254        if (currentChar != EOF) {
255            currentPosition++;
256        }
257        if (currentChar == '\n') {
258            lineNumber++;
259        }
260        return currentChar;
261    }
262    
263
264    /**
265     * Starting from the current position of the input text,
266     * returns the subsequent characters of type charClass,
267     * and not of type singleCharSymbols.
268     *
269     * @param  charClass  the type of characters to look for
270     * @param  buffer  the place to append characters of type charClass
271     *
272     * @return  a string of characters starting from the current position
273     *          of the input text, until it encounters a character not
274     *          in the string charClass
275     *
276     */
277    private String getTokenOfCharClass(String charClass) {
278        return getTokenByCharClass(charClass, true);
279    }
280
281    /**
282     * Starting from the current position of the input text/file,
283     * returns the subsequent characters, not of type singleCharSymbols,
284     * and ended at characters of type endingCharClass.  E.g., if the current
285     * string is "xxxxyyy", endingCharClass is "yz", and singleCharClass
286     * "abc". Then this method will return to "xxxx".
287     *
288     * @param  endingCharClass  the type of characters to look for
289     *
290     * @return  a string of characters from the current position until
291     *          it encounters characters in endingCharClass
292     *
293     */
294    private String getTokenNotOfCharClass(String endingCharClass) {
295        return getTokenByCharClass(endingCharClass, false);
296    }
297    
298    /**
299     * Provides a `compressed' method from getTokenOfCharClass() and 
300     * getTokenNotOfCharClass().
301     * If parameter containThisCharClass is <code>true</code>, 
302     * then a string from the
303     * current position to the last character in charClass is returned.
304     * If containThisCharClass is <code>false</code>, then a string 
305     * before the first
306     * occurrence of a character in containThisCharClass is returned.
307     *
308     * @param  charClass  the string of characters you want included or
309     *                    excluded in your return
310     * @param  containThisCharClass  determines if you want characters
311     *                in charClass in the returned string or not
312     *
313     * @return  a string of characters from the current position until
314     *          it encounters characters in endingCharClass
315     */
316    private String getTokenByCharClass(String charClass, 
317                                       boolean containThisCharClass) {  
318        final StringBuilder buffer = new StringBuilder();
319
320        // if we want the returned string to contain chars in charClass, then
321        // containThisCharClass is TRUE and
322        // (charClass.indexOf(currentChar) != 1) == containThisCharClass)
323        // returns true; if we want it to stop at characters of charClass,
324        // then containThisCharClass is FALSE, and the condition returns
325        // false.
326        while ((charClass.indexOf(currentChar) != -1) == containThisCharClass
327                && singleCharSymbols.indexOf(currentChar) == -1
328                && currentChar != EOF) {
329            buffer.append((char) currentChar);
330            getNextChar();
331        }
332        return buffer.toString();
333    }
334
335    /**
336     * Removes the postpunctuation characters from the current token.
337     * Copies those postpunctuation characters to the class
338     * variable 'postpunctuation'.
339     */
340    private void removeTokenPostpunctuation() {
341        if (token == null) {
342            return;
343        }
344        final String tokenWord = token.getWord();
345
346        int tokenLength = tokenWord.length();
347        int position = tokenLength - 1;
348
349        while (position > 0
350                && postpunctuationSymbols.indexOf((int) tokenWord
351                        .charAt(position)) != -1) {
352            position--;
353        }
354
355        if (tokenLength - 1 != position) {
356            // Copy postpunctuation from token
357            token.setPostpunctuation(tokenWord.substring(position + 1));
358
359            // truncate token at postpunctuation
360            token.setWord(tokenWord.substring(0, position + 1));
361        } else {
362            token.setPostpunctuation("");
363        }
364    }
365
366    /**
367     * Returns <code>true</code> if there were errors while reading tokens
368     *
369     * @return <code>true</code> if there were errors;
370     *          <code>false</code> otherwise
371     */
372    public boolean hasErrors() {
373        return errorDescription != null;
374    }
375
376    /**
377     * if hasErrors returns <code>true</code>, this will return a 
378     * description of the error encountered, otherwise
379     * it will return <code>null</code>
380     *
381     * @return a description of the last error that occurred.
382     */
383    public String getErrorDescription() {
384        return errorDescription;
385    }
386
387    /**
388     * Determines if the current token should start a new sentence.
389     *
390     * @return <code>true</code> if a new sentence should be started
391     */
392    public boolean isBreak() {
393        String tokenWhiteSpace = token.getWhitespace();
394        String lastTokenPostpunctuation = null;
395        if (lastToken != null) {
396            lastTokenPostpunctuation = lastToken.getPostpunctuation();
397        }
398
399        if (lastToken == null || token == null) {
400            return false;
401        } else if (tokenWhiteSpace.indexOf('\n') != tokenWhiteSpace
402                .lastIndexOf('\n')) {
403            return true;
404        } else if (lastTokenPostpunctuation.indexOf(':') != -1
405                || lastTokenPostpunctuation.indexOf('?') != -1
406                || lastTokenPostpunctuation.indexOf('!') != -1) {
407            return true;
408        } else if (lastTokenPostpunctuation.indexOf('.') != -1
409                && tokenWhiteSpace.length() > 1
410                && Character.isUpperCase(token.getWord().charAt(0))) {
411            return true;
412        } else {
413            String lastWord = lastToken.getWord();
414            int lastWordLength = lastWord.length();
415
416            if (lastTokenPostpunctuation.indexOf('.') != -1
417                    &&
418                    /* next word starts with a capital */
419                    Character.isUpperCase(token.getWord().charAt(0))
420                    &&
421                    /* last word isn't an abbreviation */
422                    !(Character.isUpperCase(lastWord.charAt(lastWordLength - 1)) 
423                        || (lastWordLength < 4
424                            && Character.isUpperCase(lastWord.charAt(0))))) {
425                return true;
426            }
427        }
428        return false;
429    }
430}
431