001/**
002 * Portions Copyright 2001 Sun Microsystems, Inc.
003 * Portions Copyright 1999-2001 Language Technologies Institute, 
004 * Carnegie Mellon University.
005 * All Rights Reserved.  Use is subject to license terms.
006 * 
007 * See the file "license.terms" for information on usage and
008 * redistribution of this file, and for a DISCLAIMER OF ALL 
009 * WARRANTIES.
010 */
011package com.sun.speech.freetts.lexicon;
012
013import java.io.BufferedInputStream;
014import java.io.BufferedOutputStream;
015import java.io.BufferedReader;
016import java.io.DataInputStream;
017import java.io.DataOutputStream;
018import java.io.FileInputStream;
019import java.io.FileNotFoundException;
020import java.io.FileOutputStream;
021import java.io.IOException;
022import java.io.InputStream;
023import java.io.InputStreamReader;
024import java.net.MalformedURLException;
025import java.net.URL;
026import java.nio.ByteBuffer;
027import java.nio.MappedByteBuffer;
028import java.nio.channels.FileChannel;
029import java.util.ArrayList;
030import java.util.Collections;
031import java.util.HashMap;
032import java.util.Iterator;
033import java.util.LinkedHashMap;
034import java.util.List;
035import java.util.Map;
036import java.util.StringTokenizer;
037
038import com.sun.speech.freetts.util.BulkTimer;
039import com.sun.speech.freetts.util.Utilities;
040
041/**
042 * Provides an implementation of a Lexicon.
043 *
044 * <p>This implementation will either read from a straight ASCII file
045 * or a binary file.  When reading from an ASCII file, you can specify
046 * when the input line is tokenized:  load, lookup, or never.  If you
047 * specify 'load', the entire file will be parsed when it is loaded.
048 * If you specify 'lookup', the file will be loaded, but the parsing
049 * for each line will be delayed until it is referenced and the parsed
050 * form will be saved away.  If you specify 'never', the lines will
051 * parsed each time they are referenced.  The default is 'never'.  To
052 * specify the load type, set the system property as follows:
053 *
054 * <pre>
055 *   -Dcom.sun.speech.freetts.lexicon.LexTokenize=load
056 * </pre>
057 *
058 * <p>If a binary file is used, you can also specify whether the new
059 * IO package is used.  The new IO package is new for JDK1.4, and can
060 * greatly improve the speed of loading files.  To enable new IO, use
061 * the following system property (it is enabled by default):
062 *
063 * <pre>
064 *   -Dcom.sun.speech.freetts.useNewIO=true
065 * </pre>
066 *
067 * <p>The implementation also allows users to define their own addenda
068 * that will be used in addition to the system addenda.  If the user
069 * defines their own addenda, it values will be added to the system
070 * addenda, overriding any existing elements in the system addenda.
071 * To define a user addenda, the user needs to set the following
072 * property:
073 *
074 * <pre>
075 *   -Dcom.sun.speeech.freetts.lexicon.userAddenda=&lt;URLToUserAddenda>
076 * </pre>
077 *
078 * Where &lt;URLToUserAddenda> is a URL pointing to an ASCII file
079 * containing addenda entries.
080 *
081 * <p>[[[TODO: support multiple homographs with the same part of speech.]]] 
082 */
083abstract public class LexiconImpl implements Lexicon {
084    /**
085     * If true, the phone string is replaced with the phone array in
086     * the hashmap when the phone array is loaded.  The side effects
087     * of this are quicker lookups, but more memory usage and a longer
088     * startup time.
089     */
090    protected boolean tokenizeOnLoad = false;
091       
092    /**
093     * If true, the phone string is replaced with the phone array in
094     * the hashmap when the phone array is first looked up.  The side effects
095     * Set by cmufilelex.tokenize=lookup.
096     */
097    protected boolean tokenizeOnLookup = false;
098 
099    /**
100     * Magic number for binary Lexicon files.
101     */
102    private final static int MAGIC = 0xBABB1E;
103
104    /**
105     * Current binary file version.
106     */
107    private final static int VERSION = 1;
108
109    /**
110     * URL for the compiled form.
111     */
112    private URL compiledURL;
113
114    /**
115     * URL for the addenda.
116     */
117    private URL addendaURL;
118
119    /**
120     * URL for the letter to sound rules.
121     */
122    private URL letterToSoundURL;
123
124    /**
125     * The addenda.
126     */
127    private Map addenda;
128
129    /**
130     * The compiled lexicon.
131     */
132    private Map compiled;
133
134    /**
135     * The LetterToSound rules.
136     */
137    private LetterToSound letterToSound = null;
138
139    /**
140     * Parts of Speech.
141     */
142    private ArrayList partsOfSpeech = new ArrayList();
143
144    /**
145     * A static directory of compiledURL URL objects and associated
146     * already-loaded compiled Map objects. This is used to share
147     * the immutable compiled lexicons between lexicon instances.
148     * As the addenda can be changed using <code>addAddendum()</code>
149     * and <code>removeAddendum</code>, each lexicon instance has its
150     * own addenda.
151     */
152    private static Map loadedCompiledLexicons;
153    
154
155    
156    /**
157     * Loaded State of the lexicon
158     */
159    private boolean loaded = false;
160    
161    /**
162     * Type of lexicon to load
163     */
164    private boolean binary = false;
165
166    /**
167     * No phones for this word.
168     */
169    final static private String[] NO_PHONES = new String[0];
170
171    /**
172     * Temporary place holder.
173     */
174    private char charBuffer[] = new char[128];
175
176    /**
177     * Use the new IO package?
178     */
179    private boolean useNewIO =
180        Utilities.getProperty("com.sun.speech.freetts.useNewIO",
181                "true").equals("true");
182
183    /**
184     * Create a new LexiconImpl by reading from the given URLS.
185     *
186     * @param compiledURL a URL pointing to the compiled lexicon
187     * @param addendaURL a URL pointing to lexicon addenda
188     * @param letterToSoundURL a LetterToSound to use if a word cannot
189     *   be found in the compiled form or the addenda
190     * @param binary if <code>true</code>, the input streams are binary;
191     *   otherwise, they are text.
192     */ 
193    public LexiconImpl(URL compiledURL, URL addendaURL,
194                       URL letterToSoundURL,
195                       boolean binary) {
196        this();
197        setLexiconParameters(compiledURL, addendaURL, letterToSoundURL, binary);
198    }
199
200    /**
201     * Class constructor for an empty Lexicon.
202     */
203    public LexiconImpl() {
204        // Find out when to convert the phone string into an array.
205        //
206        String tokenize =
207            Utilities.getProperty("com.sun.speech.freetts.lexicon.LexTokenize",
208                               "never");
209        tokenizeOnLoad = tokenize.equals("load");
210        tokenizeOnLookup = tokenize.equals("lookup");
211    }
212
213    /**
214     * Sets the lexicon parameters
215     * @param compiledURL a URL pointing to the compiled lexicon
216     * @param addendaURL a URL pointing to lexicon addenda
217     * @param letterToSoundURL a URL pointing to the LetterToSound to use
218     * @param binary if <code>true</code>, the input streams are binary;
219     *   otherwise, they are text.
220     */ 
221    protected void setLexiconParameters(URL compiledURL,
222                                        URL addendaURL,
223                                        URL letterToSoundURL,
224                                        boolean binary) {
225        this.compiledURL = compiledURL;
226        this.addendaURL = addendaURL;
227        this.letterToSoundURL = letterToSoundURL;
228        this.binary = binary;
229    }
230
231    /**
232     * Determines if this lexicon is loaded.
233     *
234     * @return <code>true</code> if the lexicon is loaded
235     */
236    public boolean isLoaded() {
237        return loaded;
238    }
239
240    /**
241     * Loads the data for this lexicon.  If the 
242     *
243     * @throws IOException if errors occur during loading
244     */
245    public void load() throws IOException {
246        BulkTimer.LOAD.start("Lexicon");
247
248        if (compiledURL == null) {
249            throw new IOException("Can't load lexicon");
250        }
251
252        if (addendaURL == null) {
253            throw new IOException("Can't load lexicon addenda " );
254        }
255
256        if (loadedCompiledLexicons == null) {
257            loadedCompiledLexicons = new HashMap();
258        }
259        if (!loadedCompiledLexicons.containsKey(compiledURL)) {
260                InputStream compiledIS = Utilities.getInputStream(compiledURL);
261                if (compiledIS == null) {
262                    throw new IOException("Can't load lexicon from " + compiledURL);
263                }
264                Map newCompiled = createLexicon(compiledIS, binary, 65000);
265        loadedCompiledLexicons.put(compiledURL, newCompiled);
266        compiledIS.close();
267        }
268        compiled = Collections.unmodifiableMap((Map)loadedCompiledLexicons.get(compiledURL));
269
270        InputStream addendaIS = Utilities.getInputStream(addendaURL);
271        if (addendaIS == null) {
272            throw new IOException("Can't load lexicon addenda from " 
273                    + addendaURL);
274        }
275
276        // [[[TODO: what is the best way to derive the estimated sizes?]]]
277        //
278        addenda = createLexicon(addendaIS, binary, 50);
279        addendaIS.close();
280
281        /* Load the user-defined addenda and override any existing
282         * entries in the system addenda.
283         */
284        String userAddenda = Utilities.getProperty(
285            "com.sun.speech.freetts.lexicon.userAddenda", null);
286        if (userAddenda != null) {
287            try {
288                URL userAddendaURL = new URL(userAddenda);
289                InputStream userAddendaIS = Utilities.getInputStream(
290                    userAddendaURL);
291                if (userAddendaIS == null) {
292                    throw new IOException("Can't load user addenda from "
293                                          + userAddenda);
294                }
295                Map tmpAddenda = createLexicon(userAddendaIS, false, 50);
296                userAddendaIS.close();
297                for (Iterator keys = tmpAddenda.keySet().iterator();
298                     keys.hasNext();) {
299                    Object key = keys.next();
300                    addenda.put(key, tmpAddenda.get(key));
301                }
302            } catch (MalformedURLException e) {
303                throw new IOException("User addenda URL is malformed: " +
304                                      userAddenda);
305            }
306        }
307        
308        loaded = true;
309        BulkTimer.LOAD.stop("Lexicon");
310        letterToSound = new LetterToSoundImpl(letterToSoundURL, binary);
311    }
312
313    /**
314     * Reads the given input stream as lexicon data and returns the
315     * results in a <code>Map</code>.
316     *
317     * @param is the input stream
318     * @param binary if <code>true</code>, the data is binary
319     * @param estimatedSize the estimated size of the lexicon
320     *
321     * @throws IOException if errors are encountered while reading the data
322     */
323    protected Map createLexicon(InputStream is,
324                                boolean binary, 
325                                int estimatedSize) 
326        throws IOException {
327        if (binary) {
328            if (useNewIO && is instanceof FileInputStream) {
329                FileInputStream fis = (FileInputStream) is;
330                return loadMappedBinaryLexicon(fis, estimatedSize);
331            } else {
332                DataInputStream dis = new DataInputStream(
333                        new BufferedInputStream(is));
334                return loadBinaryLexicon(dis, estimatedSize);
335            }
336        }  else {
337            return loadTextLexicon(is, estimatedSize);
338        }
339    }
340
341    /**
342     * Reads the given input stream as text lexicon data and returns the
343     * results in a <code>Map</code>.
344     *
345     * @param is the input stream
346     * @param estimatedSize the estimated number of entries of the lexicon
347     *
348     * @throws IOException if errors are encountered while reading the data
349     */
350    protected Map loadTextLexicon(InputStream is, int estimatedSize) 
351        throws IOException {
352        Map lexicon = new LinkedHashMap(estimatedSize * 4 / 3);
353        BufferedReader reader = new BufferedReader(new InputStreamReader(is));
354        String line;
355        
356        line = reader.readLine();
357        while (line != null) {
358            if (!line.startsWith("***")) {
359                parseAndAdd(lexicon, line);
360            }
361            line = reader.readLine();
362        }
363        return lexicon;
364    }
365    
366    /**
367     * Creates a word from the given input line and add it to the lexicon.
368     *
369     * @param lexicon the lexicon
370     * @param line the input text
371     */
372    protected void parseAndAdd(Map lexicon, String line) {
373        StringTokenizer tokenizer = new StringTokenizer(line,"\t");
374        String phones = null;
375        
376        String wordAndPos = tokenizer.nextToken();
377        String pos = wordAndPos.substring(wordAndPos.length() - 1);
378        if (!partsOfSpeech.contains(pos)) {
379            partsOfSpeech.add(pos);
380        }
381        if (tokenizer.hasMoreTokens()) {
382            phones = tokenizer.nextToken();
383        }
384        if ((phones != null) && (tokenizeOnLoad)) {
385            lexicon.put(wordAndPos, getPhones(phones));
386        } else if (phones == null) {
387            lexicon.put(wordAndPos, NO_PHONES);
388        } else {
389            lexicon.put(wordAndPos, phones);
390        }
391    }
392    
393    /**
394     * Gets the phone list for a given word.  If a phone list cannot
395     * be found, returns <code>null</code>.  The format is lexicon
396     * dependent.  If the part of speech does not matter, pass in
397     * <code>null</code>.
398     *
399     * @param word the word to find
400     * @param partOfSpeech the part of speech
401     *
402     * @return the list of phones for word or <code>null</code>
403     */
404    public String[] getPhones(String word, String partOfSpeech) {
405        return getPhones(word, partOfSpeech, true);
406    }
407
408    /**
409     * Gets the phone list for a given word.  If a phone list cannot
410     * be found, <code>null</code> is returned.  The
411     * <code>partOfSpeech</code> is implementation dependent, but
412     * <code>null</code> always matches.
413     *
414     * @param word the word to find
415     * @param partOfSpeech the part of speech or <code>null</code>
416     * @param useLTS whether to use the letter-to-sound rules when
417     *        the word is not in the lexicon.
418     *
419     * @return the list of phones for word or null
420     */    
421    public String[] getPhones
422                        (String word, String partOfSpeech, boolean useLTS){
423        String[] phones = null;
424        phones = getPhones(addenda, word, partOfSpeech);
425        if (phones == null) {
426            phones = getPhones(compiled, word, partOfSpeech);
427        }
428        if(useLTS){
429            if (phones == null && letterToSound != null) {
430                phones = letterToSound.getPhones(word, partOfSpeech);
431            }
432        }
433        if(phones != null){
434        String[] copy = new String[phones.length];
435        System.arraycopy(phones, 0, copy, 0, phones.length);
436            return copy;
437        }
438        else return null;
439        
440    }
441    /**
442     * Gets a phone list for a word from a given lexicon.  If a phone
443     * list cannot be found, returns <code>null</code>.  The format is 
444     * lexicon dependent.  If the part of speech does not matter, pass
445     * in <code>null</code>.
446     *
447     * @param lexicon the lexicon
448     * @param word the word to find
449     * @param partOfSpeech the part of speech
450     *
451     * @return the list of phones for word or <code>null</code>
452     */
453    protected String[] getPhones(Map lexicon,
454                                 String word,
455                                 String partOfSpeech) {
456        String[] phones;
457        partOfSpeech = fixPartOfSpeech(partOfSpeech);
458        phones = getPhones(lexicon, word+partOfSpeech);
459        for (int i = 0;
460             (i < partsOfSpeech.size()) && (phones == null);
461             i++) {
462            if (!partOfSpeech.equals((String) partsOfSpeech.get(i))) {
463                phones = getPhones(lexicon,
464                                   word + (String) partsOfSpeech.get(i));
465            }
466        }
467        return phones;
468    }
469
470    /**
471     * Gets a phone list for a word from a given lexicon.  If a phone
472     * list cannot be found, returns <code>null</code>.
473     *
474     * @param lexicon the lexicon
475     * @param wordAndPartOfSpeech word and part of speech concatenated
476     *   together
477     *
478     * @return the list of phones for word or <code>null</code>
479     */
480    protected String[] getPhones(Map lexicon,
481                                 String wordAndPartOfSpeech) {
482        Object value = lexicon.get(wordAndPartOfSpeech);
483        if (value instanceof String[]) {
484            return (String[]) value;
485        } else if (value instanceof String) {
486            String[] phoneArray;
487            phoneArray = getPhones((String) value);
488            if (tokenizeOnLookup) {
489                lexicon.put(wordAndPartOfSpeech, phoneArray);
490            }
491            return phoneArray;
492        } else {
493            return null;
494        }
495    }
496    
497    /** 
498     * Turns the phone <code>String</code> into a <code>String[]</code>,
499     * using " " as the delimiter.
500     *
501     * @param phones the phones
502     *
503     * @return the phones split into an array
504     */
505    protected String[] getPhones(String phones) {
506        ArrayList phoneList = new ArrayList();
507        StringTokenizer tokenizer = new StringTokenizer(phones, " ");
508        while (tokenizer.hasMoreTokens()) {
509            phoneList.add(tokenizer.nextToken());
510        }
511        return (String[]) phoneList.toArray(new String[0]);
512    } 
513    
514    /**
515     * Adds a word to the addenda.
516     *
517     * @param word the word to find
518     * @param partOfSpeech the part of speech
519     * @param phones the phones for the word
520     * 
521     */
522    public void addAddendum(String word,
523                            String partOfSpeech,
524                            String[] phones) {
525        String pos = fixPartOfSpeech(partOfSpeech);
526        if (!partsOfSpeech.contains(pos)) {
527            partsOfSpeech.add(pos);
528        }
529        addenda.put(word + pos, phones);
530    }   
531
532    /**
533     * Removes a word from the addenda.
534     *
535     * @param word the word to remove
536     * @param partOfSpeech the part of speech
537     */
538    public void removeAddendum(String word, String partOfSpeech) {
539        addenda.remove(word + fixPartOfSpeech(partOfSpeech));        
540    }
541
542    /**
543     * Outputs a string to a data output stream.
544     *
545     * @param dos the data output stream
546     * @param s the string to output
547     *
548     * @throws IOException if errors occur during writing
549     */
550    private void outString(DataOutputStream dos, String s) 
551                        throws IOException {
552        dos.writeByte((byte) s.length());
553        for (int i = 0; i < s.length(); i++) {
554            dos.writeChar(s.charAt(i));
555        }
556    }
557
558    /**
559     * Inputs a string from a DataInputStream.  This method is not re-entrant.
560     *
561     * @param dis the data input stream
562     *
563     * @return the string 
564     *
565     * @throws IOException if errors occur during reading
566     */
567    private String getString(DataInputStream dis) throws IOException {
568        int size = dis.readByte();
569        for (int i = 0; i < size; i++) {
570            charBuffer[i] = dis.readChar();
571        }
572        return new String(charBuffer, 0, size);
573    }
574
575    /**
576     * Inputs a string from a DataInputStream.  This method is not re-entrant.
577     *
578     * @param bb the input byte buffer
579     *
580     * @return the string 
581     *
582     * @throws IOException if errors occur during reading
583     */
584    private String getString(ByteBuffer bb) throws IOException {
585        int size = bb.get();
586        for (int i = 0; i < size; i++) {
587            charBuffer[i] = bb.getChar();
588        }
589        return new String(charBuffer, 0, size);
590    }
591
592
593    /**
594     * Dumps a binary form of the database.  This method is not thread-safe.
595     * 
596     * <p>Binary format is:
597     * <pre>
598     * MAGIC
599     * VERSION
600     * (int) numPhonemes
601     * (String) phoneme0
602     * (String) phoneme1
603     * (String) phonemeN
604     * (int) numEntries
605     * (String) nameWithPOS 
606     * (byte) numPhonemes
607     * phoneme index 1
608     * phoneme index 2
609     * phoneme index n
610     * </pre>
611     *
612     * <p>Strings are formatted as: <code>(byte) len char0 char1 charN</code>
613     *
614     * <p>Limits: Strings: 128 chars
615     * <p>Limits: Strings: 128 phonemes per word
616     *
617     * @param lexicon the lexicon to dump 
618     * @param path the path to dump the file to
619     */
620    private void dumpBinaryLexicon(Map lexicon, String path) {
621        try {
622            FileOutputStream fos = new FileOutputStream(path);
623            DataOutputStream dos = new DataOutputStream(new
624                    BufferedOutputStream(fos));
625            List phonemeList = findPhonemes(lexicon);
626
627            dos.writeInt(MAGIC);
628            dos.writeInt(VERSION);
629            dos.writeInt(phonemeList.size());
630
631            for (int i = 0; i < phonemeList.size(); i++) {
632                outString(dos, (String) phonemeList.get(i));
633            }
634
635            dos.writeInt(lexicon.keySet().size());
636            for (Iterator i = lexicon.keySet().iterator(); i.hasNext(); ) {
637                String key = (String) i.next();
638                outString(dos, key);
639                String[] phonemes = getPhones(lexicon, key);
640                dos.writeByte((byte) phonemes.length);
641                for (int index = 0; index < phonemes.length; index++) {
642                    int phonemeIndex = phonemeList.indexOf(phonemes[index]);
643                    if (phonemeIndex == -1) {
644                        throw new Error("Can't find phoneme index");
645                    }
646                    dos.writeByte((byte) phonemeIndex);
647                }
648            }
649            dos.close();
650        } catch (FileNotFoundException fe) {
651            throw new Error("Can't dump binary database " +
652                    fe.getMessage());
653        } catch (IOException ioe) {
654            throw new Error("Can't write binary database " +
655                    ioe.getMessage());
656        }
657    }
658
659    /**
660     * Loads the binary lexicon from the given InputStream.
661     * This method is not thread safe.
662     *
663     * @param is the InputStream to load the database from
664     * @param estimatedSize estimate of how large the database is
665     *
666     * @return a <code>Map</code> containing the lexicon
667     *
668     * @throws IOException if an IO error occurs
669     */
670    private Map loadMappedBinaryLexicon(FileInputStream is, int estimatedSize) 
671        throws IOException {
672        FileChannel fc = is.getChannel();
673
674        MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 
675                0, (int) fc.size());
676        bb.load();
677        int size = 0;
678        int numEntries = 0;
679        List phonemeList = new ArrayList();
680
681        // we get better performance for some reason if we
682        // just ignore estimated size
683        //
684        // Map lexicon = new HashMap();
685        Map lexicon = new LinkedHashMap(estimatedSize * 4 / 3);
686
687        if (bb.getInt() != MAGIC) {
688            throw new Error("bad magic number in lexicon");
689        }
690
691        if (bb.getInt() != VERSION) {
692            throw new Error("bad version number in lexicon");
693        }
694
695        size = bb.getInt();
696        for (int i = 0; i < size; i++) {
697            String phoneme = getString(bb);
698            phonemeList.add(phoneme);
699        }
700        numEntries = bb.getInt();
701
702        for (int i = 0; i < numEntries; i++) {
703            String wordAndPos = getString(bb);
704            String pos = Character.toString(
705                    wordAndPos.charAt(wordAndPos.length() - 1));
706            if (!partsOfSpeech.contains(pos)) {
707                partsOfSpeech.add(pos);
708            }
709
710            int numPhonemes = bb.get();
711            String[] phonemes = new String[numPhonemes];
712
713            for (int j = 0; j < numPhonemes; j++) {
714                phonemes[j] = (String) phonemeList.get(bb.get());
715            }
716            lexicon.put(wordAndPos, phonemes);
717        }
718        fc.close();
719        return lexicon;
720    }
721
722    /**
723     * Loads the binary lexicon from the given InputStream.
724     * This method is not thread safe.
725     *
726     * @param is the InputStream to load the database from
727     * @param estimatedSize estimate of how large the database is
728     *
729     * @return a <code>Map</code> containing the lexicon
730     *
731     * @throws IOException if an IO error occurs
732     */
733    private Map loadBinaryLexicon(InputStream is, int estimatedSize) 
734        throws IOException {
735        DataInputStream dis = new DataInputStream(new
736                BufferedInputStream(is));
737        int size = 0;
738        int numEntries = 0;
739        List phonemeList = new ArrayList();
740
741        // we get better performance for some reason if we
742        // just ignore estimated size
743        //
744        Map lexicon = new LinkedHashMap();
745
746        if (dis.readInt() != MAGIC) {
747            throw new Error("bad magic number in lexicon");
748        }
749
750        if (dis.readInt() != VERSION) {
751            throw new Error("bad version number in lexicon");
752        }
753
754        size = dis.readInt();
755        for (int i = 0; i < size; i++) {
756            String phoneme = getString(dis);
757            phonemeList.add(phoneme);
758        }
759        numEntries = dis.readInt();
760
761        for (int i = 0; i < numEntries; i++) {
762            String wordAndPos = getString(dis);
763            String pos = Character.toString(
764                    wordAndPos.charAt(wordAndPos.length() - 1));
765            if (!partsOfSpeech.contains(pos)) {
766                partsOfSpeech.add(pos);
767            }
768
769            int numPhonemes = dis.readByte();
770            String[] phonemes = new String[numPhonemes];
771
772            for (int j = 0; j < numPhonemes; j++) {
773                phonemes[j] = (String) phonemeList.get(dis.readByte());
774            }
775            lexicon.put(wordAndPos, phonemes);
776        }
777        dis.close();
778        return lexicon;
779    }
780
781    /**
782     * Dumps this lexicon (just the compiled form). Lexicon will be
783     * dumped to two binary files PATH_compiled.bin and
784     * PATH_addenda.bin
785     *
786     * @param path the root path to dump it to
787     */
788    public void dumpBinary(String path) {
789        String compiledPath = path + "_compiled.bin";
790        String addendaPath = path + "_addenda.bin";
791        
792        dumpBinaryLexicon(compiled, compiledPath);
793        dumpBinaryLexicon(addenda, addendaPath);
794    }
795
796    /**
797     * Returns a list of the unique phonemes in the lexicon.
798     *
799     * @param lexicon the lexicon of interest
800     *
801     * @return list the unique set of phonemes
802     */
803    private List findPhonemes(Map lexicon) {
804        List phonemeList = new ArrayList();
805        for (Iterator i = lexicon.keySet().iterator(); i.hasNext(); ) {
806            String key = (String) i.next();
807            String[] phonemes = getPhones(lexicon, key);
808            for (int index = 0; index < phonemes.length; index++) {
809                if (!phonemeList.contains(phonemes[index])) {
810                    phonemeList.add(phonemes[index]);
811                }
812            }
813        }
814        return phonemeList;
815    }
816
817
818    /**
819     * Tests to see if this lexicon is identical to the other for
820     * debugging purposes.
821     *
822     * @param other the other lexicon to compare to
823     *
824     * @return true if lexicons are identical
825     */
826    public boolean compare(LexiconImpl other) {
827        return compare(addenda, other.addenda) && 
828              compare(compiled, other.compiled);
829    }
830
831    /**
832     * Determines if the two lexicons are identical for debugging purposes.
833     *
834     * @param lex this lex
835     * @param other the other lexicon to chd
836     *
837     * @return true if they are identical
838     */
839    private boolean compare(Map lex, Map other) {
840        for (Iterator i = lex.keySet().iterator(); i.hasNext(); ) {
841            String key = (String) i.next();
842            String[] thisPhonemes = getPhones(lex, key);
843            String[] otherPhonemes = getPhones(other, key);
844            if (thisPhonemes == null) {
845                System.out.println(key + " not found in this.");
846                return false;
847            } else if (otherPhonemes == null) {
848                System.out.println(key + " not found in other.");
849                return false;
850            } else if (thisPhonemes.length == otherPhonemes.length) {
851                for (int j = 0; j < thisPhonemes.length; j++) {
852                    if (!thisPhonemes[j].equals(otherPhonemes[j])) {
853                        return false;
854                    }
855                }
856            } else {
857                return false;
858            }
859        }
860        return true;
861    }
862 
863    /**
864     * Fixes the part of speech if it is <code>null</code>.  The
865     * default representation of a <code>null</code> part of speech
866     * is the number "0".
867     */
868    static protected String fixPartOfSpeech(String partOfSpeech) {
869        return (partOfSpeech == null) ? "0" : partOfSpeech;
870    }
871}