001/** 002 * Portions Copyright 2001 Sun Microsystems, Inc. 003 * Portions Copyright 1999-2001 Language Technologies Institute, 004 * Carnegie Mellon University. 005 * All Rights Reserved. Use is subject to license terms. 006 * 007 * See the file "license.terms" for information on usage and 008 * redistribution of this file, and for a DISCLAIMER OF ALL 009 * WARRANTIES. 010 */ 011package com.sun.speech.freetts.lexicon; 012 013import java.io.BufferedInputStream; 014import java.io.BufferedOutputStream; 015import java.io.BufferedReader; 016import java.io.DataInputStream; 017import java.io.DataOutputStream; 018import java.io.FileInputStream; 019import java.io.FileNotFoundException; 020import java.io.FileOutputStream; 021import java.io.IOException; 022import java.io.InputStream; 023import java.io.InputStreamReader; 024import java.net.MalformedURLException; 025import java.net.URL; 026import java.nio.ByteBuffer; 027import java.nio.MappedByteBuffer; 028import java.nio.channels.FileChannel; 029import java.util.ArrayList; 030import java.util.Collections; 031import java.util.HashMap; 032import java.util.Iterator; 033import java.util.LinkedHashMap; 034import java.util.List; 035import java.util.Map; 036import java.util.StringTokenizer; 037 038import com.sun.speech.freetts.util.BulkTimer; 039import com.sun.speech.freetts.util.Utilities; 040 041/** 042 * Provides an implementation of a Lexicon. 043 * 044 * <p>This implementation will either read from a straight ASCII file 045 * or a binary file. When reading from an ASCII file, you can specify 046 * when the input line is tokenized: load, lookup, or never. If you 047 * specify 'load', the entire file will be parsed when it is loaded. 048 * If you specify 'lookup', the file will be loaded, but the parsing 049 * for each line will be delayed until it is referenced and the parsed 050 * form will be saved away. If you specify 'never', the lines will 051 * parsed each time they are referenced. The default is 'never'. To 052 * specify the load type, set the system property as follows: 053 * 054 * <pre> 055 * -Dcom.sun.speech.freetts.lexicon.LexTokenize=load 056 * </pre> 057 * 058 * <p>If a binary file is used, you can also specify whether the new 059 * IO package is used. The new IO package is new for JDK1.4, and can 060 * greatly improve the speed of loading files. To enable new IO, use 061 * the following system property (it is enabled by default): 062 * 063 * <pre> 064 * -Dcom.sun.speech.freetts.useNewIO=true 065 * </pre> 066 * 067 * <p>The implementation also allows users to define their own addenda 068 * that will be used in addition to the system addenda. If the user 069 * defines their own addenda, it values will be added to the system 070 * addenda, overriding any existing elements in the system addenda. 071 * To define a user addenda, the user needs to set the following 072 * property: 073 * 074 * <pre> 075 * -Dcom.sun.speeech.freetts.lexicon.userAddenda=<URLToUserAddenda> 076 * </pre> 077 * 078 * Where <URLToUserAddenda> is a URL pointing to an ASCII file 079 * containing addenda entries. 080 * 081 * <p>[[[TODO: support multiple homographs with the same part of speech.]]] 082 */ 083abstract public class LexiconImpl implements Lexicon { 084 /** 085 * If true, the phone string is replaced with the phone array in 086 * the hashmap when the phone array is loaded. The side effects 087 * of this are quicker lookups, but more memory usage and a longer 088 * startup time. 089 */ 090 protected boolean tokenizeOnLoad = false; 091 092 /** 093 * If true, the phone string is replaced with the phone array in 094 * the hashmap when the phone array is first looked up. The side effects 095 * Set by cmufilelex.tokenize=lookup. 096 */ 097 protected boolean tokenizeOnLookup = false; 098 099 /** 100 * Magic number for binary Lexicon files. 101 */ 102 private final static int MAGIC = 0xBABB1E; 103 104 /** 105 * Current binary file version. 106 */ 107 private final static int VERSION = 1; 108 109 /** 110 * URL for the compiled form. 111 */ 112 private URL compiledURL; 113 114 /** 115 * URL for the addenda. 116 */ 117 private URL addendaURL; 118 119 /** 120 * URL for the letter to sound rules. 121 */ 122 private URL letterToSoundURL; 123 124 /** 125 * The addenda. 126 */ 127 private Map addenda; 128 129 /** 130 * The compiled lexicon. 131 */ 132 private Map compiled; 133 134 /** 135 * The LetterToSound rules. 136 */ 137 private LetterToSound letterToSound = null; 138 139 /** 140 * Parts of Speech. 141 */ 142 private ArrayList partsOfSpeech = new ArrayList(); 143 144 /** 145 * A static directory of compiledURL URL objects and associated 146 * already-loaded compiled Map objects. This is used to share 147 * the immutable compiled lexicons between lexicon instances. 148 * As the addenda can be changed using <code>addAddendum()</code> 149 * and <code>removeAddendum</code>, each lexicon instance has its 150 * own addenda. 151 */ 152 private static Map loadedCompiledLexicons; 153 154 155 156 /** 157 * Loaded State of the lexicon 158 */ 159 private boolean loaded = false; 160 161 /** 162 * Type of lexicon to load 163 */ 164 private boolean binary = false; 165 166 /** 167 * No phones for this word. 168 */ 169 final static private String[] NO_PHONES = new String[0]; 170 171 /** 172 * Temporary place holder. 173 */ 174 private char charBuffer[] = new char[128]; 175 176 /** 177 * Use the new IO package? 178 */ 179 private boolean useNewIO = 180 Utilities.getProperty("com.sun.speech.freetts.useNewIO", 181 "true").equals("true"); 182 183 /** 184 * Create a new LexiconImpl by reading from the given URLS. 185 * 186 * @param compiledURL a URL pointing to the compiled lexicon 187 * @param addendaURL a URL pointing to lexicon addenda 188 * @param letterToSoundURL a LetterToSound to use if a word cannot 189 * be found in the compiled form or the addenda 190 * @param binary if <code>true</code>, the input streams are binary; 191 * otherwise, they are text. 192 */ 193 public LexiconImpl(URL compiledURL, URL addendaURL, 194 URL letterToSoundURL, 195 boolean binary) { 196 this(); 197 setLexiconParameters(compiledURL, addendaURL, letterToSoundURL, binary); 198 } 199 200 /** 201 * Class constructor for an empty Lexicon. 202 */ 203 public LexiconImpl() { 204 // Find out when to convert the phone string into an array. 205 // 206 String tokenize = 207 Utilities.getProperty("com.sun.speech.freetts.lexicon.LexTokenize", 208 "never"); 209 tokenizeOnLoad = tokenize.equals("load"); 210 tokenizeOnLookup = tokenize.equals("lookup"); 211 } 212 213 /** 214 * Sets the lexicon parameters 215 * @param compiledURL a URL pointing to the compiled lexicon 216 * @param addendaURL a URL pointing to lexicon addenda 217 * @param letterToSoundURL a URL pointing to the LetterToSound to use 218 * @param binary if <code>true</code>, the input streams are binary; 219 * otherwise, they are text. 220 */ 221 protected void setLexiconParameters(URL compiledURL, 222 URL addendaURL, 223 URL letterToSoundURL, 224 boolean binary) { 225 this.compiledURL = compiledURL; 226 this.addendaURL = addendaURL; 227 this.letterToSoundURL = letterToSoundURL; 228 this.binary = binary; 229 } 230 231 /** 232 * Determines if this lexicon is loaded. 233 * 234 * @return <code>true</code> if the lexicon is loaded 235 */ 236 public boolean isLoaded() { 237 return loaded; 238 } 239 240 /** 241 * Loads the data for this lexicon. If the 242 * 243 * @throws IOException if errors occur during loading 244 */ 245 public void load() throws IOException { 246 BulkTimer.LOAD.start("Lexicon"); 247 248 if (compiledURL == null) { 249 throw new IOException("Can't load lexicon"); 250 } 251 252 if (addendaURL == null) { 253 throw new IOException("Can't load lexicon addenda " ); 254 } 255 256 if (loadedCompiledLexicons == null) { 257 loadedCompiledLexicons = new HashMap(); 258 } 259 if (!loadedCompiledLexicons.containsKey(compiledURL)) { 260 InputStream compiledIS = Utilities.getInputStream(compiledURL); 261 if (compiledIS == null) { 262 throw new IOException("Can't load lexicon from " + compiledURL); 263 } 264 Map newCompiled = createLexicon(compiledIS, binary, 65000); 265 loadedCompiledLexicons.put(compiledURL, newCompiled); 266 compiledIS.close(); 267 } 268 compiled = Collections.unmodifiableMap((Map)loadedCompiledLexicons.get(compiledURL)); 269 270 InputStream addendaIS = Utilities.getInputStream(addendaURL); 271 if (addendaIS == null) { 272 throw new IOException("Can't load lexicon addenda from " 273 + addendaURL); 274 } 275 276 // [[[TODO: what is the best way to derive the estimated sizes?]]] 277 // 278 addenda = createLexicon(addendaIS, binary, 50); 279 addendaIS.close(); 280 281 /* Load the user-defined addenda and override any existing 282 * entries in the system addenda. 283 */ 284 String userAddenda = Utilities.getProperty( 285 "com.sun.speech.freetts.lexicon.userAddenda", null); 286 if (userAddenda != null) { 287 try { 288 URL userAddendaURL = new URL(userAddenda); 289 InputStream userAddendaIS = Utilities.getInputStream( 290 userAddendaURL); 291 if (userAddendaIS == null) { 292 throw new IOException("Can't load user addenda from " 293 + userAddenda); 294 } 295 Map tmpAddenda = createLexicon(userAddendaIS, false, 50); 296 userAddendaIS.close(); 297 for (Iterator keys = tmpAddenda.keySet().iterator(); 298 keys.hasNext();) { 299 Object key = keys.next(); 300 addenda.put(key, tmpAddenda.get(key)); 301 } 302 } catch (MalformedURLException e) { 303 throw new IOException("User addenda URL is malformed: " + 304 userAddenda); 305 } 306 } 307 308 loaded = true; 309 BulkTimer.LOAD.stop("Lexicon"); 310 letterToSound = new LetterToSoundImpl(letterToSoundURL, binary); 311 } 312 313 /** 314 * Reads the given input stream as lexicon data and returns the 315 * results in a <code>Map</code>. 316 * 317 * @param is the input stream 318 * @param binary if <code>true</code>, the data is binary 319 * @param estimatedSize the estimated size of the lexicon 320 * 321 * @throws IOException if errors are encountered while reading the data 322 */ 323 protected Map createLexicon(InputStream is, 324 boolean binary, 325 int estimatedSize) 326 throws IOException { 327 if (binary) { 328 if (useNewIO && is instanceof FileInputStream) { 329 FileInputStream fis = (FileInputStream) is; 330 return loadMappedBinaryLexicon(fis, estimatedSize); 331 } else { 332 DataInputStream dis = new DataInputStream( 333 new BufferedInputStream(is)); 334 return loadBinaryLexicon(dis, estimatedSize); 335 } 336 } else { 337 return loadTextLexicon(is, estimatedSize); 338 } 339 } 340 341 /** 342 * Reads the given input stream as text lexicon data and returns the 343 * results in a <code>Map</code>. 344 * 345 * @param is the input stream 346 * @param estimatedSize the estimated number of entries of the lexicon 347 * 348 * @throws IOException if errors are encountered while reading the data 349 */ 350 protected Map loadTextLexicon(InputStream is, int estimatedSize) 351 throws IOException { 352 Map lexicon = new LinkedHashMap(estimatedSize * 4 / 3); 353 BufferedReader reader = new BufferedReader(new InputStreamReader(is)); 354 String line; 355 356 line = reader.readLine(); 357 while (line != null) { 358 if (!line.startsWith("***")) { 359 parseAndAdd(lexicon, line); 360 } 361 line = reader.readLine(); 362 } 363 return lexicon; 364 } 365 366 /** 367 * Creates a word from the given input line and add it to the lexicon. 368 * 369 * @param lexicon the lexicon 370 * @param line the input text 371 */ 372 protected void parseAndAdd(Map lexicon, String line) { 373 StringTokenizer tokenizer = new StringTokenizer(line,"\t"); 374 String phones = null; 375 376 String wordAndPos = tokenizer.nextToken(); 377 String pos = wordAndPos.substring(wordAndPos.length() - 1); 378 if (!partsOfSpeech.contains(pos)) { 379 partsOfSpeech.add(pos); 380 } 381 if (tokenizer.hasMoreTokens()) { 382 phones = tokenizer.nextToken(); 383 } 384 if ((phones != null) && (tokenizeOnLoad)) { 385 lexicon.put(wordAndPos, getPhones(phones)); 386 } else if (phones == null) { 387 lexicon.put(wordAndPos, NO_PHONES); 388 } else { 389 lexicon.put(wordAndPos, phones); 390 } 391 } 392 393 /** 394 * Gets the phone list for a given word. If a phone list cannot 395 * be found, returns <code>null</code>. The format is lexicon 396 * dependent. If the part of speech does not matter, pass in 397 * <code>null</code>. 398 * 399 * @param word the word to find 400 * @param partOfSpeech the part of speech 401 * 402 * @return the list of phones for word or <code>null</code> 403 */ 404 public String[] getPhones(String word, String partOfSpeech) { 405 return getPhones(word, partOfSpeech, true); 406 } 407 408 /** 409 * Gets the phone list for a given word. If a phone list cannot 410 * be found, <code>null</code> is returned. The 411 * <code>partOfSpeech</code> is implementation dependent, but 412 * <code>null</code> always matches. 413 * 414 * @param word the word to find 415 * @param partOfSpeech the part of speech or <code>null</code> 416 * @param useLTS whether to use the letter-to-sound rules when 417 * the word is not in the lexicon. 418 * 419 * @return the list of phones for word or null 420 */ 421 public String[] getPhones 422 (String word, String partOfSpeech, boolean useLTS){ 423 String[] phones = null; 424 phones = getPhones(addenda, word, partOfSpeech); 425 if (phones == null) { 426 phones = getPhones(compiled, word, partOfSpeech); 427 } 428 if(useLTS){ 429 if (phones == null && letterToSound != null) { 430 phones = letterToSound.getPhones(word, partOfSpeech); 431 } 432 } 433 if(phones != null){ 434 String[] copy = new String[phones.length]; 435 System.arraycopy(phones, 0, copy, 0, phones.length); 436 return copy; 437 } 438 else return null; 439 440 } 441 /** 442 * Gets a phone list for a word from a given lexicon. If a phone 443 * list cannot be found, returns <code>null</code>. The format is 444 * lexicon dependent. If the part of speech does not matter, pass 445 * in <code>null</code>. 446 * 447 * @param lexicon the lexicon 448 * @param word the word to find 449 * @param partOfSpeech the part of speech 450 * 451 * @return the list of phones for word or <code>null</code> 452 */ 453 protected String[] getPhones(Map lexicon, 454 String word, 455 String partOfSpeech) { 456 String[] phones; 457 partOfSpeech = fixPartOfSpeech(partOfSpeech); 458 phones = getPhones(lexicon, word+partOfSpeech); 459 for (int i = 0; 460 (i < partsOfSpeech.size()) && (phones == null); 461 i++) { 462 if (!partOfSpeech.equals((String) partsOfSpeech.get(i))) { 463 phones = getPhones(lexicon, 464 word + (String) partsOfSpeech.get(i)); 465 } 466 } 467 return phones; 468 } 469 470 /** 471 * Gets a phone list for a word from a given lexicon. If a phone 472 * list cannot be found, returns <code>null</code>. 473 * 474 * @param lexicon the lexicon 475 * @param wordAndPartOfSpeech word and part of speech concatenated 476 * together 477 * 478 * @return the list of phones for word or <code>null</code> 479 */ 480 protected String[] getPhones(Map lexicon, 481 String wordAndPartOfSpeech) { 482 Object value = lexicon.get(wordAndPartOfSpeech); 483 if (value instanceof String[]) { 484 return (String[]) value; 485 } else if (value instanceof String) { 486 String[] phoneArray; 487 phoneArray = getPhones((String) value); 488 if (tokenizeOnLookup) { 489 lexicon.put(wordAndPartOfSpeech, phoneArray); 490 } 491 return phoneArray; 492 } else { 493 return null; 494 } 495 } 496 497 /** 498 * Turns the phone <code>String</code> into a <code>String[]</code>, 499 * using " " as the delimiter. 500 * 501 * @param phones the phones 502 * 503 * @return the phones split into an array 504 */ 505 protected String[] getPhones(String phones) { 506 ArrayList phoneList = new ArrayList(); 507 StringTokenizer tokenizer = new StringTokenizer(phones, " "); 508 while (tokenizer.hasMoreTokens()) { 509 phoneList.add(tokenizer.nextToken()); 510 } 511 return (String[]) phoneList.toArray(new String[0]); 512 } 513 514 /** 515 * Adds a word to the addenda. 516 * 517 * @param word the word to find 518 * @param partOfSpeech the part of speech 519 * @param phones the phones for the word 520 * 521 */ 522 public void addAddendum(String word, 523 String partOfSpeech, 524 String[] phones) { 525 String pos = fixPartOfSpeech(partOfSpeech); 526 if (!partsOfSpeech.contains(pos)) { 527 partsOfSpeech.add(pos); 528 } 529 addenda.put(word + pos, phones); 530 } 531 532 /** 533 * Removes a word from the addenda. 534 * 535 * @param word the word to remove 536 * @param partOfSpeech the part of speech 537 */ 538 public void removeAddendum(String word, String partOfSpeech) { 539 addenda.remove(word + fixPartOfSpeech(partOfSpeech)); 540 } 541 542 /** 543 * Outputs a string to a data output stream. 544 * 545 * @param dos the data output stream 546 * @param s the string to output 547 * 548 * @throws IOException if errors occur during writing 549 */ 550 private void outString(DataOutputStream dos, String s) 551 throws IOException { 552 dos.writeByte((byte) s.length()); 553 for (int i = 0; i < s.length(); i++) { 554 dos.writeChar(s.charAt(i)); 555 } 556 } 557 558 /** 559 * Inputs a string from a DataInputStream. This method is not re-entrant. 560 * 561 * @param dis the data input stream 562 * 563 * @return the string 564 * 565 * @throws IOException if errors occur during reading 566 */ 567 private String getString(DataInputStream dis) throws IOException { 568 int size = dis.readByte(); 569 for (int i = 0; i < size; i++) { 570 charBuffer[i] = dis.readChar(); 571 } 572 return new String(charBuffer, 0, size); 573 } 574 575 /** 576 * Inputs a string from a DataInputStream. This method is not re-entrant. 577 * 578 * @param bb the input byte buffer 579 * 580 * @return the string 581 * 582 * @throws IOException if errors occur during reading 583 */ 584 private String getString(ByteBuffer bb) throws IOException { 585 int size = bb.get(); 586 for (int i = 0; i < size; i++) { 587 charBuffer[i] = bb.getChar(); 588 } 589 return new String(charBuffer, 0, size); 590 } 591 592 593 /** 594 * Dumps a binary form of the database. This method is not thread-safe. 595 * 596 * <p>Binary format is: 597 * <pre> 598 * MAGIC 599 * VERSION 600 * (int) numPhonemes 601 * (String) phoneme0 602 * (String) phoneme1 603 * (String) phonemeN 604 * (int) numEntries 605 * (String) nameWithPOS 606 * (byte) numPhonemes 607 * phoneme index 1 608 * phoneme index 2 609 * phoneme index n 610 * </pre> 611 * 612 * <p>Strings are formatted as: <code>(byte) len char0 char1 charN</code> 613 * 614 * <p>Limits: Strings: 128 chars 615 * <p>Limits: Strings: 128 phonemes per word 616 * 617 * @param lexicon the lexicon to dump 618 * @param path the path to dump the file to 619 */ 620 private void dumpBinaryLexicon(Map lexicon, String path) { 621 try { 622 FileOutputStream fos = new FileOutputStream(path); 623 DataOutputStream dos = new DataOutputStream(new 624 BufferedOutputStream(fos)); 625 List phonemeList = findPhonemes(lexicon); 626 627 dos.writeInt(MAGIC); 628 dos.writeInt(VERSION); 629 dos.writeInt(phonemeList.size()); 630 631 for (int i = 0; i < phonemeList.size(); i++) { 632 outString(dos, (String) phonemeList.get(i)); 633 } 634 635 dos.writeInt(lexicon.keySet().size()); 636 for (Iterator i = lexicon.keySet().iterator(); i.hasNext(); ) { 637 String key = (String) i.next(); 638 outString(dos, key); 639 String[] phonemes = getPhones(lexicon, key); 640 dos.writeByte((byte) phonemes.length); 641 for (int index = 0; index < phonemes.length; index++) { 642 int phonemeIndex = phonemeList.indexOf(phonemes[index]); 643 if (phonemeIndex == -1) { 644 throw new Error("Can't find phoneme index"); 645 } 646 dos.writeByte((byte) phonemeIndex); 647 } 648 } 649 dos.close(); 650 } catch (FileNotFoundException fe) { 651 throw new Error("Can't dump binary database " + 652 fe.getMessage()); 653 } catch (IOException ioe) { 654 throw new Error("Can't write binary database " + 655 ioe.getMessage()); 656 } 657 } 658 659 /** 660 * Loads the binary lexicon from the given InputStream. 661 * This method is not thread safe. 662 * 663 * @param is the InputStream to load the database from 664 * @param estimatedSize estimate of how large the database is 665 * 666 * @return a <code>Map</code> containing the lexicon 667 * 668 * @throws IOException if an IO error occurs 669 */ 670 private Map loadMappedBinaryLexicon(FileInputStream is, int estimatedSize) 671 throws IOException { 672 FileChannel fc = is.getChannel(); 673 674 MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 675 0, (int) fc.size()); 676 bb.load(); 677 int size = 0; 678 int numEntries = 0; 679 List phonemeList = new ArrayList(); 680 681 // we get better performance for some reason if we 682 // just ignore estimated size 683 // 684 // Map lexicon = new HashMap(); 685 Map lexicon = new LinkedHashMap(estimatedSize * 4 / 3); 686 687 if (bb.getInt() != MAGIC) { 688 throw new Error("bad magic number in lexicon"); 689 } 690 691 if (bb.getInt() != VERSION) { 692 throw new Error("bad version number in lexicon"); 693 } 694 695 size = bb.getInt(); 696 for (int i = 0; i < size; i++) { 697 String phoneme = getString(bb); 698 phonemeList.add(phoneme); 699 } 700 numEntries = bb.getInt(); 701 702 for (int i = 0; i < numEntries; i++) { 703 String wordAndPos = getString(bb); 704 String pos = Character.toString( 705 wordAndPos.charAt(wordAndPos.length() - 1)); 706 if (!partsOfSpeech.contains(pos)) { 707 partsOfSpeech.add(pos); 708 } 709 710 int numPhonemes = bb.get(); 711 String[] phonemes = new String[numPhonemes]; 712 713 for (int j = 0; j < numPhonemes; j++) { 714 phonemes[j] = (String) phonemeList.get(bb.get()); 715 } 716 lexicon.put(wordAndPos, phonemes); 717 } 718 fc.close(); 719 return lexicon; 720 } 721 722 /** 723 * Loads the binary lexicon from the given InputStream. 724 * This method is not thread safe. 725 * 726 * @param is the InputStream to load the database from 727 * @param estimatedSize estimate of how large the database is 728 * 729 * @return a <code>Map</code> containing the lexicon 730 * 731 * @throws IOException if an IO error occurs 732 */ 733 private Map loadBinaryLexicon(InputStream is, int estimatedSize) 734 throws IOException { 735 DataInputStream dis = new DataInputStream(new 736 BufferedInputStream(is)); 737 int size = 0; 738 int numEntries = 0; 739 List phonemeList = new ArrayList(); 740 741 // we get better performance for some reason if we 742 // just ignore estimated size 743 // 744 Map lexicon = new LinkedHashMap(); 745 746 if (dis.readInt() != MAGIC) { 747 throw new Error("bad magic number in lexicon"); 748 } 749 750 if (dis.readInt() != VERSION) { 751 throw new Error("bad version number in lexicon"); 752 } 753 754 size = dis.readInt(); 755 for (int i = 0; i < size; i++) { 756 String phoneme = getString(dis); 757 phonemeList.add(phoneme); 758 } 759 numEntries = dis.readInt(); 760 761 for (int i = 0; i < numEntries; i++) { 762 String wordAndPos = getString(dis); 763 String pos = Character.toString( 764 wordAndPos.charAt(wordAndPos.length() - 1)); 765 if (!partsOfSpeech.contains(pos)) { 766 partsOfSpeech.add(pos); 767 } 768 769 int numPhonemes = dis.readByte(); 770 String[] phonemes = new String[numPhonemes]; 771 772 for (int j = 0; j < numPhonemes; j++) { 773 phonemes[j] = (String) phonemeList.get(dis.readByte()); 774 } 775 lexicon.put(wordAndPos, phonemes); 776 } 777 dis.close(); 778 return lexicon; 779 } 780 781 /** 782 * Dumps this lexicon (just the compiled form). Lexicon will be 783 * dumped to two binary files PATH_compiled.bin and 784 * PATH_addenda.bin 785 * 786 * @param path the root path to dump it to 787 */ 788 public void dumpBinary(String path) { 789 String compiledPath = path + "_compiled.bin"; 790 String addendaPath = path + "_addenda.bin"; 791 792 dumpBinaryLexicon(compiled, compiledPath); 793 dumpBinaryLexicon(addenda, addendaPath); 794 } 795 796 /** 797 * Returns a list of the unique phonemes in the lexicon. 798 * 799 * @param lexicon the lexicon of interest 800 * 801 * @return list the unique set of phonemes 802 */ 803 private List findPhonemes(Map lexicon) { 804 List phonemeList = new ArrayList(); 805 for (Iterator i = lexicon.keySet().iterator(); i.hasNext(); ) { 806 String key = (String) i.next(); 807 String[] phonemes = getPhones(lexicon, key); 808 for (int index = 0; index < phonemes.length; index++) { 809 if (!phonemeList.contains(phonemes[index])) { 810 phonemeList.add(phonemes[index]); 811 } 812 } 813 } 814 return phonemeList; 815 } 816 817 818 /** 819 * Tests to see if this lexicon is identical to the other for 820 * debugging purposes. 821 * 822 * @param other the other lexicon to compare to 823 * 824 * @return true if lexicons are identical 825 */ 826 public boolean compare(LexiconImpl other) { 827 return compare(addenda, other.addenda) && 828 compare(compiled, other.compiled); 829 } 830 831 /** 832 * Determines if the two lexicons are identical for debugging purposes. 833 * 834 * @param lex this lex 835 * @param other the other lexicon to chd 836 * 837 * @return true if they are identical 838 */ 839 private boolean compare(Map lex, Map other) { 840 for (Iterator i = lex.keySet().iterator(); i.hasNext(); ) { 841 String key = (String) i.next(); 842 String[] thisPhonemes = getPhones(lex, key); 843 String[] otherPhonemes = getPhones(other, key); 844 if (thisPhonemes == null) { 845 System.out.println(key + " not found in this."); 846 return false; 847 } else if (otherPhonemes == null) { 848 System.out.println(key + " not found in other."); 849 return false; 850 } else if (thisPhonemes.length == otherPhonemes.length) { 851 for (int j = 0; j < thisPhonemes.length; j++) { 852 if (!thisPhonemes[j].equals(otherPhonemes[j])) { 853 return false; 854 } 855 } 856 } else { 857 return false; 858 } 859 } 860 return true; 861 } 862 863 /** 864 * Fixes the part of speech if it is <code>null</code>. The 865 * default representation of a <code>null</code> part of speech 866 * is the number "0". 867 */ 868 static protected String fixPartOfSpeech(String partOfSpeech) { 869 return (partOfSpeech == null) ? "0" : partOfSpeech; 870 } 871}