001/* 002 * Copyright 1999-2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package com.itextpdf.text.pdf.hyphenation; 018 019import java.io.Serializable; 020import java.util.Enumeration; 021import java.util.Stack; 022 023/** 024 * <h2>Ternary Search Tree.</h2> 025 * 026 * <p>A ternary search tree is a hybrid between a binary tree and 027 * a digital search tree (trie). Keys are limited to strings. 028 * A data value of type char is stored in each leaf node. 029 * It can be used as an index (or pointer) to the data. 030 * Branches that only contain one key are compressed to one node 031 * by storing a pointer to the trailer substring of the key. 032 * This class is intended to serve as base class or helper class 033 * to implement Dictionary collections or the like. Ternary trees 034 * have some nice properties as the following: the tree can be 035 * traversed in sorted order, partial matches (wildcard) can be 036 * implemented, retrieval of all keys within a given distance 037 * from the target, etc. The storage requirements are higher than 038 * a binary tree but a lot less than a trie. Performance is 039 * comparable with a hash table, sometimes it outperforms a hash 040 * function (most of the time can determine a miss faster than a hash).</p> 041 * 042 * <p>The main purpose of this java port is to serve as a base for 043 * implementing TeX's hyphenation algorithm (see The TeXBook, 044 * appendix H). Each language requires from 5000 to 15000 hyphenation 045 * patterns which will be keys in this tree. The strings patterns 046 * are usually small (from 2 to 5 characters), but each char in the 047 * tree is stored in a node. Thus memory usage is the main concern. 048 * We will sacrifice 'elegance' to keep memory requirements to the 049 * minimum. Using java's char type as pointer (yes, I know pointer 050 * it is a forbidden word in java) we can keep the size of the node 051 * to be just 8 bytes (3 pointers and the data char). This gives 052 * room for about 65000 nodes. In my tests the English patterns 053 * took 7694 nodes and the German patterns 10055 nodes, 054 * so I think we are safe.</p> 055 * 056 * <p>All said, this is a map with strings as keys and char as value. 057 * Pretty limited!. It can be extended to a general map by 058 * using the string representation of an object and using the 059 * char value as an index to an array that contains the object 060 * values.</p> 061 * 062 * @author cav@uniscope.co.jp 063 */ 064 065public class TernaryTree implements Cloneable, Serializable { 066 067 /** 068 * We use 4 arrays to represent a node. I guess I should have created 069 * a proper node class, but somehow Knuth's pascal code made me forget 070 * we now have a portable language with virtual memory management and 071 * automatic garbage collection! And now is kind of late, furthermore, 072 * if it ain't broken, don't fix it. 073 */ 074 075 private static final long serialVersionUID = 5313366505322983510L; 076 077 /** 078 * Pointer to low branch and to rest of the key when it is 079 * stored directly in this node, we don't have unions in java! 080 */ 081 protected char[] lo; 082 083 /** 084 * Pointer to high branch. 085 */ 086 protected char[] hi; 087 088 /** 089 * Pointer to equal branch and to data when this node is a string terminator. 090 */ 091 protected char[] eq; 092 093 /** 094 * <P>The character stored in this node: splitchar. 095 * Two special values are reserved:</P> 096 * <ul><li>0x0000 as string terminator</li> 097 * <li>0xFFFF to indicate that the branch starting at 098 * this node is compressed</li></ul> 099 * <p>This shouldn't be a problem if we give the usual semantics to 100 * strings since 0xFFFF is guaranteed not to be an Unicode character.</p> 101 */ 102 protected char[] sc; 103 104 /** 105 * This vector holds the trailing of the keys when the branch is compressed. 106 */ 107 protected CharVector kv; 108 109 protected char root; 110 protected char freenode; 111 protected int length; // number of items in tree 112 113 protected static final int BLOCK_SIZE = 2048; // allocation size for arrays 114 115 TernaryTree() { 116 init(); 117 } 118 119 protected void init() { 120 root = 0; 121 freenode = 1; 122 length = 0; 123 lo = new char[BLOCK_SIZE]; 124 hi = new char[BLOCK_SIZE]; 125 eq = new char[BLOCK_SIZE]; 126 sc = new char[BLOCK_SIZE]; 127 kv = new CharVector(); 128 } 129 130 /** 131 * Branches are initially compressed, needing 132 * one node per key plus the size of the string 133 * key. They are decompressed as needed when 134 * another key with same prefix 135 * is inserted. This saves a lot of space, 136 * specially for long keys. 137 */ 138 public void insert(String key, char val) { 139 // make sure we have enough room in the arrays 140 int len = key.length() 141 + 1; // maximum number of nodes that may be generated 142 if (freenode + len > eq.length) { 143 redimNodeArrays(eq.length + BLOCK_SIZE); 144 } 145 char strkey[] = new char[len--]; 146 key.getChars(0, len, strkey, 0); 147 strkey[len] = 0; 148 root = insert(root, strkey, 0, val); 149 } 150 151 public void insert(char[] key, int start, char val) { 152 int len = strlen(key) + 1; 153 if (freenode + len > eq.length) { 154 redimNodeArrays(eq.length + BLOCK_SIZE); 155 } 156 root = insert(root, key, start, val); 157 } 158 159 /** 160 * The actual insertion function, recursive version. 161 */ 162 private char insert(char p, char[] key, int start, char val) { 163 int len = strlen(key, start); 164 if (p == 0) { 165 // this means there is no branch, this node will start a new branch. 166 // Instead of doing that, we store the key somewhere else and create 167 // only one node with a pointer to the key 168 p = freenode++; 169 eq[p] = val; // holds data 170 length++; 171 hi[p] = 0; 172 if (len > 0) { 173 sc[p] = 0xFFFF; // indicates branch is compressed 174 lo[p] = (char)kv.alloc(len 175 + 1); // use 'lo' to hold pointer to key 176 strcpy(kv.getArray(), lo[p], key, start); 177 } else { 178 sc[p] = 0; 179 lo[p] = 0; 180 } 181 return p; 182 } 183 184 if (sc[p] == 0xFFFF) { 185 // branch is compressed: need to decompress 186 // this will generate garbage in the external key array 187 // but we can do some garbage collection later 188 char pp = freenode++; 189 lo[pp] = lo[p]; // previous pointer to key 190 eq[pp] = eq[p]; // previous pointer to data 191 lo[p] = 0; 192 if (len > 0) { 193 sc[p] = kv.get(lo[pp]); 194 eq[p] = pp; 195 lo[pp]++; 196 if (kv.get(lo[pp]) == 0) { 197 // key completely decompressed leaving garbage in key array 198 lo[pp] = 0; 199 sc[pp] = 0; 200 hi[pp] = 0; 201 } else { 202 // we only got first char of key, rest is still there 203 sc[pp] = 0xFFFF; 204 } 205 } else { 206 // In this case we can save a node by swapping the new node 207 // with the compressed node 208 sc[pp] = 0xFFFF; 209 hi[p] = pp; 210 sc[p] = 0; 211 eq[p] = val; 212 length++; 213 return p; 214 } 215 } 216 char s = key[start]; 217 if (s < sc[p]) { 218 lo[p] = insert(lo[p], key, start, val); 219 } else if (s == sc[p]) { 220 if (s != 0) { 221 eq[p] = insert(eq[p], key, start + 1, val); 222 } else { 223 // key already in tree, overwrite data 224 eq[p] = val; 225 } 226 } else { 227 hi[p] = insert(hi[p], key, start, val); 228 } 229 return p; 230 } 231 232 /** 233 * Compares 2 null terminated char arrays 234 */ 235 public static int strcmp(char[] a, int startA, char[] b, int startB) { 236 for (; a[startA] == b[startB]; startA++, startB++) { 237 if (a[startA] == 0) { 238 return 0; 239 } 240 } 241 return a[startA] - b[startB]; 242 } 243 244 /** 245 * Compares a string with null terminated char array 246 */ 247 public static int strcmp(String str, char[] a, int start) { 248 int i, d, len = str.length(); 249 for (i = 0; i < len; i++) { 250 d = str.charAt(i) - a[start + i]; 251 if (d != 0) { 252 return d; 253 } 254 if (a[start + i] == 0) { 255 return d; 256 } 257 } 258 if (a[start + i] != 0) { 259 return -a[start + i]; 260 } 261 return 0; 262 263 } 264 265 public static void strcpy(char[] dst, int di, char[] src, int si) { 266 while (src[si] != 0) { 267 dst[di++] = src[si++]; 268 } 269 dst[di] = 0; 270 } 271 272 public static int strlen(char[] a, int start) { 273 int len = 0; 274 for (int i = start; i < a.length && a[i] != 0; i++) { 275 len++; 276 } 277 return len; 278 } 279 280 public static int strlen(char[] a) { 281 return strlen(a, 0); 282 } 283 284 public int find(String key) { 285 int len = key.length(); 286 char strkey[] = new char[len + 1]; 287 key.getChars(0, len, strkey, 0); 288 strkey[len] = 0; 289 290 return find(strkey, 0); 291 } 292 293 public int find(char[] key, int start) { 294 int d; 295 char p = root; 296 int i = start; 297 char c; 298 299 while (p != 0) { 300 if (sc[p] == 0xFFFF) { 301 if (strcmp(key, i, kv.getArray(), lo[p]) == 0) { 302 return eq[p]; 303 } else { 304 return -1; 305 } 306 } 307 c = key[i]; 308 d = c - sc[p]; 309 if (d == 0) { 310 if (c == 0) { 311 return eq[p]; 312 } 313 i++; 314 p = eq[p]; 315 } else if (d < 0) { 316 p = lo[p]; 317 } else { 318 p = hi[p]; 319 } 320 } 321 return -1; 322 } 323 324 public boolean knows(String key) { 325 return find(key) >= 0; 326 } 327 328 // redimension the arrays 329 private void redimNodeArrays(int newsize) { 330 int len = newsize < lo.length ? newsize : lo.length; 331 char[] na = new char[newsize]; 332 System.arraycopy(lo, 0, na, 0, len); 333 lo = na; 334 na = new char[newsize]; 335 System.arraycopy(hi, 0, na, 0, len); 336 hi = na; 337 na = new char[newsize]; 338 System.arraycopy(eq, 0, na, 0, len); 339 eq = na; 340 na = new char[newsize]; 341 System.arraycopy(sc, 0, na, 0, len); 342 sc = na; 343 } 344 345 public int size() { 346 return length; 347 } 348 349 @Override 350 public Object clone() { 351 TernaryTree t = new TernaryTree(); 352 t.lo = this.lo.clone(); 353 t.hi = this.hi.clone(); 354 t.eq = this.eq.clone(); 355 t.sc = this.sc.clone(); 356 t.kv = (CharVector)this.kv.clone(); 357 t.root = this.root; 358 t.freenode = this.freenode; 359 t.length = this.length; 360 361 return t; 362 } 363 364 /** 365 * Recursively insert the median first and then the median of the 366 * lower and upper halves, and so on in order to get a balanced 367 * tree. The array of keys is assumed to be sorted in ascending 368 * order. 369 */ 370 protected void insertBalanced(String[] k, char[] v, int offset, int n) { 371 int m; 372 if (n < 1) { 373 return; 374 } 375 m = n >> 1; 376 377 insert(k[m + offset], v[m + offset]); 378 insertBalanced(k, v, offset, m); 379 380 insertBalanced(k, v, offset + m + 1, n - m - 1); 381 } 382 383 384 /** 385 * Balance the tree for best search performance 386 */ 387 public void balance() { 388 // System.out.print("Before root splitchar = "); System.out.println(sc[root]); 389 390 int i = 0, n = length; 391 String[] k = new String[n]; 392 char[] v = new char[n]; 393 Iterator iter = new Iterator(); 394 while (iter.hasMoreElements()) { 395 v[i] = iter.getValue(); 396 k[i++] = iter.nextElement(); 397 } 398 init(); 399 insertBalanced(k, v, 0, n); 400 401 // With uniform letter distribution sc[root] should be around 'm' 402 // System.out.print("After root splitchar = "); System.out.println(sc[root]); 403 } 404 405 /** 406 * Each node stores a character (splitchar) which is part of 407 * some key(s). In a compressed branch (one that only contain 408 * a single string key) the trailer of the key which is not 409 * already in nodes is stored externally in the kv array. 410 * As items are inserted, key substrings decrease. 411 * Some substrings may completely disappear when the whole 412 * branch is totally decompressed. 413 * The tree is traversed to find the key substrings actually 414 * used. In addition, duplicate substrings are removed using 415 * a map (implemented with a TernaryTree!). 416 * 417 */ 418 public void trimToSize() { 419 // first balance the tree for best performance 420 balance(); 421 422 // redimension the node arrays 423 redimNodeArrays(freenode); 424 425 // ok, compact kv array 426 CharVector kx = new CharVector(); 427 kx.alloc(1); 428 TernaryTree map = new TernaryTree(); 429 compact(kx, map, root); 430 kv = kx; 431 kv.trimToSize(); 432 } 433 434 private void compact(CharVector kx, TernaryTree map, char p) { 435 int k; 436 if (p == 0) { 437 return; 438 } 439 if (sc[p] == 0xFFFF) { 440 k = map.find(kv.getArray(), lo[p]); 441 if (k < 0) { 442 k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1); 443 strcpy(kx.getArray(), k, kv.getArray(), lo[p]); 444 map.insert(kx.getArray(), k, (char)k); 445 } 446 lo[p] = (char)k; 447 } else { 448 compact(kx, map, lo[p]); 449 if (sc[p] != 0) { 450 compact(kx, map, eq[p]); 451 } 452 compact(kx, map, hi[p]); 453 } 454 } 455 456 457 public Enumeration<String> keys() { 458 return new Iterator(); 459 } 460 461 public class Iterator implements Enumeration<String> { 462 463 /** 464 * current node index 465 */ 466 int cur; 467 468 /** 469 * current key 470 */ 471 String curkey; 472 473 private class Item implements Cloneable { 474 char parent; 475 char child; 476 477 public Item() { 478 parent = 0; 479 child = 0; 480 } 481 482 public Item(char p, char c) { 483 parent = p; 484 child = c; 485 } 486 487 @Override 488 public Item clone() { 489 return new Item(parent, child); 490 } 491 492 } 493 494 /** 495 * Node stack 496 */ 497 Stack<Item> ns; 498 499 /** 500 * key stack implemented with a StringBuffer 501 */ 502 StringBuffer ks; 503 504 public Iterator() { 505 cur = -1; 506 ns = new Stack<Item>(); 507 ks = new StringBuffer(); 508 rewind(); 509 } 510 511 public void rewind() { 512 ns.removeAllElements(); 513 ks.setLength(0); 514 cur = root; 515 run(); 516 } 517 518 public String nextElement() { 519 String res = curkey; 520 cur = up(); 521 run(); 522 return res; 523 } 524 525 public char getValue() { 526 if (cur >= 0) { 527 return eq[cur]; 528 } 529 return 0; 530 } 531 532 public boolean hasMoreElements() { 533 return cur != -1; 534 } 535 536 /** 537 * traverse upwards 538 */ 539 private int up() { 540 Item i = new Item(); 541 int res = 0; 542 543 if (ns.empty()) { 544 return -1; 545 } 546 547 if (cur != 0 && sc[cur] == 0) { 548 return lo[cur]; 549 } 550 551 boolean climb = true; 552 553 while (climb) { 554 i = ns.pop(); 555 i.child++; 556 switch (i.child) { 557 case 1: 558 if (sc[i.parent] != 0) { 559 res = eq[i.parent]; 560 ns.push(i.clone()); 561 ks.append(sc[i.parent]); 562 } else { 563 i.child++; 564 ns.push(i.clone()); 565 res = hi[i.parent]; 566 } 567 climb = false; 568 break; 569 570 case 2: 571 res = hi[i.parent]; 572 ns.push(i.clone()); 573 if (ks.length() > 0) { 574 ks.setLength(ks.length() - 1); // pop 575 } 576 climb = false; 577 break; 578 579 default: 580 if (ns.empty()) { 581 return -1; 582 } 583 climb = true; 584 break; 585 } 586 } 587 return res; 588 } 589 590 /** 591 * traverse the tree to find next key 592 */ 593 private int run() { 594 if (cur == -1) { 595 return -1; 596 } 597 598 boolean leaf = false; 599 while (true) { 600 // first go down on low branch until leaf or compressed branch 601 while (cur != 0) { 602 if (sc[cur] == 0xFFFF) { 603 leaf = true; 604 break; 605 } 606 ns.push(new Item((char)cur, '\u0000')); 607 if (sc[cur] == 0) { 608 leaf = true; 609 break; 610 } 611 cur = lo[cur]; 612 } 613 if (leaf) { 614 break; 615 } 616 // nothing found, go up one node and try again 617 cur = up(); 618 if (cur == -1) { 619 return -1; 620 } 621 } 622 // The current node should be a data node and 623 // the key should be in the key stack (at least partially) 624 StringBuffer buf = new StringBuffer(ks.toString()); 625 if (sc[cur] == 0xFFFF) { 626 int p = lo[cur]; 627 while (kv.get(p) != 0) { 628 buf.append(kv.get(p++)); 629 } 630 } 631 curkey = buf.toString(); 632 return 0; 633 } 634 635 } 636 637 public void printStats() { 638 System.out.println("Number of keys = " + Integer.toString(length)); 639 System.out.println("Node count = " + Integer.toString(freenode)); 640 // System.out.println("Array length = " + Integer.toString(eq.length)); 641 System.out.println("Key Array length = " 642 + Integer.toString(kv.length())); 643 644 /* 645 * for(int i=0; i<kv.length(); i++) 646 * if ( kv.get(i) != 0 ) 647 * System.out.print(kv.get(i)); 648 * else 649 * System.out.println(""); 650 * System.out.println("Keys:"); 651 * for(Enumeration enum = keys(); enum.hasMoreElements(); ) 652 * System.out.println(enum.nextElement()); 653 */ 654 655 } 656 657/* public static void main(String[] args) throws Exception { 658 TernaryTree tt = new TernaryTree(); 659 tt.insert("Carlos", 'C'); 660 tt.insert("Car", 'r'); 661 tt.insert("palos", 'l'); 662 tt.insert("pa", 'p'); 663 tt.trimToSize(); 664 System.out.println((char)tt.find("Car")); 665 System.out.println((char)tt.find("Carlos")); 666 System.out.println((char)tt.find("alto")); 667 tt.printStats(); 668 }*/ 669 670} 671