001/*
002 * Copyright 1999-2004 The Apache Software Foundation.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.itextpdf.text.pdf.hyphenation;
018
019import java.io.Serializable;
020import java.util.Enumeration;
021import java.util.Stack;
022
023/**
024 * <h2>Ternary Search Tree.</h2>
025 *
026 * <p>A ternary search tree is a hybrid between a binary tree and
027 * a digital search tree (trie). Keys are limited to strings.
028 * A data value of type char is stored in each leaf node.
029 * It can be used as an index (or pointer) to the data.
030 * Branches that only contain one key are compressed to one node
031 * by storing a pointer to the trailer substring of the key.
032 * This class is intended to serve as base class or helper class
033 * to implement Dictionary collections or the like. Ternary trees
034 * have some nice properties as the following: the tree can be
035 * traversed in sorted order, partial matches (wildcard) can be
036 * implemented, retrieval of all keys within a given distance
037 * from the target, etc. The storage requirements are higher than
038 * a binary tree but a lot less than a trie. Performance is
039 * comparable with a hash table, sometimes it outperforms a hash
040 * function (most of the time can determine a miss faster than a hash).</p>
041 *
042 * <p>The main purpose of this java port is to serve as a base for
043 * implementing TeX's hyphenation algorithm (see The TeXBook,
044 * appendix H). Each language requires from 5000 to 15000 hyphenation
045 * patterns which will be keys in this tree. The strings patterns
046 * are usually small (from 2 to 5 characters), but each char in the
047 * tree is stored in a node. Thus memory usage is the main concern.
048 * We will sacrifice 'elegance' to keep memory requirements to the
049 * minimum. Using java's char type as pointer (yes, I know pointer
050 * it is a forbidden word in java) we can keep the size of the node
051 * to be just 8 bytes (3 pointers and the data char). This gives
052 * room for about 65000 nodes. In my tests the English patterns
053 * took 7694 nodes and the German patterns 10055 nodes,
054 * so I think we are safe.</p>
055 *
056 * <p>All said, this is a map with strings as keys and char as value.
057 * Pretty limited!. It can be extended to a general map by
058 * using the string representation of an object and using the
059 * char value as an index to an array that contains the object
060 * values.</p>
061 *
062 * @author cav@uniscope.co.jp
063 */
064
065public class TernaryTree implements Cloneable, Serializable {
066
067    /**
068     * We use 4 arrays to represent a node. I guess I should have created
069     * a proper node class, but somehow Knuth's pascal code made me forget
070     * we now have a portable language with virtual memory management and
071     * automatic garbage collection! And now is kind of late, furthermore,
072     * if it ain't broken, don't fix it.
073     */
074
075    private static final long serialVersionUID = 5313366505322983510L;
076
077        /**
078     * Pointer to low branch and to rest of the key when it is
079     * stored directly in this node, we don't have unions in java!
080     */
081    protected char[] lo;
082
083    /**
084     * Pointer to high branch.
085     */
086    protected char[] hi;
087
088    /**
089     * Pointer to equal branch and to data when this node is a string terminator.
090     */
091    protected char[] eq;
092
093    /**
094     * <P>The character stored in this node: splitchar.
095     * Two special values are reserved:</P>
096     * <ul><li>0x0000 as string terminator</li>
097     * <li>0xFFFF to indicate that the branch starting at
098     * this node is compressed</li></ul>
099     * <p>This shouldn't be a problem if we give the usual semantics to
100     * strings since 0xFFFF is guaranteed not to be an Unicode character.</p>
101     */
102    protected char[] sc;
103
104    /**
105     * This vector holds the trailing of the keys when the branch is compressed.
106     */
107    protected CharVector kv;
108
109    protected char root;
110    protected char freenode;
111    protected int length;    // number of items in tree
112
113    protected static final int BLOCK_SIZE = 2048;    // allocation size for arrays
114
115    TernaryTree() {
116        init();
117    }
118
119    protected void init() {
120        root = 0;
121        freenode = 1;
122        length = 0;
123        lo = new char[BLOCK_SIZE];
124        hi = new char[BLOCK_SIZE];
125        eq = new char[BLOCK_SIZE];
126        sc = new char[BLOCK_SIZE];
127        kv = new CharVector();
128    }
129
130    /**
131     * Branches are initially compressed, needing
132     * one node per key plus the size of the string
133     * key. They are decompressed as needed when
134     * another key with same prefix
135     * is inserted. This saves a lot of space,
136     * specially for long keys.
137     */
138    public void insert(String key, char val) {
139        // make sure we have enough room in the arrays
140        int len = key.length()
141                  + 1;    // maximum number of nodes that may be generated
142        if (freenode + len > eq.length) {
143            redimNodeArrays(eq.length + BLOCK_SIZE);
144        }
145        char strkey[] = new char[len--];
146        key.getChars(0, len, strkey, 0);
147        strkey[len] = 0;
148        root = insert(root, strkey, 0, val);
149    }
150
151    public void insert(char[] key, int start, char val) {
152        int len = strlen(key) + 1;
153        if (freenode + len > eq.length) {
154            redimNodeArrays(eq.length + BLOCK_SIZE);
155        }
156        root = insert(root, key, start, val);
157    }
158
159    /**
160     * The actual insertion function, recursive version.
161     */
162    private char insert(char p, char[] key, int start, char val) {
163        int len = strlen(key, start);
164        if (p == 0) {
165            // this means there is no branch, this node will start a new branch.
166            // Instead of doing that, we store the key somewhere else and create
167            // only one node with a pointer to the key
168            p = freenode++;
169            eq[p] = val;           // holds data
170            length++;
171            hi[p] = 0;
172            if (len > 0) {
173                sc[p] = 0xFFFF;    // indicates branch is compressed
174                lo[p] = (char)kv.alloc(len
175                                       + 1);    // use 'lo' to hold pointer to key
176                strcpy(kv.getArray(), lo[p], key, start);
177            } else {
178                sc[p] = 0;
179                lo[p] = 0;
180            }
181            return p;
182        }
183
184        if (sc[p] == 0xFFFF) {
185            // branch is compressed: need to decompress
186            // this will generate garbage in the external key array
187            // but we can do some garbage collection later
188            char pp = freenode++;
189            lo[pp] = lo[p];    // previous pointer to key
190            eq[pp] = eq[p];    // previous pointer to data
191            lo[p] = 0;
192            if (len > 0) {
193                sc[p] = kv.get(lo[pp]);
194                eq[p] = pp;
195                lo[pp]++;
196                if (kv.get(lo[pp]) == 0) {
197                    // key completely decompressed leaving garbage in key array
198                    lo[pp] = 0;
199                    sc[pp] = 0;
200                    hi[pp] = 0;
201                } else {
202                    // we only got first char of key, rest is still there
203                    sc[pp] = 0xFFFF;
204                }
205            } else {
206                // In this case we can save a node by swapping the new node
207                // with the compressed node
208                sc[pp] = 0xFFFF;
209                hi[p] = pp;
210                sc[p] = 0;
211                eq[p] = val;
212                length++;
213                return p;
214            }
215        }
216        char s = key[start];
217        if (s < sc[p]) {
218            lo[p] = insert(lo[p], key, start, val);
219        } else if (s == sc[p]) {
220            if (s != 0) {
221                eq[p] = insert(eq[p], key, start + 1, val);
222            } else {
223                // key already in tree, overwrite data
224                eq[p] = val;
225            }
226        } else {
227            hi[p] = insert(hi[p], key, start, val);
228        }
229        return p;
230    }
231
232    /**
233     * Compares 2 null terminated char arrays
234     */
235    public static int strcmp(char[] a, int startA, char[] b, int startB) {
236        for (; a[startA] == b[startB]; startA++, startB++) {
237            if (a[startA] == 0) {
238                return 0;
239            }
240        }
241        return a[startA] - b[startB];
242    }
243
244    /**
245     * Compares a string with null terminated char array
246     */
247    public static int strcmp(String str, char[] a, int start) {
248        int i, d, len = str.length();
249        for (i = 0; i < len; i++) {
250            d = str.charAt(i) - a[start + i];
251            if (d != 0) {
252                return d;
253            }
254            if (a[start + i] == 0) {
255                return d;
256        }
257        }
258        if (a[start + i] != 0) {
259            return -a[start + i];
260        }
261        return 0;
262
263    }
264
265    public static void strcpy(char[] dst, int di, char[] src, int si) {
266        while (src[si] != 0) {
267            dst[di++] = src[si++];
268        }
269        dst[di] = 0;
270    }
271
272    public static int strlen(char[] a, int start) {
273        int len = 0;
274        for (int i = start; i < a.length && a[i] != 0; i++) {
275            len++;
276        }
277        return len;
278    }
279
280    public static int strlen(char[] a) {
281        return strlen(a, 0);
282    }
283
284    public int find(String key) {
285        int len = key.length();
286        char strkey[] = new char[len + 1];
287        key.getChars(0, len, strkey, 0);
288        strkey[len] = 0;
289
290        return find(strkey, 0);
291    }
292
293    public int find(char[] key, int start) {
294        int d;
295        char p = root;
296        int i = start;
297        char c;
298
299        while (p != 0) {
300            if (sc[p] == 0xFFFF) {
301                if (strcmp(key, i, kv.getArray(), lo[p]) == 0) {
302                    return eq[p];
303                } else {
304                    return -1;
305            }
306            }
307            c = key[i];
308            d = c - sc[p];
309            if (d == 0) {
310                if (c == 0) {
311                    return eq[p];
312                }
313                i++;
314                p = eq[p];
315            } else if (d < 0) {
316                p = lo[p];
317            } else {
318                p = hi[p];
319        }
320        }
321        return -1;
322    }
323
324    public boolean knows(String key) {
325        return find(key) >= 0;
326    }
327
328    // redimension the arrays
329    private void redimNodeArrays(int newsize) {
330        int len = newsize < lo.length ? newsize : lo.length;
331        char[] na = new char[newsize];
332        System.arraycopy(lo, 0, na, 0, len);
333        lo = na;
334        na = new char[newsize];
335        System.arraycopy(hi, 0, na, 0, len);
336        hi = na;
337        na = new char[newsize];
338        System.arraycopy(eq, 0, na, 0, len);
339        eq = na;
340        na = new char[newsize];
341        System.arraycopy(sc, 0, na, 0, len);
342        sc = na;
343    }
344
345    public int size() {
346        return length;
347    }
348
349    @Override
350    public Object clone() {
351        TernaryTree t = new TernaryTree();
352        t.lo = this.lo.clone();
353        t.hi = this.hi.clone();
354        t.eq = this.eq.clone();
355        t.sc = this.sc.clone();
356        t.kv = (CharVector)this.kv.clone();
357        t.root = this.root;
358        t.freenode = this.freenode;
359        t.length = this.length;
360
361        return t;
362    }
363
364    /**
365     * Recursively insert the median first and then the median of the
366     * lower and upper halves, and so on in order to get a balanced
367     * tree. The array of keys is assumed to be sorted in ascending
368     * order.
369     */
370    protected void insertBalanced(String[] k, char[] v, int offset, int n) {
371        int m;
372        if (n < 1) {
373            return;
374        }
375        m = n >> 1;
376
377        insert(k[m + offset], v[m + offset]);
378        insertBalanced(k, v, offset, m);
379
380        insertBalanced(k, v, offset + m + 1, n - m - 1);
381    }
382
383
384    /**
385     * Balance the tree for best search performance
386     */
387    public void balance() {
388        // System.out.print("Before root splitchar = "); System.out.println(sc[root]);
389
390        int i = 0, n = length;
391        String[] k = new String[n];
392        char[] v = new char[n];
393        Iterator iter = new Iterator();
394        while (iter.hasMoreElements()) {
395            v[i] = iter.getValue();
396            k[i++] = iter.nextElement();
397        }
398        init();
399        insertBalanced(k, v, 0, n);
400
401        // With uniform letter distribution sc[root] should be around 'm'
402        // System.out.print("After root splitchar = "); System.out.println(sc[root]);
403    }
404
405    /**
406     * Each node stores a character (splitchar) which is part of
407     * some key(s). In a compressed branch (one that only contain
408     * a single string key) the trailer of the key which is not
409     * already in nodes is stored  externally in the kv array.
410     * As items are inserted, key substrings decrease.
411     * Some substrings may completely  disappear when the whole
412     * branch is totally decompressed.
413     * The tree is traversed to find the key substrings actually
414     * used. In addition, duplicate substrings are removed using
415     * a map (implemented with a TernaryTree!).
416     *
417     */
418    public void trimToSize() {
419        // first balance the tree for best performance
420        balance();
421
422        // redimension the node arrays
423        redimNodeArrays(freenode);
424
425        // ok, compact kv array
426        CharVector kx = new CharVector();
427        kx.alloc(1);
428        TernaryTree map = new TernaryTree();
429        compact(kx, map, root);
430        kv = kx;
431        kv.trimToSize();
432    }
433
434    private void compact(CharVector kx, TernaryTree map, char p) {
435        int k;
436        if (p == 0) {
437            return;
438        }
439        if (sc[p] == 0xFFFF) {
440            k = map.find(kv.getArray(), lo[p]);
441            if (k < 0) {
442                k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1);
443                strcpy(kx.getArray(), k, kv.getArray(), lo[p]);
444                map.insert(kx.getArray(), k, (char)k);
445            }
446            lo[p] = (char)k;
447        } else {
448            compact(kx, map, lo[p]);
449            if (sc[p] != 0) {
450                compact(kx, map, eq[p]);
451            }
452            compact(kx, map, hi[p]);
453        }
454    }
455
456
457    public Enumeration<String> keys() {
458        return new Iterator();
459    }
460
461    public class Iterator implements Enumeration<String> {
462
463        /**
464         * current node index
465         */
466        int cur;
467
468        /**
469         * current key
470         */
471        String curkey;
472
473        private class Item implements Cloneable {
474            char parent;
475            char child;
476
477            public Item() {
478                parent = 0;
479                child = 0;
480            }
481
482            public Item(char p, char c) {
483                parent = p;
484                child = c;
485            }
486
487            @Override
488            public Item clone() {
489                return new Item(parent, child);
490            }
491
492        }
493
494        /**
495         * Node stack
496         */
497        Stack<Item> ns;
498
499        /**
500         * key stack implemented with a StringBuffer
501         */
502        StringBuffer ks;
503
504        public Iterator() {
505            cur = -1;
506            ns = new Stack<Item>();
507            ks = new StringBuffer();
508            rewind();
509        }
510
511        public void rewind() {
512            ns.removeAllElements();
513            ks.setLength(0);
514            cur = root;
515            run();
516        }
517
518        public String nextElement() {
519            String res = curkey;
520            cur = up();
521            run();
522            return res;
523        }
524
525        public char getValue() {
526            if (cur >= 0) {
527                return eq[cur];
528            }
529            return 0;
530        }
531
532        public boolean hasMoreElements() {
533            return cur != -1;
534        }
535
536        /**
537         * traverse upwards
538         */
539        private int up() {
540            Item i = new Item();
541            int res = 0;
542
543            if (ns.empty()) {
544                return -1;
545            }
546
547            if (cur != 0 && sc[cur] == 0) {
548                return lo[cur];
549            }
550
551            boolean climb = true;
552
553            while (climb) {
554                i = ns.pop();
555                i.child++;
556                switch (i.child) {
557                case 1:
558                    if (sc[i.parent] != 0) {
559                        res = eq[i.parent];
560                        ns.push(i.clone());
561                        ks.append(sc[i.parent]);
562                    } else {
563                        i.child++;
564                        ns.push(i.clone());
565                        res = hi[i.parent];
566                    }
567                    climb = false;
568                    break;
569
570                case 2:
571                    res = hi[i.parent];
572                    ns.push(i.clone());
573                    if (ks.length() > 0) {
574                        ks.setLength(ks.length() - 1);    // pop
575                    }
576                    climb = false;
577                    break;
578
579                default:
580                    if (ns.empty()) {
581                        return -1;
582                    }
583                    climb = true;
584                    break;
585                }
586            }
587            return res;
588        }
589
590        /**
591         * traverse the tree to find next key
592         */
593        private int run() {
594            if (cur == -1) {
595                return -1;
596            }
597
598            boolean leaf = false;
599            while (true) {
600                // first go down on low branch until leaf or compressed branch
601                while (cur != 0) {
602                    if (sc[cur] == 0xFFFF) {
603                        leaf = true;
604                        break;
605                    }
606                    ns.push(new Item((char)cur, '\u0000'));
607                    if (sc[cur] == 0) {
608                        leaf = true;
609                        break;
610                    }
611                    cur = lo[cur];
612                }
613                if (leaf) {
614                    break;
615                }
616                    // nothing found, go up one node and try again
617                cur = up();
618                if (cur == -1) {
619                    return -1;
620                }
621            }
622            // The current node should be a data node and
623            // the key should be in the key stack (at least partially)
624            StringBuffer buf = new StringBuffer(ks.toString());
625            if (sc[cur] == 0xFFFF) {
626                int p = lo[cur];
627                while (kv.get(p) != 0) {
628                    buf.append(kv.get(p++));
629            }
630            }
631            curkey = buf.toString();
632            return 0;
633        }
634
635    }
636
637    public void printStats() {
638        System.out.println("Number of keys = " + Integer.toString(length));
639        System.out.println("Node count = " + Integer.toString(freenode));
640        // System.out.println("Array length = " + Integer.toString(eq.length));
641        System.out.println("Key Array length = "
642                           + Integer.toString(kv.length()));
643
644        /*
645         * for(int i=0; i<kv.length(); i++)
646         * if ( kv.get(i) != 0 )
647         * System.out.print(kv.get(i));
648         * else
649         * System.out.println("");
650         * System.out.println("Keys:");
651         * for(Enumeration enum = keys(); enum.hasMoreElements(); )
652         * System.out.println(enum.nextElement());
653         */
654
655    }
656
657/*    public static void main(String[] args) throws Exception {
658        TernaryTree tt = new TernaryTree();
659        tt.insert("Carlos", 'C');
660        tt.insert("Car", 'r');
661        tt.insert("palos", 'l');
662        tt.insert("pa", 'p');
663        tt.trimToSize();
664        System.out.println((char)tt.find("Car"));
665        System.out.println((char)tt.find("Carlos"));
666        System.out.println((char)tt.find("alto"));
667        tt.printStats();
668    }*/
669
670}
671