001/* 002 * Copyright 1999-2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017/* $Id: HyphenationTree.java 4749 2011-02-14 11:25:32Z blowagie $ */ 018 019package com.itextpdf.text.pdf.hyphenation; 020 021import java.io.InputStream; 022import java.util.ArrayList; 023import java.util.HashMap; 024 025/** 026 * This tree structure stores the hyphenation patterns in an efficient 027 * way for fast lookup. It provides the provides the method to 028 * hyphenate a word. 029 * 030 * @author Carlos Villegas <cav@uniscope.co.jp> 031 */ 032public class HyphenationTree extends TernaryTree 033 implements PatternConsumer { 034 035 private static final long serialVersionUID = -7763254239309429432L; 036 037 /** 038 * value space: stores the interletter values 039 */ 040 protected ByteVector vspace; 041 042 /** 043 * This map stores hyphenation exceptions 044 */ 045 protected HashMap<String, ArrayList<Object>> stoplist; 046 047 /** 048 * This map stores the character classes 049 */ 050 protected TernaryTree classmap; 051 052 /** 053 * Temporary map to store interletter values on pattern loading. 054 */ 055 private transient TernaryTree ivalues; 056 057 public HyphenationTree() { 058 stoplist = new HashMap<String, ArrayList<Object>>(23); // usually a small table 059 classmap = new TernaryTree(); 060 vspace = new ByteVector(); 061 vspace.alloc(1); // this reserves index 0, which we don't use 062 } 063 064 /** 065 * Packs the values by storing them in 4 bits, two values into a byte 066 * Values range is from 0 to 9. We use zero as terminator, 067 * so we'll add 1 to the value. 068 * @param values a string of digits from '0' to '9' representing the 069 * interletter values. 070 * @return the index into the vspace array where the packed values 071 * are stored. 072 */ 073 protected int packValues(String values) { 074 int i, n = values.length(); 075 int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1; 076 int offset = vspace.alloc(m); 077 byte[] va = vspace.getArray(); 078 for (i = 0; i < n; i++) { 079 int j = i >> 1; 080 byte v = (byte)(values.charAt(i) - '0' + 1 & 0x0f); 081 if ((i & 1) == 1) { 082 va[j + offset] = (byte)(va[j + offset] | v); 083 } else { 084 va[j + offset] = (byte)(v << 4); // big endian 085 } 086 } 087 va[m - 1 + offset] = 0; // terminator 088 return offset; 089 } 090 091 protected String unpackValues(int k) { 092 StringBuffer buf = new StringBuffer(); 093 byte v = vspace.get(k++); 094 while (v != 0) { 095 char c = (char)((v >>> 4) - 1 + '0'); 096 buf.append(c); 097 c = (char)(v & 0x0f); 098 if (c == 0) { 099 break; 100 } 101 c = (char)(c - 1 + '0'); 102 buf.append(c); 103 v = vspace.get(k++); 104 } 105 return buf.toString(); 106 } 107 108 public void loadSimplePatterns(InputStream stream) { 109 SimplePatternParser pp = new SimplePatternParser(); 110 ivalues = new TernaryTree(); 111 112 pp.parse(stream, this); 113 114 // patterns/values should be now in the tree 115 // let's optimize a bit 116 trimToSize(); 117 vspace.trimToSize(); 118 classmap.trimToSize(); 119 120 // get rid of the auxiliary map 121 ivalues = null; 122 } 123 124 125 public String findPattern(String pat) { 126 int k = super.find(pat); 127 if (k >= 0) { 128 return unpackValues(k); 129 } 130 return ""; 131 } 132 133 /** 134 * String compare, returns 0 if equal or 135 * t is a substring of s 136 */ 137 protected int hstrcmp(char[] s, int si, char[] t, int ti) { 138 for (; s[si] == t[ti]; si++, ti++) { 139 if (s[si] == 0) { 140 return 0; 141 } 142 } 143 if (t[ti] == 0) { 144 return 0; 145 } 146 return s[si] - t[ti]; 147 } 148 149 protected byte[] getValues(int k) { 150 StringBuffer buf = new StringBuffer(); 151 byte v = vspace.get(k++); 152 while (v != 0) { 153 char c = (char)((v >>> 4) - 1); 154 buf.append(c); 155 c = (char)(v & 0x0f); 156 if (c == 0) { 157 break; 158 } 159 c = (char)(c - 1); 160 buf.append(c); 161 v = vspace.get(k++); 162 } 163 byte[] res = new byte[buf.length()]; 164 for (int i = 0; i < res.length; i++) { 165 res[i] = (byte)buf.charAt(i); 166 } 167 return res; 168 } 169 170 /** 171 * <p>Search for all possible partial matches of word starting 172 * at index an update interletter values. In other words, it 173 * does something like:</p> 174 * <code> 175 * for(i=0; i < patterns.length; i++) { 176 * if ( word.substring(index).startsWidth(patterns[i]) ) 177 * update_interletter_values(patterns[i]); 178 * } 179 * </code> 180 * <p>But it is done in an efficient way since the patterns are 181 * stored in a ternary tree. In fact, this is the whole purpose 182 * of having the tree: doing this search without having to test 183 * every single pattern. The number of patterns for languages 184 * such as English range from 4000 to 10000. Thus, doing thousands 185 * of string comparisons for each word to hyphenate would be 186 * really slow without the tree. The tradeoff is memory, but 187 * using a ternary tree instead of a trie, almost halves the 188 * the memory used by Lout or TeX. It's also faster than using 189 * a hash table</p> 190 * @param word null terminated word to match 191 * @param index start index from word 192 * @param il interletter values array to update 193 */ 194 protected void searchPatterns(char[] word, int index, byte[] il) { 195 byte[] values; 196 int i = index; 197 char p, q; 198 char sp = word[i]; 199 p = root; 200 201 while (p > 0 && p < sc.length) { 202 if (sc[p] == 0xFFFF) { 203 if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) { 204 values = getValues(eq[p]); // data pointer is in eq[] 205 int j = index; 206 for (byte value : values) { 207 if (j < il.length && value > il[j]) { 208 il[j] = value; 209 } 210 j++; 211 } 212 } 213 return; 214 } 215 int d = sp - sc[p]; 216 if (d == 0) { 217 if (sp == 0) { 218 break; 219 } 220 sp = word[++i]; 221 p = eq[p]; 222 q = p; 223 224 // look for a pattern ending at this position by searching for 225 // the null char ( splitchar == 0 ) 226 while (q > 0 && q < sc.length) { 227 if (sc[q] == 0xFFFF) { // stop at compressed branch 228 break; 229 } 230 if (sc[q] == 0) { 231 values = getValues(eq[q]); 232 int j = index; 233 for (byte value : values) { 234 if (j < il.length && value > il[j]) { 235 il[j] = value; 236 } 237 j++; 238 } 239 break; 240 } else { 241 q = lo[q]; 242 243 /** 244 * actually the code should be: 245 * q = sc[q] < 0 ? hi[q] : lo[q]; 246 * but java chars are unsigned 247 */ 248 } 249 } 250 } else { 251 p = d < 0 ? lo[p] : hi[p]; 252 } 253 } 254 } 255 256 /** 257 * Hyphenate word and return a Hyphenation object. 258 * @param word the word to be hyphenated 259 * @param remainCharCount Minimum number of characters allowed 260 * before the hyphenation point. 261 * @param pushCharCount Minimum number of characters allowed after 262 * the hyphenation point. 263 * @return a {@link Hyphenation Hyphenation} object representing 264 * the hyphenated word or null if word is not hyphenated. 265 */ 266 public Hyphenation hyphenate(String word, int remainCharCount, 267 int pushCharCount) { 268 char[] w = word.toCharArray(); 269 return hyphenate(w, 0, w.length, remainCharCount, pushCharCount); 270 } 271 272 /** 273 * w = "****nnllllllnnn*****", 274 * where n is a non-letter, l is a letter, 275 * all n may be absent, the first n is at offset, 276 * the first l is at offset + iIgnoreAtBeginning; 277 * word = ".llllll.'\0'***", 278 * where all l in w are copied into word. 279 * In the first part of the routine len = w.length, 280 * in the second part of the routine len = word.length. 281 * Three indices are used: 282 * index(w), the index in w, 283 * index(word), the index in word, 284 * letterindex(word), the index in the letter part of word. 285 * The following relations exist: 286 * index(w) = offset + i - 1 287 * index(word) = i - iIgnoreAtBeginning 288 * letterindex(word) = index(word) - 1 289 * (see first loop). 290 * It follows that: 291 * index(w) - index(word) = offset - 1 + iIgnoreAtBeginning 292 * index(w) = letterindex(word) + offset + iIgnoreAtBeginning 293 */ 294 295 /** 296 * Hyphenate word and return an array of hyphenation points. 297 * @param w char array that contains the word 298 * @param offset Offset to first character in word 299 * @param len Length of word 300 * @param remainCharCount Minimum number of characters allowed 301 * before the hyphenation point. 302 * @param pushCharCount Minimum number of characters allowed after 303 * the hyphenation point. 304 * @return a {@link Hyphenation Hyphenation} object representing 305 * the hyphenated word or null if word is not hyphenated. 306 */ 307 public Hyphenation hyphenate(char[] w, int offset, int len, 308 int remainCharCount, int pushCharCount) { 309 int i; 310 char[] word = new char[len + 3]; 311 312 // normalize word 313 char[] c = new char[2]; 314 int iIgnoreAtBeginning = 0; 315 int iLength = len; 316 boolean bEndOfLetters = false; 317 for (i = 1; i <= len; i++) { 318 c[0] = w[offset + i - 1]; 319 int nc = classmap.find(c, 0); 320 if (nc < 0) { // found a non-letter character ... 321 if (i == 1 + iIgnoreAtBeginning) { 322 // ... before any letter character 323 iIgnoreAtBeginning ++; 324 } else { 325 // ... after a letter character 326 bEndOfLetters = true; 327 } 328 iLength --; 329 } else { 330 if (!bEndOfLetters) { 331 word[i - iIgnoreAtBeginning] = (char)nc; 332 } else { 333 return null; 334 } 335 } 336 } 337 len = iLength; 338 if (len < remainCharCount + pushCharCount) { 339 // word is too short to be hyphenated 340 return null; 341 } 342 int[] result = new int[len + 1]; 343 int k = 0; 344 345 // check exception list first 346 String sw = new String(word, 1, len); 347 if (stoplist.containsKey(sw)) { 348 // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = null) 349 ArrayList<Object> hw = stoplist.get(sw); 350 int j = 0; 351 for (i = 0; i < hw.size(); i++) { 352 Object o = hw.get(i); 353 // j = index(sw) = letterindex(word)? 354 // result[k] = corresponding index(w) 355 if (o instanceof String) { 356 j += ((String)o).length(); 357 if (j >= remainCharCount && j < len - pushCharCount) { 358 result[k++] = j + iIgnoreAtBeginning; 359 } 360 } 361 } 362 } else { 363 // use algorithm to get hyphenation points 364 word[0] = '.'; // word start marker 365 word[len + 1] = '.'; // word end marker 366 word[len + 2] = 0; // null terminated 367 byte[] il = new byte[len + 3]; // initialized to zero 368 for (i = 0; i < len + 1; i++) { 369 searchPatterns(word, i, il); 370 } 371 372 // hyphenation points are located where interletter value is odd 373 // i is letterindex(word), 374 // i + 1 is index(word), 375 // result[k] = corresponding index(w) 376 for (i = 0; i < len; i++) { 377 if ((il[i + 1] & 1) == 1 && i >= remainCharCount 378 && i <= len - pushCharCount) { 379 result[k++] = i + iIgnoreAtBeginning; 380 } 381 } 382 } 383 384 385 if (k > 0) { 386 // trim result array 387 int[] res = new int[k]; 388 System.arraycopy(result, 0, res, 0, k); 389 return new Hyphenation(new String(w, offset, len), res); 390 } else { 391 return null; 392 } 393 } 394 395 /** 396 * Add a character class to the tree. It is used by 397 * {@link SimplePatternParser SimplePatternParser} as callback to 398 * add character classes. Character classes define the 399 * valid word characters for hyphenation. If a word contains 400 * a character not defined in any of the classes, it is not hyphenated. 401 * It also defines a way to normalize the characters in order 402 * to compare them with the stored patterns. Usually pattern 403 * files use only lower case characters, in this case a class 404 * for letter 'a', for example, should be defined as "aA", the first 405 * character being the normalization char. 406 */ 407 public void addClass(String chargroup) { 408 if (chargroup.length() > 0) { 409 char equivChar = chargroup.charAt(0); 410 char[] key = new char[2]; 411 key[1] = 0; 412 for (int i = 0; i < chargroup.length(); i++) { 413 key[0] = chargroup.charAt(i); 414 classmap.insert(key, 0, equivChar); 415 } 416 } 417 } 418 419 /** 420 * Add an exception to the tree. It is used by 421 * {@link SimplePatternParser SimplePatternParser} class as callback to 422 * store the hyphenation exceptions. 423 * @param word normalized word 424 * @param hyphenatedword a vector of alternating strings and 425 * {@link Hyphen hyphen} objects. 426 */ 427 public void addException(String word, ArrayList<Object> hyphenatedword) { 428 stoplist.put(word, hyphenatedword); 429 } 430 431 /** 432 * Add a pattern to the tree. Mainly, to be used by 433 * {@link SimplePatternParser SimplePatternParser} class as callback to 434 * add a pattern to the tree. 435 * @param pattern the hyphenation pattern 436 * @param ivalue interletter weight values indicating the 437 * desirability and priority of hyphenating at a given point 438 * within the pattern. It should contain only digit characters. 439 * (i.e. '0' to '9'). 440 */ 441 public void addPattern(String pattern, String ivalue) { 442 int k = ivalues.find(ivalue); 443 if (k <= 0) { 444 k = packValues(ivalue); 445 ivalues.insert(ivalue, (char)k); 446 } 447 insert(pattern, (char)k); 448 } 449 450 @Override 451 public void printStats() { 452 System.out.println("Value space size = " 453 + Integer.toString(vspace.length())); 454 super.printStats(); 455 } 456}