001/* 002 * $Id: PRTokeniser.java 4883 2011-05-24 19:17:29Z blowagie $ 003 * 004 * This file is part of the iText (R) project. 005 * Copyright (c) 1998-2011 1T3XT BVBA 006 * Authors: Bruno Lowagie, Paulo Soares, et al. 007 * 008 * This program is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Affero General Public License version 3 010 * as published by the Free Software Foundation with the addition of the 011 * following permission added to Section 15 as permitted in Section 7(a): 012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT, 013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS. 014 * 015 * This program is distributed in the hope that it will be useful, but 016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 017 * or FITNESS FOR A PARTICULAR PURPOSE. 018 * See the GNU Affero General Public License for more details. 019 * You should have received a copy of the GNU Affero General Public License 020 * along with this program; if not, see http://www.gnu.org/licenses or write to 021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 022 * Boston, MA, 02110-1301 USA, or download the license from the following URL: 023 * http://itextpdf.com/terms-of-use/ 024 * 025 * The interactive user interfaces in modified source and object code versions 026 * of this program must display Appropriate Legal Notices, as required under 027 * Section 5 of the GNU Affero General Public License. 028 * 029 * In accordance with Section 7(b) of the GNU Affero General Public License, 030 * a covered work must retain the producer line in every PDF that is created 031 * or manipulated using iText. 032 * 033 * You can be released from the requirements of the license by purchasing 034 * a commercial license. Buying such a license is mandatory as soon as you 035 * develop commercial activities involving the iText software without 036 * disclosing the source code of your own applications. 037 * These activities include: offering paid services to customers as an ASP, 038 * serving PDFs on the fly in a web application, shipping iText with a closed 039 * source product. 040 * 041 * For more information, please contact iText Software Corp. at this 042 * address: sales@itextpdf.com 043 */ 044package com.itextpdf.text.pdf; 045 046import java.io.IOException; 047import com.itextpdf.text.exceptions.InvalidPdfException; 048import com.itextpdf.text.error_messages.MessageLocalization; 049/** 050 * 051 * @author Paulo Soares 052 */ 053public class PRTokeniser { 054 055 /** 056 * Enum representing the possible token types 057 * @since 5.0.1 058 */ 059 public enum TokenType { 060 NUMBER, 061 STRING, 062 NAME, 063 COMMENT, 064 START_ARRAY, 065 END_ARRAY, 066 START_DIC, 067 END_DIC, 068 REF, 069 OTHER, 070 ENDOFFILE 071 } 072 073 public static final boolean delims[] = { 074 true, true, false, false, false, false, false, false, false, false, 075 true, true, false, true, true, false, false, false, false, false, 076 false, false, false, false, false, false, false, false, false, false, 077 false, false, false, true, false, false, false, false, true, false, 078 false, true, true, false, false, false, false, false, true, false, 079 false, false, false, false, false, false, false, false, false, false, 080 false, true, false, true, false, false, false, false, false, false, 081 false, false, false, false, false, false, false, false, false, false, 082 false, false, false, false, false, false, false, false, false, false, 083 false, false, true, false, true, false, false, false, false, false, 084 false, false, false, false, false, false, false, false, false, false, 085 false, false, false, false, false, false, false, false, false, false, 086 false, false, false, false, false, false, false, false, false, false, 087 false, false, false, false, false, false, false, false, false, false, 088 false, false, false, false, false, false, false, false, false, false, 089 false, false, false, false, false, false, false, false, false, false, 090 false, false, false, false, false, false, false, false, false, false, 091 false, false, false, false, false, false, false, false, false, false, 092 false, false, false, false, false, false, false, false, false, false, 093 false, false, false, false, false, false, false, false, false, false, 094 false, false, false, false, false, false, false, false, false, false, 095 false, false, false, false, false, false, false, false, false, false, 096 false, false, false, false, false, false, false, false, false, false, 097 false, false, false, false, false, false, false, false, false, false, 098 false, false, false, false, false, false, false, false, false, false, 099 false, false, false, false, false, false, false}; 100 101 static final String EMPTY = ""; 102 103 104 protected RandomAccessFileOrArray file; 105 protected TokenType type; 106 protected String stringValue; 107 protected int reference; 108 protected int generation; 109 protected boolean hexString; 110 111 public PRTokeniser(String filename) throws IOException { 112 file = new RandomAccessFileOrArray(filename); 113 } 114 115 public PRTokeniser(byte pdfIn[]) { 116 file = new RandomAccessFileOrArray(pdfIn); 117 } 118 119 public PRTokeniser(RandomAccessFileOrArray file) { 120 this.file = file; 121 } 122 123 public void seek(int pos) throws IOException { 124 file.seek(pos); 125 } 126 127 public int getFilePointer() throws IOException { 128 return file.getFilePointer(); 129 } 130 131 public void close() throws IOException { 132 file.close(); 133 } 134 135 public int length() throws IOException { 136 return file.length(); 137 } 138 139 public int read() throws IOException { 140 return file.read(); 141 } 142 143 public RandomAccessFileOrArray getSafeFile() { 144 return new RandomAccessFileOrArray(file); 145 } 146 147 public RandomAccessFileOrArray getFile() { 148 return file; 149 } 150 151 public String readString(int size) throws IOException { 152 StringBuffer buf = new StringBuffer(); 153 int ch; 154 while ((size--) > 0) { 155 ch = file.read(); 156 if (ch == -1) 157 break; 158 buf.append((char)ch); 159 } 160 return buf.toString(); 161 } 162 163 public static final boolean isWhitespace(int ch) { 164 return (ch == 0 || ch == 9 || ch == 10 || ch == 12 || ch == 13 || ch == 32); 165 } 166 167 public static final boolean isDelimiter(int ch) { 168 return (ch == '(' || ch == ')' || ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '/' || ch == '%'); 169 } 170 171 public static final boolean isDelimiterWhitespace(int ch) { 172 return delims[ch + 1]; 173 } 174 175 public TokenType getTokenType() { 176 return type; 177 } 178 179 public String getStringValue() { 180 return stringValue; 181 } 182 183 public int getReference() { 184 return reference; 185 } 186 187 public int getGeneration() { 188 return generation; 189 } 190 191 public void backOnePosition(int ch) { 192 if (ch != -1) 193 file.pushBack((byte)ch); 194 } 195 196 public void throwError(String error) throws IOException { 197 throw new InvalidPdfException(MessageLocalization.getComposedMessage("1.at.file.pointer.2", error, String.valueOf(file.getFilePointer()))); 198 } 199 200 public char checkPdfHeader() throws IOException { 201 file.setStartOffset(0); 202 String str = readString(1024); 203 int idx = str.indexOf("%PDF-"); 204 if (idx < 0) 205 throw new InvalidPdfException(MessageLocalization.getComposedMessage("pdf.header.not.found")); 206 file.setStartOffset(idx); 207 return str.charAt(idx + 7); 208 } 209 210 public void checkFdfHeader() throws IOException { 211 file.setStartOffset(0); 212 String str = readString(1024); 213 int idx = str.indexOf("%FDF-"); 214 if (idx < 0) 215 throw new InvalidPdfException(MessageLocalization.getComposedMessage("fdf.header.not.found")); 216 file.setStartOffset(idx); 217 } 218 219 public int getStartxref(int arrLength) throws IOException { 220 int fileLength = file.length(); 221 int size = Math.min(arrLength, fileLength); 222 int pos = file.length() - size; 223 file.seek(pos); 224 String str = readString(arrLength); 225 int idx = str.lastIndexOf("startxref"); 226 if (idx < 0 && size == fileLength) 227 throw new InvalidPdfException(MessageLocalization.getComposedMessage("pdf.startxref.not.found")); 228 if (idx < 0) 229 return getStartxref(arrLength + 1024); 230 return pos + idx; 231 } 232 233 public static int getHex(int v) { 234 if (v >= '0' && v <= '9') 235 return v - '0'; 236 if (v >= 'A' && v <= 'F') 237 return v - 'A' + 10; 238 if (v >= 'a' && v <= 'f') 239 return v - 'a' + 10; 240 return -1; 241 } 242 243 public void nextValidToken() throws IOException { 244 int level = 0; 245 String n1 = null; 246 String n2 = null; 247 int ptr = 0; 248 while (nextToken()) { 249 if (type == TokenType.COMMENT) 250 continue; 251 switch (level) { 252 case 0: 253 { 254 if (type != TokenType.NUMBER) 255 return; 256 ptr = file.getFilePointer(); 257 n1 = stringValue; 258 ++level; 259 break; 260 } 261 case 1: 262 { 263 if (type != TokenType.NUMBER) { 264 file.seek(ptr); 265 type = TokenType.NUMBER; 266 stringValue = n1; 267 return; 268 } 269 n2 = stringValue; 270 ++level; 271 break; 272 } 273 default: 274 { 275 if (type != TokenType.OTHER || !stringValue.equals("R")) { 276 file.seek(ptr); 277 type = TokenType.NUMBER; 278 stringValue = n1; 279 return; 280 } 281 type = TokenType.REF; 282 reference = Integer.parseInt(n1); 283 generation = Integer.parseInt(n2); 284 return; 285 } 286 } 287 } 288 // if we hit here, the file is either corrupt (stream ended unexpectedly), 289 // or the last token ended exactly at the end of a stream. This last 290 // case can occur inside an Object Stream. 291 } 292 293 public boolean nextToken() throws IOException { 294 int ch = 0; 295 do { 296 ch = file.read(); 297 } while (ch != -1 && isWhitespace(ch)); 298 if (ch == -1){ 299 type = TokenType.ENDOFFILE; 300 return false; 301 } 302 303 // Note: We have to initialize stringValue here, after we've looked for the end of the stream, 304 // to ensure that we don't lose the value of a token that might end exactly at the end 305 // of the stream 306 StringBuffer outBuf = null; 307 stringValue = EMPTY; 308 309 switch (ch) { 310 case '[': 311 type = TokenType.START_ARRAY; 312 break; 313 case ']': 314 type = TokenType.END_ARRAY; 315 break; 316 case '/': 317 { 318 outBuf = new StringBuffer(); 319 type = TokenType.NAME; 320 while (true) { 321 ch = file.read(); 322 if (delims[ch + 1]) 323 break; 324 if (ch == '#') { 325 ch = (getHex(file.read()) << 4) + getHex(file.read()); 326 } 327 outBuf.append((char)ch); 328 } 329 backOnePosition(ch); 330 break; 331 } 332 case '>': 333 ch = file.read(); 334 if (ch != '>') 335 throwError(MessageLocalization.getComposedMessage("greaterthan.not.expected")); 336 type = TokenType.END_DIC; 337 break; 338 case '<': 339 { 340 int v1 = file.read(); 341 if (v1 == '<') { 342 type = TokenType.START_DIC; 343 break; 344 } 345 outBuf = new StringBuffer(); 346 type = TokenType.STRING; 347 hexString = true; 348 int v2 = 0; 349 while (true) { 350 while (isWhitespace(v1)) 351 v1 = file.read(); 352 if (v1 == '>') 353 break; 354 v1 = getHex(v1); 355 if (v1 < 0) 356 break; 357 v2 = file.read(); 358 while (isWhitespace(v2)) 359 v2 = file.read(); 360 if (v2 == '>') { 361 ch = v1 << 4; 362 outBuf.append((char)ch); 363 break; 364 } 365 v2 = getHex(v2); 366 if (v2 < 0) 367 break; 368 ch = (v1 << 4) + v2; 369 outBuf.append((char)ch); 370 v1 = file.read(); 371 } 372 if (v1 < 0 || v2 < 0) 373 throwError(MessageLocalization.getComposedMessage("error.reading.string")); 374 break; 375 } 376 case '%': 377 type = TokenType.COMMENT; 378 do { 379 ch = file.read(); 380 } while (ch != -1 && ch != '\r' && ch != '\n'); 381 break; 382 case '(': 383 { 384 outBuf = new StringBuffer(); 385 type = TokenType.STRING; 386 hexString = false; 387 int nesting = 0; 388 while (true) { 389 ch = file.read(); 390 if (ch == -1) 391 break; 392 if (ch == '(') { 393 ++nesting; 394 } 395 else if (ch == ')') { 396 --nesting; 397 } 398 else if (ch == '\\') { 399 boolean lineBreak = false; 400 ch = file.read(); 401 switch (ch) { 402 case 'n': 403 ch = '\n'; 404 break; 405 case 'r': 406 ch = '\r'; 407 break; 408 case 't': 409 ch = '\t'; 410 break; 411 case 'b': 412 ch = '\b'; 413 break; 414 case 'f': 415 ch = '\f'; 416 break; 417 case '(': 418 case ')': 419 case '\\': 420 break; 421 case '\r': 422 lineBreak = true; 423 ch = file.read(); 424 if (ch != '\n') 425 backOnePosition(ch); 426 break; 427 case '\n': 428 lineBreak = true; 429 break; 430 default: 431 { 432 if (ch < '0' || ch > '7') { 433 break; 434 } 435 int octal = ch - '0'; 436 ch = file.read(); 437 if (ch < '0' || ch > '7') { 438 backOnePosition(ch); 439 ch = octal; 440 break; 441 } 442 octal = (octal << 3) + ch - '0'; 443 ch = file.read(); 444 if (ch < '0' || ch > '7') { 445 backOnePosition(ch); 446 ch = octal; 447 break; 448 } 449 octal = (octal << 3) + ch - '0'; 450 ch = octal & 0xff; 451 break; 452 } 453 } 454 if (lineBreak) 455 continue; 456 if (ch < 0) 457 break; 458 } 459 else if (ch == '\r') { 460 ch = file.read(); 461 if (ch < 0) 462 break; 463 if (ch != '\n') { 464 backOnePosition(ch); 465 ch = '\n'; 466 } 467 } 468 if (nesting == -1) 469 break; 470 outBuf.append((char)ch); 471 } 472 if (ch == -1) 473 throwError(MessageLocalization.getComposedMessage("error.reading.string")); 474 break; 475 } 476 default: 477 { 478 outBuf = new StringBuffer(); 479 if (ch == '-' || ch == '+' || ch == '.' || (ch >= '0' && ch <= '9')) { 480 type = TokenType.NUMBER; 481 do { 482 outBuf.append((char)ch); 483 ch = file.read(); 484 } while (ch != -1 && ((ch >= '0' && ch <= '9') || ch == '.')); 485 } 486 else { 487 type = TokenType.OTHER; 488 do { 489 outBuf.append((char)ch); 490 ch = file.read(); 491 } while (!delims[ch + 1]); 492 } 493 backOnePosition(ch); 494 break; 495 } 496 } 497 if (outBuf != null) 498 stringValue = outBuf.toString(); 499 return true; 500 } 501 502 public int intValue() { 503 return Integer.parseInt(stringValue); 504 } 505 506 public boolean readLineSegment(byte input[]) throws IOException { 507 int c = -1; 508 boolean eol = false; 509 int ptr = 0; 510 int len = input.length; 511 // ssteward, pdftk-1.10, 040922: 512 // skip initial whitespace; added this because PdfReader.rebuildXref() 513 // assumes that line provided by readLineSegment does not have init. whitespace; 514 if ( ptr < len ) { 515 while ( isWhitespace( (c = read()) ) ); 516 } 517 while ( !eol && ptr < len ) { 518 switch (c) { 519 case -1: 520 case '\n': 521 eol = true; 522 break; 523 case '\r': 524 eol = true; 525 int cur = getFilePointer(); 526 if ((read()) != '\n') { 527 seek(cur); 528 } 529 break; 530 default: 531 input[ptr++] = (byte)c; 532 break; 533 } 534 535 // break loop? do it before we read() again 536 if( eol || len <= ptr ) { 537 break; 538 } 539 else { 540 c = read(); 541 } 542 } 543 if (ptr >= len) { 544 eol = false; 545 while (!eol) { 546 switch (c = read()) { 547 case -1: 548 case '\n': 549 eol = true; 550 break; 551 case '\r': 552 eol = true; 553 int cur = getFilePointer(); 554 if ((read()) != '\n') { 555 seek(cur); 556 } 557 break; 558 } 559 } 560 } 561 562 if ((c == -1) && (ptr == 0)) { 563 return false; 564 } 565 if (ptr + 2 <= len) { 566 input[ptr++] = (byte)' '; 567 input[ptr] = (byte)'X'; 568 } 569 return true; 570 } 571 572 public static int[] checkObjectStart(byte line[]) { 573 try { 574 PRTokeniser tk = new PRTokeniser(line); 575 int num = 0; 576 int gen = 0; 577 if (!tk.nextToken() || tk.getTokenType() != TokenType.NUMBER) 578 return null; 579 num = tk.intValue(); 580 if (!tk.nextToken() || tk.getTokenType() != TokenType.NUMBER) 581 return null; 582 gen = tk.intValue(); 583 if (!tk.nextToken()) 584 return null; 585 if (!tk.getStringValue().equals("obj")) 586 return null; 587 return new int[]{num, gen}; 588 } 589 catch (Exception ioe) { 590 // empty on purpose 591 } 592 return null; 593 } 594 595 public boolean isHexString() { 596 return this.hexString; 597 } 598 599}