001/*
002 * $Id: PRTokeniser.java 4883 2011-05-24 19:17:29Z blowagie $
003 *
004 * This file is part of the iText (R) project.
005 * Copyright (c) 1998-2011 1T3XT BVBA
006 * Authors: Bruno Lowagie, Paulo Soares, et al.
007 *
008 * This program is free software; you can redistribute it and/or modify
009 * it under the terms of the GNU Affero General Public License version 3
010 * as published by the Free Software Foundation with the addition of the
011 * following permission added to Section 15 as permitted in Section 7(a):
012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
014 *
015 * This program is distributed in the hope that it will be useful, but
016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
017 * or FITNESS FOR A PARTICULAR PURPOSE.
018 * See the GNU Affero General Public License for more details.
019 * You should have received a copy of the GNU Affero General Public License
020 * along with this program; if not, see http://www.gnu.org/licenses or write to
021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
022 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
023 * http://itextpdf.com/terms-of-use/
024 *
025 * The interactive user interfaces in modified source and object code versions
026 * of this program must display Appropriate Legal Notices, as required under
027 * Section 5 of the GNU Affero General Public License.
028 *
029 * In accordance with Section 7(b) of the GNU Affero General Public License,
030 * a covered work must retain the producer line in every PDF that is created
031 * or manipulated using iText.
032 *
033 * You can be released from the requirements of the license by purchasing
034 * a commercial license. Buying such a license is mandatory as soon as you
035 * develop commercial activities involving the iText software without
036 * disclosing the source code of your own applications.
037 * These activities include: offering paid services to customers as an ASP,
038 * serving PDFs on the fly in a web application, shipping iText with a closed
039 * source product.
040 *
041 * For more information, please contact iText Software Corp. at this
042 * address: sales@itextpdf.com
043 */
044package com.itextpdf.text.pdf;
045
046import java.io.IOException;
047import com.itextpdf.text.exceptions.InvalidPdfException;
048import com.itextpdf.text.error_messages.MessageLocalization;
049/**
050 *
051 * @author  Paulo Soares
052 */
053public class PRTokeniser {
054
055    /**
056     * Enum representing the possible token types
057     * @since 5.0.1
058     */ 
059    public enum TokenType {
060        NUMBER,
061        STRING,
062        NAME,
063        COMMENT,
064        START_ARRAY,
065        END_ARRAY,
066        START_DIC,
067        END_DIC,
068        REF,
069        OTHER,
070        ENDOFFILE
071    }
072    
073    public static final boolean delims[] = {
074        true,  true,  false, false, false, false, false, false, false, false,
075        true,  true,  false, true,  true,  false, false, false, false, false,
076        false, false, false, false, false, false, false, false, false, false,
077        false, false, false, true,  false, false, false, false, true,  false,
078        false, true,  true,  false, false, false, false, false, true,  false,
079        false, false, false, false, false, false, false, false, false, false,
080        false, true,  false, true,  false, false, false, false, false, false,
081        false, false, false, false, false, false, false, false, false, false,
082        false, false, false, false, false, false, false, false, false, false,
083        false, false, true,  false, true,  false, false, false, false, false,
084        false, false, false, false, false, false, false, false, false, false,
085        false, false, false, false, false, false, false, false, false, false,
086        false, false, false, false, false, false, false, false, false, false,
087        false, false, false, false, false, false, false, false, false, false,
088        false, false, false, false, false, false, false, false, false, false,
089        false, false, false, false, false, false, false, false, false, false,
090        false, false, false, false, false, false, false, false, false, false,
091        false, false, false, false, false, false, false, false, false, false,
092        false, false, false, false, false, false, false, false, false, false,
093        false, false, false, false, false, false, false, false, false, false,
094        false, false, false, false, false, false, false, false, false, false,
095        false, false, false, false, false, false, false, false, false, false,
096        false, false, false, false, false, false, false, false, false, false,
097        false, false, false, false, false, false, false, false, false, false,
098        false, false, false, false, false, false, false, false, false, false,
099        false, false, false, false, false, false, false};
100    
101    static final String EMPTY = "";
102
103    
104    protected RandomAccessFileOrArray file;
105    protected TokenType type;
106    protected String stringValue;
107    protected int reference;
108    protected int generation;
109    protected boolean hexString;
110       
111    public PRTokeniser(String filename) throws IOException {
112        file = new RandomAccessFileOrArray(filename);
113    }
114
115    public PRTokeniser(byte pdfIn[]) {
116        file = new RandomAccessFileOrArray(pdfIn);
117    }
118    
119    public PRTokeniser(RandomAccessFileOrArray file) {
120        this.file = file;
121    }
122    
123    public void seek(int pos) throws IOException {
124        file.seek(pos);
125    }
126    
127    public int getFilePointer() throws IOException {
128        return file.getFilePointer();
129    }
130
131    public void close() throws IOException {
132        file.close();
133    }
134    
135    public int length() throws IOException {
136        return file.length();
137    }
138
139    public int read() throws IOException {
140        return file.read();
141    }
142    
143    public RandomAccessFileOrArray getSafeFile() {
144        return new RandomAccessFileOrArray(file);
145    }
146    
147    public RandomAccessFileOrArray getFile() {
148        return file;
149    }
150    
151    public String readString(int size) throws IOException {
152        StringBuffer buf = new StringBuffer();
153        int ch;
154        while ((size--) > 0) {
155            ch = file.read();
156            if (ch == -1)
157                break;
158            buf.append((char)ch);
159        }
160        return buf.toString();
161    }
162
163    public static final boolean isWhitespace(int ch) {
164        return (ch == 0 || ch == 9 || ch == 10 || ch == 12 || ch == 13 || ch == 32);
165    }
166    
167    public static final boolean isDelimiter(int ch) {
168        return (ch == '(' || ch == ')' || ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '/' || ch == '%');
169    }
170
171    public static final boolean isDelimiterWhitespace(int ch) {
172        return delims[ch + 1];
173    }
174
175    public TokenType getTokenType() {
176        return type;
177    }
178    
179    public String getStringValue() {
180        return stringValue;
181    }
182    
183    public int getReference() {
184        return reference;
185    }
186    
187    public int getGeneration() {
188        return generation;
189    }
190    
191    public void backOnePosition(int ch) {
192        if (ch != -1)
193            file.pushBack((byte)ch);
194    }
195    
196    public void throwError(String error) throws IOException {
197        throw new InvalidPdfException(MessageLocalization.getComposedMessage("1.at.file.pointer.2", error, String.valueOf(file.getFilePointer())));
198    }
199    
200    public char checkPdfHeader() throws IOException {
201        file.setStartOffset(0);
202        String str = readString(1024);
203        int idx = str.indexOf("%PDF-");
204        if (idx < 0)
205            throw new InvalidPdfException(MessageLocalization.getComposedMessage("pdf.header.not.found"));
206        file.setStartOffset(idx);
207        return str.charAt(idx + 7);
208    }
209    
210    public void checkFdfHeader() throws IOException {
211        file.setStartOffset(0);
212        String str = readString(1024);
213        int idx = str.indexOf("%FDF-");
214        if (idx < 0)
215            throw new InvalidPdfException(MessageLocalization.getComposedMessage("fdf.header.not.found"));
216        file.setStartOffset(idx);
217    }
218
219    public int getStartxref(int arrLength) throws IOException {
220        int fileLength = file.length();
221        int size = Math.min(arrLength, fileLength);
222        int pos = file.length() - size;
223        file.seek(pos);
224        String str = readString(arrLength);
225        int idx = str.lastIndexOf("startxref");
226        if (idx < 0 && size == fileLength)
227            throw new InvalidPdfException(MessageLocalization.getComposedMessage("pdf.startxref.not.found"));
228        if (idx < 0)
229                return getStartxref(arrLength + 1024);
230        return pos + idx;
231    }
232
233    public static int getHex(int v) {
234        if (v >= '0' && v <= '9')
235            return v - '0';
236        if (v >= 'A' && v <= 'F')
237            return v - 'A' + 10;
238        if (v >= 'a' && v <= 'f')
239            return v - 'a' + 10;
240        return -1;
241    }
242    
243    public void nextValidToken() throws IOException {
244        int level = 0;
245        String n1 = null;
246        String n2 = null;
247        int ptr = 0;
248        while (nextToken()) {
249            if (type == TokenType.COMMENT)
250                continue;
251            switch (level) {
252                case 0:
253                {
254                    if (type != TokenType.NUMBER)
255                        return;
256                    ptr = file.getFilePointer();
257                    n1 = stringValue;
258                    ++level;
259                    break;
260                }
261                case 1:
262                {
263                    if (type != TokenType.NUMBER) {
264                        file.seek(ptr);
265                        type = TokenType.NUMBER;
266                        stringValue = n1;
267                        return;
268                    }
269                    n2 = stringValue;
270                    ++level;
271                    break;
272                }
273                default:
274                {
275                    if (type != TokenType.OTHER || !stringValue.equals("R")) {
276                        file.seek(ptr);
277                        type = TokenType.NUMBER;
278                        stringValue = n1;
279                        return;
280                    }
281                    type = TokenType.REF;
282                    reference = Integer.parseInt(n1);
283                    generation = Integer.parseInt(n2);
284                    return;
285                }
286            }
287        }
288        // if we hit here, the file is either corrupt (stream ended unexpectedly),
289        // or the last token ended exactly at the end of a stream.  This last
290        // case can occur inside an Object Stream.
291    }
292    
293    public boolean nextToken() throws IOException {
294        int ch = 0;
295        do {
296            ch = file.read();
297        } while (ch != -1 && isWhitespace(ch));
298        if (ch == -1){
299            type = TokenType.ENDOFFILE;
300            return false;
301        }
302
303        // Note:  We have to initialize stringValue here, after we've looked for the end of the stream,
304        // to ensure that we don't lose the value of a token that might end exactly at the end
305        // of the stream
306        StringBuffer outBuf = null;
307        stringValue = EMPTY;
308
309        switch (ch) {
310            case '[':
311                type = TokenType.START_ARRAY;
312                break;
313            case ']':
314                type = TokenType.END_ARRAY;
315                break;
316            case '/':
317            {
318                outBuf = new StringBuffer();
319                type = TokenType.NAME;
320                while (true) {
321                    ch = file.read();
322                    if (delims[ch + 1])
323                        break;
324                    if (ch == '#') {
325                        ch = (getHex(file.read()) << 4) + getHex(file.read());
326                    }
327                    outBuf.append((char)ch);
328                }
329                backOnePosition(ch);
330                break;
331            }
332            case '>':
333                ch = file.read();
334                if (ch != '>')
335                    throwError(MessageLocalization.getComposedMessage("greaterthan.not.expected"));
336                type = TokenType.END_DIC;
337                break;
338            case '<':
339            {
340                int v1 = file.read();
341                if (v1 == '<') {
342                    type = TokenType.START_DIC;
343                    break;
344                }
345                outBuf = new StringBuffer();
346                type = TokenType.STRING;
347                hexString = true;
348                int v2 = 0;
349                while (true) {
350                    while (isWhitespace(v1))
351                        v1 = file.read();
352                    if (v1 == '>')
353                        break;
354                    v1 = getHex(v1);
355                    if (v1 < 0)
356                        break;
357                    v2 = file.read();
358                    while (isWhitespace(v2))
359                        v2 = file.read();
360                    if (v2 == '>') {
361                        ch = v1 << 4;
362                        outBuf.append((char)ch);
363                        break;
364                    }
365                    v2 = getHex(v2);
366                    if (v2 < 0)
367                        break;
368                    ch = (v1 << 4) + v2;
369                    outBuf.append((char)ch);
370                    v1 = file.read();
371                }
372                if (v1 < 0 || v2 < 0)
373                    throwError(MessageLocalization.getComposedMessage("error.reading.string"));
374                break;
375            }
376            case '%':
377                type = TokenType.COMMENT;
378                do {
379                    ch = file.read();
380                } while (ch != -1 && ch != '\r' && ch != '\n');
381                break;
382            case '(':
383            {
384                outBuf = new StringBuffer();
385                type = TokenType.STRING;
386                hexString = false;
387                int nesting = 0;
388                while (true) {
389                    ch = file.read();
390                    if (ch == -1)
391                        break;
392                    if (ch == '(') {
393                        ++nesting;
394                    }
395                    else if (ch == ')') {
396                        --nesting;
397                    }
398                    else if (ch == '\\') {
399                        boolean lineBreak = false;
400                        ch = file.read();
401                        switch (ch) {
402                            case 'n':
403                                ch = '\n';
404                                break;
405                            case 'r':
406                                ch = '\r';
407                                break;
408                            case 't':
409                                ch = '\t';
410                                break;
411                            case 'b':
412                                ch = '\b';
413                                break;
414                            case 'f':
415                                ch = '\f';
416                                break;
417                            case '(':
418                            case ')':
419                            case '\\':
420                                break;
421                            case '\r':
422                                lineBreak = true;
423                                ch = file.read();
424                                if (ch != '\n')
425                                    backOnePosition(ch);
426                                break;
427                            case '\n':
428                                lineBreak = true;
429                                break;
430                            default:
431                            {
432                                if (ch < '0' || ch > '7') {
433                                    break;
434                                }
435                                int octal = ch - '0';
436                                ch = file.read();
437                                if (ch < '0' || ch > '7') {
438                                    backOnePosition(ch);
439                                    ch = octal;
440                                    break;
441                                }
442                                octal = (octal << 3) + ch - '0';
443                                ch = file.read();
444                                if (ch < '0' || ch > '7') {
445                                    backOnePosition(ch);
446                                    ch = octal;
447                                    break;
448                                }
449                                octal = (octal << 3) + ch - '0';
450                                ch = octal & 0xff;
451                                break;
452                            }
453                        }
454                        if (lineBreak)
455                            continue;
456                        if (ch < 0)
457                            break;
458                    }
459                    else if (ch == '\r') {
460                        ch = file.read();
461                        if (ch < 0)
462                            break;
463                        if (ch != '\n') {
464                            backOnePosition(ch);
465                            ch = '\n';
466                        }
467                    }
468                    if (nesting == -1)
469                        break;
470                    outBuf.append((char)ch);
471                }
472                if (ch == -1)
473                    throwError(MessageLocalization.getComposedMessage("error.reading.string"));
474                break;
475            }
476            default:
477            {
478                outBuf = new StringBuffer();
479                if (ch == '-' || ch == '+' || ch == '.' || (ch >= '0' && ch <= '9')) {
480                    type = TokenType.NUMBER;
481                    do {
482                        outBuf.append((char)ch);
483                        ch = file.read();
484                    } while (ch != -1 && ((ch >= '0' && ch <= '9') || ch == '.'));
485                }
486                else {
487                    type = TokenType.OTHER;
488                    do {
489                        outBuf.append((char)ch);
490                        ch = file.read();
491                    } while (!delims[ch + 1]);
492                }
493                backOnePosition(ch);
494                break;
495            }
496        }
497        if (outBuf != null)
498            stringValue = outBuf.toString();
499        return true;
500    }
501    
502    public int intValue() {
503        return Integer.parseInt(stringValue);
504    }
505    
506    public boolean readLineSegment(byte input[]) throws IOException {
507        int c = -1;
508        boolean eol = false;
509        int ptr = 0;
510        int len = input.length;
511        // ssteward, pdftk-1.10, 040922: 
512        // skip initial whitespace; added this because PdfReader.rebuildXref()
513        // assumes that line provided by readLineSegment does not have init. whitespace;
514        if ( ptr < len ) {
515            while ( isWhitespace( (c = read()) ) );
516        }
517        while ( !eol && ptr < len ) {
518            switch (c) {
519                case -1:
520                case '\n':
521                    eol = true;
522                    break;
523                case '\r':
524                    eol = true;
525                    int cur = getFilePointer();
526                    if ((read()) != '\n') {
527                        seek(cur);
528                    }
529                    break;
530                default:
531                    input[ptr++] = (byte)c;
532                    break;
533            }
534
535            // break loop? do it before we read() again
536            if( eol || len <= ptr ) {
537                break;
538            }
539            else {
540                c = read();
541            }
542        }
543        if (ptr >= len) {
544            eol = false;
545            while (!eol) {
546                switch (c = read()) {
547                    case -1:
548                    case '\n':
549                        eol = true;
550                        break;
551                    case '\r':
552                        eol = true;
553                        int cur = getFilePointer();
554                        if ((read()) != '\n') {
555                            seek(cur);
556                        }
557                        break;
558                }
559            }
560        }
561        
562        if ((c == -1) && (ptr == 0)) {
563            return false;
564        }
565        if (ptr + 2 <= len) {
566            input[ptr++] = (byte)' ';
567            input[ptr] = (byte)'X';
568        }
569        return true;
570    }
571    
572    public static int[] checkObjectStart(byte line[]) {
573        try {
574            PRTokeniser tk = new PRTokeniser(line);
575            int num = 0;
576            int gen = 0;
577            if (!tk.nextToken() || tk.getTokenType() != TokenType.NUMBER)
578                return null;
579            num = tk.intValue();
580            if (!tk.nextToken() || tk.getTokenType() != TokenType.NUMBER)
581                return null;
582            gen = tk.intValue();
583            if (!tk.nextToken())
584                return null;
585            if (!tk.getStringValue().equals("obj"))
586                return null;
587            return new int[]{num, gen};
588        }
589        catch (Exception ioe) {
590            // empty on purpose
591        }
592        return null;
593    }
594    
595    public boolean isHexString() {
596        return this.hexString;
597    }
598    
599}