001/*
002 * $Id: PdfContentReaderTool.java 4784 2011-03-15 08:33:00Z blowagie $
003 *
004 * This file is part of the iText (R) project.
005 * Copyright (c) 1998-2011 1T3XT BVBA
006 * Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al.
007 *
008 * This program is free software; you can redistribute it and/or modify
009 * it under the terms of the GNU Affero General Public License version 3
010 * as published by the Free Software Foundation with the addition of the
011 * following permission added to Section 15 as permitted in Section 7(a):
012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
014 *
015 * This program is distributed in the hope that it will be useful, but
016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
017 * or FITNESS FOR A PARTICULAR PURPOSE.
018 * See the GNU Affero General Public License for more details.
019 * You should have received a copy of the GNU Affero General Public License
020 * along with this program; if not, see http://www.gnu.org/licenses or write to
021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
022 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
023 * http://itextpdf.com/terms-of-use/
024 *
025 * The interactive user interfaces in modified source and object code versions
026 * of this program must display Appropriate Legal Notices, as required under
027 * Section 5 of the GNU Affero General Public License.
028 *
029 * In accordance with Section 7(b) of the GNU Affero General Public License,
030 * a covered work must retain the producer line in every PDF that is created
031 * or manipulated using iText.
032 *
033 * You can be released from the requirements of the license by purchasing
034 * a commercial license. Buying such a license is mandatory as soon as you
035 * develop commercial activities involving the iText software without
036 * disclosing the source code of your own applications.
037 * These activities include: offering paid services to customers as an ASP,
038 * serving PDFs on the fly in a web application, shipping iText with a closed
039 * source product.
040 *
041 * For more information, please contact iText Software Corp. at this
042 * address: sales@itextpdf.com
043 */
044package com.itextpdf.text.pdf.parser;
045
046import java.io.ByteArrayInputStream;
047import java.io.File;
048import java.io.FileOutputStream;
049import java.io.IOException;
050import java.io.InputStream;
051import java.io.PrintWriter;
052import java.util.ArrayList;
053import java.util.List;
054
055import com.itextpdf.text.pdf.PdfDictionary;
056import com.itextpdf.text.pdf.PdfName;
057import com.itextpdf.text.pdf.PdfObject;
058import com.itextpdf.text.pdf.PdfReader;
059import com.itextpdf.text.pdf.PdfStream;
060import com.itextpdf.text.pdf.RandomAccessFileOrArray;
061
062/**
063 * Tool that parses the content of a PDF document.
064 * @since       2.1.4
065 */
066public class PdfContentReaderTool {
067
068        /**
069         * Shows the detail of a dictionary.
070         * This is similar to the PdfLister functionality.
071         * @param dic   the dictionary of which you want the detail
072         * @return      a String representation of the dictionary
073         */
074    static public String getDictionaryDetail(PdfDictionary dic){
075        return getDictionaryDetail(dic, 0);
076    }
077
078    /**
079     * Shows the detail of a dictionary.
080     * @param dic       the dictionary of which you want the detail
081     * @param depth     the depth of the current dictionary (for nested dictionaries)
082     * @return  a String representation of the dictionary
083     */
084    static public String getDictionaryDetail(PdfDictionary dic, int depth){
085        StringBuffer builder = new StringBuffer();
086        builder.append('(');
087        List<PdfName> subDictionaries = new ArrayList<PdfName>();
088        for (PdfName key: dic.getKeys()) {
089            PdfObject val = dic.getDirectObject(key);
090            if (val.isDictionary())
091                subDictionaries.add(key);
092            builder.append(key);
093            builder.append('=');
094            builder.append(val);
095            builder.append(", ");
096        }
097        builder.setLength(builder.length()-2);
098        builder.append(')');
099        for (PdfName pdfSubDictionaryName: subDictionaries) {
100            builder.append('\n');
101            for(int i = 0; i < depth+1; i++){
102                builder.append('\t');
103            }
104            builder.append("Subdictionary ");
105            builder.append(pdfSubDictionaryName);
106            builder.append(" = ");
107            builder.append(getDictionaryDetail(dic.getAsDict(pdfSubDictionaryName), depth+1));
108        }
109        return builder.toString();
110    }
111
112    /**
113     * Displays a summary of the entries in the XObject dictionary for the stream
114     * @param resourceDic the resource dictionary for the stream
115     * @return a string with the summary of the entries
116     * @throws IOException
117     * @since 5.0.2
118     */
119    static public String getXObjectDetail(PdfDictionary resourceDic) throws IOException {
120        StringBuilder sb = new StringBuilder();
121        
122        PdfDictionary xobjects = resourceDic.getAsDict(PdfName.XOBJECT);
123        if (xobjects == null)
124                return "No XObjects";
125        for (PdfName entryName : xobjects.getKeys()) {
126            PdfStream xobjectStream = xobjects.getAsStream(entryName);
127            
128            sb.append("------ " + entryName + " - subtype = " + xobjectStream.get(PdfName.SUBTYPE) + " = " + xobjectStream.getAsNumber(PdfName.LENGTH) + " bytes ------\n");
129            
130            if (!xobjectStream.get(PdfName.SUBTYPE).equals(PdfName.IMAGE)){
131            
132                byte[] contentBytes = ContentByteUtils.getContentBytesFromContentObject(xobjectStream);
133                
134                InputStream is = new ByteArrayInputStream(contentBytes);
135                int ch;
136                while ((ch = is.read()) != -1){
137                    sb.append((char)ch);
138                }
139    
140                sb.append("------ " + entryName + " - subtype = " + xobjectStream.get(PdfName.SUBTYPE) + "End of Content" + "------\n");
141            }
142        }
143       
144        return sb.toString();
145    }
146    
147    /**
148     * Writes information about a specific page from PdfReader to the specified output stream.
149     * @since 2.1.5
150     * @param reader    the PdfReader to read the page content from
151     * @param pageNum   the page number to read
152     * @param out       the output stream to send the content to
153     * @throws IOException
154     */
155    static public void listContentStreamForPage(PdfReader reader, int pageNum, PrintWriter out) throws IOException {
156        out.println("==============Page " + pageNum + "====================");
157        out.println("- - - - - Dictionary - - - - - -");
158        PdfDictionary pageDictionary = reader.getPageN(pageNum);
159        out.println(getDictionaryDetail(pageDictionary));
160
161        out.println("- - - - - XObject Summary - - - - - -");
162        out.println(getXObjectDetail(pageDictionary.getAsDict(PdfName.RESOURCES)));
163        
164        out.println("- - - - - Content Stream - - - - - -");
165        RandomAccessFileOrArray f = reader.getSafeFile();
166
167        byte[] contentBytes = reader.getPageContent(pageNum, f);
168        f.close();
169
170        out.flush();
171
172        InputStream is = new ByteArrayInputStream(contentBytes);
173        int ch;
174        while ((ch = is.read()) != -1){
175            out.print((char)ch);
176        }
177
178        out.flush();
179        
180        out.println("- - - - - Text Extraction - - - - - -");
181        String extractedText = PdfTextExtractor.getTextFromPage(reader, pageNum, new LocationTextExtractionStrategy());
182        if (extractedText.length() != 0)
183            out.println(extractedText);
184        else
185            out.println("No text found on page " + pageNum);
186
187        out.println();
188
189    }
190
191    /**
192     * Writes information about each page in a PDF file to the specified output stream.
193     * @since 2.1.5
194     * @param pdfFile   a File instance referring to a PDF file
195     * @param out       the output stream to send the content to
196     * @throws IOException
197     */
198    static public void listContentStream(File pdfFile, PrintWriter out) throws IOException {
199        PdfReader reader = new PdfReader(pdfFile.getCanonicalPath());
200
201        int maxPageNum = reader.getNumberOfPages();
202
203        for (int pageNum = 1; pageNum <= maxPageNum; pageNum++){
204            listContentStreamForPage(reader, pageNum, out);
205        }
206
207    }
208
209    /**
210     * Writes information about the specified page in a PDF file to the specified output stream.
211     * @since 2.1.5
212     * @param pdfFile   a File instance referring to a PDF file
213     * @param pageNum   the page number to read
214     * @param out       the output stream to send the content to
215     * @throws IOException
216     */
217    static public void listContentStream(File pdfFile, int pageNum, PrintWriter out) throws IOException {
218        PdfReader reader = new PdfReader(pdfFile.getCanonicalPath());
219
220        listContentStreamForPage(reader, pageNum, out);
221    }
222
223    /**
224     * Writes information about each page in a PDF file to the specified file, or System.out.
225     * @param args
226     */
227    public static void main(String[] args) {
228        try{
229            if (args.length < 1 || args.length > 3){
230                System.out.println("Usage:  PdfContentReaderTool <pdf file> [<output file>|stdout] [<page num>]");
231                return;
232            }
233
234            PrintWriter writer = new PrintWriter(System.out);
235            if (args.length >= 2){
236                if (args[1].compareToIgnoreCase("stdout") != 0){
237                    System.out.println("Writing PDF content to " + args[1]);
238                    writer = new PrintWriter(new FileOutputStream(new File(args[1])));
239                }
240            }
241
242            int pageNum = -1;
243            if (args.length >= 3){
244                pageNum = Integer.parseInt(args[2]);
245            }
246
247            if (pageNum == -1){
248                listContentStream(new File(args[0]), writer);
249            } else {
250                listContentStream(new File(args[0]), pageNum, writer);
251            }
252            writer.flush();
253
254            if (args.length >= 2){
255                writer.close();
256                System.out.println("Finished writing content to " + args[1]);
257            }
258        } catch (Exception e){
259            e.printStackTrace(System.err);
260        }
261    }
262
263}