001/* 002 * $Id: PdfContentReaderTool.java 4784 2011-03-15 08:33:00Z blowagie $ 003 * 004 * This file is part of the iText (R) project. 005 * Copyright (c) 1998-2011 1T3XT BVBA 006 * Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al. 007 * 008 * This program is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Affero General Public License version 3 010 * as published by the Free Software Foundation with the addition of the 011 * following permission added to Section 15 as permitted in Section 7(a): 012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT, 013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS. 014 * 015 * This program is distributed in the hope that it will be useful, but 016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 017 * or FITNESS FOR A PARTICULAR PURPOSE. 018 * See the GNU Affero General Public License for more details. 019 * You should have received a copy of the GNU Affero General Public License 020 * along with this program; if not, see http://www.gnu.org/licenses or write to 021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 022 * Boston, MA, 02110-1301 USA, or download the license from the following URL: 023 * http://itextpdf.com/terms-of-use/ 024 * 025 * The interactive user interfaces in modified source and object code versions 026 * of this program must display Appropriate Legal Notices, as required under 027 * Section 5 of the GNU Affero General Public License. 028 * 029 * In accordance with Section 7(b) of the GNU Affero General Public License, 030 * a covered work must retain the producer line in every PDF that is created 031 * or manipulated using iText. 032 * 033 * You can be released from the requirements of the license by purchasing 034 * a commercial license. Buying such a license is mandatory as soon as you 035 * develop commercial activities involving the iText software without 036 * disclosing the source code of your own applications. 037 * These activities include: offering paid services to customers as an ASP, 038 * serving PDFs on the fly in a web application, shipping iText with a closed 039 * source product. 040 * 041 * For more information, please contact iText Software Corp. at this 042 * address: sales@itextpdf.com 043 */ 044package com.itextpdf.text.pdf.parser; 045 046import java.io.ByteArrayInputStream; 047import java.io.File; 048import java.io.FileOutputStream; 049import java.io.IOException; 050import java.io.InputStream; 051import java.io.PrintWriter; 052import java.util.ArrayList; 053import java.util.List; 054 055import com.itextpdf.text.pdf.PdfDictionary; 056import com.itextpdf.text.pdf.PdfName; 057import com.itextpdf.text.pdf.PdfObject; 058import com.itextpdf.text.pdf.PdfReader; 059import com.itextpdf.text.pdf.PdfStream; 060import com.itextpdf.text.pdf.RandomAccessFileOrArray; 061 062/** 063 * Tool that parses the content of a PDF document. 064 * @since 2.1.4 065 */ 066public class PdfContentReaderTool { 067 068 /** 069 * Shows the detail of a dictionary. 070 * This is similar to the PdfLister functionality. 071 * @param dic the dictionary of which you want the detail 072 * @return a String representation of the dictionary 073 */ 074 static public String getDictionaryDetail(PdfDictionary dic){ 075 return getDictionaryDetail(dic, 0); 076 } 077 078 /** 079 * Shows the detail of a dictionary. 080 * @param dic the dictionary of which you want the detail 081 * @param depth the depth of the current dictionary (for nested dictionaries) 082 * @return a String representation of the dictionary 083 */ 084 static public String getDictionaryDetail(PdfDictionary dic, int depth){ 085 StringBuffer builder = new StringBuffer(); 086 builder.append('('); 087 List<PdfName> subDictionaries = new ArrayList<PdfName>(); 088 for (PdfName key: dic.getKeys()) { 089 PdfObject val = dic.getDirectObject(key); 090 if (val.isDictionary()) 091 subDictionaries.add(key); 092 builder.append(key); 093 builder.append('='); 094 builder.append(val); 095 builder.append(", "); 096 } 097 builder.setLength(builder.length()-2); 098 builder.append(')'); 099 for (PdfName pdfSubDictionaryName: subDictionaries) { 100 builder.append('\n'); 101 for(int i = 0; i < depth+1; i++){ 102 builder.append('\t'); 103 } 104 builder.append("Subdictionary "); 105 builder.append(pdfSubDictionaryName); 106 builder.append(" = "); 107 builder.append(getDictionaryDetail(dic.getAsDict(pdfSubDictionaryName), depth+1)); 108 } 109 return builder.toString(); 110 } 111 112 /** 113 * Displays a summary of the entries in the XObject dictionary for the stream 114 * @param resourceDic the resource dictionary for the stream 115 * @return a string with the summary of the entries 116 * @throws IOException 117 * @since 5.0.2 118 */ 119 static public String getXObjectDetail(PdfDictionary resourceDic) throws IOException { 120 StringBuilder sb = new StringBuilder(); 121 122 PdfDictionary xobjects = resourceDic.getAsDict(PdfName.XOBJECT); 123 if (xobjects == null) 124 return "No XObjects"; 125 for (PdfName entryName : xobjects.getKeys()) { 126 PdfStream xobjectStream = xobjects.getAsStream(entryName); 127 128 sb.append("------ " + entryName + " - subtype = " + xobjectStream.get(PdfName.SUBTYPE) + " = " + xobjectStream.getAsNumber(PdfName.LENGTH) + " bytes ------\n"); 129 130 if (!xobjectStream.get(PdfName.SUBTYPE).equals(PdfName.IMAGE)){ 131 132 byte[] contentBytes = ContentByteUtils.getContentBytesFromContentObject(xobjectStream); 133 134 InputStream is = new ByteArrayInputStream(contentBytes); 135 int ch; 136 while ((ch = is.read()) != -1){ 137 sb.append((char)ch); 138 } 139 140 sb.append("------ " + entryName + " - subtype = " + xobjectStream.get(PdfName.SUBTYPE) + "End of Content" + "------\n"); 141 } 142 } 143 144 return sb.toString(); 145 } 146 147 /** 148 * Writes information about a specific page from PdfReader to the specified output stream. 149 * @since 2.1.5 150 * @param reader the PdfReader to read the page content from 151 * @param pageNum the page number to read 152 * @param out the output stream to send the content to 153 * @throws IOException 154 */ 155 static public void listContentStreamForPage(PdfReader reader, int pageNum, PrintWriter out) throws IOException { 156 out.println("==============Page " + pageNum + "===================="); 157 out.println("- - - - - Dictionary - - - - - -"); 158 PdfDictionary pageDictionary = reader.getPageN(pageNum); 159 out.println(getDictionaryDetail(pageDictionary)); 160 161 out.println("- - - - - XObject Summary - - - - - -"); 162 out.println(getXObjectDetail(pageDictionary.getAsDict(PdfName.RESOURCES))); 163 164 out.println("- - - - - Content Stream - - - - - -"); 165 RandomAccessFileOrArray f = reader.getSafeFile(); 166 167 byte[] contentBytes = reader.getPageContent(pageNum, f); 168 f.close(); 169 170 out.flush(); 171 172 InputStream is = new ByteArrayInputStream(contentBytes); 173 int ch; 174 while ((ch = is.read()) != -1){ 175 out.print((char)ch); 176 } 177 178 out.flush(); 179 180 out.println("- - - - - Text Extraction - - - - - -"); 181 String extractedText = PdfTextExtractor.getTextFromPage(reader, pageNum, new LocationTextExtractionStrategy()); 182 if (extractedText.length() != 0) 183 out.println(extractedText); 184 else 185 out.println("No text found on page " + pageNum); 186 187 out.println(); 188 189 } 190 191 /** 192 * Writes information about each page in a PDF file to the specified output stream. 193 * @since 2.1.5 194 * @param pdfFile a File instance referring to a PDF file 195 * @param out the output stream to send the content to 196 * @throws IOException 197 */ 198 static public void listContentStream(File pdfFile, PrintWriter out) throws IOException { 199 PdfReader reader = new PdfReader(pdfFile.getCanonicalPath()); 200 201 int maxPageNum = reader.getNumberOfPages(); 202 203 for (int pageNum = 1; pageNum <= maxPageNum; pageNum++){ 204 listContentStreamForPage(reader, pageNum, out); 205 } 206 207 } 208 209 /** 210 * Writes information about the specified page in a PDF file to the specified output stream. 211 * @since 2.1.5 212 * @param pdfFile a File instance referring to a PDF file 213 * @param pageNum the page number to read 214 * @param out the output stream to send the content to 215 * @throws IOException 216 */ 217 static public void listContentStream(File pdfFile, int pageNum, PrintWriter out) throws IOException { 218 PdfReader reader = new PdfReader(pdfFile.getCanonicalPath()); 219 220 listContentStreamForPage(reader, pageNum, out); 221 } 222 223 /** 224 * Writes information about each page in a PDF file to the specified file, or System.out. 225 * @param args 226 */ 227 public static void main(String[] args) { 228 try{ 229 if (args.length < 1 || args.length > 3){ 230 System.out.println("Usage: PdfContentReaderTool <pdf file> [<output file>|stdout] [<page num>]"); 231 return; 232 } 233 234 PrintWriter writer = new PrintWriter(System.out); 235 if (args.length >= 2){ 236 if (args[1].compareToIgnoreCase("stdout") != 0){ 237 System.out.println("Writing PDF content to " + args[1]); 238 writer = new PrintWriter(new FileOutputStream(new File(args[1]))); 239 } 240 } 241 242 int pageNum = -1; 243 if (args.length >= 3){ 244 pageNum = Integer.parseInt(args[2]); 245 } 246 247 if (pageNum == -1){ 248 listContentStream(new File(args[0]), writer); 249 } else { 250 listContentStream(new File(args[0]), pageNum, writer); 251 } 252 writer.flush(); 253 254 if (args.length >= 2){ 255 writer.close(); 256 System.out.println("Finished writing content to " + args[1]); 257 } 258 } catch (Exception e){ 259 e.printStackTrace(System.err); 260 } 261 } 262 263}