001/* 002 * $Id: TaggedPdfReaderTool.java 4813 2011-04-26 10:35:49Z blowagie $ 003 * 004 * This file is part of the iText (R) project. 005 * Copyright (c) 1998-2011 1T3XT BVBA 006 * Authors: Bruno Lowagie, et al. 007 * 008 * This program is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Affero General Public License version 3 010 * as published by the Free Software Foundation with the addition of the 011 * following permission added to Section 15 as permitted in Section 7(a): 012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT, 013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS. 014 * 015 * This program is distributed in the hope that it will be useful, but 016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 017 * or FITNESS FOR A PARTICULAR PURPOSE. 018 * See the GNU Affero General Public License for more details. 019 * You should have received a copy of the GNU Affero General Public License 020 * along with this program; if not, see http://www.gnu.org/licenses or write to 021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 022 * Boston, MA, 02110-1301 USA, or download the license from the following URL: 023 * http://itextpdf.com/terms-of-use/ 024 * 025 * The interactive user interfaces in modified source and object code versions 026 * of this program must display Appropriate Legal Notices, as required under 027 * Section 5 of the GNU Affero General Public License. 028 * 029 * In accordance with Section 7(b) of the GNU Affero General Public License, 030 * a covered work must retain the producer line in every PDF that is created 031 * or manipulated using iText. 032 * 033 * You can be released from the requirements of the license by purchasing 034 * a commercial license. Buying such a license is mandatory as soon as you 035 * develop commercial activities involving the iText software without 036 * disclosing the source code of your own applications. 037 * These activities include: offering paid services to customers as an ASP, 038 * serving PDFs on the fly in a web application, shipping iText with a closed 039 * source product. 040 * 041 * For more information, please contact iText Software Corp. at this 042 * address: sales@itextpdf.com 043 */ 044package com.itextpdf.text.pdf.parser; 045 046import java.io.IOException; 047import java.io.OutputStream; 048import java.io.PrintWriter; 049 050import com.itextpdf.text.error_messages.MessageLocalization; 051import com.itextpdf.text.pdf.PdfArray; 052import com.itextpdf.text.pdf.PdfDictionary; 053import com.itextpdf.text.pdf.PdfName; 054import com.itextpdf.text.pdf.PdfNumber; 055import com.itextpdf.text.pdf.PdfObject; 056import com.itextpdf.text.pdf.PdfReader; 057import com.itextpdf.text.xml.XMLUtil; 058import java.io.OutputStreamWriter; 059import java.nio.charset.Charset; 060 061/** 062 * Converts a tagged PDF document into an XML file. 063 * 064 * @since 5.0.2 065 */ 066public class TaggedPdfReaderTool { 067 068 /** The reader object from which the content streams are read. */ 069 PdfReader reader; 070 /** The writer object to which the XML will be written */ 071 PrintWriter out; 072 073 /** 074 * Parses a string with structured content. 075 * 076 * @param reader 077 * the PdfReader that has access to the PDF file 078 * @param os 079 * the OutputStream to which the resulting xml will be written 080 * @param charset 081 * the charset to encode the data 082 * @since 5.0.5 083 */ 084 public void convertToXml(PdfReader reader, OutputStream os, String charset) 085 throws IOException { 086 this.reader = reader; 087 OutputStreamWriter outs = new OutputStreamWriter(os, charset); 088 out = new PrintWriter(outs); 089 // get the StructTreeRoot from the root object 090 PdfDictionary catalog = reader.getCatalog(); 091 PdfDictionary struct = catalog.getAsDict(PdfName.STRUCTTREEROOT); 092 if (struct == null) 093 throw new IOException(MessageLocalization.getComposedMessage("no.structtreeroot.found")); 094 // Inspect the child or children of the StructTreeRoot 095 inspectChild(struct.getDirectObject(PdfName.K)); 096 out.flush(); 097 out.close(); 098 } 099 100 /** 101 * Parses a string with structured content. The output is done using the 102 * current charset. 103 * 104 * @param reader 105 * the PdfReader that has access to the PDF file 106 * @param os 107 * the OutputStream to which the resulting xml will be written 108 */ 109 public void convertToXml(PdfReader reader, OutputStream os) 110 throws IOException { 111 convertToXml(reader, os, Charset.defaultCharset().name()); 112 } 113 114 /** 115 * Inspects a child of a structured element. This can be an array or a 116 * dictionary. 117 * 118 * @param k 119 * the child to inspect 120 * @throws IOException 121 */ 122 public void inspectChild(PdfObject k) throws IOException { 123 if (k == null) 124 return; 125 if (k instanceof PdfArray) 126 inspectChildArray((PdfArray) k); 127 else if (k instanceof PdfDictionary) 128 inspectChildDictionary((PdfDictionary) k); 129 } 130 131 /** 132 * If the child of a structured element is an array, we need to loop over 133 * the elements. 134 * 135 * @param k 136 * the child array to inspect 137 */ 138 public void inspectChildArray(PdfArray k) throws IOException { 139 if (k == null) 140 return; 141 for (int i = 0; i < k.size(); i++) { 142 inspectChild(k.getDirectObject(i)); 143 } 144 } 145 146 /** 147 * If the child of a structured element is a dictionary, we inspect the 148 * child; we may also draw a tag. 149 * 150 * @param k 151 * the child dictionary to inspect 152 */ 153 public void inspectChildDictionary(PdfDictionary k) throws IOException { 154 if (k == null) 155 return; 156 PdfName s = k.getAsName(PdfName.S); 157 if (s != null) { 158 String tagN = PdfName.decodeName(s.toString()); 159 String tag = fixTagName(tagN); 160 out.print("<"); 161 out.print(tag); 162 out.print(">"); 163 PdfDictionary dict = k.getAsDict(PdfName.PG); 164 if (dict != null) 165 parseTag(tagN, k.getDirectObject(PdfName.K), dict); 166 inspectChild(k.getDirectObject(PdfName.K)); 167 out.print("</"); 168 out.print(tag); 169 out.println(">"); 170 } else 171 inspectChild(k.getDirectObject(PdfName.K)); 172 } 173 174 private static String fixTagName(String tag) { 175 StringBuilder sb = new StringBuilder(); 176 for (int k = 0; k < tag.length(); ++k) { 177 char c = tag.charAt(k); 178 boolean nameStart = 179 c == ':' 180 || (c >= 'A' && c <= 'Z') 181 || c == '_' 182 || (c >= 'a' && c <= 'z') 183 || (c >= '\u00c0' && c <= '\u00d6') 184 || (c >= '\u00d8' && c <= '\u00f6') 185 || (c >= '\u00f8' && c <= '\u02ff') 186 || (c >= '\u0370' && c <= '\u037d') 187 || (c >= '\u037f' && c <= '\u1fff') 188 || (c >= '\u200c' && c <= '\u200d') 189 || (c >= '\u2070' && c <= '\u218f') 190 || (c >= '\u2c00' && c <= '\u2fef') 191 || (c >= '\u3001' && c <= '\ud7ff') 192 || (c >= '\uf900' && c <= '\ufdcf') 193 || (c >= '\ufdf0' && c <= '\ufffd'); 194 boolean nameMiddle = 195 c == '-' 196 || c == '.' 197 || (c >= '0' && c <= '9') 198 || c == '\u00b7' 199 || (c >= '\u0300' && c <= '\u036f') 200 || (c >= '\u203f' && c <= '\u2040') 201 || nameStart; 202 if (k == 0) { 203 if (!nameStart) 204 c = '_'; 205 } 206 else { 207 if (!nameMiddle) 208 c = '-'; 209 } 210 sb.append(c); 211 } 212 return sb.toString(); 213 } 214 215 /** 216 * Searches for a tag in a page. 217 * 218 * @param tag 219 * the name of the tag 220 * @param object 221 * an identifier to find the marked content 222 * @param page 223 * a page dictionary 224 * @throws IOException 225 */ 226 public void parseTag(String tag, PdfObject object, PdfDictionary page) 227 throws IOException { 228 // if the identifier is a number, we can extract the content right away 229 if (object instanceof PdfNumber) { 230 PdfNumber mcid = (PdfNumber) object; 231 RenderFilter filter = new MarkedContentRenderFilter(mcid.intValue()); 232 TextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); 233 FilteredTextRenderListener listener = new FilteredTextRenderListener( 234 strategy, filter); 235 PdfContentStreamProcessor processor = new PdfContentStreamProcessor( 236 listener); 237 processor.processContent(PdfReader.getPageContent(page), page 238 .getAsDict(PdfName.RESOURCES)); 239 out.print(XMLUtil.escapeXML(listener.getResultantText(), true)); 240 } 241 // if the identifier is an array, we call the parseTag method 242 // recursively 243 else if (object instanceof PdfArray) { 244 PdfArray arr = (PdfArray) object; 245 int n = arr.size(); 246 for (int i = 0; i < n; i++) { 247 parseTag(tag, arr.getPdfObject(i), page); 248 if (i < n - 1) 249 out.println(); 250 } 251 } 252 // if the identifier is a dictionary, we get the resources from the 253 // dictionary 254 else if (object instanceof PdfDictionary) { 255 PdfDictionary mcr = (PdfDictionary) object; 256 parseTag(tag, mcr.getDirectObject(PdfName.MCID), mcr 257 .getAsDict(PdfName.PG)); 258 } 259 } 260 261}