001/*
002 * $Id: TaggedPdfReaderTool.java 4813 2011-04-26 10:35:49Z blowagie $
003 *
004 * This file is part of the iText (R) project.
005 * Copyright (c) 1998-2011 1T3XT BVBA
006 * Authors: Bruno Lowagie, et al.
007 *
008 * This program is free software; you can redistribute it and/or modify
009 * it under the terms of the GNU Affero General Public License version 3
010 * as published by the Free Software Foundation with the addition of the
011 * following permission added to Section 15 as permitted in Section 7(a):
012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
014 *
015 * This program is distributed in the hope that it will be useful, but
016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
017 * or FITNESS FOR A PARTICULAR PURPOSE.
018 * See the GNU Affero General Public License for more details.
019 * You should have received a copy of the GNU Affero General Public License
020 * along with this program; if not, see http://www.gnu.org/licenses or write to
021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
022 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
023 * http://itextpdf.com/terms-of-use/
024 *
025 * The interactive user interfaces in modified source and object code versions
026 * of this program must display Appropriate Legal Notices, as required under
027 * Section 5 of the GNU Affero General Public License.
028 *
029 * In accordance with Section 7(b) of the GNU Affero General Public License,
030 * a covered work must retain the producer line in every PDF that is created
031 * or manipulated using iText.
032 *
033 * You can be released from the requirements of the license by purchasing
034 * a commercial license. Buying such a license is mandatory as soon as you
035 * develop commercial activities involving the iText software without
036 * disclosing the source code of your own applications.
037 * These activities include: offering paid services to customers as an ASP,
038 * serving PDFs on the fly in a web application, shipping iText with a closed
039 * source product.
040 *
041 * For more information, please contact iText Software Corp. at this
042 * address: sales@itextpdf.com
043 */
044package com.itextpdf.text.pdf.parser;
045
046import java.io.IOException;
047import java.io.OutputStream;
048import java.io.PrintWriter;
049
050import com.itextpdf.text.error_messages.MessageLocalization;
051import com.itextpdf.text.pdf.PdfArray;
052import com.itextpdf.text.pdf.PdfDictionary;
053import com.itextpdf.text.pdf.PdfName;
054import com.itextpdf.text.pdf.PdfNumber;
055import com.itextpdf.text.pdf.PdfObject;
056import com.itextpdf.text.pdf.PdfReader;
057import com.itextpdf.text.xml.XMLUtil;
058import java.io.OutputStreamWriter;
059import java.nio.charset.Charset;
060
061/**
062 * Converts a tagged PDF document into an XML file.
063 * 
064 * @since 5.0.2
065 */
066public class TaggedPdfReaderTool {
067
068        /** The reader object from which the content streams are read. */
069        PdfReader reader;
070        /** The writer object to which the XML will be written */
071        PrintWriter out;
072
073        /**
074         * Parses a string with structured content.
075         * 
076         * @param reader
077         *            the PdfReader that has access to the PDF file
078         * @param os
079         *            the OutputStream to which the resulting xml will be written
080         * @param charset
081         *            the charset to encode the data
082     * @since 5.0.5
083         */
084        public void convertToXml(PdfReader reader, OutputStream os, String charset)
085                        throws IOException {
086                this.reader = reader;
087        OutputStreamWriter outs = new OutputStreamWriter(os, charset);
088                out = new PrintWriter(outs);
089                // get the StructTreeRoot from the root object
090                PdfDictionary catalog = reader.getCatalog();
091                PdfDictionary struct = catalog.getAsDict(PdfName.STRUCTTREEROOT);
092                if (struct == null)
093                        throw new IOException(MessageLocalization.getComposedMessage("no.structtreeroot.found"));
094                // Inspect the child or children of the StructTreeRoot
095                inspectChild(struct.getDirectObject(PdfName.K));
096                out.flush();
097                out.close();
098        }
099
100        /**
101         * Parses a string with structured content. The output is done using the
102     * current charset.
103         *
104         * @param reader
105         *            the PdfReader that has access to the PDF file
106         * @param os
107         *            the OutputStream to which the resulting xml will be written
108         */
109        public void convertToXml(PdfReader reader, OutputStream os)
110                        throws IOException {
111        convertToXml(reader, os, Charset.defaultCharset().name());
112    }
113
114    /**
115         * Inspects a child of a structured element. This can be an array or a
116         * dictionary.
117         * 
118         * @param k
119         *            the child to inspect
120         * @throws IOException
121         */
122        public void inspectChild(PdfObject k) throws IOException {
123                if (k == null)
124                        return;
125                if (k instanceof PdfArray)
126                        inspectChildArray((PdfArray) k);
127                else if (k instanceof PdfDictionary)
128                        inspectChildDictionary((PdfDictionary) k);
129        }
130
131        /**
132         * If the child of a structured element is an array, we need to loop over
133         * the elements.
134         * 
135         * @param k
136         *            the child array to inspect
137         */
138        public void inspectChildArray(PdfArray k) throws IOException {
139                if (k == null)
140                        return;
141                for (int i = 0; i < k.size(); i++) {
142                        inspectChild(k.getDirectObject(i));
143                }
144        }
145
146        /**
147         * If the child of a structured element is a dictionary, we inspect the
148         * child; we may also draw a tag.
149         * 
150         * @param k
151         *            the child dictionary to inspect
152         */
153        public void inspectChildDictionary(PdfDictionary k) throws IOException {
154                if (k == null)
155                        return;
156                PdfName s = k.getAsName(PdfName.S);
157                if (s != null) {
158            String tagN = PdfName.decodeName(s.toString());
159                        String tag = fixTagName(tagN);
160                        out.print("<");
161                        out.print(tag);
162                        out.print(">");
163                        PdfDictionary dict = k.getAsDict(PdfName.PG);
164                        if (dict != null)
165                                parseTag(tagN, k.getDirectObject(PdfName.K), dict);
166                        inspectChild(k.getDirectObject(PdfName.K));
167                        out.print("</");
168                        out.print(tag);
169                        out.println(">");
170                } else
171                        inspectChild(k.getDirectObject(PdfName.K));
172        }
173
174    private static String fixTagName(String tag) {
175        StringBuilder sb = new StringBuilder();
176        for (int k = 0; k < tag.length(); ++k) {
177            char c = tag.charAt(k);
178            boolean nameStart =
179                c == ':'
180                || (c >= 'A' && c <= 'Z')
181                || c == '_'
182                || (c >= 'a' && c <= 'z')
183                || (c >= '\u00c0' && c <= '\u00d6')
184                || (c >= '\u00d8' && c <= '\u00f6')
185                || (c >= '\u00f8' && c <= '\u02ff')
186                || (c >= '\u0370' && c <= '\u037d')
187                || (c >= '\u037f' && c <= '\u1fff')
188                || (c >= '\u200c' && c <= '\u200d')
189                || (c >= '\u2070' && c <= '\u218f')
190                || (c >= '\u2c00' && c <= '\u2fef')
191                || (c >= '\u3001' && c <= '\ud7ff')
192                || (c >= '\uf900' && c <= '\ufdcf')
193                || (c >= '\ufdf0' && c <= '\ufffd');
194            boolean nameMiddle =
195                c == '-'
196                || c == '.'
197                || (c >= '0' && c <= '9')
198                || c == '\u00b7'
199                || (c >= '\u0300' && c <= '\u036f')
200                || (c >= '\u203f' && c <= '\u2040')
201                || nameStart;
202            if (k == 0) {
203                if (!nameStart)
204                    c = '_';
205            }
206            else {
207                if (!nameMiddle)
208                    c = '-';
209            }
210            sb.append(c);
211        }
212        return sb.toString();
213    }
214
215        /**
216         * Searches for a tag in a page.
217         * 
218         * @param tag
219         *            the name of the tag
220         * @param object
221         *            an identifier to find the marked content
222         * @param page
223         *            a page dictionary
224         * @throws IOException
225         */
226        public void parseTag(String tag, PdfObject object, PdfDictionary page)
227                        throws IOException {
228                // if the identifier is a number, we can extract the content right away
229                if (object instanceof PdfNumber) {
230                        PdfNumber mcid = (PdfNumber) object;
231                        RenderFilter filter = new MarkedContentRenderFilter(mcid.intValue());
232                        TextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
233                        FilteredTextRenderListener listener = new FilteredTextRenderListener(
234                                        strategy, filter);
235                        PdfContentStreamProcessor processor = new PdfContentStreamProcessor(
236                                        listener);
237                        processor.processContent(PdfReader.getPageContent(page), page
238                                        .getAsDict(PdfName.RESOURCES));
239                        out.print(XMLUtil.escapeXML(listener.getResultantText(), true));
240                }
241                // if the identifier is an array, we call the parseTag method
242                // recursively
243                else if (object instanceof PdfArray) {
244                        PdfArray arr = (PdfArray) object;
245                        int n = arr.size();
246                        for (int i = 0; i < n; i++) {
247                                parseTag(tag, arr.getPdfObject(i), page);
248                                if (i < n - 1)
249                                        out.println();
250                        }
251                }
252                // if the identifier is a dictionary, we get the resources from the
253                // dictionary
254                else if (object instanceof PdfDictionary) {
255                        PdfDictionary mcr = (PdfDictionary) object;
256                        parseTag(tag, mcr.getDirectObject(PdfName.MCID), mcr
257                                        .getAsDict(PdfName.PG));
258                }
259        }
260
261}