001/*
002 * $Id: SimpleXMLParser.java 4784 2011-03-15 08:33:00Z blowagie $
003 *
004 * This file is part of the iText (R) project.
005 * Copyright (c) 1998-2011 1T3XT BVBA
006 * Authors: Bruno Lowagie, Paulo Soares, et al.
007 *
008 * This program is free software; you can redistribute it and/or modify
009 * it under the terms of the GNU Affero General Public License version 3
010 * as published by the Free Software Foundation with the addition of the
011 * following permission added to Section 15 as permitted in Section 7(a):
012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
014 *
015 * This program is distributed in the hope that it will be useful, but
016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
017 * or FITNESS FOR A PARTICULAR PURPOSE.
018 * See the GNU Affero General Public License for more details.
019 * You should have received a copy of the GNU Affero General Public License
020 * along with this program; if not, see http://www.gnu.org/licenses or write to
021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
022 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
023 * http://itextpdf.com/terms-of-use/
024 *
025 * The interactive user interfaces in modified source and object code versions
026 * of this program must display Appropriate Legal Notices, as required under
027 * Section 5 of the GNU Affero General Public License.
028 *
029 * In accordance with Section 7(b) of the GNU Affero General Public License,
030 * a covered work must retain the producer line in every PDF that is created
031 * or manipulated using iText.
032 *
033 * You can be released from the requirements of the license by purchasing
034 * a commercial license. Buying such a license is mandatory as soon as you
035 * develop commercial activities involving the iText software without
036 * disclosing the source code of your own applications.
037 * These activities include: offering paid services to customers as an ASP,
038 * serving PDFs on the fly in a web application, shipping iText with a closed
039 * source product.
040 *
041 * For more information, please contact iText Software Corp. at this
042 * address: sales@itextpdf.com
043 */
044
045/* The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license:
046 * Licensed to the Apache Software Foundation (ASF) under one or more
047 * contributor license agreements.  See the NOTICE file distributed with
048 * this work for additional information regarding copyright ownership.
049 * The ASF licenses this file to You under the Apache License, Version 2.0
050 * (the "License"); you may not use this file except in compliance with
051 * the License.  You may obtain a copy of the License at
052 *
053 *      http://www.apache.org/licenses/LICENSE-2.0
054 *
055 * Unless required by applicable law or agreed to in writing, software
056 * distributed under the License is distributed on an "AS IS" BASIS,
057 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
058 * See the License for the specific language governing permissions and
059 * limitations under the License.
060 *
061 * Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt.
062 * The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128).
063 * Steven Brandt and JavaWorld gave permission to use the code for free.
064 * (Bruno Lowagie and Paulo Soares chose to use it under the AGPL in conformance
065 * with the rest of the code).
066 * The original code can be found on this url: <A HREF="http://www.javaworld.com/javatips/jw-javatip128_p.html">http://www.javaworld.com/javatips/jw-javatip128_p.html</A>.
067 * It was substantially refactored by Bruno Lowagie.
068 *
069 * The method 'private static String getEncodingName(byte[] b4)' was found
070 * in org.apache.xerces.impl.XMLEntityManager, originaly published by the
071 * Apache Software Foundation under the Apache Software License; now being
072 * used in iText under the MPL.
073 */
074package com.itextpdf.text.xml.simpleparser;
075
076import java.io.BufferedReader;
077import java.io.ByteArrayOutputStream;
078import java.io.IOException;
079import java.io.InputStream;
080import java.io.InputStreamReader;
081import java.io.Reader;
082import java.util.HashMap;
083import java.util.Stack;
084
085import com.itextpdf.text.error_messages.MessageLocalization;
086import com.itextpdf.text.xml.XMLUtil;
087import com.itextpdf.text.xml.simpleparser.handler.HTMLNewLineHandler;
088import com.itextpdf.text.xml.simpleparser.handler.NeverNewLineHandler;
089
090/**
091 * A simple XML.  This parser is, like the SAX parser,
092 * an event based parser, but with much less functionality.
093 * <p>
094 * The parser can:
095 * <p>
096 * <ul>
097 * <li>It recognizes the encoding used
098 * <li>It recognizes all the elements' start tags and end tags
099 * <li>It lists attributes, where attribute values can be enclosed in single or double quotes
100 * <li>It recognizes the <code>&lt;[CDATA[ ... ]]&gt;</code> construct
101 * <li>It recognizes the standard entities: &amp;amp;, &amp;lt;, &amp;gt;, &amp;quot;, and &amp;apos;, as well as numeric entities
102 * <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11
103 * </ul>
104 * <p>
105 */
106public final class SimpleXMLParser {
107    /** possible states */
108        private final static int UNKNOWN = 0;
109        private final static int TEXT = 1;
110        private final static int TAG_ENCOUNTERED = 2;
111        private final static int EXAMIN_TAG = 3;
112        private final static int TAG_EXAMINED = 4;
113        private final static int IN_CLOSETAG = 5;
114        private final static int SINGLE_TAG = 6;
115        private final static int CDATA = 7;
116        private final static int COMMENT = 8;
117        private final static int PI = 9;
118        private final static int ENTITY = 10;
119        private final static int QUOTE = 11;
120        private final static int ATTRIBUTE_KEY = 12;
121        private final static int ATTRIBUTE_EQUAL = 13;
122        private final static int ATTRIBUTE_VALUE = 14;
123
124        /** the state stack */
125        private final Stack<Integer> stack;
126        /** The current character. */
127        private int character = 0;
128        /** The previous character. */
129        private int previousCharacter = -1;
130        /** the line we are currently reading */
131        private int lines = 1;
132        /** the column where the current character occurs */
133        private int columns = 0;
134        /** was the last character equivalent to a newline? */
135        private boolean eol = false;
136        /**
137         * A boolean indicating if the next character should be taken into account
138         * if it's a space character. When nospace is false, the previous character
139         * wasn't whitespace.
140         * @since 2.1.5
141         */
142        private boolean nowhite = false;
143        /** the current state */
144        private int state;
145        /** Are we parsing HTML? */
146        private final boolean html;
147        /** current text (whatever is encountered between tags) */
148        private final StringBuffer text = new StringBuffer();
149        /** current entity (whatever is encountered between & and ;) */
150        private final StringBuffer entity = new StringBuffer();
151        /** current tagname */
152        private String tag = null;
153        /** current attributes */
154        private HashMap<String, String> attributes = null;
155        /** The handler to which we are going to forward document content */
156        private final  SimpleXMLDocHandler doc;
157        /** The handler to which we are going to forward comments. */
158        private final  SimpleXMLDocHandlerComment comment;
159        /** Keeps track of the number of tags that are open. */
160        private  int nested = 0;
161        /** the quote character that was used to open the quote. */
162        private  int quoteCharacter = '"';
163        /** the attribute key. */
164        private  String attributekey = null;
165        /** the attribute value. */
166        private  String attributevalue = null;
167        private  NewLineHandler newLineHandler;
168        /**
169         * Creates a Simple XML parser object.
170         * Call go(BufferedReader) immediately after creation.
171         */
172    private SimpleXMLParser(final SimpleXMLDocHandler doc, final SimpleXMLDocHandlerComment comment, final boolean html) {
173        this.doc = doc;
174        this.comment = comment;
175        this.html = html;
176        if (html) {
177                this.newLineHandler = new HTMLNewLineHandler();
178        } else {
179                this.newLineHandler = new NeverNewLineHandler();
180        }
181        stack = new Stack<Integer>();
182        state = html ? TEXT : UNKNOWN;
183    }
184
185    /**
186     * Does the actual parsing. Perform this immediately
187     * after creating the parser object.
188     */
189    private void go(final Reader r) throws IOException {
190        BufferedReader reader;
191        if (r instanceof BufferedReader)
192            reader = (BufferedReader)r;
193        else
194            reader = new BufferedReader(r);
195        doc.startDocument();
196        while(true) {
197                        // read a new character
198                        if (previousCharacter == -1) {
199                                character = reader.read();
200                        }
201                        // or re-examine the previous character
202                        else {
203                                character = previousCharacter;
204                                previousCharacter = -1;
205                        }
206
207                        // the end of the file was reached
208                        if (character == -1) {
209                                if (html) {
210                                        if (html && state == TEXT)
211                                                flush();
212                                        doc.endDocument();
213                                } else {
214                                        throwException(MessageLocalization.getComposedMessage("missing.end.tag"));
215                                }
216                                return;
217                        }
218
219                        // dealing with  \n and \r
220                        if (character == '\n' && eol) {
221                                eol = false;
222                                continue;
223                        } else if (eol) {
224                                eol = false;
225                        } else if (character == '\n') {
226                                lines++;
227                                columns = 0;
228                        } else if (character == '\r') {
229                                eol = true;
230                                character = '\n';
231                                lines++;
232                                columns = 0;
233                        } else {
234                                columns++;
235                        }
236
237                        switch(state) {
238            // we are in an unknown state before there's actual content
239                        case UNKNOWN:
240                if(character == '<') {
241                    saveState(TEXT);
242                    state = TAG_ENCOUNTERED;
243                }
244                break;
245            // we can encounter any content
246                        case TEXT:
247                if(character == '<') {
248                    flush();
249                    saveState(state);
250                    state = TAG_ENCOUNTERED;
251                } else if(character == '&') {
252                    saveState(state);
253                    entity.setLength(0);
254                    state = ENTITY;
255                    nowhite = true;
256                } else if (character == ' ') {
257                        if (html && nowhite) {
258                                text.append(' ');
259                                nowhite = false;
260                        } else {
261                                if (nowhite){
262                                        text.append((char)character);
263                                }
264                                nowhite = false;
265                        }
266                } else if (Character.isWhitespace((char)character)) {
267                        if (html) {
268                                // totally ignore other whitespace
269                        } else {
270                                if (nowhite){
271                                        text.append((char)character);
272                                }
273                                nowhite = false;
274                        }
275                } else {
276                    text.append((char)character);
277                    nowhite = true;
278                }
279                break;
280            // we have just seen a < and are wondering what we are looking at
281            // <foo>, </foo>, <!-- ... --->, etc.
282                        case TAG_ENCOUNTERED:
283                initTag();
284                if(character == '/') {
285                    state = IN_CLOSETAG;
286                } else if (character == '?') {
287                    restoreState();
288                    state = PI;
289                } else {
290                    text.append((char)character);
291                    state = EXAMIN_TAG;
292                }
293                break;
294            // we are processing something like this <foo ... >.
295            // It could still be a <!-- ... --> or something.
296                        case EXAMIN_TAG:
297                if(character == '>') {
298                    doTag();
299                    processTag(true);
300                    initTag();
301                    state = restoreState();
302                } else if(character == '/') {
303                    state = SINGLE_TAG;
304                } else if(character == '-' && text.toString().equals("!-")) {
305                    flush();
306                    state = COMMENT;
307                } else if(character == '[' && text.toString().equals("![CDATA")) {
308                    flush();
309                    state = CDATA;
310                } else if(character == 'E' && text.toString().equals("!DOCTYP")) {
311                    flush();
312                    state = PI;
313                } else if(Character.isWhitespace((char)character)) {
314                    doTag();
315                    state = TAG_EXAMINED;
316                } else {
317                    text.append((char)character);
318                }
319                break;
320            // we know the name of the tag now.
321                        case TAG_EXAMINED:
322                if(character == '>') {
323                    processTag(true);
324                    initTag();
325                    state = restoreState();
326                } else if(character == '/') {
327                    state = SINGLE_TAG;
328                } else if(Character.isWhitespace((char)character)) {
329                    // empty
330                } else {
331                    text.append((char)character);
332                    state = ATTRIBUTE_KEY;
333                }
334                break;
335
336                // we are processing a closing tag: e.g. </foo>
337                        case IN_CLOSETAG:
338                if(character == '>') {
339                    doTag();
340                    processTag(false);
341                    if(!html && nested==0) return;
342                    state = restoreState();
343                } else {
344                    if (!Character.isWhitespace((char)character))
345                        text.append((char)character);
346                }
347                break;
348
349            // we have just seen something like this: <foo a="b"/
350            // and are looking for the final >.
351                        case SINGLE_TAG:
352                if(character != '>')
353                    throwException(MessageLocalization.getComposedMessage("expected.gt.for.tag.lt.1.gt", tag));
354                                doTag();
355                processTag(true);
356                processTag(false);
357                initTag();
358                if(!html && nested==0) {
359                    doc.endDocument();
360                    return;
361                }
362                state = restoreState();
363                break;
364
365            // we are processing CDATA
366                        case CDATA:
367                if(character == '>'
368                && text.toString().endsWith("]]")) {
369                    text.setLength(text.length()-2);
370                    flush();
371                    state = restoreState();
372                } else
373                    text.append((char)character);
374                break;
375
376            // we are processing a comment.  We are inside
377            // the <!-- .... --> looking for the -->.
378                        case COMMENT:
379                if(character == '>'
380                && text.toString().endsWith("--")) {
381                    text.setLength(text.length() - 2);
382                    flush();
383                    state = restoreState();
384                } else
385                    text.append((char)character);
386                break;
387
388            // We are inside one of these <? ... ?> or one of these <!DOCTYPE ... >
389                        case PI:
390                if(character == '>') {
391                    state = restoreState();
392                    if(state == TEXT) state = UNKNOWN;
393                }
394                break;
395
396            // we are processing an entity, e.g. &lt;, &#187;, etc.
397                        case ENTITY:
398                if(character == ';') {
399                    state = restoreState();
400                    String cent = entity.toString();
401                    entity.setLength(0);
402                    char ce = EntitiesToUnicode.decodeEntity(cent);
403                    if (ce == '\0')
404                        text.append('&').append(cent).append(';');
405                    else
406                        text.append(ce);
407                } else if (character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z')
408                    && (character < 'A' || character > 'Z') || entity.length() >= 7) {
409                    state = restoreState();
410                    previousCharacter = character;
411                    text.append('&').append(entity.toString());
412                    entity.setLength(0);
413                }
414                else {
415                    entity.append((char)character);
416                }
417                break;
418            // We are processing the quoted right-hand side of an element's attribute.
419                        case QUOTE:
420                if (html && quoteCharacter == ' ' && character == '>') {
421                    flush();
422                    processTag(true);
423                    initTag();
424                    state = restoreState();
425                }
426                else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) {
427                        flush();
428                    state = TAG_EXAMINED;
429                }
430                else if (html && quoteCharacter == ' ') {
431                    text.append((char)character);
432                }
433                else if(character == quoteCharacter) {
434                        flush();
435                    state = TAG_EXAMINED;
436                } else if(" \r\n\u0009".indexOf(character)>=0) {
437                    text.append(' ');
438                } else if(character == '&') {
439                    saveState(state);
440                    state = ENTITY;
441                    entity.setLength(0);
442                } else {
443                    text.append((char)character);
444                }
445                break;
446
447                        case ATTRIBUTE_KEY:
448                if(Character.isWhitespace((char)character)) {
449                    flush();
450                    state = ATTRIBUTE_EQUAL;
451                } else if(character == '=') {
452                        flush();
453                    state = ATTRIBUTE_VALUE;
454                } else if (html && character == '>') {
455                    text.setLength(0);
456                    processTag(true);
457                    initTag();
458                    state = restoreState();
459                } else {
460                    text.append((char)character);
461                }
462                break;
463
464                        case ATTRIBUTE_EQUAL:
465                if(character == '=') {
466                    state = ATTRIBUTE_VALUE;
467                } else if(Character.isWhitespace((char)character)) {
468                    // empty
469                } else if (html && character == '>') {
470                    text.setLength(0);
471                    processTag(true);
472                    initTag();
473                    state = restoreState();
474                } else if (html && character == '/') {
475                    flush();
476                    state = SINGLE_TAG;
477                } else if (html) {
478                    flush();
479                    text.append((char)character);
480                    state = ATTRIBUTE_KEY;
481                } else {
482                    throwException(MessageLocalization.getComposedMessage("error.in.attribute.processing"));
483                }
484                break;
485
486                        case ATTRIBUTE_VALUE:
487                if(character == '"' || character == '\'') {
488                    quoteCharacter = character;
489                    state = QUOTE;
490                } else if(Character.isWhitespace((char)character)) {
491                    // empty
492                } else if (html && character == '>') {
493                    flush();
494                    processTag(true);
495                    initTag();
496                    state = restoreState();
497                } else if (html) {
498                    text.append((char)character);
499                    quoteCharacter = ' ';
500                    state = QUOTE;
501                } else {
502                    throwException(MessageLocalization.getComposedMessage("error.in.attribute.processing"));
503                }
504                break;
505            }
506        }
507    }
508
509    /**
510     * Gets a state from the stack
511     * @return the previous state
512     */
513    private int restoreState() {
514        if(!stack.empty())
515            return stack.pop().intValue();
516        else
517            return UNKNOWN;
518    }
519    /**
520     * Adds a state to the stack.
521     * @param   s       a state to add to the stack
522     */
523    private void saveState(final int s) {
524        stack.push(Integer.valueOf(s));
525    }
526    /**
527     * Flushes the text that is currently in the buffer.
528     * The text can be ignored, added to the document
529     * as content or as comment,... depending on the current state.
530     */
531    private void flush() {
532        switch(state){
533        case TEXT:
534        case CDATA:
535            if(text.length() > 0) {
536                doc.text(text.toString());
537            }
538            break;
539        case COMMENT:
540                if (comment != null) {
541                comment.comment(text.toString());
542            }
543                break;
544        case ATTRIBUTE_KEY:
545            attributekey = text.toString();
546            if (html)
547                attributekey = attributekey.toLowerCase();
548                break;
549        case QUOTE:
550        case ATTRIBUTE_VALUE:
551                attributevalue = text.toString();
552            attributes.put(attributekey,attributevalue);
553            break;
554        default:
555                // do nothing
556        }
557        text.setLength(0);
558    }
559    /**
560     * Initialized the tag name and attributes.
561     */
562    private void initTag() {
563        tag = null;
564        attributes = new HashMap<String, String>();
565    }
566    /** Sets the name of the tag. */
567    private void doTag() {
568        if(tag == null)
569                tag = text.toString();
570        if (html)
571                tag = tag.toLowerCase();
572        text.setLength(0);
573    }
574    /**
575     * processes the tag.
576     * @param start     if true we are dealing with a tag that has just been opened; if false we are closing a tag.
577     */
578    private void processTag(final boolean start) {
579        if (start) {
580                nested++;
581                doc.startElement(tag,attributes);
582        }
583        else {
584                        // White spaces following new lines need to be ignored in HTML
585                        if(newLineHandler.isNewLineTag(tag)) {
586                                nowhite = false;
587                        }
588            nested--;
589            doc.endElement(tag);
590        }
591    }
592    /** Throws an exception */
593    private void throwException(final String s) throws IOException {
594        throw new IOException(MessageLocalization.getComposedMessage("1.near.line.2.column.3", s, String.valueOf(lines), String.valueOf(columns)));
595    }
596
597    /**
598     * Parses the XML document firing the events to the handler.
599     * @param doc the document handler
600     * @param comment the comment handler
601     * @param r the document. The encoding is already resolved. The reader is not closed
602     * @param html
603     * @throws IOException on error
604     */
605    public static void parse(final SimpleXMLDocHandler doc, final SimpleXMLDocHandlerComment comment, final Reader r, final boolean html) throws IOException {
606        SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html);
607        parser.go(r);
608    }
609
610    /**
611     * Parses the XML document firing the events to the handler.
612     * @param doc the document handler
613     * @param in the document. The encoding is deduced from the stream. The stream is not closed
614     * @throws IOException on error
615     */
616    public static void parse(final SimpleXMLDocHandler doc, final InputStream in) throws IOException {
617        byte b4[] = new byte[4];
618        int count = in.read(b4);
619        if (count != 4)
620            throw new IOException(MessageLocalization.getComposedMessage("insufficient.length"));
621        String encoding = XMLUtil.getEncodingName(b4);
622        String decl = null;
623        if (encoding.equals("UTF-8")) {
624            StringBuffer sb = new StringBuffer();
625            int c;
626            while ((c = in.read()) != -1) {
627                if (c == '>')
628                    break;
629                sb.append((char)c);
630            }
631            decl = sb.toString();
632        }
633        else if (encoding.equals("CP037")) {
634            ByteArrayOutputStream bi = new ByteArrayOutputStream();
635            int c;
636            while ((c = in.read()) != -1) {
637                if (c == 0x6e) // that's '>' in ebcdic
638                    break;
639                bi.write(c);
640            }
641            decl = new String(bi.toByteArray(), "CP037");
642        }
643        if (decl != null) {
644            decl = getDeclaredEncoding(decl);
645            if (decl != null)
646                encoding = decl;
647        }
648        parse(doc, new InputStreamReader(in, IanaEncodings.getJavaEncoding(encoding)));
649    }
650
651    private static String getDeclaredEncoding(final String decl) {
652        if (decl == null)
653            return null;
654        int idx = decl.indexOf("encoding");
655        if (idx < 0)
656            return null;
657        int idx1 = decl.indexOf('"', idx);
658        int idx2 = decl.indexOf('\'', idx);
659        if (idx1 == idx2)
660            return null;
661        if (idx1 < 0 && idx2 > 0 || idx2 > 0 && idx2 < idx1) {
662            int idx3 = decl.indexOf('\'', idx2 + 1);
663            if (idx3 < 0)
664                return null;
665            return decl.substring(idx2 + 1, idx3);
666        }
667        if (idx2 < 0 && idx1 > 0 || idx1 > 0 && idx1 < idx2) {
668            int idx3 = decl.indexOf('"', idx1 + 1);
669            if (idx3 < 0)
670                return null;
671            return decl.substring(idx1 + 1, idx3);
672        }
673        return null;
674    }
675
676    /**
677     * @param doc
678     * @param r
679     * @throws IOException
680     */
681    public static void parse(final SimpleXMLDocHandler doc,final Reader r) throws IOException {
682        parse(doc, null, r, false);
683    }
684
685        /**
686         * Escapes a string with the appropriated XML codes.
687         *
688         * @param s
689         *            the string to be escaped
690         * @param onlyASCII
691         *            codes above 127 will always be escaped with &amp;#nn; if
692         *            <CODE>true</CODE>
693         * @return the escaped string
694         * @deprecated moved to {@link XMLUtil#escapeXML(String, boolean)}, left
695         *             here for the sake of backwards compatibility
696         */
697        @Deprecated
698        public static String escapeXML(final String s, final boolean onlyASCII) {
699                return XMLUtil.escapeXML(s, onlyASCII);
700        }
701
702}