001/* 002 * $Id: SimpleXMLParser.java 4784 2011-03-15 08:33:00Z blowagie $ 003 * 004 * This file is part of the iText (R) project. 005 * Copyright (c) 1998-2011 1T3XT BVBA 006 * Authors: Bruno Lowagie, Paulo Soares, et al. 007 * 008 * This program is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Affero General Public License version 3 010 * as published by the Free Software Foundation with the addition of the 011 * following permission added to Section 15 as permitted in Section 7(a): 012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT, 013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS. 014 * 015 * This program is distributed in the hope that it will be useful, but 016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 017 * or FITNESS FOR A PARTICULAR PURPOSE. 018 * See the GNU Affero General Public License for more details. 019 * You should have received a copy of the GNU Affero General Public License 020 * along with this program; if not, see http://www.gnu.org/licenses or write to 021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 022 * Boston, MA, 02110-1301 USA, or download the license from the following URL: 023 * http://itextpdf.com/terms-of-use/ 024 * 025 * The interactive user interfaces in modified source and object code versions 026 * of this program must display Appropriate Legal Notices, as required under 027 * Section 5 of the GNU Affero General Public License. 028 * 029 * In accordance with Section 7(b) of the GNU Affero General Public License, 030 * a covered work must retain the producer line in every PDF that is created 031 * or manipulated using iText. 032 * 033 * You can be released from the requirements of the license by purchasing 034 * a commercial license. Buying such a license is mandatory as soon as you 035 * develop commercial activities involving the iText software without 036 * disclosing the source code of your own applications. 037 * These activities include: offering paid services to customers as an ASP, 038 * serving PDFs on the fly in a web application, shipping iText with a closed 039 * source product. 040 * 041 * For more information, please contact iText Software Corp. at this 042 * address: sales@itextpdf.com 043 */ 044 045/* The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license: 046 * Licensed to the Apache Software Foundation (ASF) under one or more 047 * contributor license agreements. See the NOTICE file distributed with 048 * this work for additional information regarding copyright ownership. 049 * The ASF licenses this file to You under the Apache License, Version 2.0 050 * (the "License"); you may not use this file except in compliance with 051 * the License. You may obtain a copy of the License at 052 * 053 * http://www.apache.org/licenses/LICENSE-2.0 054 * 055 * Unless required by applicable law or agreed to in writing, software 056 * distributed under the License is distributed on an "AS IS" BASIS, 057 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 058 * See the License for the specific language governing permissions and 059 * limitations under the License. 060 * 061 * Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt. 062 * The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128). 063 * Steven Brandt and JavaWorld gave permission to use the code for free. 064 * (Bruno Lowagie and Paulo Soares chose to use it under the AGPL in conformance 065 * with the rest of the code). 066 * The original code can be found on this url: <A HREF="http://www.javaworld.com/javatips/jw-javatip128_p.html">http://www.javaworld.com/javatips/jw-javatip128_p.html</A>. 067 * It was substantially refactored by Bruno Lowagie. 068 * 069 * The method 'private static String getEncodingName(byte[] b4)' was found 070 * in org.apache.xerces.impl.XMLEntityManager, originaly published by the 071 * Apache Software Foundation under the Apache Software License; now being 072 * used in iText under the MPL. 073 */ 074package com.itextpdf.text.xml.simpleparser; 075 076import java.io.BufferedReader; 077import java.io.ByteArrayOutputStream; 078import java.io.IOException; 079import java.io.InputStream; 080import java.io.InputStreamReader; 081import java.io.Reader; 082import java.util.HashMap; 083import java.util.Stack; 084 085import com.itextpdf.text.error_messages.MessageLocalization; 086import com.itextpdf.text.xml.XMLUtil; 087import com.itextpdf.text.xml.simpleparser.handler.HTMLNewLineHandler; 088import com.itextpdf.text.xml.simpleparser.handler.NeverNewLineHandler; 089 090/** 091 * A simple XML. This parser is, like the SAX parser, 092 * an event based parser, but with much less functionality. 093 * <p> 094 * The parser can: 095 * <p> 096 * <ul> 097 * <li>It recognizes the encoding used 098 * <li>It recognizes all the elements' start tags and end tags 099 * <li>It lists attributes, where attribute values can be enclosed in single or double quotes 100 * <li>It recognizes the <code><[CDATA[ ... ]]></code> construct 101 * <li>It recognizes the standard entities: &amp;, &lt;, &gt;, &quot;, and &apos;, as well as numeric entities 102 * <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11 103 * </ul> 104 * <p> 105 */ 106public final class SimpleXMLParser { 107 /** possible states */ 108 private final static int UNKNOWN = 0; 109 private final static int TEXT = 1; 110 private final static int TAG_ENCOUNTERED = 2; 111 private final static int EXAMIN_TAG = 3; 112 private final static int TAG_EXAMINED = 4; 113 private final static int IN_CLOSETAG = 5; 114 private final static int SINGLE_TAG = 6; 115 private final static int CDATA = 7; 116 private final static int COMMENT = 8; 117 private final static int PI = 9; 118 private final static int ENTITY = 10; 119 private final static int QUOTE = 11; 120 private final static int ATTRIBUTE_KEY = 12; 121 private final static int ATTRIBUTE_EQUAL = 13; 122 private final static int ATTRIBUTE_VALUE = 14; 123 124 /** the state stack */ 125 private final Stack<Integer> stack; 126 /** The current character. */ 127 private int character = 0; 128 /** The previous character. */ 129 private int previousCharacter = -1; 130 /** the line we are currently reading */ 131 private int lines = 1; 132 /** the column where the current character occurs */ 133 private int columns = 0; 134 /** was the last character equivalent to a newline? */ 135 private boolean eol = false; 136 /** 137 * A boolean indicating if the next character should be taken into account 138 * if it's a space character. When nospace is false, the previous character 139 * wasn't whitespace. 140 * @since 2.1.5 141 */ 142 private boolean nowhite = false; 143 /** the current state */ 144 private int state; 145 /** Are we parsing HTML? */ 146 private final boolean html; 147 /** current text (whatever is encountered between tags) */ 148 private final StringBuffer text = new StringBuffer(); 149 /** current entity (whatever is encountered between & and ;) */ 150 private final StringBuffer entity = new StringBuffer(); 151 /** current tagname */ 152 private String tag = null; 153 /** current attributes */ 154 private HashMap<String, String> attributes = null; 155 /** The handler to which we are going to forward document content */ 156 private final SimpleXMLDocHandler doc; 157 /** The handler to which we are going to forward comments. */ 158 private final SimpleXMLDocHandlerComment comment; 159 /** Keeps track of the number of tags that are open. */ 160 private int nested = 0; 161 /** the quote character that was used to open the quote. */ 162 private int quoteCharacter = '"'; 163 /** the attribute key. */ 164 private String attributekey = null; 165 /** the attribute value. */ 166 private String attributevalue = null; 167 private NewLineHandler newLineHandler; 168 /** 169 * Creates a Simple XML parser object. 170 * Call go(BufferedReader) immediately after creation. 171 */ 172 private SimpleXMLParser(final SimpleXMLDocHandler doc, final SimpleXMLDocHandlerComment comment, final boolean html) { 173 this.doc = doc; 174 this.comment = comment; 175 this.html = html; 176 if (html) { 177 this.newLineHandler = new HTMLNewLineHandler(); 178 } else { 179 this.newLineHandler = new NeverNewLineHandler(); 180 } 181 stack = new Stack<Integer>(); 182 state = html ? TEXT : UNKNOWN; 183 } 184 185 /** 186 * Does the actual parsing. Perform this immediately 187 * after creating the parser object. 188 */ 189 private void go(final Reader r) throws IOException { 190 BufferedReader reader; 191 if (r instanceof BufferedReader) 192 reader = (BufferedReader)r; 193 else 194 reader = new BufferedReader(r); 195 doc.startDocument(); 196 while(true) { 197 // read a new character 198 if (previousCharacter == -1) { 199 character = reader.read(); 200 } 201 // or re-examine the previous character 202 else { 203 character = previousCharacter; 204 previousCharacter = -1; 205 } 206 207 // the end of the file was reached 208 if (character == -1) { 209 if (html) { 210 if (html && state == TEXT) 211 flush(); 212 doc.endDocument(); 213 } else { 214 throwException(MessageLocalization.getComposedMessage("missing.end.tag")); 215 } 216 return; 217 } 218 219 // dealing with \n and \r 220 if (character == '\n' && eol) { 221 eol = false; 222 continue; 223 } else if (eol) { 224 eol = false; 225 } else if (character == '\n') { 226 lines++; 227 columns = 0; 228 } else if (character == '\r') { 229 eol = true; 230 character = '\n'; 231 lines++; 232 columns = 0; 233 } else { 234 columns++; 235 } 236 237 switch(state) { 238 // we are in an unknown state before there's actual content 239 case UNKNOWN: 240 if(character == '<') { 241 saveState(TEXT); 242 state = TAG_ENCOUNTERED; 243 } 244 break; 245 // we can encounter any content 246 case TEXT: 247 if(character == '<') { 248 flush(); 249 saveState(state); 250 state = TAG_ENCOUNTERED; 251 } else if(character == '&') { 252 saveState(state); 253 entity.setLength(0); 254 state = ENTITY; 255 nowhite = true; 256 } else if (character == ' ') { 257 if (html && nowhite) { 258 text.append(' '); 259 nowhite = false; 260 } else { 261 if (nowhite){ 262 text.append((char)character); 263 } 264 nowhite = false; 265 } 266 } else if (Character.isWhitespace((char)character)) { 267 if (html) { 268 // totally ignore other whitespace 269 } else { 270 if (nowhite){ 271 text.append((char)character); 272 } 273 nowhite = false; 274 } 275 } else { 276 text.append((char)character); 277 nowhite = true; 278 } 279 break; 280 // we have just seen a < and are wondering what we are looking at 281 // <foo>, </foo>, <!-- ... --->, etc. 282 case TAG_ENCOUNTERED: 283 initTag(); 284 if(character == '/') { 285 state = IN_CLOSETAG; 286 } else if (character == '?') { 287 restoreState(); 288 state = PI; 289 } else { 290 text.append((char)character); 291 state = EXAMIN_TAG; 292 } 293 break; 294 // we are processing something like this <foo ... >. 295 // It could still be a <!-- ... --> or something. 296 case EXAMIN_TAG: 297 if(character == '>') { 298 doTag(); 299 processTag(true); 300 initTag(); 301 state = restoreState(); 302 } else if(character == '/') { 303 state = SINGLE_TAG; 304 } else if(character == '-' && text.toString().equals("!-")) { 305 flush(); 306 state = COMMENT; 307 } else if(character == '[' && text.toString().equals("![CDATA")) { 308 flush(); 309 state = CDATA; 310 } else if(character == 'E' && text.toString().equals("!DOCTYP")) { 311 flush(); 312 state = PI; 313 } else if(Character.isWhitespace((char)character)) { 314 doTag(); 315 state = TAG_EXAMINED; 316 } else { 317 text.append((char)character); 318 } 319 break; 320 // we know the name of the tag now. 321 case TAG_EXAMINED: 322 if(character == '>') { 323 processTag(true); 324 initTag(); 325 state = restoreState(); 326 } else if(character == '/') { 327 state = SINGLE_TAG; 328 } else if(Character.isWhitespace((char)character)) { 329 // empty 330 } else { 331 text.append((char)character); 332 state = ATTRIBUTE_KEY; 333 } 334 break; 335 336 // we are processing a closing tag: e.g. </foo> 337 case IN_CLOSETAG: 338 if(character == '>') { 339 doTag(); 340 processTag(false); 341 if(!html && nested==0) return; 342 state = restoreState(); 343 } else { 344 if (!Character.isWhitespace((char)character)) 345 text.append((char)character); 346 } 347 break; 348 349 // we have just seen something like this: <foo a="b"/ 350 // and are looking for the final >. 351 case SINGLE_TAG: 352 if(character != '>') 353 throwException(MessageLocalization.getComposedMessage("expected.gt.for.tag.lt.1.gt", tag)); 354 doTag(); 355 processTag(true); 356 processTag(false); 357 initTag(); 358 if(!html && nested==0) { 359 doc.endDocument(); 360 return; 361 } 362 state = restoreState(); 363 break; 364 365 // we are processing CDATA 366 case CDATA: 367 if(character == '>' 368 && text.toString().endsWith("]]")) { 369 text.setLength(text.length()-2); 370 flush(); 371 state = restoreState(); 372 } else 373 text.append((char)character); 374 break; 375 376 // we are processing a comment. We are inside 377 // the <!-- .... --> looking for the -->. 378 case COMMENT: 379 if(character == '>' 380 && text.toString().endsWith("--")) { 381 text.setLength(text.length() - 2); 382 flush(); 383 state = restoreState(); 384 } else 385 text.append((char)character); 386 break; 387 388 // We are inside one of these <? ... ?> or one of these <!DOCTYPE ... > 389 case PI: 390 if(character == '>') { 391 state = restoreState(); 392 if(state == TEXT) state = UNKNOWN; 393 } 394 break; 395 396 // we are processing an entity, e.g. <, », etc. 397 case ENTITY: 398 if(character == ';') { 399 state = restoreState(); 400 String cent = entity.toString(); 401 entity.setLength(0); 402 char ce = EntitiesToUnicode.decodeEntity(cent); 403 if (ce == '\0') 404 text.append('&').append(cent).append(';'); 405 else 406 text.append(ce); 407 } else if (character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z') 408 && (character < 'A' || character > 'Z') || entity.length() >= 7) { 409 state = restoreState(); 410 previousCharacter = character; 411 text.append('&').append(entity.toString()); 412 entity.setLength(0); 413 } 414 else { 415 entity.append((char)character); 416 } 417 break; 418 // We are processing the quoted right-hand side of an element's attribute. 419 case QUOTE: 420 if (html && quoteCharacter == ' ' && character == '>') { 421 flush(); 422 processTag(true); 423 initTag(); 424 state = restoreState(); 425 } 426 else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) { 427 flush(); 428 state = TAG_EXAMINED; 429 } 430 else if (html && quoteCharacter == ' ') { 431 text.append((char)character); 432 } 433 else if(character == quoteCharacter) { 434 flush(); 435 state = TAG_EXAMINED; 436 } else if(" \r\n\u0009".indexOf(character)>=0) { 437 text.append(' '); 438 } else if(character == '&') { 439 saveState(state); 440 state = ENTITY; 441 entity.setLength(0); 442 } else { 443 text.append((char)character); 444 } 445 break; 446 447 case ATTRIBUTE_KEY: 448 if(Character.isWhitespace((char)character)) { 449 flush(); 450 state = ATTRIBUTE_EQUAL; 451 } else if(character == '=') { 452 flush(); 453 state = ATTRIBUTE_VALUE; 454 } else if (html && character == '>') { 455 text.setLength(0); 456 processTag(true); 457 initTag(); 458 state = restoreState(); 459 } else { 460 text.append((char)character); 461 } 462 break; 463 464 case ATTRIBUTE_EQUAL: 465 if(character == '=') { 466 state = ATTRIBUTE_VALUE; 467 } else if(Character.isWhitespace((char)character)) { 468 // empty 469 } else if (html && character == '>') { 470 text.setLength(0); 471 processTag(true); 472 initTag(); 473 state = restoreState(); 474 } else if (html && character == '/') { 475 flush(); 476 state = SINGLE_TAG; 477 } else if (html) { 478 flush(); 479 text.append((char)character); 480 state = ATTRIBUTE_KEY; 481 } else { 482 throwException(MessageLocalization.getComposedMessage("error.in.attribute.processing")); 483 } 484 break; 485 486 case ATTRIBUTE_VALUE: 487 if(character == '"' || character == '\'') { 488 quoteCharacter = character; 489 state = QUOTE; 490 } else if(Character.isWhitespace((char)character)) { 491 // empty 492 } else if (html && character == '>') { 493 flush(); 494 processTag(true); 495 initTag(); 496 state = restoreState(); 497 } else if (html) { 498 text.append((char)character); 499 quoteCharacter = ' '; 500 state = QUOTE; 501 } else { 502 throwException(MessageLocalization.getComposedMessage("error.in.attribute.processing")); 503 } 504 break; 505 } 506 } 507 } 508 509 /** 510 * Gets a state from the stack 511 * @return the previous state 512 */ 513 private int restoreState() { 514 if(!stack.empty()) 515 return stack.pop().intValue(); 516 else 517 return UNKNOWN; 518 } 519 /** 520 * Adds a state to the stack. 521 * @param s a state to add to the stack 522 */ 523 private void saveState(final int s) { 524 stack.push(Integer.valueOf(s)); 525 } 526 /** 527 * Flushes the text that is currently in the buffer. 528 * The text can be ignored, added to the document 529 * as content or as comment,... depending on the current state. 530 */ 531 private void flush() { 532 switch(state){ 533 case TEXT: 534 case CDATA: 535 if(text.length() > 0) { 536 doc.text(text.toString()); 537 } 538 break; 539 case COMMENT: 540 if (comment != null) { 541 comment.comment(text.toString()); 542 } 543 break; 544 case ATTRIBUTE_KEY: 545 attributekey = text.toString(); 546 if (html) 547 attributekey = attributekey.toLowerCase(); 548 break; 549 case QUOTE: 550 case ATTRIBUTE_VALUE: 551 attributevalue = text.toString(); 552 attributes.put(attributekey,attributevalue); 553 break; 554 default: 555 // do nothing 556 } 557 text.setLength(0); 558 } 559 /** 560 * Initialized the tag name and attributes. 561 */ 562 private void initTag() { 563 tag = null; 564 attributes = new HashMap<String, String>(); 565 } 566 /** Sets the name of the tag. */ 567 private void doTag() { 568 if(tag == null) 569 tag = text.toString(); 570 if (html) 571 tag = tag.toLowerCase(); 572 text.setLength(0); 573 } 574 /** 575 * processes the tag. 576 * @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag. 577 */ 578 private void processTag(final boolean start) { 579 if (start) { 580 nested++; 581 doc.startElement(tag,attributes); 582 } 583 else { 584 // White spaces following new lines need to be ignored in HTML 585 if(newLineHandler.isNewLineTag(tag)) { 586 nowhite = false; 587 } 588 nested--; 589 doc.endElement(tag); 590 } 591 } 592 /** Throws an exception */ 593 private void throwException(final String s) throws IOException { 594 throw new IOException(MessageLocalization.getComposedMessage("1.near.line.2.column.3", s, String.valueOf(lines), String.valueOf(columns))); 595 } 596 597 /** 598 * Parses the XML document firing the events to the handler. 599 * @param doc the document handler 600 * @param comment the comment handler 601 * @param r the document. The encoding is already resolved. The reader is not closed 602 * @param html 603 * @throws IOException on error 604 */ 605 public static void parse(final SimpleXMLDocHandler doc, final SimpleXMLDocHandlerComment comment, final Reader r, final boolean html) throws IOException { 606 SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html); 607 parser.go(r); 608 } 609 610 /** 611 * Parses the XML document firing the events to the handler. 612 * @param doc the document handler 613 * @param in the document. The encoding is deduced from the stream. The stream is not closed 614 * @throws IOException on error 615 */ 616 public static void parse(final SimpleXMLDocHandler doc, final InputStream in) throws IOException { 617 byte b4[] = new byte[4]; 618 int count = in.read(b4); 619 if (count != 4) 620 throw new IOException(MessageLocalization.getComposedMessage("insufficient.length")); 621 String encoding = XMLUtil.getEncodingName(b4); 622 String decl = null; 623 if (encoding.equals("UTF-8")) { 624 StringBuffer sb = new StringBuffer(); 625 int c; 626 while ((c = in.read()) != -1) { 627 if (c == '>') 628 break; 629 sb.append((char)c); 630 } 631 decl = sb.toString(); 632 } 633 else if (encoding.equals("CP037")) { 634 ByteArrayOutputStream bi = new ByteArrayOutputStream(); 635 int c; 636 while ((c = in.read()) != -1) { 637 if (c == 0x6e) // that's '>' in ebcdic 638 break; 639 bi.write(c); 640 } 641 decl = new String(bi.toByteArray(), "CP037"); 642 } 643 if (decl != null) { 644 decl = getDeclaredEncoding(decl); 645 if (decl != null) 646 encoding = decl; 647 } 648 parse(doc, new InputStreamReader(in, IanaEncodings.getJavaEncoding(encoding))); 649 } 650 651 private static String getDeclaredEncoding(final String decl) { 652 if (decl == null) 653 return null; 654 int idx = decl.indexOf("encoding"); 655 if (idx < 0) 656 return null; 657 int idx1 = decl.indexOf('"', idx); 658 int idx2 = decl.indexOf('\'', idx); 659 if (idx1 == idx2) 660 return null; 661 if (idx1 < 0 && idx2 > 0 || idx2 > 0 && idx2 < idx1) { 662 int idx3 = decl.indexOf('\'', idx2 + 1); 663 if (idx3 < 0) 664 return null; 665 return decl.substring(idx2 + 1, idx3); 666 } 667 if (idx2 < 0 && idx1 > 0 || idx1 > 0 && idx1 < idx2) { 668 int idx3 = decl.indexOf('"', idx1 + 1); 669 if (idx3 < 0) 670 return null; 671 return decl.substring(idx1 + 1, idx3); 672 } 673 return null; 674 } 675 676 /** 677 * @param doc 678 * @param r 679 * @throws IOException 680 */ 681 public static void parse(final SimpleXMLDocHandler doc,final Reader r) throws IOException { 682 parse(doc, null, r, false); 683 } 684 685 /** 686 * Escapes a string with the appropriated XML codes. 687 * 688 * @param s 689 * the string to be escaped 690 * @param onlyASCII 691 * codes above 127 will always be escaped with &#nn; if 692 * <CODE>true</CODE> 693 * @return the escaped string 694 * @deprecated moved to {@link XMLUtil#escapeXML(String, boolean)}, left 695 * here for the sake of backwards compatibility 696 */ 697 @Deprecated 698 public static String escapeXML(final String s, final boolean onlyASCII) { 699 return XMLUtil.escapeXML(s, onlyASCII); 700 } 701 702}