001/* 002 * Copyright (c) 1999-2000 by David Brownell. All Rights Reserved. 003 * 004 * This program is open source software; you may use, copy, modify, and 005 * redistribute it under the terms of the LICENSE with which it was 006 * originally distributed. 007 * 008 * This program is distributed in the hope that it will be useful, 009 * but WITHOUT ANY WARRANTY; without even the implied warranty of 010 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 011 * LICENSE for more details. 012 */ 013 014// 015// Copyright (c) 1997, 1998 by Microstar Software Ltd. 016// From Microstar's README (the entire original license): 017// 018// AElfred is free for both commercial and non-commercial use and 019// redistribution, provided that Microstar's copyright and disclaimer are 020// retained intact. You are free to modify AElfred for your own use and 021// to redistribute AElfred with your modifications, provided that the 022// modifications are clearly documented. 023// 024// This program is distributed in the hope that it will be useful, but 025// WITHOUT ANY WARRANTY; without even the implied warranty of 026// merchantability or fitness for a particular purpose. Please use it AT 027// YOUR OWN RISK. 028// 029 030 031package org.dom4j.io.aelfred; 032 033import java.io.BufferedInputStream; 034import java.io.CharConversionException; 035import java.io.EOFException; 036import java.io.InputStream; 037import java.io.InputStreamReader; 038import java.io.IOException; 039import java.io.Reader; 040import java.net.URL; 041import java.net.URLConnection; 042import java.util.ArrayList; 043import java.util.HashMap; 044import java.util.Iterator; 045 046import org.xml.sax.SAXException; 047 048 049// $Id: XmlParser.java,v 1.4 2002/02/01 10:55:25 jstrachan Exp $ 050 051/** 052 * Parse XML documents and return parse events through call-backs. 053 * Use the <code>SAXDriver</code> class as your entry point, as the 054 * internal parser interfaces are subject to change. 055 * 056 * @author Written by David Megginson <dmeggins@microstar.com> 057 * (version 1.2a with bugfixes) 058 * @author Updated by David Brownell <david-b@pacbell.net> 059 * @version $Date: 2002/02/01 10:55:25 $ 060 * @see SAXDriver 061 */ 062final class XmlParser 063{ 064 // 065 // Use special cheats that speed up the code by 066 // avoiding per-character readCh () method calls. 067 // 068 private final static boolean USE_CHEATS = true; 069 070 071 ////////////////////////////////////////////////////////////////////// 072 // Constructors. 073 //////////////////////////////////////////////////////////////////////// 074 075 076 /** 077 * Construct a new parser with no associated handler. 078 * @see #setHandler 079 * @see #parse 080 */ 081 // package private 082 XmlParser () 083 { 084 cleanupVariables (); 085 } 086 087 088 /** 089 * Set the handler that will receive parsing events. 090 * @param handler The handler to receive callback events. 091 * @see #parse 092 */ 093 // package private 094 void setHandler (SAXDriver handler) 095 { 096 this.handler = handler; 097 } 098 099 100 /** 101 * Parse an XML document from the character stream, byte stream, or URI 102 * that you provide (in that order of preference). Any URI that you 103 * supply will become the base URI for resolving relative URI, and may 104 * be used to acquire a reader or byte stream. 105 * 106 * <p>You may parse more than one document, but that must be done 107 * sequentially. Only one thread at a time may use this parser. 108 * 109 * @param systemId The URI of the document; should never be null, 110 * but may be so iff a reader <em>or</em> a stream is provided. 111 * @param publicId The public identifier of the document, or null. 112 * @param reader A character stream; must be null if stream isn't. 113 * @param stream A byte input stream; must be null if reader isn't. 114 * @param encoding The suggested encoding, or null if unknown. 115 * @exception java.lang.Exception Basically SAXException or IOException 116 */ 117 // package private 118 void doParse ( 119 String systemId, 120 String publicId, 121 Reader reader, 122 InputStream stream, 123 String encoding 124 ) throws Exception 125 { 126 if (handler == null) 127 throw new IllegalStateException ("no callback handler"); 128 129 basePublicId = publicId; 130 baseURI = systemId; 131 baseReader = reader; 132 baseInputStream = stream; 133 134 initializeVariables (); 135 136 // predeclare the built-in entities here (replacement texts) 137 // we don't need to intern(), since we're guaranteed literals 138 // are always (globally) interned. 139 setInternalEntity ("amp", "&"); 140 setInternalEntity ("lt", "<"); 141 setInternalEntity ("gt", ">"); 142 setInternalEntity ("apos", "'"); 143 setInternalEntity ("quot", """); 144 145 handler.startDocument (); 146 147 pushURL ("[document]", basePublicId, baseURI, 148 baseReader, baseInputStream, encoding); 149 150 try { 151 parseDocument (); 152 handler.endDocument (); 153 } finally { 154 if (baseReader != null) 155 try { baseReader.close (); 156 } catch (IOException e) { /* ignore */ } 157 if (baseInputStream != null) 158 try { baseInputStream.close (); 159 } catch (IOException e) { /* ignore */ } 160 if (is != null) 161 try { is.close (); 162 } catch (IOException e) { /* ignore */ } 163 if (reader != null) 164 try { 165 reader.close (); 166 } catch (IOException e) { /* ignore */ 167 } 168 cleanupVariables (); 169 } 170 } 171 172 173 //////////////////////////////////////////////////////////////////////// 174 // Constants. 175 //////////////////////////////////////////////////////////////////////// 176 177 // 178 // Constants for element content type. 179 // 180 181 /** 182 * Constant: an element has not been declared. 183 * @see #getElementContentType 184 */ 185 public final static int CONTENT_UNDECLARED = 0; 186 187 /** 188 * Constant: the element has a content model of ANY. 189 * @see #getElementContentType 190 */ 191 public final static int CONTENT_ANY = 1; 192 193 /** 194 * Constant: the element has declared content of EMPTY. 195 * @see #getElementContentType 196 */ 197 public final static int CONTENT_EMPTY = 2; 198 199 /** 200 * Constant: the element has mixed content. 201 * @see #getElementContentType 202 */ 203 public final static int CONTENT_MIXED = 3; 204 205 /** 206 * Constant: the element has element content. 207 * @see #getElementContentType 208 */ 209 public final static int CONTENT_ELEMENTS = 4; 210 211 212 // 213 // Constants for the entity type. 214 // 215 216 /** 217 * Constant: the entity has not been declared. 218 * @see #getEntityType 219 */ 220 public final static int ENTITY_UNDECLARED = 0; 221 222 /** 223 * Constant: the entity is internal. 224 * @see #getEntityType 225 */ 226 public final static int ENTITY_INTERNAL = 1; 227 228 /** 229 * Constant: the entity is external, non-XML data. 230 * @see #getEntityType 231 */ 232 public final static int ENTITY_NDATA = 2; 233 234 /** 235 * Constant: the entity is external XML data. 236 * @see #getEntityType 237 */ 238 public final static int ENTITY_TEXT = 3; 239 240 241 // 242 // Constants for attribute type. 243 // 244 245 /** 246 * Constant: the attribute has not been declared for this element type. 247 * @see #getAttributeType 248 */ 249 public final static int ATTRIBUTE_UNDECLARED = 0; 250 251 /** 252 * Constant: the attribute value is a string value. 253 * @see #getAttributeType 254 */ 255 public final static int ATTRIBUTE_CDATA = 1; 256 257 /** 258 * Constant: the attribute value is a unique identifier. 259 * @see #getAttributeType 260 */ 261 public final static int ATTRIBUTE_ID = 2; 262 263 /** 264 * Constant: the attribute value is a reference to a unique identifier. 265 * @see #getAttributeType 266 */ 267 public final static int ATTRIBUTE_IDREF = 3; 268 269 /** 270 * Constant: the attribute value is a list of ID references. 271 * @see #getAttributeType 272 */ 273 public final static int ATTRIBUTE_IDREFS = 4; 274 275 /** 276 * Constant: the attribute value is the name of an entity. 277 * @see #getAttributeType 278 */ 279 public final static int ATTRIBUTE_ENTITY = 5; 280 281 /** 282 * Constant: the attribute value is a list of entity names. 283 * @see #getAttributeType 284 */ 285 public final static int ATTRIBUTE_ENTITIES = 6; 286 287 /** 288 * Constant: the attribute value is a name token. 289 * @see #getAttributeType 290 */ 291 public final static int ATTRIBUTE_NMTOKEN = 7; 292 293 /** 294 * Constant: the attribute value is a list of name tokens. 295 * @see #getAttributeType 296 */ 297 public final static int ATTRIBUTE_NMTOKENS = 8; 298 299 /** 300 * Constant: the attribute value is a token from an enumeration. 301 * @see #getAttributeType 302 */ 303 public final static int ATTRIBUTE_ENUMERATED = 9; 304 305 /** 306 * Constant: the attribute is the name of a notation. 307 * @see #getAttributeType 308 */ 309 public final static int ATTRIBUTE_NOTATION = 10; 310 311 312 // 313 // When the class is loaded, populate the hash table of 314 // attribute types. 315 // 316 317 /** 318 * Hash table of attribute types. 319 */ 320 private static HashMap attributeTypeHash; 321 static { 322 attributeTypeHash = new HashMap (13); 323 attributeTypeHash.put ("CDATA", new Integer (ATTRIBUTE_CDATA)); 324 attributeTypeHash.put ("ID", new Integer (ATTRIBUTE_ID)); 325 attributeTypeHash.put ("IDREF", new Integer (ATTRIBUTE_IDREF)); 326 attributeTypeHash.put ("IDREFS", new Integer (ATTRIBUTE_IDREFS)); 327 attributeTypeHash.put ("ENTITY", new Integer (ATTRIBUTE_ENTITY)); 328 attributeTypeHash.put ("ENTITIES", new Integer (ATTRIBUTE_ENTITIES)); 329 attributeTypeHash.put ("NMTOKEN", new Integer (ATTRIBUTE_NMTOKEN)); 330 attributeTypeHash.put ("NMTOKENS", new Integer (ATTRIBUTE_NMTOKENS)); 331 attributeTypeHash.put ("NOTATION", new Integer (ATTRIBUTE_NOTATION)); 332 } 333 334 335 // 336 // Constants for supported encodings. "external" is just a flag. 337 // 338 private final static int ENCODING_EXTERNAL = 0; 339 private final static int ENCODING_UTF_8 = 1; 340 private final static int ENCODING_ISO_8859_1 = 2; 341 private final static int ENCODING_UCS_2_12 = 3; 342 private final static int ENCODING_UCS_2_21 = 4; 343 private final static int ENCODING_UCS_4_1234 = 5; 344 private final static int ENCODING_UCS_4_4321 = 6; 345 private final static int ENCODING_UCS_4_2143 = 7; 346 private final static int ENCODING_UCS_4_3412 = 8; 347 private final static int ENCODING_ASCII = 9; 348 349 350 // 351 // Constants for attribute default value. 352 // 353 354 /** 355 * Constant: the attribute is not declared. 356 * @see #getAttributeDefaultValueType 357 */ 358 public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30; 359 360 /** 361 * Constant: the attribute has a literal default value specified. 362 * @see #getAttributeDefaultValueType 363 * @see #getAttributeDefaultValue 364 */ 365 public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31; 366 367 /** 368 * Constant: the attribute was declared #IMPLIED. 369 * @see #getAttributeDefaultValueType 370 */ 371 public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32; 372 373 /** 374 * Constant: the attribute was declared #REQUIRED. 375 * @see #getAttributeDefaultValueType 376 */ 377 public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33; 378 379 /** 380 * Constant: the attribute was declared #FIXED. 381 * @see #getAttributeDefaultValueType 382 * @see #getAttributeDefaultValue 383 */ 384 public final static int ATTRIBUTE_DEFAULT_FIXED = 34; 385 386 387 // 388 // Constants for input. 389 // 390 private final static int INPUT_NONE = 0; 391 private final static int INPUT_INTERNAL = 1; 392 private final static int INPUT_EXTERNAL = 2; 393 private final static int INPUT_STREAM = 3; 394 private final static int INPUT_BUFFER = 4; 395 private final static int INPUT_READER = 5; 396 397 398 // 399 // Flags for reading literals. 400 // 401 // expand general entity refs (attribute values in dtd and content) 402 private final static int LIT_ENTITY_REF = 2; 403 // normalize this value (whitespace etc) (attributes, public ids) 404 private final static int LIT_NORMALIZE = 4; 405 // literal is an attribute value 406 private final static int LIT_ATTRIBUTE = 8; 407 // don't expand parameter entities 408 private final static int LIT_DISABLE_PE = 16; 409 // don't expand [or parse] character refs 410 private final static int LIT_DISABLE_CREF = 32; 411 // don't parse general entity refs 412 private final static int LIT_DISABLE_EREF = 64; 413 // don't expand general entities, but make sure we _could_ 414 private final static int LIT_ENTITY_CHECK = 128; 415 416 417 // 418 // Flags affecting PE handling in DTDs (if expandPE is true). 419 // PEs expand with space padding, except inside literals. 420 // 421 private final static int CONTEXT_NORMAL = 0; 422 private final static int CONTEXT_LITERAL = 1; 423 424 425 ////////////////////////////////////////////////////////////////////// 426 // Error reporting. 427 ////////////////////////////////////////////////////////////////////// 428 429 430 /** 431 * Report an error. 432 * @param message The error message. 433 * @param textFound The text that caused the error (or null). 434 * @see SAXDriver#error 435 * @see #line 436 */ 437 private void error (String message, String textFound, String textExpected) 438 throws SAXException 439 { 440 if (textFound != null) { 441 message = message + " (found \"" + textFound + "\")"; 442 } 443 if (textExpected != null) { 444 message = message + " (expected \"" + textExpected + "\")"; 445 } 446 String uri = null; 447 448 if (externalEntity != null) { 449 uri = externalEntity.getURL ().toString (); 450 } 451 handler.error (message, uri, line, column); 452 453 // "can't happen" 454 throw new SAXException (message); 455 } 456 457 458 /** 459 * Report a serious error. 460 * @param message The error message. 461 * @param textFound The text that caused the error (or null). 462 */ 463 private void error (String message, char textFound, String textExpected) 464 throws SAXException 465 { 466 error (message, new Character (textFound).toString (), textExpected); 467 } 468 469 /** Report typical case fatal errors. */ 470 private void error (String message) 471 throws SAXException 472 { 473 error (message, null, null); 474 } 475 476 477 ////////////////////////////////////////////////////////////////////// 478 // Major syntactic productions. 479 ////////////////////////////////////////////////////////////////////// 480 481 482 /** 483 * Parse an XML document. 484 * <pre> 485 * [1] document ::= prolog element Misc* 486 * </pre> 487 * <p>This is the top-level parsing function for a single XML 488 * document. As a minimum, a well-formed document must have 489 * a document element, and a valid document must have a prolog 490 * (one with doctype) as well. 491 */ 492 private void parseDocument () 493 throws Exception 494 { 495 char c; 496 try { // added by MHK 497 parseProlog (); 498 require ('<'); 499 parseElement (); 500 } catch (EOFException ee) { // added by MHK 501 error("premature end of file", "[EOF]", null); 502 } 503 504 try { 505 parseMisc (); //skip all white, PIs, and comments 506 c = readCh (); //if this doesn't throw an exception... 507 error ("unexpected characters after document end", c, null); 508 } catch (EOFException e) { 509 return; 510 } 511 } 512 513 514 /** 515 * Skip a comment. 516 * <pre> 517 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" 518 * </pre> 519 * <p> (The <code><!--</code> has already been read.) 520 */ 521 private void parseComment () 522 throws Exception 523 { 524 char c; 525 boolean saved = expandPE; 526 527 expandPE = false; 528 parseUntil ("--"); 529 require ('>'); 530 expandPE = saved; 531 handler.comment (dataBuffer, 0, dataBufferPos); 532 dataBufferPos = 0; 533 } 534 535 536 /** 537 * Parse a processing instruction and do a call-back. 538 * <pre> 539 * [16] PI ::= '<?' PITarget 540 * (S (Char* - (Char* '?>' Char*)))? 541 * '?>' 542 * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') ) 543 * </pre> 544 * <p> (The <code><?</code> has already been read.) 545 */ 546 private void parsePI () 547 throws SAXException, IOException 548 { 549 String name; 550 boolean saved = expandPE; 551 552 expandPE = false; 553 name = readNmtoken (true); 554 if ("xml".equalsIgnoreCase (name)) 555 error ("Illegal processing instruction target", name, null); 556 if (!tryRead ("?>")) { 557 requireWhitespace (); 558 parseUntil ("?>"); 559 } 560 expandPE = saved; 561 handler.processingInstruction (name, dataBufferToString ()); 562 } 563 564 565 /** 566 * Parse a CDATA section. 567 * <pre> 568 * [18] CDSect ::= CDStart CData CDEnd 569 * [19] CDStart ::= '<![CDATA[' 570 * [20] CData ::= (Char* - (Char* ']]>' Char*)) 571 * [21] CDEnd ::= ']]>' 572 * </pre> 573 * <p> (The '<![CDATA[' has already been read.) 574 */ 575 private void parseCDSect () 576 throws Exception 577 { 578 parseUntil ("]]>"); 579 dataBufferFlush (); 580 } 581 582 583 /** 584 * Parse the prolog of an XML document. 585 * <pre> 586 * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? 587 * </pre> 588 * <p>There are a couple of tricks here. First, it is necessary to 589 * declare the XML default attributes after the DTD (if present) 590 * has been read. [??] Second, it is not possible to expand general 591 * references in attribute value literals until after the entire 592 * DTD (if present) has been parsed. 593 * <p>We do not look for the XML declaration here, because it was 594 * handled by pushURL (). 595 * @see pushURL 596 */ 597 private void parseProlog () 598 throws Exception 599 { 600 parseMisc (); 601 602 if (tryRead ("<!DOCTYPE")) { 603 parseDoctypedecl (); 604 parseMisc (); 605 } 606 } 607 608 609 /** 610 * Parse the XML declaration. 611 * <pre> 612 * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' 613 * [24] VersionInfo ::= S 'version' Eq 614 * ("'" VersionNum "'" | '"' VersionNum '"' ) 615 * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* 616 * [32] SDDecl ::= S 'standalone' Eq 617 * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) 618 * [80] EncodingDecl ::= S 'encoding' Eq 619 * ( "'" EncName "'" | "'" EncName "'" ) 620 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* 621 * </pre> 622 * <p> (The <code><?xml</code> and whitespace have already been read.) 623 * @return the encoding in the declaration, uppercased; or null 624 * @see #parseTextDecl 625 * @see #setupDecoding 626 */ 627 private String parseXMLDecl (boolean ignoreEncoding) 628 throws SAXException, IOException 629 { 630 String version; 631 String encodingName = null; 632 String standalone = null; 633 boolean white; 634 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 635 636 // Read the version. 637 require ("version"); 638 parseEq (); 639 version = readLiteral (flags); 640 if (!version.equals ("1.0")) { 641 error ("unsupported XML version", version, "1.0"); 642 } 643 644 // Try reading an encoding declaration. 645 white = tryWhitespace (); 646 if (tryRead ("encoding")) { 647 if (!white) 648 error ("whitespace required before 'encoding='"); 649 parseEq (); 650 encodingName = readLiteral (flags); 651 if (!ignoreEncoding) 652 setupDecoding (encodingName); 653 } 654 655 // Try reading a standalone declaration 656 if (encodingName != null) 657 white = tryWhitespace (); 658 if (tryRead ("standalone")) { 659 if (!white) 660 error ("whitespace required before 'standalone='"); 661 parseEq (); 662 standalone = readLiteral (flags); 663 if (! ("yes".equals (standalone) || "no".equals (standalone))) 664 error ("standalone flag must be 'yes' or 'no'"); 665 } 666 667 skipWhitespace (); 668 require ("?>"); 669 670 return encodingName; 671 } 672 673 674 /** 675 * Parse a text declaration. 676 * <pre> 677 * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' 678 * [80] EncodingDecl ::= S 'encoding' Eq 679 * ( '"' EncName '"' | "'" EncName "'" ) 680 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* 681 * </pre> 682 * <p> (The <code><?xml</code>' and whitespace have already been read.) 683 * @return the encoding in the declaration, uppercased; or null 684 * @see #parseXMLDecl 685 * @see #setupDecoding 686 */ 687 private String parseTextDecl (boolean ignoreEncoding) 688 throws SAXException, IOException 689 { 690 String encodingName = null; 691 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 692 693 // Read an optional version. 694 if (tryRead ("version")) { 695 String version; 696 parseEq (); 697 version = readLiteral (flags); 698 if (!version.equals ("1.0")) { 699 error ("unsupported XML version", version, "1.0"); 700 } 701 requireWhitespace (); 702 } 703 704 705 // Read the encoding. 706 require ("encoding"); 707 parseEq (); 708 encodingName = readLiteral (flags); 709 if (!ignoreEncoding) 710 setupDecoding (encodingName); 711 712 skipWhitespace (); 713 require ("?>"); 714 715 return encodingName; 716 } 717 718 719 /** 720 * Sets up internal state so that we can decode an entity using the 721 * specified encoding. This is used when we start to read an entity 722 * and we have been given knowledge of its encoding before we start to 723 * read any data (e.g. from a SAX input source or from a MIME type). 724 * 725 * <p> It is also used after autodetection, at which point only very 726 * limited adjustments to the encoding may be used (switching between 727 * related builtin decoders). 728 * 729 * @param encodingName The name of the encoding specified by the user. 730 * @exception IOException if the encoding isn't supported either 731 * internally to this parser, or by the hosting JVM. 732 * @see #parseXMLDecl 733 * @see #parseTextDecl 734 */ 735 private void setupDecoding (String encodingName) 736 throws SAXException, IOException 737 { 738 encodingName = encodingName.toUpperCase (); 739 740 // ENCODING_EXTERNAL indicates an encoding that wasn't 741 // autodetected ... we can use builtin decoders, or 742 // ones from the JVM (InputStreamReader). 743 744 // Otherwise we can only tweak what was autodetected, and 745 // only for single byte (ASCII derived) builtin encodings. 746 747 // ASCII-derived encodings 748 if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) { 749 if (encodingName.equals ("ISO-8859-1") 750 || encodingName.equals ("8859_1") 751 || encodingName.equals ("ISO8859_1") 752 ) { 753 encoding = ENCODING_ISO_8859_1; 754 return; 755 } else if (encodingName.equals ("US-ASCII") 756 || encodingName.equals ("ASCII")) { 757 encoding = ENCODING_ASCII; 758 return; 759 } else if (encodingName.equals ("UTF-8") 760 || encodingName.equals ("UTF8")) { 761 encoding = ENCODING_UTF_8; 762 return; 763 } else if (encoding != ENCODING_EXTERNAL) { 764 // fatal error 765 error ("unsupported ASCII-derived encoding", 766 encodingName, 767 "UTF-8, US-ASCII, or ISO-8859-1"); 768 } 769 // else fallthrough ... 770 // it's ASCII-ish and something other than a builtin 771 } 772 773 // Unicode and such 774 if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) { 775 if (!(encodingName.equals ("ISO-10646-UCS-2") 776 || encodingName.equals ("UTF-16") 777 || encodingName.equals ("UTF-16BE") 778 || encodingName.equals ("UTF-16LE"))) 779 error ("unsupported Unicode encoding", 780 encodingName, 781 "UTF-16"); 782 return; 783 } 784 785 // four byte encodings 786 if (encoding == ENCODING_UCS_4_1234 787 || encoding == ENCODING_UCS_4_4321 788 || encoding == ENCODING_UCS_4_2143 789 || encoding == ENCODING_UCS_4_3412) { 790 if (!encodingName.equals ("ISO-10646-UCS-4")) 791 error ("unsupported 32-bit encoding", 792 encodingName, 793 "ISO-10646-UCS-4"); 794 return; 795 } 796 797 // assert encoding == ENCODING_EXTERNAL 798 // if (encoding != ENCODING_EXTERNAL) 799 // throw new RuntimeException ("encoding = " + encoding); 800 801 if (encodingName.equals ("UTF-16BE")) { 802 encoding = ENCODING_UCS_2_12; 803 return; 804 } 805 if (encodingName.equals ("UTF-16LE")) { 806 encoding = ENCODING_UCS_2_21; 807 return; 808 } 809 810 // We couldn't use the builtin decoders at all. But we can try to 811 // create a reader, since we haven't messed up buffering. Tweak 812 // the encoding name if necessary. 813 814 if (encodingName.equals ("UTF-16") 815 || encodingName.equals ("ISO-10646-UCS-2")) 816 encodingName = "Unicode"; 817 // Ignoring all the EBCDIC aliases here 818 819 reader = new InputStreamReader (is, encodingName); 820 sourceType = INPUT_READER; 821 is = null; 822 } 823 824 825 /** 826 * Parse miscellaneous markup outside the document element and DOCTYPE 827 * declaration. 828 * <pre> 829 * [27] Misc ::= Comment | PI | S 830 * </pre> 831 */ 832 private void parseMisc () 833 throws Exception 834 { 835 while (true) { 836 skipWhitespace (); 837 if (tryRead ("<?")) { 838 parsePI (); 839 } else if (tryRead ("<!--")) { 840 parseComment (); 841 } else { 842 return; 843 } 844 } 845 } 846 847 848 /** 849 * Parse a document type declaration. 850 * <pre> 851 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 852 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 853 * </pre> 854 * <p> (The <code><!DOCTYPE</code> has already been read.) 855 */ 856 private void parseDoctypedecl () 857 throws Exception 858 { 859 char c; 860 String doctypeName, ids[]; 861 862 // Read the document type name. 863 requireWhitespace (); 864 doctypeName = readNmtoken (true); 865 866 // Read the External subset's IDs 867 skipWhitespace (); 868 ids = readExternalIds (false); 869 870 // report (a) declaration of name, (b) lexical info (ids) 871 handler.doctypeDecl (doctypeName, ids [0], ids [1]); 872 873 // Internal subset is parsed first, if present 874 skipWhitespace (); 875 if (tryRead ('[')) { 876 877 // loop until the subset ends 878 while (true) { 879 expandPE = true; 880 skipWhitespace (); 881 expandPE = false; 882 if (tryRead (']')) { 883 break; // end of subset 884 } else { 885 // WFC, PEs in internal subset (only between decls) 886 peIsError = expandPE = true; 887 parseMarkupdecl (); 888 peIsError = expandPE = false; 889 } 890 } 891 } 892 893 // Read the external subset, if any 894 if (ids [1] != null) { 895 pushURL ("[external subset]", ids [0], ids [1], null, null, null); 896 897 // Loop until we end up back at '>' 898 while (true) { 899 expandPE = true; 900 skipWhitespace (); 901 expandPE = false; 902 if (tryRead ('>')) { 903 break; 904 } else { 905 expandPE = true; 906 parseMarkupdecl (); 907 expandPE = false; 908 } 909 } 910 } else { 911 // No external subset. 912 skipWhitespace (); 913 require ('>'); 914 } 915 916 // done dtd 917 handler.endDoctype (); 918 expandPE = false; 919 } 920 921 922 /** 923 * Parse a markup declaration in the internal or external DTD subset. 924 * <pre> 925 * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl 926 * | NotationDecl | PI | Comment 927 * [30] extSubsetDecl ::= (markupdecl | conditionalSect 928 * | PEReference | S) * 929 * </pre> 930 * <p> Reading toplevel PE references is handled as a lexical issue 931 * by the caller, as is whitespace. 932 */ 933 private void parseMarkupdecl () 934 throws Exception 935 { 936 if (tryRead ("<!ELEMENT")) { 937 parseElementdecl (); 938 } else if (tryRead ("<!ATTLIST")) { 939 parseAttlistDecl (); 940 } else if (tryRead ("<!ENTITY")) { 941 parseEntityDecl (); 942 } else if (tryRead ("<!NOTATION")) { 943 parseNotationDecl (); 944 } else if (tryRead ("<?")) { 945 parsePI (); 946 } else if (tryRead ("<!--")) { 947 parseComment (); 948 } else if (tryRead ("<![")) { 949 if (inputStack.size () > 0) 950 parseConditionalSect (); 951 else 952 error ("conditional sections illegal in internal subset"); 953 } else { 954 error ("expected markup declaration"); 955 } 956 } 957 958 959 /** 960 * Parse an element, with its tags. 961 * <pre> 962 * [39] element ::= EmptyElementTag | STag content ETag 963 * [40] STag ::= '<' Name (S Attribute)* S? '>' 964 * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>' 965 * </pre> 966 * <p> (The '<' has already been read.) 967 * <p>NOTE: this method actually chains onto parseContent (), if necessary, 968 * and parseContent () will take care of calling parseETag (). 969 */ 970 private void parseElement () 971 throws Exception 972 { 973 String gi; 974 char c; 975 int oldElementContent = currentElementContent; 976 String oldElement = currentElement; 977 Object element []; 978 979 // This is the (global) counter for the 980 // array of specified attributes. 981 tagAttributePos = 0; 982 983 // Read the element type name. 984 gi = readNmtoken (true); 985 986 // Determine the current content type. 987 currentElement = gi; 988 element = (Object []) elementInfo.get (gi); 989 currentElementContent = getContentType (element, CONTENT_ANY); 990 991 // Read the attributes, if any. 992 // After this loop, "c" is the closing delimiter. 993 boolean white = tryWhitespace (); 994 c = readCh (); 995 while (c != '/' && c != '>') { 996 unread (c); 997 if (!white) 998 error ("need whitespace between attributes"); 999 parseAttribute (gi); 1000 white = tryWhitespace (); 1001 c = readCh (); 1002 } 1003 1004 // Supply any defaulted attributes. 1005 Iterator atts = declaredAttributes (element); 1006 if (atts != null) { 1007 String aname; 1008loop: 1009 while (atts.hasNext ()) { 1010 aname = (String) atts.next (); 1011 // See if it was specified. 1012 for (int i = 0; i < tagAttributePos; i++) { 1013 if (tagAttributes [i] == aname) { 1014 continue loop; 1015 } 1016 } 1017 // I guess not... 1018 handler.attribute (aname, 1019 getAttributeExpandedValue (gi, aname), 1020 false); 1021 } 1022 } 1023 1024 // Figure out if this is a start tag 1025 // or an empty element, and dispatch an 1026 // event accordingly. 1027 switch (c) { 1028 case '>': 1029 handler.startElement (gi); 1030 parseContent (); 1031 break; 1032 case '/': 1033 require ('>'); 1034 handler.startElement (gi); 1035 handler.endElement (gi); 1036 break; 1037 } 1038 1039 // Restore the previous state. 1040 currentElement = oldElement; 1041 currentElementContent = oldElementContent; 1042 } 1043 1044 1045 /** 1046 * Parse an attribute assignment. 1047 * <pre> 1048 * [41] Attribute ::= Name Eq AttValue 1049 * </pre> 1050 * @param name The name of the attribute's element. 1051 * @see SAXDriver#attribute 1052 */ 1053 private void parseAttribute (String name) 1054 throws Exception 1055 { 1056 String aname; 1057 int type; 1058 String value; 1059 int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF; 1060 1061 // Read the attribute name. 1062 aname = readNmtoken (true); 1063 type = getAttributeType (name, aname); 1064 1065 // Parse '=' 1066 parseEq (); 1067 1068 // Read the value, normalizing whitespace 1069 // unless it is CDATA. 1070 if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) { 1071 value = readLiteral (flags); 1072 } else { 1073 value = readLiteral (flags | LIT_NORMALIZE); 1074 } 1075 1076 // WFC: no duplicate attributes 1077 for (int i = 0; i < tagAttributePos; i++) 1078 if (aname.equals (tagAttributes [i])) 1079 error ("duplicate attribute", aname, null); 1080 1081 // Inform the handler about the 1082 // attribute. 1083 handler.attribute (aname, value, true); 1084 dataBufferPos = 0; 1085 1086 // Note that the attribute has been 1087 // specified. 1088 if (tagAttributePos == tagAttributes.length) { 1089 String newAttrib[] = new String [tagAttributes.length * 2]; 1090 System.arraycopy (tagAttributes, 0, newAttrib, 0, tagAttributePos); 1091 tagAttributes = newAttrib; 1092 } 1093 tagAttributes [tagAttributePos++] = aname; 1094 } 1095 1096 1097 /** 1098 * Parse an equals sign surrounded by optional whitespace. 1099 * <pre> 1100 * [25] Eq ::= S? '=' S? 1101 * </pre> 1102 */ 1103 private void parseEq () 1104 throws SAXException, IOException 1105 { 1106 skipWhitespace (); 1107 require ('='); 1108 skipWhitespace (); 1109 } 1110 1111 1112 /** 1113 * Parse an end tag. 1114 * <pre> 1115 * [42] ETag ::= '</' Name S? '>' 1116 * </pre> 1117 * <p>NOTE: parseContent () chains to here, we already read the 1118 * "</". 1119 */ 1120 private void parseETag () 1121 throws Exception 1122 { 1123 require (currentElement); 1124 skipWhitespace (); 1125 require ('>'); 1126 handler.endElement (currentElement); 1127 // not re-reporting any SAXException re bogus end tags, 1128 // even though that diagnostic might be clearer ... 1129 } 1130 1131 1132 /** 1133 * Parse the content of an element. 1134 * <pre> 1135 * [43] content ::= (element | CharData | Reference 1136 * | CDSect | PI | Comment)* 1137 * [67] Reference ::= EntityRef | CharRef 1138 * </pre> 1139 * <p> NOTE: consumes ETtag. 1140 */ 1141 private void parseContent () 1142 throws Exception 1143 { 1144 String data; 1145 char c; 1146 1147 while (true) { 1148 switch (currentElementContent) { 1149 case CONTENT_ANY: 1150 case CONTENT_MIXED: 1151 case CONTENT_UNDECLARED: // this line added by MHK 24 May 2000 1152 case CONTENT_EMPTY: // this line added by MHK 8 Sept 2000 1153 parseCharData (); 1154 break; 1155 case CONTENT_ELEMENTS: 1156 parseWhitespace (); 1157 break; 1158 } 1159 1160 // Handle delimiters 1161 c = readCh (); 1162 switch (c) { 1163 1164 case '&': // Found "&" 1165 1166 c = readCh (); 1167 if (c == '#') { 1168 parseCharRef (); 1169 } else { 1170 unread (c); 1171 parseEntityRef (true); 1172 } 1173 break; 1174 1175 case '<': // Found "<" 1176 dataBufferFlush (); 1177 c = readCh (); 1178 switch (c) { 1179 case '!': // Found "<!" 1180 c = readCh (); 1181 switch (c) { 1182 case '-': // Found "<!-" 1183 require ('-'); 1184 parseComment (); 1185 break; 1186 case '[': // Found "<![" 1187 require ("CDATA["); 1188 handler.startCDATA (); 1189 inCDATA = true; 1190 parseCDSect (); 1191 inCDATA = false; 1192 handler.endCDATA (); 1193 break; 1194 default: 1195 error ("expected comment or CDATA section", c, null); 1196 break; 1197 } 1198 break; 1199 1200 case '?': // Found "<?" 1201 parsePI (); 1202 break; 1203 1204 case '/': // Found "</" 1205 parseETag (); 1206 return; 1207 1208 default: // Found "<" followed by something else 1209 unread (c); 1210 parseElement (); 1211 break; 1212 } 1213 } 1214 } 1215 } 1216 1217 1218 /** 1219 * Parse an element type declaration. 1220 * <pre> 1221 * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' 1222 * </pre> 1223 * <p> NOTE: the '<!ELEMENT' has already been read. 1224 */ 1225 private void parseElementdecl () 1226 throws Exception 1227 { 1228 String name; 1229 1230 requireWhitespace (); 1231 // Read the element type name. 1232 name = readNmtoken (true); 1233 1234 requireWhitespace (); 1235 // Read the content model. 1236 parseContentspec (name); 1237 1238 skipWhitespace (); 1239 require ('>'); 1240 } 1241 1242 1243 /** 1244 * Content specification. 1245 * <pre> 1246 * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements 1247 * </pre> 1248 */ 1249 private void parseContentspec (String name) 1250 throws Exception 1251 { 1252 if (tryRead ("EMPTY")) { 1253 setElement (name, CONTENT_EMPTY, null, null); 1254 return; 1255 } else if (tryRead ("ANY")) { 1256 setElement (name, CONTENT_ANY, null, null); 1257 return; 1258 } else { 1259 require ('('); 1260 dataBufferAppend ('('); 1261 skipWhitespace (); 1262 if (tryRead ("#PCDATA")) { 1263 dataBufferAppend ("#PCDATA"); 1264 parseMixed (); 1265 setElement (name, CONTENT_MIXED, dataBufferToString (), null); 1266 } else { 1267 parseElements (); 1268 setElement (name, CONTENT_ELEMENTS, 1269 dataBufferToString (), null); 1270 } 1271 } 1272 } 1273 1274 1275 /** 1276 * Parse an element-content model. 1277 * <pre> 1278 * [47] elements ::= (choice | seq) ('?' | '*' | '+')? 1279 * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')' 1280 * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')' 1281 * </pre> 1282 * 1283 * <p> NOTE: the opening '(' and S have already been read. 1284 */ 1285 private void parseElements () 1286 throws Exception 1287 { 1288 char c; 1289 char sep; 1290 1291 // Parse the first content particle 1292 skipWhitespace (); 1293 parseCp (); 1294 1295 // Check for end or for a separator. 1296 skipWhitespace (); 1297 c = readCh (); 1298 switch (c) { 1299 case ')': 1300 dataBufferAppend (')'); 1301 c = readCh (); 1302 switch (c) { 1303 case '*': 1304 case '+': 1305 case '?': 1306 dataBufferAppend (c); 1307 break; 1308 default: 1309 unread (c); 1310 } 1311 return; 1312 case ',': // Register the separator. 1313 case '|': 1314 sep = c; 1315 dataBufferAppend (c); 1316 break; 1317 default: 1318 error ("bad separator in content model", c, null); 1319 return; 1320 } 1321 1322 // Parse the rest of the content model. 1323 while (true) { 1324 skipWhitespace (); 1325 parseCp (); 1326 skipWhitespace (); 1327 c = readCh (); 1328 if (c == ')') { 1329 dataBufferAppend (')'); 1330 break; 1331 } else if (c != sep) { 1332 error ("bad separator in content model", c, null); 1333 return; 1334 } else { 1335 dataBufferAppend (c); 1336 } 1337 } 1338 1339 // Check for the occurrence indicator. 1340 c = readCh (); 1341 switch (c) { 1342 case '?': 1343 case '*': 1344 case '+': 1345 dataBufferAppend (c); 1346 return; 1347 default: 1348 unread (c); 1349 return; 1350 } 1351 } 1352 1353 1354 /** 1355 * Parse a content particle. 1356 * <pre> 1357 * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? 1358 * </pre> 1359 */ 1360 private void parseCp () 1361 throws Exception 1362 { 1363 char c; 1364 1365 if (tryRead ('(')) { 1366 dataBufferAppend ('('); 1367 parseElements (); 1368 } else { 1369 dataBufferAppend (readNmtoken (true)); 1370 c = readCh (); 1371 switch (c) { 1372 case '?': 1373 case '*': 1374 case '+': 1375 dataBufferAppend (c); 1376 break; 1377 default: 1378 unread (c); 1379 break; 1380 } 1381 } 1382 } 1383 1384 1385 /** 1386 * Parse mixed content. 1387 * <pre> 1388 * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*' 1389 * | '(' S? ('#PCDATA') S? ')' 1390 * </pre> 1391 */ 1392 private void parseMixed () 1393 throws Exception 1394 { 1395 char c; 1396 1397 // Check for PCDATA alone. 1398 skipWhitespace (); 1399 if (tryRead (')')) { 1400 dataBufferAppend (")*"); 1401 tryRead ('*'); 1402 return; 1403 } 1404 1405 // Parse mixed content. 1406 skipWhitespace (); 1407 while (!tryRead (")*")) { 1408 require ('|'); 1409 dataBufferAppend ('|'); 1410 skipWhitespace (); 1411 dataBufferAppend (readNmtoken (true)); 1412 skipWhitespace (); 1413 } 1414 dataBufferAppend (")*"); 1415 } 1416 1417 1418 /** 1419 * Parse an attribute list declaration. 1420 * <pre> 1421 * [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' 1422 * </pre> 1423 * <p>NOTE: the '<!ATTLIST' has already been read. 1424 */ 1425 private void parseAttlistDecl () 1426 throws Exception 1427 { 1428 String elementName; 1429 1430 requireWhitespace (); 1431 elementName = readNmtoken (true); 1432 boolean white = tryWhitespace (); 1433 while (!tryRead ('>')) { 1434 if (!white) 1435 error ("whitespace required before attribute definition"); 1436 parseAttDef (elementName); 1437 white = tryWhitespace (); 1438 } 1439 } 1440 1441 1442 /** 1443 * Parse a single attribute definition. 1444 * <pre> 1445 * [53] AttDef ::= S Name S AttType S DefaultDecl 1446 * </pre> 1447 */ 1448 private void parseAttDef (String elementName) 1449 throws Exception 1450 { 1451 String name; 1452 int type; 1453 String myEnum = null; 1454 1455 // Read the attribute name. 1456 name = readNmtoken (true); 1457 1458 // Read the attribute type. 1459 requireWhitespace (); 1460 type = readAttType (); 1461 1462 // Get the string of enumerated values 1463 // if necessary. 1464 if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) { 1465 myEnum = dataBufferToString (); 1466 } 1467 1468 // Read the default value. 1469 requireWhitespace (); 1470 parseDefault (elementName, name, type, myEnum); 1471 } 1472 1473 1474 /** 1475 * Parse the attribute type. 1476 * <pre> 1477 * [54] AttType ::= StringType | TokenizedType | EnumeratedType 1478 * [55] StringType ::= 'CDATA' 1479 * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' 1480 * | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' 1481 * [57] EnumeratedType ::= NotationType | Enumeration 1482 * </pre> 1483 */ 1484 private int readAttType () 1485 throws Exception 1486 { 1487 String typeString; 1488 Integer type; 1489 1490 if (tryRead ('(')) { 1491 parseEnumeration (false); 1492 return ATTRIBUTE_ENUMERATED; 1493 } else { 1494 typeString = readNmtoken (true); 1495 if (typeString.equals ("NOTATION")) { 1496 parseNotationType (); 1497 } 1498 type = (Integer) attributeTypeHash.get (typeString); 1499 if (type == null) { 1500 error ("illegal attribute type", typeString, null); 1501 return ATTRIBUTE_UNDECLARED; 1502 } else { 1503 return type.intValue (); 1504 } 1505 } 1506 } 1507 1508 1509 /** 1510 * Parse an enumeration. 1511 * <pre> 1512 * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' 1513 * </pre> 1514 * <p>NOTE: the '(' has already been read. 1515 */ 1516 private void parseEnumeration (boolean isNames) 1517 throws Exception 1518 { 1519 char c; 1520 1521 dataBufferAppend ('('); 1522 1523 // Read the first token. 1524 skipWhitespace (); 1525 dataBufferAppend (readNmtoken (isNames)); 1526 // Read the remaining tokens. 1527 skipWhitespace (); 1528 while (!tryRead (')')) { 1529 require ('|'); 1530 dataBufferAppend ('|'); 1531 skipWhitespace (); 1532 dataBufferAppend (readNmtoken (isNames)); 1533 skipWhitespace (); 1534 } 1535 dataBufferAppend (')'); 1536 } 1537 1538 1539 /** 1540 * Parse a notation type for an attribute. 1541 * <pre> 1542 * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks 1543 * (S? '|' S? name)* S? ')' 1544 * </pre> 1545 * <p>NOTE: the 'NOTATION' has already been read 1546 */ 1547 private void parseNotationType () 1548 throws Exception 1549 { 1550 requireWhitespace (); 1551 require ('('); 1552 1553 parseEnumeration (true); 1554 } 1555 1556 1557 /** 1558 * Parse the default value for an attribute. 1559 * <pre> 1560 * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' 1561 * | (('#FIXED' S)? AttValue) 1562 * </pre> 1563 */ 1564 private void parseDefault ( 1565 String elementName, 1566 String name, 1567 int type, 1568 String myEnum 1569 ) throws Exception 1570 { 1571 int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; 1572 String value = null; 1573 int flags = LIT_ATTRIBUTE | LIT_DISABLE_CREF | LIT_ENTITY_CHECK; 1574 1575 // Note: char refs not checked here, and input not normalized, 1576 // since it's done correctly later when we actually expand any 1577 // entity refs. We ought to report char ref syntax errors now, 1578 // but don't. Cost: unused defaults mean unreported WF errs. 1579 1580 // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace 1581 // chars to spaces (doesn't matter when that's done if it doesn't 1582 // interfere with char refs expanding to whitespace). 1583 1584 if (tryRead ('#')) { 1585 if (tryRead ("FIXED")) { 1586 valueType = ATTRIBUTE_DEFAULT_FIXED; 1587 requireWhitespace (); 1588 value = readLiteral (flags); 1589 } else if (tryRead ("REQUIRED")) { 1590 valueType = ATTRIBUTE_DEFAULT_REQUIRED; 1591 } else if (tryRead ("IMPLIED")) { 1592 valueType = ATTRIBUTE_DEFAULT_IMPLIED; 1593 } else { 1594 error ("illegal keyword for attribute default value"); 1595 } 1596 } else 1597 value = readLiteral (flags); 1598 setAttribute (elementName, name, type, myEnum, value, valueType); 1599 } 1600 1601 1602 /** 1603 * Parse a conditional section. 1604 * <pre> 1605 * [61] conditionalSect ::= includeSect || ignoreSect 1606 * [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' 1607 * extSubsetDecl ']]>' 1608 * [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' 1609 * ignoreSectContents* ']]>' 1610 * [64] ignoreSectContents ::= Ignore 1611 * ('<![' ignoreSectContents* ']]>' Ignore )* 1612 * [65] Ignore ::= Char* - (Char* ( '<![' | ']]>') Char* ) 1613 * </pre> 1614 * <p> NOTE: the '>![' has already been read. 1615 */ 1616 private void parseConditionalSect () 1617 throws Exception 1618 { 1619 skipWhitespace (); 1620 if (tryRead ("INCLUDE")) { 1621 skipWhitespace (); 1622 require ('['); 1623 skipWhitespace (); 1624 while (!tryRead ("]]>")) { 1625 parseMarkupdecl (); 1626 skipWhitespace (); 1627 } 1628 } else if (tryRead ("IGNORE")) { 1629 skipWhitespace (); 1630 require ('['); 1631 int nesting = 1; 1632 char c; 1633 expandPE = false; 1634 for (int nest = 1; nest > 0;) { 1635 c = readCh (); 1636 switch (c) { 1637 case '<': 1638 if (tryRead ("![")) { 1639 nest++; 1640 } 1641 case ']': 1642 if (tryRead ("]>")) { 1643 nest--; 1644 } 1645 } 1646 } 1647 expandPE = true; 1648 } else { 1649 error ("conditional section must begin with INCLUDE or IGNORE"); 1650 } 1651 } 1652 1653 1654 /** 1655 * Read and interpret a character reference. 1656 * <pre> 1657 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' 1658 * </pre> 1659 * <p>NOTE: the '&#' has already been read. 1660 */ 1661 private void parseCharRef () 1662 throws SAXException, IOException 1663 { 1664 int value = 0; 1665 char c; 1666 1667 if (tryRead ('x')) { 1668loop1: 1669 while (true) { 1670 c = readCh (); 1671 switch (c) { 1672 case '0': 1673 case '1': 1674 case '2': 1675 case '3': 1676 case '4': 1677 case '5': 1678 case '6': 1679 case '7': 1680 case '8': 1681 case '9': 1682 case 'a': 1683 case 'A': 1684 case 'b': 1685 case 'B': 1686 case 'c': 1687 case 'C': 1688 case 'd': 1689 case 'D': 1690 case 'e': 1691 case 'E': 1692 case 'f': 1693 case 'F': 1694 value *= 16; 1695 value += Integer.parseInt (new Character (c).toString (), 1696 16); 1697 break; 1698 case ';': 1699 break loop1; 1700 default: 1701 error ("illegal character in character reference", c, null); 1702 break loop1; 1703 } 1704 } 1705 } else { 1706loop2: 1707 while (true) { 1708 c = readCh (); 1709 switch (c) { 1710 case '0': 1711 case '1': 1712 case '2': 1713 case '3': 1714 case '4': 1715 case '5': 1716 case '6': 1717 case '7': 1718 case '8': 1719 case '9': 1720 value *= 10; 1721 value += Integer.parseInt (new Character (c).toString (), 1722 10); 1723 break; 1724 case ';': 1725 break loop2; 1726 default: 1727 error ("illegal character in character reference", c, null); 1728 break loop2; 1729 } 1730 } 1731 } 1732 1733 // check for character refs being legal XML 1734 if ((value < 0x0020 1735 && ! (value == '\n' || value == '\t' || value == '\r')) 1736 || (value >= 0xD800 && value <= 0xDFFF) 1737 || value == 0xFFFE || value == 0xFFFF 1738 || value > 0x0010ffff) 1739 error ("illegal XML character reference U+" 1740 + Integer.toHexString (value)); 1741 1742 // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz 1743 // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: 1744 if (value <= 0x0000ffff) { 1745 // no surrogates needed 1746 dataBufferAppend ((char) value); 1747 } else if (value <= 0x0010ffff) { 1748 value -= 0x10000; 1749 // > 16 bits, surrogate needed 1750 dataBufferAppend ((char) (0xd800 | (value >> 10))); 1751 dataBufferAppend ((char) (0xdc00 | (value & 0x0003ff))); 1752 } else { 1753 // too big for surrogate 1754 error ("character reference " + value + " is too large for UTF-16", 1755 new Integer (value).toString (), null); 1756 } 1757 } 1758 1759 1760 /** 1761 * Parse and expand an entity reference. 1762 * <pre> 1763 * [68] EntityRef ::= '&' Name ';' 1764 * </pre> 1765 * <p>NOTE: the '&' has already been read. 1766 * @param externalAllowed External entities are allowed here. 1767 */ 1768 private void parseEntityRef (boolean externalAllowed) 1769 throws SAXException, IOException 1770 { 1771 String name; 1772 1773 name = readNmtoken (true); 1774 require (';'); 1775 switch (getEntityType (name)) { 1776 case ENTITY_UNDECLARED: 1777 error ("reference to undeclared entity", name, null); 1778 break; 1779 case ENTITY_INTERNAL: 1780 pushString (name, getEntityValue (name)); 1781 break; 1782 case ENTITY_TEXT: 1783 if (externalAllowed) { 1784 pushURL (name, getEntityPublicId (name), 1785 getEntitySystemId (name), 1786 null, null, null); 1787 } else { 1788 error ("reference to external entity in attribute value.", 1789 name, null); 1790 } 1791 break; 1792 case ENTITY_NDATA: 1793 if (externalAllowed) { 1794 error ("unparsed entity reference in content", name, null); 1795 } else { 1796 error ("reference to external entity in attribute value.", 1797 name, null); 1798 } 1799 break; 1800 } 1801 } 1802 1803 1804 /** 1805 * Parse and expand a parameter entity reference. 1806 * <pre> 1807 * [69] PEReference ::= '%' Name ';' 1808 * </pre> 1809 * <p>NOTE: the '%' has already been read. 1810 */ 1811 private void parsePEReference () 1812 throws SAXException, IOException 1813 { 1814 String name; 1815 1816 name = "%" + readNmtoken (true); 1817 require (';'); 1818 switch (getEntityType (name)) { 1819 case ENTITY_UNDECLARED: 1820 // this is a validity problem, not a WFC violation ... but 1821 // we should disable handling of all subsequent declarations 1822 // unless this is a standalone document 1823 // warn ("reference to undeclared parameter entity", name, null); 1824 1825 break; 1826 case ENTITY_INTERNAL: 1827 if (inLiteral) 1828 pushString (name, getEntityValue (name)); 1829 else 1830 pushString (name, " " + getEntityValue (name) + ' '); 1831 break; 1832 case ENTITY_TEXT: 1833 if (!inLiteral) 1834 pushString (null, " "); 1835 pushURL (name, getEntityPublicId (name), 1836 getEntitySystemId (name), 1837 null, null, null); 1838 if (!inLiteral) 1839 pushString (null, " "); 1840 break; 1841 } 1842 } 1843 1844 /** 1845 * Parse an entity declaration. 1846 * <pre> 1847 * [70] EntityDecl ::= GEDecl | PEDecl 1848 * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' 1849 * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' 1850 * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) 1851 * [74] PEDef ::= EntityValue | ExternalID 1852 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 1853 * | 'PUBLIC' S PubidLiteral S SystemLiteral 1854 * [76] NDataDecl ::= S 'NDATA' S Name 1855 * </pre> 1856 * <p>NOTE: the '<!ENTITY' has already been read. 1857 */ 1858 private void parseEntityDecl () 1859 throws Exception 1860 { 1861 char c; 1862 boolean peFlag = false; 1863 String name, value, notationName, ids[]; 1864 1865 // Check for a parameter entity. 1866 expandPE = false; 1867 requireWhitespace (); 1868 if (tryRead ('%')) { 1869 peFlag = true; 1870 requireWhitespace (); 1871 } 1872 expandPE = true; 1873 1874 // Read the entity name, and prepend 1875 // '%' if necessary. 1876 name = readNmtoken (true); 1877 if (peFlag) { 1878 name = "%" + name; 1879 } 1880 1881 // Read the entity value. 1882 requireWhitespace (); 1883 c = readCh (); 1884 unread (c); 1885 if (c == '"' || c == '\'') { 1886 // Internal entity ... replacement text has expanded refs 1887 // to characters and PEs, but not to general entities 1888 value = readLiteral (0); 1889 setInternalEntity (name, value); 1890 } else { 1891 // Read the external IDs 1892 ids = readExternalIds (false); 1893 if (ids [1] == null) { 1894 error ("system identifer missing", name, null); 1895 } 1896 1897 // Check for NDATA declaration. 1898 boolean white = tryWhitespace (); 1899 if (!peFlag && tryRead ("NDATA")) { 1900 if (!white) 1901 error ("whitespace required before NDATA"); 1902 requireWhitespace (); 1903 notationName = readNmtoken (true); 1904 setExternalDataEntity (name, ids [0], ids [1], notationName); 1905 } else { 1906 setExternalTextEntity (name, ids [0], ids [1]); 1907 } 1908 } 1909 1910 // Finish the declaration. 1911 skipWhitespace (); 1912 require ('>'); 1913 } 1914 1915 1916 /** 1917 * Parse a notation declaration. 1918 * <pre> 1919 * [82] NotationDecl ::= '<!NOTATION' S Name S 1920 * (ExternalID | PublicID) S? '>' 1921 * [83] PublicID ::= 'PUBLIC' S PubidLiteral 1922 * </pre> 1923 * <P>NOTE: the '<!NOTATION' has already been read. 1924 */ 1925 private void parseNotationDecl () 1926 throws Exception 1927 { 1928 String nname, ids[]; 1929 1930 1931 requireWhitespace (); 1932 nname = readNmtoken (true); 1933 1934 requireWhitespace (); 1935 1936 // Read the external identifiers. 1937 ids = readExternalIds (true); 1938 if (ids [0] == null && ids [1] == null) { 1939 error ("external identifer missing", nname, null); 1940 } 1941 1942 // Register the notation. 1943 setNotation (nname, ids [0], ids [1]); 1944 1945 skipWhitespace (); 1946 require ('>'); 1947 } 1948 1949 1950 /** 1951 * Parse character data. 1952 * <pre> 1953 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 1954 * </pre> 1955 */ 1956 private void parseCharData () 1957 throws Exception 1958 { 1959 char c; 1960 1961 // Start with a little cheat -- in most 1962 // cases, the entire sequence of 1963 // character data will already be in 1964 // the readBuffer; if not, fall through to 1965 // the normal approach. 1966 if (USE_CHEATS) { 1967 int lineAugment = 0; 1968 int columnAugment = 0; 1969 1970loop: 1971 for (int i = readBufferPos; i < readBufferLength; i++) { 1972 switch (c = readBuffer [i]) { 1973 case '\n': 1974 lineAugment++; 1975 columnAugment = 0; 1976 break; 1977 case '&': 1978 case '<': 1979 int start = readBufferPos; 1980 columnAugment++; 1981 readBufferPos = i; 1982 if (lineAugment > 0) { 1983 line += lineAugment; 1984 column = columnAugment; 1985 } else { 1986 column += columnAugment; 1987 } 1988 dataBufferAppend (readBuffer, start, i - start); 1989 return; 1990 case ']': 1991 // XXX missing two end-of-buffer cases 1992 if ((i + 2) < readBufferLength) { 1993 if (readBuffer [i + 1] == ']' 1994 && readBuffer [i + 2] == '>') { 1995 error ("character data may not contain ']]>'"); 1996 } 1997 } 1998 columnAugment++; 1999 break; 2000 default: 2001 if (c < 0x0020 || c > 0xFFFD) 2002 error ("illegal XML character U+" 2003 + Integer.toHexString (c)); 2004 // FALLTHROUGH 2005 case '\r': 2006 case '\t': 2007 columnAugment++; 2008 } 2009 } 2010 } 2011 2012 // OK, the cheat didn't work; start over 2013 // and do it by the book. 2014 while (true) { 2015 c = readCh (); 2016 switch (c) { 2017 case '<': 2018 case '&': 2019 unread (c); 2020 return; 2021 // XXX "]]>" precluded ... 2022 default: 2023 dataBufferAppend (c); 2024 break; 2025 } 2026 } 2027 } 2028 2029 2030 ////////////////////////////////////////////////////////////////////// 2031 // High-level reading and scanning methods. 2032 ////////////////////////////////////////////////////////////////////// 2033 2034 /** 2035 * Require whitespace characters. 2036 */ 2037 private void requireWhitespace () 2038 throws SAXException, IOException 2039 { 2040 char c = readCh (); 2041 if (isWhitespace (c)) { 2042 skipWhitespace (); 2043 } else { 2044 error ("whitespace required", c, null); 2045 } 2046 } 2047 2048 2049 /** 2050 * Parse whitespace characters, and leave them in the data buffer. 2051 */ 2052 private void parseWhitespace () 2053 throws Exception 2054 { 2055 char c = readCh (); 2056 while (isWhitespace (c)) { 2057 dataBufferAppend (c); 2058 c = readCh (); 2059 } 2060 unread (c); 2061 } 2062 2063 2064 /** 2065 * Skip whitespace characters. 2066 * <pre> 2067 * [3] S ::= (#x20 | #x9 | #xd | #xa)+ 2068 * </pre> 2069 */ 2070 private void skipWhitespace () 2071 throws SAXException, IOException 2072 { 2073 // Start with a little cheat. Most of 2074 // the time, the white space will fall 2075 // within the current read buffer; if 2076 // not, then fall through. 2077 if (USE_CHEATS) { 2078 int lineAugment = 0; 2079 int columnAugment = 0; 2080 2081loop: 2082 for (int i = readBufferPos; i < readBufferLength; i++) { 2083 switch (readBuffer [i]) { 2084 case ' ': 2085 case '\t': 2086 case '\r': 2087 columnAugment++; 2088 break; 2089 case '\n': 2090 lineAugment++; 2091 columnAugment = 0; 2092 break; 2093 case '%': 2094 if (expandPE) 2095 break loop; 2096 // else fall through... 2097 default: 2098 readBufferPos = i; 2099 if (lineAugment > 0) { 2100 line += lineAugment; 2101 column = columnAugment; 2102 } else { 2103 column += columnAugment; 2104 } 2105 return; 2106 } 2107 } 2108 } 2109 2110 // OK, do it by the book. 2111 char c = readCh (); 2112 while (isWhitespace (c)) { 2113 c = readCh (); 2114 } 2115 unread (c); 2116 } 2117 2118 2119 /** 2120 * Read a name or (when parsing an enumeration) name token. 2121 * <pre> 2122 * [5] Name ::= (Letter | '_' | ':') (NameChar)* 2123 * [7] Nmtoken ::= (NameChar)+ 2124 * </pre> 2125 */ 2126 private String readNmtoken (boolean isName) 2127 throws SAXException, IOException 2128 { 2129 char c; 2130 2131 if (USE_CHEATS) { 2132loop: 2133 for (int i = readBufferPos; i < readBufferLength; i++) { 2134 c = readBuffer [i]; 2135 switch (c) { 2136 case '%': 2137 if (expandPE) 2138 break loop; 2139 // else fall through... 2140 2141 // What may legitimately come AFTER a name/nmtoken? 2142 case '<': case '>': case '&': 2143 case ',': case '|': case '*': case '+': case '?': 2144 case ')': 2145 case '=': 2146 case '\'': case '"': 2147 case '[': 2148 case ' ': case '\t': case '\r': case '\n': 2149 case ';': 2150 case '/': 2151 int start = readBufferPos; 2152 if (i == start) 2153 error ("name expected", readBuffer [i], null); 2154 readBufferPos = i; 2155 return intern (readBuffer, start, i - start); 2156 2157 default: 2158 // punt on exact tests from Appendix A; approximate 2159 // them using the Unicode ID start/part rules 2160 if (i == readBufferPos && isName) { 2161 if (!Character.isUnicodeIdentifierStart (c) 2162 && c != ':' && c != '_') 2163 error ("Not a name start character, U+" 2164 + Integer.toHexString (c)); 2165 } else if (!Character.isUnicodeIdentifierPart (c) 2166 && c != '-' && c != ':' && c != '_' && c != '.' 2167 && !isExtender (c)) 2168 error ("Not a name character, U+" 2169 + Integer.toHexString (c)); 2170 } 2171 } 2172 } 2173 2174 nameBufferPos = 0; 2175 2176 // Read the first character. 2177loop: 2178 while (true) { 2179 c = readCh (); 2180 switch (c) { 2181 case '%': 2182 case '<': case '>': case '&': 2183 case ',': case '|': case '*': case '+': case '?': 2184 case ')': 2185 case '=': 2186 case '\'': case '"': 2187 case '[': 2188 case ' ': case '\t': case '\n': case '\r': 2189 case ';': 2190 case '/': 2191 unread (c); 2192 if (nameBufferPos == 0) { 2193 error ("name expected"); 2194 } 2195 // punt on exact tests from Appendix A, but approximate them 2196 if (isName 2197 && !Character.isUnicodeIdentifierStart ( 2198 nameBuffer [0]) 2199 && ":_".indexOf (nameBuffer [0]) == -1) 2200 error ("Not a name start character, U+" 2201 + Integer.toHexString (nameBuffer [0])); 2202 String s = intern (nameBuffer, 0, nameBufferPos); 2203 nameBufferPos = 0; 2204 return s; 2205 default: 2206 // punt on exact tests from Appendix A, but approximate them 2207 2208 if ((nameBufferPos != 0 || !isName) 2209 && !Character.isUnicodeIdentifierPart (c) 2210 && ":-_.".indexOf (c) == -1 2211 && !isExtender (c)) 2212 error ("Not a name character, U+" 2213 + Integer.toHexString (c)); 2214 if (nameBufferPos >= nameBuffer.length) 2215 nameBuffer = 2216 (char[]) extendArray (nameBuffer, 2217 nameBuffer.length, nameBufferPos); 2218 nameBuffer [nameBufferPos++] = c; 2219 } 2220 } 2221 } 2222 2223 private static boolean isExtender (char c) 2224 { 2225 // [88] Extender ::= ... 2226 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 2227 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005 2228 || (c >= 0x3031 && c <= 0x3035) 2229 || (c >= 0x309d && c <= 0x309e) 2230 || (c >= 0x30fc && c <= 0x30fe); 2231 } 2232 2233 2234 /** 2235 * Read a literal. With matching single or double quotes as 2236 * delimiters (and not embedded!) this is used to parse: 2237 * <pre> 2238 * [9] EntityValue ::= ... ([^%&] | PEReference | Reference)* ... 2239 * [10] AttValue ::= ... ([^<&] | Reference)* ... 2240 * [11] SystemLiteral ::= ... (URLchar - "'")* ... 2241 * [12] PubidLiteral ::= ... (PubidChar - "'")* ... 2242 * </pre> 2243 * as well as the quoted strings in XML and text declarations 2244 * (for version, encoding, and standalone) which have their 2245 * own constraints. 2246 */ 2247 private String readLiteral (int flags) 2248 throws SAXException, IOException 2249 { 2250 char delim, c; 2251 int startLine = line; 2252 boolean saved = expandPE; 2253 2254 // Find the first delimiter. 2255 delim = readCh (); 2256 if (delim != '"' && delim != '\'' && delim != (char) 0) { 2257 error ("expected '\"' or \"'\"", delim, null); 2258 return null; 2259 } 2260 inLiteral = true; 2261 if ((flags & LIT_DISABLE_PE) != 0) 2262 expandPE = false; 2263 2264 // Each level of input source has its own buffer; remember 2265 // ours, so we won't read the ending delimiter from any 2266 // other input source, regardless of entity processing. 2267 char ourBuf [] = readBuffer; 2268 2269 // Read the literal. 2270 try { 2271 c = readCh (); 2272loop: 2273 while (! (c == delim && readBuffer == ourBuf)) { 2274 switch (c) { 2275 // Can't escape this normalization for attributes 2276 case '\n': 2277 case '\r': 2278 case '\t': 2279 if ((flags & LIT_ATTRIBUTE) != 0) 2280 c = ' '; 2281 break; 2282 case '&': 2283 c = readCh (); 2284 // Char refs are expanded immediately, except for 2285 // all the cases where it's deferred. 2286 if (c == '#') { 2287 if ((flags & LIT_DISABLE_CREF) != 0) { 2288 dataBufferAppend ('&'); 2289 dataBufferAppend ('#'); 2290 continue; 2291 } 2292 parseCharRef (); 2293 2294 // It looks like an entity ref ... 2295 } else { 2296 unread (c); 2297 // Expand it? 2298 if ((flags & LIT_ENTITY_REF) > 0) { 2299 parseEntityRef (false); 2300 2301 // Is it just data? 2302 } else if ((flags & LIT_DISABLE_EREF) != 0) { 2303 dataBufferAppend ('&'); 2304 2305 // OK, it will be an entity ref -- expanded later. 2306 } else { 2307 String name = readNmtoken (true); 2308 require (';'); 2309 if ((flags & LIT_ENTITY_CHECK) != 0 2310 && getEntityType (name) == 2311 ENTITY_UNDECLARED) { 2312 error ("General entity '" + name 2313 + "' must be declared before use"); 2314 } 2315 dataBufferAppend ('&'); 2316 dataBufferAppend (name); 2317 dataBufferAppend (';'); 2318 } 2319 } 2320 c = readCh (); 2321 continue loop; 2322 2323 case '<': 2324 // and why? Perhaps so "&foo;" expands the same 2325 // inside and outside an attribute? 2326 if ((flags & LIT_ATTRIBUTE) != 0) 2327 error ("attribute values may not contain '<'"); 2328 break; 2329 2330 // We don't worry about case '%' and PE refs, readCh does. 2331 2332 default: 2333 break; 2334 } 2335 dataBufferAppend (c); 2336 c = readCh (); 2337 } 2338 } catch (EOFException e) { 2339 error ("end of input while looking for delimiter (started on line " 2340 + startLine + ')', null, new Character (delim).toString ()); 2341 } 2342 inLiteral = false; 2343 expandPE = saved; 2344 2345 // Normalise whitespace if necessary. 2346 if ((flags & LIT_NORMALIZE) > 0) { 2347 dataBufferNormalize (); 2348 } 2349 2350 // Return the value. 2351 return dataBufferToString (); 2352 } 2353 2354 2355 /** 2356 * Try reading external identifiers. 2357 * A system identifier is not required for notations. 2358 * @param inNotation Are we in a notation? 2359 * @return A two-member String array containing the identifiers. 2360 */ 2361 private String[] readExternalIds (boolean inNotation) 2362 throws Exception 2363 { 2364 char c; 2365 String ids[] = new String [2]; 2366 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 2367 2368 if (tryRead ("PUBLIC")) { 2369 requireWhitespace (); 2370 ids [0] = readLiteral (LIT_NORMALIZE | flags); 2371 if (inNotation) { 2372 skipWhitespace (); 2373 c = readCh (); 2374 unread (c); 2375 if (c == '"' || c == '\'') { 2376 ids [1] = readLiteral (flags); 2377 } 2378 } else { 2379 requireWhitespace (); 2380 ids [1] = readLiteral (flags); 2381 } 2382 2383 for (int i = 0; i < ids [0].length (); i++) { 2384 c = ids [0].charAt (i); 2385 if (c >= 'a' && c <= 'z') 2386 continue; 2387 if (c >= 'A' && c <= 'Z') 2388 continue; 2389 if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf (c) != -1) 2390 continue; 2391 error ("illegal PUBLIC id character U+" 2392 + Integer.toHexString (c)); 2393 } 2394 } else if (tryRead ("SYSTEM")) { 2395 requireWhitespace (); 2396 ids [1] = readLiteral (flags); 2397 } 2398 2399 // XXX should normalize system IDs as follows: 2400 // - Convert to UTF-8 2401 // - Map reserved and non-ASCII characters to %HH 2402 2403 return ids; 2404 } 2405 2406 2407 /** 2408 * Test if a character is whitespace. 2409 * <pre> 2410 * [3] S ::= (#x20 | #x9 | #xd | #xa)+ 2411 * </pre> 2412 * @param c The character to test. 2413 * @return true if the character is whitespace. 2414 */ 2415 private final boolean isWhitespace (char c) 2416 { 2417 if (c > 0x20) 2418 return false; 2419 if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d) 2420 return true; 2421 return false; // illegal ... 2422 } 2423 2424 2425 ////////////////////////////////////////////////////////////////////// 2426 // Utility routines. 2427 ////////////////////////////////////////////////////////////////////// 2428 2429 2430 /** 2431 * Add a character to the data buffer. 2432 */ 2433 private void dataBufferAppend (char c) 2434 { 2435 // Expand buffer if necessary. 2436 if (dataBufferPos >= dataBuffer.length) 2437 dataBuffer = 2438 (char[]) extendArray (dataBuffer, 2439 dataBuffer.length, dataBufferPos); 2440 dataBuffer [dataBufferPos++] = c; 2441 } 2442 2443 2444 /** 2445 * Add a string to the data buffer. 2446 */ 2447 private void dataBufferAppend (String s) 2448 { 2449 dataBufferAppend (s.toCharArray (), 0, s.length ()); 2450 } 2451 2452 2453 /** 2454 * Append (part of) a character array to the data buffer. 2455 */ 2456 private void dataBufferAppend (char ch[], int start, int length) 2457 { 2458 dataBuffer = (char[]) 2459 extendArray (dataBuffer, dataBuffer.length, 2460 dataBufferPos + length); 2461 2462 System.arraycopy (ch, start, dataBuffer, dataBufferPos, length); 2463 dataBufferPos += length; 2464 } 2465 2466 2467 /** 2468 * Normalise whitespace in the data buffer. 2469 */ 2470 private void dataBufferNormalize () 2471 { 2472 int i = 0; 2473 int j = 0; 2474 int end = dataBufferPos; 2475 2476 // Skip whitespace at the start. 2477 while (j < end && isWhitespace (dataBuffer [j])) { 2478 j++; 2479 } 2480 2481 // Skip whitespace at the end. 2482 while (end > j && isWhitespace (dataBuffer [end - 1])) { 2483 end --; 2484 } 2485 2486 // Start copying to the left. 2487 while (j < end) { 2488 2489 char c = dataBuffer [j++]; 2490 2491 // Normalise all other whitespace to 2492 // a single space. 2493 if (isWhitespace (c)) { 2494 while (j < end && isWhitespace (dataBuffer [j++])) {} 2495 2496 dataBuffer [i++] = ' '; 2497 dataBuffer [i++] = dataBuffer [j - 1]; 2498 } else { 2499 dataBuffer [i++] = c; 2500 } 2501 } 2502 2503 // The new length is <= the old one. 2504 dataBufferPos = i; 2505 } 2506 2507 2508 /** 2509 * Convert the data buffer to a string. 2510 */ 2511 private String dataBufferToString () 2512 { 2513 String s = new String (dataBuffer, 0, dataBufferPos); 2514 dataBufferPos = 0; 2515 return s; 2516 } 2517 2518 2519 /** 2520 * Flush the contents of the data buffer to the handler, as 2521 * appropriate, and reset the buffer for new input. 2522 */ 2523 private void dataBufferFlush () 2524 throws SAXException 2525 { 2526 if (currentElementContent == CONTENT_ELEMENTS 2527 && dataBufferPos > 0 2528 && !inCDATA 2529 ) { 2530 // We can't just trust the buffer to be whitespace, there 2531 // are cases when it isn't 2532 for (int i = 0; i < dataBufferPos; i++) { 2533 if (!isWhitespace (dataBuffer [i])) { 2534 handler.charData (dataBuffer, 0, dataBufferPos); 2535 dataBufferPos = 0; 2536 } 2537 } 2538 if (dataBufferPos > 0) { 2539 handler.ignorableWhitespace (dataBuffer, 0, dataBufferPos); 2540 dataBufferPos = 0; 2541 } 2542 } else if (dataBufferPos > 0) { 2543 handler.charData (dataBuffer, 0, dataBufferPos); 2544 dataBufferPos = 0; 2545 } 2546 } 2547 2548 2549 /** 2550 * Require a string to appear, or throw an exception. 2551 * <p><em>Precondition:</em> Entity expansion is not required. 2552 * <p><em>Precondition:</em> data buffer has no characters that 2553 * will get sent to the application. 2554 */ 2555 private void require (String delim) 2556 throws SAXException, IOException 2557 { 2558 int length = delim.length (); 2559 char ch []; 2560 2561 if (length < dataBuffer.length) { 2562 ch = dataBuffer; 2563 delim.getChars (0, length, ch, 0); 2564 } else 2565 ch = delim.toCharArray (); 2566 2567 if (USE_CHEATS 2568 && length <= (readBufferLength - readBufferPos)) { 2569 int offset = readBufferPos; 2570 2571 for (int i = 0; i < length; i++, offset++) 2572 if (ch [i] != readBuffer [offset]) 2573 error ("required string", null, delim); 2574 readBufferPos = offset; 2575 2576 } else { 2577 for (int i = 0; i < length; i++) 2578 require (ch [i]); 2579 } 2580 } 2581 2582 2583 /** 2584 * Require a character to appear, or throw an exception. 2585 */ 2586 private void require (char delim) 2587 throws SAXException, IOException 2588 { 2589 char c = readCh (); 2590 2591 if (c != delim) { 2592 error ("required character", c, new Character (delim).toString ()); 2593 } 2594 } 2595 2596 2597 /** 2598 * Create an interned string from a character array. 2599 * Ælfred uses this method to create an interned version 2600 * of all names and name tokens, so that it can test equality 2601 * with <code>==</code> instead of <code>String.equals ()</code>. 2602 * 2603 * <p>This is much more efficient than constructing a non-interned 2604 * string first, and then interning it. 2605 * 2606 * @param ch an array of characters for building the string. 2607 * @param start the starting position in the array. 2608 * @param length the number of characters to place in the string. 2609 * @return an interned string. 2610 * @see #intern (String) 2611 * @see java.lang.String#intern 2612 */ 2613 public String intern (char ch[], int start, int length) 2614 { 2615 int index = 0; 2616 int hash = 0; 2617 Object bucket []; 2618 2619 // Generate a hash code. 2620 for (int i = start; i < start + length; i++) 2621 hash = 31 * hash + ch [i]; 2622 hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH; 2623 2624 // Get the bucket -- consists of {array,String} pairs 2625 if ((bucket = symbolTable [hash]) == null) { 2626 // first string in this bucket 2627 bucket = new Object [8]; 2628 2629 // Search for a matching tuple, and 2630 // return the string if we find one. 2631 } else { 2632 while (index < bucket.length) { 2633 char chFound [] = (char []) bucket [index]; 2634 2635 // Stop when we hit a null index. 2636 if (chFound == null) 2637 break; 2638 2639 // If they're the same length, check for a match. 2640 if (chFound.length == length) { 2641 for (int i = 0; i < chFound.length; i++) { 2642 // continue search on failure 2643 if (ch [start + i] != chFound [i]) { 2644 break; 2645 } else if (i == length - 1) { 2646 // That's it, we have a match! 2647 return (String) bucket [index + 1]; 2648 } 2649 } 2650 } 2651 index += 2; 2652 } 2653 // Not found -- we'll have to add it. 2654 2655 // Do we have to grow the bucket? 2656 bucket = (Object []) extendArray (bucket, bucket.length, index); 2657 } 2658 symbolTable [hash] = bucket; 2659 2660 // OK, add it to the end of the bucket -- "local" interning. 2661 // Intern "globally" to let applications share interning benefits. 2662 String s = new String (ch, start, length).intern (); 2663 bucket [index] = s.toCharArray (); 2664 bucket [index + 1] = s; 2665 return s; 2666 } 2667 2668 2669 /** 2670 * Ensure the capacity of an array, allocating a new one if 2671 * necessary. Usually called only a handful of times. 2672 */ 2673 private Object extendArray (Object array, int currentSize, int requiredSize) 2674 { 2675 if (requiredSize < currentSize) { 2676 return array; 2677 } else { 2678 Object newArray = null; 2679 int newSize = currentSize * 2; 2680 2681 if (newSize <= requiredSize) 2682 newSize = requiredSize + 1; 2683 2684 if (array instanceof char[]) 2685 newArray = new char [newSize]; 2686 else if (array instanceof Object[]) 2687 newArray = new Object [newSize]; 2688 else 2689 throw new RuntimeException (); 2690 2691 System.arraycopy (array, 0, newArray, 0, currentSize); 2692 return newArray; 2693 } 2694 } 2695 2696 2697 ////////////////////////////////////////////////////////////////////// 2698 // XML query routines. 2699 ////////////////////////////////////////////////////////////////////// 2700 2701 2702 // 2703 // Elements 2704 // 2705 2706 /** 2707 * Get the declared elements for an XML document. 2708 * <p>The results will be valid only after the DTD (if any) has been 2709 * parsed. 2710 * @return An enumeration of all element types declared for this 2711 * document (as Strings). 2712 * @see #getElementContentType 2713 * @see #getElementContentModel 2714 */ 2715 public Iterator declaredElements () 2716 { 2717 return elementInfo.keySet().iterator(); 2718 } 2719 2720 2721 /** 2722 * Look up the content type of an element. 2723 * @param element element info vector 2724 * @param defaultType value for null vector 2725 * @return An integer constant representing the content type. 2726 * @see #CONTENT_UNDECLARED 2727 * @see #CONTENT_ANY 2728 * @see #CONTENT_EMPTY 2729 * @see #CONTENT_MIXED 2730 * @see #CONTENT_ELEMENTS 2731 */ 2732 private int getContentType (Object element [], int defaultType) 2733 { 2734 if (element == null) 2735 return defaultType; 2736 else 2737 return ((Integer) element [0]).intValue (); 2738 } 2739 2740 2741 /** 2742 * Look up the content type of an element. 2743 * @param name The element type name. 2744 * @return An integer constant representing the content type. 2745 * @see #getElementContentModel 2746 * @see #CONTENT_UNDECLARED 2747 * @see #CONTENT_ANY 2748 * @see #CONTENT_EMPTY 2749 * @see #CONTENT_MIXED 2750 * @see #CONTENT_ELEMENTS 2751 */ 2752 public int getElementContentType (String name) 2753 { 2754 Object element [] = (Object []) elementInfo.get (name); 2755 return getContentType (element, CONTENT_UNDECLARED); 2756 } 2757 2758 2759 /** 2760 * Look up the content model of an element. 2761 * <p>The result will always be null unless the content type is 2762 * CONTENT_ELEMENTS or CONTENT_MIXED. 2763 * @param name The element type name. 2764 * @return The normalised content model, as a string. 2765 * @see #getElementContentType 2766 */ 2767 public String getElementContentModel (String name) 2768 { 2769 Object element[] = (Object[]) elementInfo.get (name); 2770 if (element == null) { 2771 return null; 2772 } else { 2773 return (String) element [1]; 2774 } 2775 } 2776 2777 2778 /** 2779 * Register an element. 2780 * Array format: 2781 * element type 2782 * attribute hash table 2783 */ 2784 private void setElement (String name, int contentType, 2785 String contentModel, HashMap attributes) 2786 throws Exception 2787 { 2788 Object element[]; 2789 2790 // Try looking up the element 2791 element = (Object[]) elementInfo.get (name); 2792 2793 // Make a new one if necessary. 2794 if (element == null) { 2795 element = new Object [3]; 2796 element [0] = new Integer (CONTENT_UNDECLARED); 2797 element [1] = null; 2798 element [2] = null; 2799 } else if (contentType != CONTENT_UNDECLARED 2800 && ((Integer) element [0]).intValue () != CONTENT_UNDECLARED 2801 ) { 2802 // warn ("multiple declarations for element type", name, null); 2803 return; 2804 } 2805 2806 // Insert the content type, if any. 2807 if (contentType != CONTENT_UNDECLARED) { 2808 element [0] = new Integer (contentType); 2809 } 2810 2811 // Insert the content model, if any. 2812 if (contentModel != null) { 2813 element [1] = contentModel; 2814 } 2815 2816 // Insert the attributes, if any. 2817 if (attributes != null) { 2818 element [2] = attributes; 2819 } 2820 2821 // Save the element info. 2822 elementInfo.put (name, element); 2823 } 2824 2825 2826 /** 2827 * Look up the attribute hash table for an element. 2828 * The hash table is the second item in the element array. 2829 */ 2830 private HashMap getElementAttributes (String name) 2831 { 2832 Object element[] = (Object[]) elementInfo.get (name); 2833 if (element == null) { 2834 return null; 2835 } else { 2836 return (HashMap) element [2]; 2837 } 2838 } 2839 2840 2841 2842 // 2843 // Attributes 2844 // 2845 2846 /** 2847 * Get the declared attributes for an element type. 2848 * @param elname The name of the element type. 2849 * @return An Iterator of all the attributes declared for 2850 * a specific element type. The results will be valid only 2851 * after the DTD (if any) has been parsed. 2852 * @see #getAttributeType 2853 * @see #getAttributeIterator 2854 * @see #getAttributeDefaultValueType 2855 * @see #getAttributeDefaultValue 2856 * @see #getAttributeExpandedValue 2857 */ 2858 private Iterator declaredAttributes (Object element []) 2859 { 2860 HashMap attlist; 2861 2862 if (element == null) 2863 return null; 2864 if ((attlist = (HashMap) element [2]) == null) 2865 return null; 2866 return attlist.keySet().iterator(); 2867 } 2868 2869 /** 2870 * Get the declared attributes for an element type. 2871 * @param elname The name of the element type. 2872 * @return An Iterator of all the attributes declared for 2873 * a specific element type. The results will be valid only 2874 * after the DTD (if any) has been parsed. 2875 * @see #getAttributeType 2876 * @see #getAttributeIterator 2877 * @see #getAttributeDefaultValueType 2878 * @see #getAttributeDefaultValue 2879 * @see #getAttributeExpandedValue 2880 */ 2881 public Iterator declaredAttributes (String elname) 2882 { 2883 return declaredAttributes ((Object []) elementInfo.get (elname)); 2884 } 2885 2886 2887 /** 2888 * Retrieve the declared type of an attribute. 2889 * @param name The name of the associated element. 2890 * @param aname The name of the attribute. 2891 * @return An integer constant representing the attribute type. 2892 * @see #ATTRIBUTE_UNDECLARED 2893 * @see #ATTRIBUTE_CDATA 2894 * @see #ATTRIBUTE_ID 2895 * @see #ATTRIBUTE_IDREF 2896 * @see #ATTRIBUTE_IDREFS 2897 * @see #ATTRIBUTE_ENTITY 2898 * @see #ATTRIBUTE_ENTITIES 2899 * @see #ATTRIBUTE_NMTOKEN 2900 * @see #ATTRIBUTE_NMTOKENS 2901 * @see #ATTRIBUTE_ENUMERATED 2902 * @see #ATTRIBUTE_NOTATION 2903 */ 2904 public int getAttributeType (String name, String aname) 2905 { 2906 Object attribute[] = getAttribute (name, aname); 2907 if (attribute == null) { 2908 return ATTRIBUTE_UNDECLARED; 2909 } else { 2910 return ((Integer) attribute [0]).intValue (); 2911 } 2912 } 2913 2914 2915 /** 2916 * Retrieve the allowed values for an enumerated attribute type. 2917 * @param name The name of the associated element. 2918 * @param aname The name of the attribute. 2919 * @return A string containing the token list. 2920 * @see #ATTRIBUTE_ENUMERATED 2921 * @see #ATTRIBUTE_NOTATION 2922 */ 2923 public String getAttributeIterator (String name, String aname) 2924 { 2925 Object attribute[] = getAttribute (name, aname); 2926 if (attribute == null) { 2927 return null; 2928 } else { 2929 return (String) attribute [3]; 2930 } 2931 } 2932 2933 2934 /** 2935 * Retrieve the default value of a declared attribute. 2936 * @param name The name of the associated element. 2937 * @param aname The name of the attribute. 2938 * @return The default value, or null if the attribute was 2939 * #IMPLIED or simply undeclared and unspecified. 2940 * @see #getAttributeExpandedValue 2941 */ 2942 public String getAttributeDefaultValue (String name, String aname) 2943 { 2944 Object attribute[] = getAttribute (name, aname); 2945 if (attribute == null) { 2946 return null; 2947 } else { 2948 return (String) attribute [1]; 2949 } 2950 } 2951 2952 2953 /** 2954 * Retrieve the expanded value of a declared attribute. 2955 * <p>General entities will be expanded (once). 2956 * @param name The name of the associated element. 2957 * @param aname The name of the attribute. 2958 * @return The expanded default value, or null if the attribute was 2959 * #IMPLIED or simply undeclared 2960 * @see #getAttributeDefaultValue 2961 */ 2962 public String getAttributeExpandedValue (String name, String aname) 2963 throws Exception 2964 { 2965 Object attribute[] = getAttribute (name, aname); 2966 2967 if (attribute == null) { 2968 return null; 2969 } else if (attribute [4] == null && attribute [1] != null) { 2970 // we MUST use the same buf for both quotes else the literal 2971 // can't be properly terminated 2972 char buf [] = new char [1]; 2973 int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE; 2974 int type = getAttributeType (name, aname); 2975 2976 if (type != ATTRIBUTE_CDATA && type != ATTRIBUTE_UNDECLARED) 2977 flags |= LIT_NORMALIZE; 2978 buf [0] = '"'; 2979 pushCharArray (null, buf, 0, 1); 2980 pushString (null, (String) attribute [1]); 2981 pushCharArray (null, buf, 0, 1); 2982 attribute [4] = readLiteral (flags); 2983 } 2984 return (String) attribute [4]; 2985 } 2986 2987 2988 /** 2989 * Retrieve the default value type of a declared attribute. 2990 * @see #ATTRIBUTE_DEFAULT_SPECIFIED 2991 * @see #ATTRIBUTE_DEFAULT_IMPLIED 2992 * @see #ATTRIBUTE_DEFAULT_REQUIRED 2993 * @see #ATTRIBUTE_DEFAULT_FIXED 2994 */ 2995 public int getAttributeDefaultValueType (String name, String aname) 2996 { 2997 Object attribute[] = getAttribute (name, aname); 2998 if (attribute == null) { 2999 return ATTRIBUTE_DEFAULT_UNDECLARED; 3000 } else { 3001 return ((Integer) attribute [2]).intValue (); 3002 } 3003 } 3004 3005 3006 /** 3007 * Register an attribute declaration for later retrieval. 3008 * Format: 3009 * - String type 3010 * - String default value 3011 * - int value type 3012 */ 3013 private void setAttribute (String elName, String name, int type, 3014 String myEnum, 3015 String value, int valueType) 3016 throws Exception 3017 { 3018 HashMap attlist; 3019 Object attribute[]; 3020 3021 // Create a new hashtable if necessary. 3022 attlist = getElementAttributes (elName); 3023 if (attlist == null) { 3024 attlist = new HashMap (); 3025 } 3026 3027 // ignore multiple attribute declarations! 3028 if (attlist.get (name) != null) { 3029 return; 3030 } else { 3031 attribute = new Object [5]; 3032 attribute [0] = new Integer (type); 3033 attribute [1] = value; 3034 attribute [2] = new Integer (valueType); 3035 attribute [3] = myEnum; 3036 attribute [4] = null; 3037 attlist.put (name, attribute); 3038 3039 // Use CONTENT_UNDECLARED to avoid overwriting 3040 // existing element declaration. 3041 setElement (elName, CONTENT_UNDECLARED, null, attlist); 3042 } 3043 } 3044 3045 3046 /** 3047 * Retrieve the three-member array representing an 3048 * attribute declaration. 3049 */ 3050 private Object[] getAttribute (String elName, String name) 3051 { 3052 HashMap attlist; 3053 Object attribute[]; 3054 3055 attlist = getElementAttributes (elName); 3056 if (attlist == null) { 3057 return null; 3058 } 3059 3060 attribute = (Object[]) attlist.get (name); 3061 return attribute; 3062 } 3063 3064 3065 // 3066 // Entities 3067 // 3068 3069 /** 3070 * Get declared entities. 3071 * @return An Iterator of all the entities declared for 3072 * this XML document. The results will be valid only 3073 * after the DTD (if any) has been parsed. 3074 * @see #getEntityType 3075 * @see #getEntityPublicId 3076 * @see #getEntitySystemId 3077 * @see #getEntityValue 3078 * @see #getEntityNotationName 3079 */ 3080 public Iterator declaredEntities () 3081 { 3082 return entityInfo.keySet().iterator(); 3083 } 3084 3085 3086 /** 3087 * Find the type of an entity. 3088 * @returns An integer constant representing the entity type. 3089 * @see #ENTITY_UNDECLARED 3090 * @see #ENTITY_INTERNAL 3091 * @see #ENTITY_NDATA 3092 * @see #ENTITY_TEXT 3093 */ 3094 public int getEntityType (String ename) 3095 { 3096 Object entity[] = (Object[]) entityInfo.get (ename); 3097 if (entity == null) { 3098 return ENTITY_UNDECLARED; 3099 } else { 3100 return ((Integer) entity [0]).intValue (); 3101 } 3102 } 3103 3104 3105 /** 3106 * Return an external entity's public identifier, if any. 3107 * @param ename The name of the external entity. 3108 * @return The entity's system identifier, or null if the 3109 * entity was not declared, if it is not an 3110 * external entity, or if no public identifier was 3111 * provided. 3112 * @see #getEntityType 3113 */ 3114 public String getEntityPublicId (String ename) 3115 { 3116 Object entity[] = (Object[]) entityInfo.get (ename); 3117 if (entity == null) { 3118 return null; 3119 } else { 3120 return (String) entity [1]; 3121 } 3122 } 3123 3124 3125 /** 3126 * Return an external entity's system identifier. 3127 * @param ename The name of the external entity. 3128 * @return The entity's system identifier, or null if the 3129 * entity was not declared, or if it is not an 3130 * external entity. 3131 * @see #getEntityType 3132 */ 3133 public String getEntitySystemId (String ename) 3134 { 3135 Object entity[] = (Object[]) entityInfo.get (ename); 3136 if (entity == null) { 3137 return null; 3138 } else { 3139 return (String) entity [2]; 3140 } 3141 } 3142 3143 3144 /** 3145 * Return the value of an internal entity. 3146 * @param ename The name of the internal entity. 3147 * @return The entity's value, or null if the entity was 3148 * not declared, or if it is not an internal entity. 3149 * @see #getEntityType 3150 */ 3151 public String getEntityValue (String ename) 3152 { 3153 Object entity[] = (Object[]) entityInfo.get (ename); 3154 if (entity == null) { 3155 return null; 3156 } else { 3157 return (String) entity [3]; 3158 } 3159 } 3160 3161 3162 /** 3163 * Get the notation name associated with an NDATA entity. 3164 * @param ename The NDATA entity name. 3165 * @return The associated notation name, or null if the 3166 * entity was not declared, or if it is not an 3167 * NDATA entity. 3168 * @see #getEntityType 3169 */ 3170 public String getEntityNotationName (String eName) 3171 { 3172 Object entity[] = (Object[]) entityInfo.get (eName); 3173 if (entity == null) { 3174 return null; 3175 } else { 3176 return (String) entity [4]; 3177 } 3178 } 3179 3180 3181 /** 3182 * Register an entity declaration for later retrieval. 3183 */ 3184 private void setInternalEntity (String eName, String value) 3185 { 3186 setEntity (eName, ENTITY_INTERNAL, null, null, value, null); 3187 } 3188 3189 3190 /** 3191 * Register an external data entity. 3192 */ 3193 private void setExternalDataEntity (String eName, String pubid, 3194 String sysid, String nName) 3195 { 3196 setEntity (eName, ENTITY_NDATA, pubid, sysid, null, nName); 3197 } 3198 3199 3200 /** 3201 * Register an external text entity. 3202 */ 3203 private void setExternalTextEntity (String eName, 3204 String pubid, String sysid) 3205 { 3206 setEntity (eName, ENTITY_TEXT, pubid, sysid, null, null); 3207 } 3208 3209 3210 /** 3211 * Register an entity declaration for later retrieval. 3212 */ 3213 private void setEntity (String eName, int eClass, 3214 String pubid, String sysid, 3215 String value, String nName) 3216 { 3217 Object entity[]; 3218 3219 if (entityInfo.get (eName) == null) { 3220 entity = new Object [5]; 3221 entity [0] = new Integer (eClass); 3222 entity [1] = pubid; 3223 entity [2] = sysid; 3224 entity [3] = value; 3225 entity [4] = nName; 3226 3227 entityInfo.put (eName, entity); 3228 } 3229 } 3230 3231 3232 // 3233 // Notations. 3234 // 3235 3236 /** 3237 * Get declared notations. 3238 * @return An Iterator of all the notations declared for 3239 * this XML document. The results will be valid only 3240 * after the DTD (if any) has been parsed. 3241 * @see #getNotationPublicId 3242 * @see #getNotationSystemId 3243 */ 3244 public Iterator declaredNotations () 3245 { 3246 return notationInfo.keySet().iterator(); 3247 } 3248 3249 3250 /** 3251 * Look up the public identifier for a notation. 3252 * You will normally use this method to look up a notation 3253 * that was provided as an attribute value or for an NDATA entity. 3254 * @param nname The name of the notation. 3255 * @return A string containing the public identifier, or null 3256 * if none was provided or if no such notation was 3257 * declared. 3258 * @see #getNotationSystemId 3259 */ 3260 public String getNotationPublicId (String nname) 3261 { 3262 Object notation[] = (Object[]) notationInfo.get (nname); 3263 if (notation == null) { 3264 return null; 3265 } else { 3266 return (String) notation [0]; 3267 } 3268 } 3269 3270 3271 /** 3272 * Look up the system identifier for a notation. 3273 * You will normally use this method to look up a notation 3274 * that was provided as an attribute value or for an NDATA entity. 3275 * @param nname The name of the notation. 3276 * @return A string containing the system identifier, or null 3277 * if no such notation was declared. 3278 * @see #getNotationPublicId 3279 */ 3280 public String getNotationSystemId (String nname) 3281 { 3282 Object notation[] = (Object[]) notationInfo.get (nname); 3283 if (notation == null) { 3284 return null; 3285 } else { 3286 return (String) notation [1]; 3287 } 3288 } 3289 3290 3291 /** 3292 * Register a notation declaration for later retrieval. 3293 * Format: 3294 * - public id 3295 * - system id 3296 */ 3297 private void setNotation (String nname, String pubid, String sysid) 3298 throws Exception 3299 { 3300 Object notation[]; 3301 3302 if (notationInfo.get (nname) == null) { 3303 notation = new Object [2]; 3304 notation [0] = pubid; 3305 notation [1] = sysid; 3306 notationInfo.put (nname, notation); 3307 } else { 3308 // VC: Unique Notation Name 3309 // (it's not fatal) 3310 } 3311 } 3312 3313 3314 // 3315 // Location. 3316 // 3317 3318 3319 /** 3320 * Return the current line number. 3321 */ 3322 public int getLineNumber () 3323 { 3324 return line; 3325 } 3326 3327 3328 /** 3329 * Return the current column number. 3330 */ 3331 public int getColumnNumber () 3332 { 3333 return column; 3334 } 3335 3336 3337 ////////////////////////////////////////////////////////////////////// 3338 // High-level I/O. 3339 ////////////////////////////////////////////////////////////////////// 3340 3341 3342 /** 3343 * Read a single character from the readBuffer. 3344 * <p>The readDataChunk () method maintains the buffer. 3345 * <p>If we hit the end of an entity, try to pop the stack and 3346 * keep going. 3347 * <p> (This approach doesn't really enforce XML's rules about 3348 * entity boundaries, but this is not currently a validating 3349 * parser). 3350 * <p>This routine also attempts to keep track of the current 3351 * position in external entities, but it's not entirely accurate. 3352 * @return The next available input character. 3353 * @see #unread (char) 3354 * @see #unread (String) 3355 * @see #readDataChunk 3356 * @see #readBuffer 3357 * @see #line 3358 * @return The next character from the current input source. 3359 */ 3360 private char readCh () 3361 throws SAXException, IOException 3362 { 3363 char c; 3364 3365 // As long as there's nothing in the 3366 // read buffer, try reading more data 3367 // (for an external entity) or popping 3368 // the entity stack (for either). 3369 while (readBufferPos >= readBufferLength) { 3370 switch (sourceType) { 3371 case INPUT_READER: 3372 case INPUT_EXTERNAL: 3373 case INPUT_STREAM: 3374 readDataChunk (); 3375 while (readBufferLength < 1) { 3376 popInput (); 3377 if (readBufferLength < 1) { 3378 readDataChunk (); 3379 } 3380 } 3381 break; 3382 3383 default: 3384 3385 popInput (); 3386 break; 3387 } 3388 } 3389 3390 c = readBuffer [readBufferPos++]; 3391 3392 if (c == '\n') { 3393 line++; 3394 column = 0; 3395 } else { 3396 if (c == '<') 3397 /* favorite return to parseContent () .. NOP */ ; 3398 else if ((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD) 3399 error ("illegal XML character U+" 3400 + Integer.toHexString (c)); 3401 3402 // If we're in the DTD and in a context where PEs get expanded, 3403 // do so ... 1/14/2000 errata identify those contexts. There 3404 // are also spots in the internal subset where PE refs are fatal 3405 // errors, hence yet another flag. 3406 else if (c == '%' && expandPE) { 3407 if (peIsError) 3408 error ("PE reference within decl in internal subset."); 3409 parsePEReference (); 3410 return readCh (); 3411 } 3412 column++; 3413 } 3414 3415 return c; 3416 } 3417 3418 3419 /** 3420 * Push a single character back onto the current input stream. 3421 * <p>This method usually pushes the character back onto 3422 * the readBuffer, while the unread (String) method treats the 3423 * string as a new internal entity. 3424 * <p>I don't think that this would ever be called with 3425 * readBufferPos = 0, because the methods always reads a character 3426 * before unreading it, but just in case, I've added a boundary 3427 * condition. 3428 * @param c The character to push back. 3429 * @see #readCh 3430 * @see #unread (String) 3431 * @see #unread (char[]) 3432 * @see #readBuffer 3433 */ 3434 private void unread (char c) 3435 throws SAXException 3436 { 3437 // Normal condition. 3438 if (c == '\n') { 3439 line--; 3440 column = -1; 3441 } 3442 if (readBufferPos > 0) { 3443 readBuffer [--readBufferPos] = c; 3444 } else { 3445 pushString (null, new Character (c).toString ()); 3446 } 3447 } 3448 3449 3450 /** 3451 * Push a char array back onto the current input stream. 3452 * <p>NOTE: you must <em>never</em> push back characters that you 3453 * haven't actually read: use pushString () instead. 3454 * @see #readCh 3455 * @see #unread (char) 3456 * @see #unread (String) 3457 * @see #readBuffer 3458 * @see #pushString 3459 */ 3460 private void unread (char ch[], int length) 3461 throws SAXException 3462 { 3463 for (int i = 0; i < length; i++) { 3464 if (ch [i] == '\n') { 3465 line--; 3466 column = -1; 3467 } 3468 } 3469 if (length < readBufferPos) { 3470 readBufferPos -= length; 3471 } else { 3472 pushCharArray (null, ch, 0, length); 3473 sourceType = INPUT_BUFFER; 3474 } 3475 } 3476 3477 3478 /** 3479 * Push a new external input source. 3480 * The source will be some kind of parsed entity, such as a PE 3481 * (including the external DTD subset) or content for the body. 3482 * <p>TODO: Right now, this method always attempts to autodetect 3483 * the encoding; in the future, it should allow the caller to 3484 * request an encoding explicitly, and it should also look at the 3485 * headers with an HTTP connection. 3486 * @param url The java.net.URL object for the entity. 3487 * @see SAXDriver#resolveEntity 3488 * @see #pushString 3489 * @see #sourceType 3490 * @see #pushInput 3491 * @see #detectEncoding 3492 * @see #sourceType 3493 * @see #readBuffer 3494 */ 3495 private void pushURL ( 3496 String ename, 3497 String publicId, 3498 String systemId, 3499 Reader reader, 3500 InputStream stream, 3501 String encoding 3502 ) throws SAXException, IOException 3503 { 3504 URL url; 3505 boolean ignoreEncoding = false; 3506 3507 // Push the existing status. 3508 pushInput (ename); 3509 3510 // Create a new read buffer. 3511 // (Note the four-character margin) 3512 readBuffer = new char [READ_BUFFER_MAX + 4]; 3513 readBufferPos = 0; 3514 readBufferLength = 0; 3515 readBufferOverflow = -1; 3516 is = null; 3517 line = 1; 3518 3519 currentByteCount = 0; 3520 3521 // Make any system ID (URI/URL) absolute. There's one case 3522 // where it may be null: parser was invoked without providing 3523 // one, e.g. since the XML data came from a memory buffer. 3524 3525 if (systemId != null && externalEntity != null) { 3526 systemId = new URL (externalEntity.getURL (), systemId).toString (); 3527 } else if (baseURI != null) { 3528 systemId = new URL (new URL (baseURI), systemId).toString (); 3529 // throws IOException if couldn't create new URL 3530 } 3531 3532 // See if the application wants to 3533 // redirect the system ID and/or 3534 // supply its own character stream. 3535 if (reader == null && stream == null && systemId != null) { 3536 Object input = handler.resolveEntity (publicId, systemId); 3537 if (input != null) { 3538 if (input instanceof String) { 3539 systemId = (String) input; 3540 } else if (input instanceof InputStream) { 3541 stream = (InputStream) input; 3542 } else if (input instanceof Reader) { 3543 reader = (Reader) input; 3544 } 3545 } 3546 } 3547 3548 // Start the entity. 3549 if (systemId != null) { 3550 handler.startExternalEntity (systemId); 3551 } else { 3552 handler.startExternalEntity ("[unidentified data stream]"); 3553 } 3554 3555 // If there's an explicit character stream, just 3556 // ignore encoding declarations. 3557 if (reader != null) { 3558 sourceType = INPUT_READER; 3559 this.reader = reader; 3560 tryEncodingDecl (true); 3561 return; 3562 } 3563 3564 // Else we handle the conversion, and need to ensure 3565 // it's done right. 3566 if (stream != null) { 3567 sourceType = INPUT_STREAM; 3568 is = stream; 3569 url = null; 3570 } else { 3571 // We have to open our own stream to the URL. 3572 3573 // Set the new status 3574 sourceType = INPUT_EXTERNAL; 3575 url = new URL (systemId); 3576 3577 externalEntity = url.openConnection (); 3578 externalEntity.connect (); 3579 is = externalEntity.getInputStream (); 3580 } 3581 3582 // If we get to here, there must be 3583 // an InputStream available. 3584 if (!is.markSupported ()) { 3585 is = new BufferedInputStream (is); 3586 } 3587 3588 // Get any external encoding label. 3589 if (encoding == null && externalEntity != null) { 3590 // External labels can be untrustworthy; filesystems in 3591 // particular often have the wrong default for content 3592 // that wasn't locally originated. Those we autodetect. 3593 if (!"file".equals (externalEntity.getURL ().getProtocol ())) { 3594 int temp; 3595 3596 // application/xml;charset=something;otherAttr=... 3597 // ... with many variants on 'something' 3598 encoding = externalEntity.getContentType (); 3599 3600 // MHK code (fix for Saxon 5.5.1/007): protect against encoding==null 3601 if (encoding==null) { 3602 temp = -1; 3603 } else { 3604 temp = encoding.indexOf ("charset"); 3605 } 3606 3607 // RFC 2376 sez MIME text defaults to ASCII, but since the 3608 // JDK will create a MIME type out of thin air, we always 3609 // autodetect when there's no explicit charset attribute. 3610 if (temp < 0) 3611 encoding = null; // autodetect 3612 else { 3613 temp = encoding.indexOf ('=', temp + 7); 3614 encoding = encoding.substring (temp); 3615 if ((temp = encoding.indexOf (';')) > 0) 3616 encoding = encoding.substring (0, temp); 3617 3618 // attributes can have comment fields (RFC 822) 3619 if ((temp = encoding.indexOf ('(')) > 0) 3620 encoding = encoding.substring (0, temp); 3621 // ... and values may be quoted 3622 if ((temp = encoding.indexOf ('"')) > 0) 3623 encoding = encoding.substring (temp + 1, 3624 encoding.indexOf ('"', temp + 2)); 3625 encoding.trim (); 3626 } 3627 } 3628 } 3629 3630 // if we got an external encoding label, use it ... 3631 if (encoding != null) { 3632 this.encoding = ENCODING_EXTERNAL; 3633 setupDecoding (encoding); 3634 ignoreEncoding = true; 3635 3636 // ... else autodetect 3637 } else { 3638 detectEncoding (); 3639 ignoreEncoding = false; 3640 } 3641 3642 // Read any XML or text declaration. 3643 tryEncodingDecl (ignoreEncoding); 3644 } 3645 3646 3647 /** 3648 * Check for an encoding declaration. This is the second part of the 3649 * XML encoding autodetection algorithm, relying on detectEncoding to 3650 * get to the point that this part can read any encoding declaration 3651 * in the document (using only US-ASCII characters). 3652 * 3653 * <p> Because this part starts to fill parser buffers with this data, 3654 * it's tricky to to a reader so that Java's built-in decoders can be 3655 * used for the character encodings that aren't built in to this parser 3656 * (such as EUC-JP, KOI8-R, Big5, etc). 3657 * 3658 * @return any encoding in the declaration, uppercased; or null 3659 * @see detectEncoding 3660 */ 3661 private String tryEncodingDecl (boolean ignoreEncoding) 3662 throws SAXException, IOException 3663 { 3664 // Read the XML/text declaration. 3665 if (tryRead ("<?xml")) { 3666 dataBufferFlush (); 3667 if (tryWhitespace ()) { 3668 if (inputStack.size () > 0) { 3669 return parseTextDecl (ignoreEncoding); 3670 } else { 3671 return parseXMLDecl (ignoreEncoding); 3672 } 3673 } else { 3674 unread ("xml".toCharArray (), 3); 3675 parsePI (); 3676 } 3677 } 3678 return null; 3679 } 3680 3681 3682 /** 3683 * Attempt to detect the encoding of an entity. 3684 * <p>The trick here (as suggested in the XML standard) is that 3685 * any entity not in UTF-8, or in UCS-2 with a byte-order mark, 3686 * <b>must</b> begin with an XML declaration or an encoding 3687 * declaration; we simply have to look for "<?xml" in various 3688 * encodings. 3689 * <p>This method has no way to distinguish among 8-bit encodings. 3690 * Instead, it sets up for UTF-8, then (possibly) revises its assumption 3691 * later in setupDecoding (). Any ASCII-derived 8-bit encoding 3692 * should work, but most will be rejected later by setupDecoding (). 3693 * <p>I don't currently detect EBCDIC, since I'm concerned that it 3694 * could also be a valid UTF-8 sequence; I'll have to do more checking 3695 * later. 3696 * @see #tryEncoding (byte[], byte, byte, byte, byte) 3697 * @see #tryEncoding (byte[], byte, byte) 3698 * @see #setupDecoding 3699 * @see #read8bitEncodingDeclaration 3700 */ 3701 private void detectEncoding () 3702 throws SAXException, IOException 3703 { 3704 byte signature[] = new byte [4]; 3705 3706 // Read the first four bytes for 3707 // autodetection. 3708 is.mark (4); 3709 is.read (signature); 3710 is.reset (); 3711 3712 // 3713 // FIRST: four byte encodings (who uses these?) 3714 // 3715 if (tryEncoding (signature, (byte) 0x00, (byte) 0x00, 3716 (byte) 0x00, (byte) 0x3c)) { 3717 // UCS-4 must begin with "<?xml" 3718 // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234) 3719 encoding = ENCODING_UCS_4_1234; 3720 3721 } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00, 3722 (byte) 0x00, (byte) 0x00)) { 3723 // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321) 3724 encoding = ENCODING_UCS_4_4321; 3725 3726 } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x00, 3727 (byte) 0x3c, (byte) 0x00)) { 3728 // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143) 3729 encoding = ENCODING_UCS_4_2143; 3730 3731 } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c, 3732 (byte) 0x00, (byte) 0x00)) { 3733 // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421) 3734 encoding = ENCODING_UCS_4_3412; 3735 3736 // 00 00 fe ff UCS_4_1234 (with BOM) 3737 // ff fe 00 00 UCS_4_4321 (with BOM) 3738 } 3739 3740 // 3741 // SECOND: two byte encodings 3742 // note ... with 1/14/2000 errata the XML spec identifies some 3743 // more "broken UTF-16" autodetection cases, with no XML decl, 3744 // which we don't handle here (that's legal too). 3745 // 3746 else if (tryEncoding (signature, (byte) 0xfe, (byte) 0xff)) { 3747 // UCS-2 with a byte-order marker. (UTF-16) 3748 // 0xfe 0xff: UCS-2, big-endian (12) 3749 encoding = ENCODING_UCS_2_12; 3750 is.read (); is.read (); 3751 3752 } else if (tryEncoding (signature, (byte) 0xff, (byte) 0xfe)) { 3753 // UCS-2 with a byte-order marker. (UTF-16) 3754 // 0xff 0xfe: UCS-2, little-endian (21) 3755 encoding = ENCODING_UCS_2_21; 3756 is.read (); is.read (); 3757 3758 } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c, 3759 (byte) 0x00, (byte) 0x3f)) { 3760 // UTF-16-BE (otherwise, malformed UTF-16) 3761 // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark 3762 encoding = ENCODING_UCS_2_12; 3763 error ("no byte-order mark for UCS-2 entity"); 3764 3765 } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00, 3766 (byte) 0x3f, (byte) 0x00)) { 3767 // UTF-16-LE (otherwise, malformed UTF-16) 3768 // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark 3769 encoding = ENCODING_UCS_2_21; 3770 error ("no byte-order mark for UCS-2 entity"); 3771 } 3772 3773 // 3774 // THIRD: ASCII-derived encodings, fixed and variable lengths 3775 // 3776 else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x3f, 3777 (byte) 0x78, (byte) 0x6d)) { 3778 // ASCII derived 3779 // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING) 3780 encoding = ENCODING_UTF_8; 3781 read8bitEncodingDeclaration (); 3782 3783 } else { 3784 // 4c 6f a7 94 ... we don't understand EBCDIC flavors 3785 // ... but we COULD at least kick in some fixed code page 3786 3787 // (default) UTF-8 without encoding/XML declaration 3788 encoding = ENCODING_UTF_8; 3789 } 3790 } 3791 3792 3793 /** 3794 * Check for a four-byte signature. 3795 * <p>Utility routine for detectEncoding (). 3796 * <p>Always looks for some part of "<?XML" in a specific encoding. 3797 * @param sig The first four bytes read. 3798 * @param b1 The first byte of the signature 3799 * @param b2 The second byte of the signature 3800 * @param b3 The third byte of the signature 3801 * @param b4 The fourth byte of the signature 3802 * @see #detectEncoding 3803 */ 3804 private static boolean tryEncoding ( 3805 byte sig[], byte b1, byte b2, byte b3, byte b4) 3806 { 3807 return (sig [0] == b1 && sig [1] == b2 3808 && sig [2] == b3 && sig [3] == b4); 3809 } 3810 3811 3812 /** 3813 * Check for a two-byte signature. 3814 * <p>Looks for a UCS-2 byte-order mark. 3815 * <p>Utility routine for detectEncoding (). 3816 * @param sig The first four bytes read. 3817 * @param b1 The first byte of the signature 3818 * @param b2 The second byte of the signature 3819 * @see #detectEncoding 3820 */ 3821 private static boolean tryEncoding (byte sig[], byte b1, byte b2) 3822 { 3823 return ((sig [0] == b1) && (sig [1] == b2)); 3824 } 3825 3826 3827 /** 3828 * This method pushes a string back onto input. 3829 * <p>It is useful either as the expansion of an internal entity, 3830 * or for backtracking during the parse. 3831 * <p>Call pushCharArray () to do the actual work. 3832 * @param s The string to push back onto input. 3833 * @see #pushCharArray 3834 */ 3835 private void pushString (String ename, String s) 3836 throws SAXException 3837 { 3838 char ch[] = s.toCharArray (); 3839 pushCharArray (ename, ch, 0, ch.length); 3840 } 3841 3842 3843 /** 3844 * Push a new internal input source. 3845 * <p>This method is useful for expanding an internal entity, 3846 * or for unreading a string of characters. It creates a new 3847 * readBuffer containing the characters in the array, instead 3848 * of characters converted from an input byte stream. 3849 * @param ch The char array to push. 3850 * @see #pushString 3851 * @see #pushURL 3852 * @see #readBuffer 3853 * @see #sourceType 3854 * @see #pushInput 3855 */ 3856 private void pushCharArray (String ename, char ch[], int start, int length) 3857 throws SAXException 3858 { 3859 // Push the existing status 3860 pushInput (ename); 3861 sourceType = INPUT_INTERNAL; 3862 readBuffer = ch; 3863 readBufferPos = start; 3864 readBufferLength = length; 3865 readBufferOverflow = -1; 3866 } 3867 3868 3869 /** 3870 * Save the current input source onto the stack. 3871 * <p>This method saves all of the global variables associated with 3872 * the current input source, so that they can be restored when a new 3873 * input source has finished. It also tests for entity recursion. 3874 * <p>The method saves the following global variables onto a stack 3875 * using a fixed-length array: 3876 * <ol> 3877 * <li>sourceType 3878 * <li>externalEntity 3879 * <li>readBuffer 3880 * <li>readBufferPos 3881 * <li>readBufferLength 3882 * <li>line 3883 * <li>encoding 3884 * </ol> 3885 * @param ename The name of the entity (if any) causing the new input. 3886 * @see #popInput 3887 * @see #sourceType 3888 * @see #externalEntity 3889 * @see #readBuffer 3890 * @see #readBufferPos 3891 * @see #readBufferLength 3892 * @see #line 3893 * @see #encoding 3894 */ 3895 private void pushInput (String ename) 3896 throws SAXException 3897 { 3898 Object input[] = new Object [12]; 3899 3900 // Check for entity recursion. 3901 if (ename != null) { 3902 Iterator entities = entityStack.iterator (); 3903 while (entities.hasNext ()) { 3904 String e = (String) entities.next (); 3905 if (e == ename) { 3906 error ("recursive reference to entity", ename, null); 3907 } 3908 } 3909 } 3910 entityStack.add (ename); 3911 3912 // Don't bother if there is no current input. 3913 if (sourceType == INPUT_NONE) { 3914 return; 3915 } 3916 3917 // Set up a snapshot of the current 3918 // input source. 3919 input [0] = new Integer (sourceType); 3920 input [1] = externalEntity; 3921 input [2] = readBuffer; 3922 input [3] = new Integer (readBufferPos); 3923 input [4] = new Integer (readBufferLength); 3924 input [5] = new Integer (line); 3925 input [6] = new Integer (encoding); 3926 input [7] = new Integer (readBufferOverflow); 3927 input [8] = is; 3928 input [9] = new Integer (currentByteCount); 3929 input [10] = new Integer (column); 3930 input [11] = reader; 3931 3932 // Push it onto the stack. 3933 inputStack.add (input); 3934 } 3935 3936 3937 /** 3938 * Restore a previous input source. 3939 * <p>This method restores all of the global variables associated with 3940 * the current input source. 3941 * @exception java.io.EOFException 3942 * If there are no more entries on the input stack. 3943 * @see #pushInput 3944 * @see #sourceType 3945 * @see #externalEntity 3946 * @see #readBuffer 3947 * @see #readBufferPos 3948 * @see #readBufferLength 3949 * @see #line 3950 * @see #encoding 3951 */ 3952 private void popInput () 3953 throws SAXException, IOException 3954 { 3955 Object input[]; 3956 3957 3958 switch (sourceType) { 3959 3960 case INPUT_EXTERNAL: 3961 if (externalEntity != null) { 3962 handler.endExternalEntity ( 3963 externalEntity.getURL ().toString ()); 3964 } 3965 break; 3966 case INPUT_STREAM: 3967 if (baseURI != null) { 3968 handler.endExternalEntity (baseURI); 3969 } 3970 is.close (); 3971 break; 3972 case INPUT_READER: 3973 if (baseURI != null) { 3974 handler.endExternalEntity (baseURI); 3975 } 3976 reader.close (); 3977 break; 3978 } 3979 3980 // Throw an EOFException if there 3981 // is nothing else to pop. 3982 if (inputStack.isEmpty ()) { 3983 throw new EOFException ("no more input"); 3984 } else { 3985 String s; 3986 input = (Object[]) inputStack.remove ( inputStack.size() - 1 ); 3987 s = (String) entityStack.remove ( entityStack.size() - 1 ); 3988 } 3989 3990 sourceType = ((Integer) input [0]).intValue (); 3991 externalEntity = (URLConnection) input [1]; 3992 readBuffer = (char[]) input [2]; 3993 readBufferPos = ((Integer) input [3]).intValue (); 3994 readBufferLength = ((Integer) input [4]).intValue (); 3995 line = ((Integer) input [5]).intValue (); 3996 encoding = ((Integer) input [6]).intValue (); 3997 readBufferOverflow = ((Integer) input [7]).intValue (); 3998 is = (InputStream) input [8]; 3999 currentByteCount = ((Integer) input [9]).intValue (); 4000 column = ((Integer) input [10]).intValue (); 4001 reader = (Reader) input [11]; 4002 } 4003 4004 4005 /** 4006 * Return true if we can read the expected character. 4007 * <p>Note that the character will be removed from the input stream 4008 * on success, but will be put back on failure. Do not attempt to 4009 * read the character again if the method succeeds. 4010 * @param delim The character that should appear next. For a 4011 * insensitive match, you must supply this in upper-case. 4012 * @return true if the character was successfully read, or false if 4013 * it was not. 4014 * @see #tryRead (String) 4015 */ 4016 private boolean tryRead (char delim) 4017 throws SAXException, IOException 4018 { 4019 char c; 4020 4021 // Read the character 4022 c = readCh (); 4023 4024 // Test for a match, and push the character 4025 // back if the match fails. 4026 if (c == delim) { 4027 return true; 4028 } else { 4029 unread (c); 4030 return false; 4031 } 4032 } 4033 4034 4035 /** 4036 * Return true if we can read the expected string. 4037 * <p>This is simply a convenience method. 4038 * <p>Note that the string will be removed from the input stream 4039 * on success, but will be put back on failure. Do not attempt to 4040 * read the string again if the method succeeds. 4041 * <p>This method will push back a character rather than an 4042 * array whenever possible (probably the majority of cases). 4043 * <p><b>NOTE:</b> This method currently has a hard-coded limit 4044 * of 100 characters for the delimiter. 4045 * @param delim The string that should appear next. 4046 * @return true if the string was successfully read, or false if 4047 * it was not. 4048 * @see #tryRead (char) 4049 */ 4050 private boolean tryRead (String delim) 4051 throws SAXException, IOException 4052 { 4053 char ch[] = delim.toCharArray (); 4054 char c; 4055 4056 // Compare the input, character- 4057 // by character. 4058 4059 for (int i = 0; i < ch.length; i++) { 4060 c = readCh (); 4061 if (c != ch [i]) { 4062 unread (c); 4063 if (i != 0) { 4064 unread (ch, i); 4065 } 4066 return false; 4067 } 4068 } 4069 return true; 4070 } 4071 4072 4073 4074 /** 4075 * Return true if we can read some whitespace. 4076 * <p>This is simply a convenience method. 4077 * <p>This method will push back a character rather than an 4078 * array whenever possible (probably the majority of cases). 4079 * @return true if whitespace was found. 4080 */ 4081 private boolean tryWhitespace () 4082 throws SAXException, IOException 4083 { 4084 char c; 4085 c = readCh (); 4086 if (isWhitespace (c)) { 4087 skipWhitespace (); 4088 return true; 4089 } else { 4090 unread (c); 4091 return false; 4092 } 4093 } 4094 4095 4096 /** 4097 * Read all data until we find the specified string. 4098 * This is useful for scanning CDATA sections and PIs. 4099 * <p>This is inefficient right now, since it calls tryRead () 4100 * for every character. 4101 * @param delim The string delimiter 4102 * @see #tryRead (String, boolean) 4103 * @see #readCh 4104 */ 4105 private void parseUntil (String delim) 4106 throws SAXException, IOException 4107 { 4108 char c; 4109 int startLine = line; 4110 4111 try { 4112 while (!tryRead (delim)) { 4113 c = readCh (); 4114 dataBufferAppend (c); 4115 } 4116 } catch (EOFException e) { 4117 error ("end of input while looking for delimiter " 4118 + "(started on line " + startLine 4119 + ')', null, delim); 4120 } 4121 } 4122 4123 4124 /** 4125 * Read just the encoding declaration (or XML declaration) at the 4126 * start of an external entity. 4127 * When this method is called, we know that the declaration is 4128 * present (or appears to be). We also know that the entity is 4129 * in some sort of ASCII-derived 8-bit encoding. 4130 * The idea of this is to let us read what the 8-bit encoding is 4131 * before we've committed to converting any more of the file; the 4132 * XML or encoding declaration must be in 7-bit ASCII, so we're 4133 * safe as long as we don't go past it. 4134 */ 4135 private void read8bitEncodingDeclaration () 4136 throws SAXException, IOException 4137 { 4138 int ch; 4139 readBufferPos = readBufferLength = 0; 4140 4141 while (true) { 4142 ch = is.read (); 4143 readBuffer [readBufferLength++] = (char) ch; 4144 switch (ch) { 4145 case (int) '>': 4146 return; 4147 case - 1: 4148 error ("end of file before end of XML or encoding declaration.", 4149 null, "?>"); 4150 } 4151 if (readBuffer.length == readBufferLength) 4152 error ("unfinished XML or encoding declaration"); 4153 } 4154 } 4155 4156 4157 ////////////////////////////////////////////////////////////////////// 4158 // Low-level I/O. 4159 ////////////////////////////////////////////////////////////////////// 4160 4161 4162 /** 4163 * Read a chunk of data from an external input source. 4164 * <p>This is simply a front-end that fills the rawReadBuffer 4165 * with bytes, then calls the appropriate encoding handler. 4166 * @see #encoding 4167 * @see #rawReadBuffer 4168 * @see #readBuffer 4169 * @see #filterCR 4170 * @see #copyUtf8ReadBuffer 4171 * @see #copyIso8859_1ReadBuffer 4172 * @see #copyUcs_2ReadBuffer 4173 * @see #copyUcs_4ReadBuffer 4174 */ 4175 private void readDataChunk () 4176 throws SAXException, IOException 4177 { 4178 int count, i, j; 4179 4180 // See if we have any overflow (filterCR sets for CR at end) 4181 if (readBufferOverflow > -1) { 4182 readBuffer [0] = (char) readBufferOverflow; 4183 readBufferOverflow = -1; 4184 readBufferPos = 1; 4185 sawCR = true; 4186 } else { 4187 readBufferPos = 0; 4188 sawCR = false; 4189 } 4190 4191 // input from a character stream. 4192 if (sourceType == INPUT_READER) { 4193 count = reader.read (readBuffer, 4194 readBufferPos, READ_BUFFER_MAX - readBufferPos); 4195 if (count < 0) 4196 readBufferLength = readBufferPos; 4197 else 4198 readBufferLength = readBufferPos + count; 4199 if (readBufferLength > 0) 4200 filterCR (count >= 0); 4201 sawCR = false; 4202 return; 4203 } 4204 4205 // Read as many bytes as possible into the raw buffer. 4206 count = is.read (rawReadBuffer, 0, READ_BUFFER_MAX); 4207 4208 // Dispatch to an encoding-specific reader method to populate 4209 // the readBuffer. In most parser speed profiles, these routines 4210 // show up at the top of the CPU usage chart. 4211 if (count > 0) { 4212 switch (encoding) { 4213 // one byte builtins 4214 case ENCODING_ASCII: 4215 copyIso8859_1ReadBuffer (count, (char) 0x0080); 4216 break; 4217 case ENCODING_UTF_8: 4218 copyUtf8ReadBuffer (count); 4219 break; 4220 case ENCODING_ISO_8859_1: 4221 copyIso8859_1ReadBuffer (count, (char) 0); 4222 break; 4223 4224 // two byte builtins 4225 case ENCODING_UCS_2_12: 4226 copyUcs2ReadBuffer (count, 8, 0); 4227 break; 4228 case ENCODING_UCS_2_21: 4229 copyUcs2ReadBuffer (count, 0, 8); 4230 break; 4231 4232 // four byte builtins 4233 case ENCODING_UCS_4_1234: 4234 copyUcs4ReadBuffer (count, 24, 16, 8, 0); 4235 break; 4236 case ENCODING_UCS_4_4321: 4237 copyUcs4ReadBuffer (count, 0, 8, 16, 24); 4238 break; 4239 case ENCODING_UCS_4_2143: 4240 copyUcs4ReadBuffer (count, 16, 24, 0, 8); 4241 break; 4242 case ENCODING_UCS_4_3412: 4243 copyUcs4ReadBuffer (count, 8, 0, 24, 16); 4244 break; 4245 } 4246 } else 4247 readBufferLength = readBufferPos; 4248 4249 readBufferPos = 0; 4250 4251 // Filter out all carriage returns if we've seen any 4252 // (including any saved from a previous read) 4253 if (sawCR) { 4254 filterCR (count >= 0); 4255 sawCR = false; 4256 4257 // must actively report EOF, lest some CRs get lost. 4258 if (readBufferLength == 0 && count >= 0) 4259 readDataChunk (); 4260 } 4261 4262 if (count > 0) 4263 currentByteCount += count; 4264 } 4265 4266 4267 /** 4268 * Filter carriage returns in the read buffer. 4269 * CRLF becomes LF; CR becomes LF. 4270 * @param moreData true iff more data might come from the same source 4271 * @see #readDataChunk 4272 * @see #readBuffer 4273 * @see #readBufferOverflow 4274 */ 4275 private void filterCR (boolean moreData) 4276 { 4277 int i, j; 4278 4279 readBufferOverflow = -1; 4280 4281loop: 4282 for (i = j = readBufferPos; j < readBufferLength; i++, j++) { 4283 switch (readBuffer [j]) { 4284 case '\r': 4285 if (j == readBufferLength - 1) { 4286 if (moreData) { 4287 readBufferOverflow = '\r'; 4288 readBufferLength--; 4289 } else // CR at end of buffer 4290 readBuffer [i++] = '\n'; 4291 break loop; 4292 } else if (readBuffer [j + 1] == '\n') { 4293 j++; 4294 } 4295 readBuffer [i] = '\n'; 4296 break; 4297 4298 case '\n': 4299 default: 4300 readBuffer [i] = readBuffer [j]; 4301 break; 4302 } 4303 } 4304 readBufferLength = i; 4305 } 4306 4307 /** 4308 * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters. 4309 * <p>When readDataChunk () calls this method, the raw bytes are in 4310 * rawReadBuffer, and the final characters will appear in 4311 * readBuffer. 4312 * @param count The number of bytes to convert. 4313 * @see #readDataChunk 4314 * @see #rawReadBuffer 4315 * @see #readBuffer 4316 * @see #getNextUtf8Byte 4317 */ 4318 private void copyUtf8ReadBuffer (int count) 4319 throws SAXException, IOException 4320 { 4321 int i = 0; 4322 int j = readBufferPos; 4323 int b1; 4324 char c = 0; 4325 4326 /* 4327 // check once, so the runtime won't (if it's smart enough) 4328 if (count < 0 || count > rawReadBuffer.length) 4329 throw new ArrayIndexOutOfBoundsException (Integer.toString (count)); 4330 */ 4331 4332 while (i < count) { 4333 b1 = rawReadBuffer [i++]; 4334 4335 // Determine whether we are dealing 4336 // with a one-, two-, three-, or four- 4337 // byte sequence. 4338 if (b1 < 0) { 4339 if ((b1 & 0xe0) == 0xc0) { 4340 // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx 4341 c = (char) (((b1 & 0x1f) << 6) 4342 | getNextUtf8Byte (i++, count)); 4343 } else if ((b1 & 0xf0) == 0xe0) { 4344 // 3-byte sequence: 4345 // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx 4346 // most CJKV characters 4347 c = (char) (((b1 & 0x0f) << 12) | 4348 (getNextUtf8Byte (i++, count) << 6) | 4349 getNextUtf8Byte (i++, count)); 4350 } else if ((b1 & 0xf8) == 0xf0) { 4351 // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx 4352 // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 4353 // (uuuuu = wwww + 1) 4354 // "Surrogate Pairs" ... from the "Astral Planes" 4355 int iso646 = b1 & 07; 4356 iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count); 4357 iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count); 4358 iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count); 4359 4360 if (iso646 <= 0xffff) { 4361 c = (char) iso646; 4362 } else { 4363 if (iso646 > 0x0010ffff) 4364 encodingError ( 4365 "UTF-8 value out of range for Unicode", 4366 iso646, 0); 4367 iso646 -= 0x010000; 4368 readBuffer [j++] = (char) (0xd800 | (iso646 >> 10)); 4369 readBuffer [j++] = (char) (0xdc00 | (iso646 & 0x03ff)); 4370 continue; 4371 } 4372 } else { 4373 // The five and six byte encodings aren't supported; 4374 // they exceed the Unicode (and XML) range. 4375 encodingError ( 4376 "unsupported five or six byte UTF-8 sequence", 4377 0xff & b1, i); 4378 // NOTREACHED 4379 c = 0; 4380 } 4381 } else { 4382 // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx 4383 // (US-ASCII character, "common" case, one branch to here) 4384 c = (char) b1; 4385 } 4386 readBuffer [j++] = c; 4387 if (c == '\r') 4388 sawCR = true; 4389 } 4390 // How many characters have we read? 4391 readBufferLength = j; 4392 } 4393 4394 4395 /** 4396 * Return the next byte value in a UTF-8 sequence. 4397 * If it is not possible to get a byte from the current 4398 * entity, throw an exception. 4399 * @param pos The current position in the rawReadBuffer. 4400 * @param count The number of bytes in the rawReadBuffer 4401 * @return The significant six bits of a non-initial byte in 4402 * a UTF-8 sequence. 4403 * @exception EOFException If the sequence is incomplete. 4404 */ 4405 private int getNextUtf8Byte (int pos, int count) 4406 throws SAXException, IOException 4407 { 4408 int val; 4409 4410 // Take a character from the buffer 4411 // or from the actual input stream. 4412 if (pos < count) { 4413 val = rawReadBuffer [pos]; 4414 } else { 4415 val = is.read (); 4416 if (val == -1) { 4417 encodingError ("unfinished multi-byte UTF-8 sequence at EOF", 4418 -1, pos); 4419 } 4420 } 4421 4422 // Check for the correct bits at the start. 4423 if ((val & 0xc0) != 0x80) { 4424 encodingError ("bad continuation of multi-byte UTF-8 sequence", 4425 val, pos + 1); 4426 } 4427 4428 // Return the significant bits. 4429 return (val & 0x3f); 4430 } 4431 4432 4433 /** 4434 * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into 4435 * UTF-16 characters. 4436 * 4437 * <p>When readDataChunk () calls this method, the raw bytes are in 4438 * rawReadBuffer, and the final characters will appear in 4439 * readBuffer. 4440 * 4441 * @param count The number of bytes to convert. 4442 * @param mask For ASCII conversion, 0x7f; else, 0xff. 4443 * @see #readDataChunk 4444 * @see #rawReadBuffer 4445 * @see #readBuffer 4446 */ 4447 private void copyIso8859_1ReadBuffer (int count, char mask) 4448 throws IOException 4449 { 4450 int i, j; 4451 for (i = 0, j = readBufferPos; i < count; i++, j++) { 4452 char c = (char) (rawReadBuffer [i] & 0xff); 4453 if ((c & mask) != 0) 4454 throw new CharConversionException ("non-ASCII character U+" 4455 + Integer.toHexString (c)); 4456 readBuffer [j] = c; 4457 if (c == '\r') { 4458 sawCR = true; 4459 } 4460 } 4461 readBufferLength = j; 4462 } 4463 4464 4465 /** 4466 * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters 4467 * (as used in Java string manipulation). 4468 * 4469 * <p>When readDataChunk () calls this method, the raw bytes are in 4470 * rawReadBuffer, and the final characters will appear in 4471 * readBuffer. 4472 * @param count The number of bytes to convert. 4473 * @param shift1 The number of bits to shift byte 1. 4474 * @param shift2 The number of bits to shift byte 2 4475 * @see #readDataChunk 4476 * @see #rawReadBuffer 4477 * @see #readBuffer 4478 */ 4479 private void copyUcs2ReadBuffer (int count, int shift1, int shift2) 4480 throws SAXException 4481 { 4482 int j = readBufferPos; 4483 4484 if (count > 0 && (count % 2) != 0) { 4485 encodingError ("odd number of bytes in UCS-2 encoding", -1, count); 4486 } 4487 // The loops are faster with less internal brancing; hence two 4488 if (shift1 == 0) { // "UTF-16-LE" 4489 for (int i = 0; i < count; i += 2) { 4490 char c = (char) (rawReadBuffer [i + 1] << 8); 4491 c |= 0xff & rawReadBuffer [i]; 4492 readBuffer [j++] = c; 4493 if (c == '\r') 4494 sawCR = true; 4495 } 4496 } else { // "UTF-16-BE" 4497 for (int i = 0; i < count; i += 2) { 4498 char c = (char) (rawReadBuffer [i] << 8); 4499 c |= 0xff & rawReadBuffer [i + 1]; 4500 readBuffer [j++] = c; 4501 if (c == '\r') 4502 sawCR = true; 4503 } 4504 } 4505 readBufferLength = j; 4506 } 4507 4508 4509 /** 4510 * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters. 4511 * 4512 * <p>When readDataChunk () calls this method, the raw bytes are in 4513 * rawReadBuffer, and the final characters will appear in 4514 * readBuffer. 4515 * <p>Java has Unicode chars, and this routine uses surrogate pairs 4516 * for ISO-10646 values between 0x00010000 and 0x000fffff. An 4517 * exception is thrown if the ISO-10646 character has no Unicode 4518 * representation. 4519 * 4520 * @param count The number of bytes to convert. 4521 * @param shift1 The number of bits to shift byte 1. 4522 * @param shift2 The number of bits to shift byte 2 4523 * @param shift3 The number of bits to shift byte 2 4524 * @param shift4 The number of bits to shift byte 2 4525 * @see #readDataChunk 4526 * @see #rawReadBuffer 4527 * @see #readBuffer 4528 */ 4529 private void copyUcs4ReadBuffer (int count, int shift1, int shift2, 4530 int shift3, int shift4) 4531 throws SAXException 4532 { 4533 int j = readBufferPos; 4534 int value; 4535 4536 if (count > 0 && (count % 4) != 0) { 4537 encodingError ( 4538 "number of bytes in UCS-4 encoding not divisible by 4", 4539 -1, count); 4540 } 4541 for (int i = 0; i < count; i += 4) { 4542 value = (((rawReadBuffer [i] & 0xff) << shift1) | 4543 ((rawReadBuffer [i + 1] & 0xff) << shift2) | 4544 ((rawReadBuffer [i + 2] & 0xff) << shift3) | 4545 ((rawReadBuffer [i + 3] & 0xff) << shift4)); 4546 if (value < 0x0000ffff) { 4547 readBuffer [j++] = (char) value; 4548 if (value == (int) '\r') { 4549 sawCR = true; 4550 } 4551 } else if (value < 0x0010ffff) { 4552 value -= 0x010000; 4553 readBuffer [j++] = (char) (0xd8 | ((value >> 10) & 0x03ff)); 4554 readBuffer [j++] = (char) (0xdc | (value & 0x03ff)); 4555 } else { 4556 encodingError ("UCS-4 value out of range for Unicode", 4557 value, i); 4558 } 4559 } 4560 readBufferLength = j; 4561 } 4562 4563 4564 /** 4565 * Report a character encoding error. 4566 */ 4567 private void encodingError (String message, int value, int offset) 4568 throws SAXException 4569 { 4570 String uri; 4571 4572 if (value != -1) { 4573 message = message + " (character code: 0x" + 4574 Integer.toHexString (value) + ')'; 4575 } 4576 if (externalEntity != null) { 4577 uri = externalEntity.getURL ().toString (); 4578 } else { 4579 uri = baseURI; 4580 } 4581 handler.error (message, uri, -1, offset + currentByteCount); 4582 } 4583 4584 4585 ////////////////////////////////////////////////////////////////////// 4586 // Local Variables. 4587 ////////////////////////////////////////////////////////////////////// 4588 4589 /** 4590 * Re-initialize the variables for each parse. 4591 */ 4592 private void initializeVariables () 4593 { 4594 // First line 4595 line = 1; 4596 column = 0; 4597 4598 // Set up the buffers for data and names 4599 dataBufferPos = 0; 4600 dataBuffer = new char [DATA_BUFFER_INITIAL]; 4601 nameBufferPos = 0; 4602 nameBuffer = new char [NAME_BUFFER_INITIAL]; 4603 4604 // Set up the DTD hash tables 4605 elementInfo = new HashMap (); 4606 entityInfo = new HashMap (); 4607 notationInfo = new HashMap (); 4608 4609 // Set up the variables for the current 4610 // element context. 4611 currentElement = null; 4612 currentElementContent = CONTENT_UNDECLARED; 4613 4614 // Set up the input variables 4615 sourceType = INPUT_NONE; 4616 inputStack = new ArrayList (); 4617 entityStack = new ArrayList (); 4618 externalEntity = null; 4619 tagAttributePos = 0; 4620 tagAttributes = new String [100]; 4621 rawReadBuffer = new byte [READ_BUFFER_MAX]; 4622 readBufferOverflow = -1; 4623 4624 inLiteral = false; 4625 expandPE = false; 4626 peIsError = false; 4627 4628 inCDATA = false; 4629 4630 symbolTable = new Object [SYMBOL_TABLE_LENGTH][]; 4631 } 4632 4633 4634 /** 4635 * Clean up after the parse to allow some garbage collection. 4636 */ 4637 private void cleanupVariables () 4638 { 4639 dataBuffer = null; 4640 nameBuffer = null; 4641 4642 elementInfo = null; 4643 entityInfo = null; 4644 notationInfo = null; 4645 4646 currentElement = null; 4647 4648 inputStack = null; 4649 entityStack = null; 4650 externalEntity = null; 4651 4652 tagAttributes = null; 4653 rawReadBuffer = null; 4654 4655 symbolTable = null; 4656 } 4657 4658 // 4659 // The current XML handler interface. 4660 // 4661 private SAXDriver handler; 4662 4663 // 4664 // I/O information. 4665 // 4666 private Reader reader; // current reader 4667 private InputStream is; // current input stream 4668 private int line; // current line number 4669 private int column; // current column number 4670 private int sourceType; // type of input source 4671 private ArrayList inputStack; // stack of input soruces 4672 private URLConnection externalEntity; // current external entity 4673 private int encoding; // current character encoding 4674 private int currentByteCount; // bytes read from current source 4675 4676 // 4677 // Buffers for decoded but unparsed character input. 4678 // 4679 private char readBuffer []; 4680 private int readBufferPos; 4681 private int readBufferLength; 4682 private int readBufferOverflow; // overflow from last data chunk. 4683 4684 4685 // 4686 // Buffer for undecoded raw byte input. 4687 // 4688 private final static int READ_BUFFER_MAX = 16384; 4689 private byte rawReadBuffer []; 4690 4691 4692 // 4693 // Buffer for parsed character data. 4694 // 4695 private static int DATA_BUFFER_INITIAL = 4096; 4696 private char dataBuffer []; 4697 private int dataBufferPos; 4698 4699 // 4700 // Buffer for parsed names. 4701 // 4702 private static int NAME_BUFFER_INITIAL = 1024; 4703 private char nameBuffer []; 4704 private int nameBufferPos; 4705 4706 4707 // 4708 // HashMaps for DTD information on elements, entities, and notations. 4709 // 4710 private HashMap elementInfo; 4711 private HashMap entityInfo; 4712 private HashMap notationInfo; 4713 4714 4715 // 4716 // Element type currently in force. 4717 // 4718 private String currentElement; 4719 private int currentElementContent; 4720 4721 // 4722 // Base external identifiers for resolution. 4723 // 4724 private String basePublicId; 4725 private String baseURI; 4726 private int baseEncoding; 4727 private Reader baseReader; 4728 private InputStream baseInputStream; 4729 private char baseInputBuffer []; 4730 private int baseInputBufferStart; 4731 private int baseInputBufferLength; 4732 4733 // 4734 // Stack of entity names, to detect recursion. 4735 // 4736 private ArrayList entityStack; 4737 4738 // 4739 // PE expansion is enabled in most chunks of the DTD, not all. 4740 // When it's enabled, literals are treated differently. 4741 // 4742 private boolean inLiteral; 4743 private boolean expandPE; 4744 private boolean peIsError; 4745 4746 // 4747 // Symbol table, for caching interned names. 4748 // 4749 private final static int SYMBOL_TABLE_LENGTH = 1087; 4750 private Object symbolTable [][]; 4751 4752 // 4753 // Hash table of attributes found in current start tag. 4754 // 4755 private String tagAttributes []; 4756 private int tagAttributePos; 4757 4758 // 4759 // Utility flag: have we noticed a CR while reading the last 4760 // data chunk? If so, we will have to go back and normalise 4761 // CR or CR/LF line ends. 4762 // 4763 private boolean sawCR; 4764 4765 // 4766 // Utility flag: are we in CDATA? If so, whitespace isn't ignorable. 4767 // 4768 private boolean inCDATA; 4769}