Source code

001/*
002 * Copyright (c) 1999-2000 by David Brownell.  All Rights Reserved.
003 *
004 * This program is open source software; you may use, copy, modify, and
005 * redistribute it under the terms of the LICENSE with which it was
006 * originally distributed.
007 *
008 * This program is distributed in the hope that it will be useful,
009 * but WITHOUT ANY WARRANTY; without even the implied warranty of
010 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
011 * LICENSE for more details.
012 */
013
014//
015// Copyright (c) 1997, 1998 by Microstar Software Ltd.
016// From Microstar's README (the entire original license):
017//
018// AElfred is free for both commercial and non-commercial use and
019// redistribution, provided that Microstar's copyright and disclaimer are
020// retained intact.  You are free to modify AElfred for your own use and
021// to redistribute AElfred with your modifications, provided that the
022// modifications are clearly documented.
023//
024// This program is distributed in the hope that it will be useful, but
025// WITHOUT ANY WARRANTY; without even the implied warranty of
026// merchantability or fitness for a particular purpose.  Please use it AT
027// YOUR OWN RISK.
028//
029
030
031package org.dom4j.io.aelfred;
032
033import java.io.BufferedInputStream;
034import java.io.CharConversionException;
035import java.io.EOFException;
036import java.io.InputStream;
037import java.io.InputStreamReader;
038import java.io.IOException;
039import java.io.Reader;
040import java.net.URL;
041import java.net.URLConnection;
042import java.util.ArrayList;
043import java.util.HashMap;
044import java.util.Iterator;
045
046import org.xml.sax.SAXException;
047
048
049// $Id: XmlParser.java,v 1.4 2002/02/01 10:55:25 jstrachan Exp $
050
051/**
052 * Parse XML documents and return parse events through call-backs.
053 * Use the <code>SAXDriver</code> class as your entry point, as the
054 * internal parser interfaces are subject to change.
055 *
056 * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
057 *      (version 1.2a with bugfixes)
058 * @author Updated by David Brownell &lt;david-b@pacbell.net&gt;
059 * @version $Date: 2002/02/01 10:55:25 $
060 * @see SAXDriver
061 */
062final class XmlParser
063{
064    //
065    // Use special cheats that speed up the code by
066    // avoiding per-character readCh () method calls.
067    //
068    private final static boolean USE_CHEATS = true;
069
070
071    //////////////////////////////////////////////////////////////////////
072    // Constructors.
073    ////////////////////////////////////////////////////////////////////////
074
075
076    /**
077     * Construct a new parser with no associated handler.
078     * @see #setHandler
079     * @see #parse
080     */
081    // package private
082    XmlParser ()
083    {
084            cleanupVariables ();
085    }
086
087
088    /**
089     * Set the handler that will receive parsing events.
090     * @param handler The handler to receive callback events.
091     * @see #parse
092     */
093    // package private
094    void setHandler (SAXDriver handler)
095    {
096            this.handler = handler;
097    }
098
099
100    /**
101     * Parse an XML document from the character stream, byte stream, or URI
102     * that you provide (in that order of preference).  Any URI that you
103     * supply will become the base URI for resolving relative URI, and may
104     * be used to acquire a reader or byte stream.
105     *
106     * <p>You may parse more than one document, but that must be done
107     * sequentially.  Only one thread at a time may use this parser.
108     *
109     * @param systemId The URI of the document; should never be null,
110     *  but may be so iff a reader <em>or</em> a stream is provided.
111     * @param publicId The public identifier of the document, or null.
112     * @param reader A character stream; must be null if stream isn't.
113     * @param stream A byte input stream; must be null if reader isn't.
114     * @param encoding The suggested encoding, or null if unknown.
115     * @exception java.lang.Exception Basically SAXException or IOException
116     */
117    // package private 
118    void doParse (
119        String          systemId,
120        String          publicId,
121        Reader          reader,
122        InputStream     stream,
123        String          encoding
124    ) throws Exception
125    {
126        if (handler == null)
127            throw new IllegalStateException ("no callback handler");
128
129        basePublicId = publicId;
130        baseURI = systemId;
131        baseReader = reader;
132        baseInputStream = stream;
133
134        initializeVariables ();
135
136        // predeclare the built-in entities here (replacement texts)
137        // we don't need to intern(), since we're guaranteed literals
138        // are always (globally) interned.
139        setInternalEntity ("amp", "&#38;");
140        setInternalEntity ("lt", "&#60;");
141        setInternalEntity ("gt", "&#62;");
142        setInternalEntity ("apos", "&#39;");
143        setInternalEntity ("quot", "&#34;");
144
145        handler.startDocument ();
146
147        pushURL ("[document]", basePublicId, baseURI,
148                baseReader, baseInputStream, encoding);
149
150        try {
151            parseDocument ();
152            handler.endDocument ();
153        } finally {
154            if (baseReader != null)
155                try { baseReader.close ();
156                } catch (IOException e) { /* ignore */ }
157            if (baseInputStream != null)
158                try { baseInputStream.close ();
159                } catch (IOException e) { /* ignore */ }
160            if (is != null)
161                try { is.close ();
162                } catch (IOException e) { /* ignore */ }
163            if (reader != null)
164                try {
165                    reader.close ();
166                } catch (IOException e) { /* ignore */
167                }
168            cleanupVariables ();
169        }
170    }
171
172
173    ////////////////////////////////////////////////////////////////////////
174    // Constants.
175    ////////////////////////////////////////////////////////////////////////
176
177    //
178    // Constants for element content type.
179    //
180
181    /**
182     * Constant: an element has not been declared.
183     * @see #getElementContentType
184     */
185    public final static int CONTENT_UNDECLARED = 0;
186
187    /**
188     * Constant: the element has a content model of ANY.
189     * @see #getElementContentType
190     */
191    public final static int CONTENT_ANY = 1;
192
193    /**
194     * Constant: the element has declared content of EMPTY.
195     * @see #getElementContentType
196     */
197    public final static int CONTENT_EMPTY = 2;
198
199    /**
200     * Constant: the element has mixed content.
201     * @see #getElementContentType
202     */
203    public final static int CONTENT_MIXED = 3;
204
205    /**
206     * Constant: the element has element content.
207     * @see #getElementContentType
208     */
209    public final static int CONTENT_ELEMENTS = 4;
210
211
212    //
213    // Constants for the entity type.
214    //
215
216    /**
217     * Constant: the entity has not been declared.
218     * @see #getEntityType
219     */
220    public final static int ENTITY_UNDECLARED = 0;
221
222    /**
223     * Constant: the entity is internal.
224     * @see #getEntityType
225     */
226    public final static int ENTITY_INTERNAL = 1;
227
228    /**
229     * Constant: the entity is external, non-XML data.
230     * @see #getEntityType
231     */
232    public final static int ENTITY_NDATA = 2;
233
234    /**
235     * Constant: the entity is external XML data.
236     * @see #getEntityType
237     */
238    public final static int ENTITY_TEXT = 3;
239
240
241    //
242    // Constants for attribute type.
243    //
244
245    /**
246     * Constant: the attribute has not been declared for this element type.
247     * @see #getAttributeType
248     */
249    public final static int ATTRIBUTE_UNDECLARED = 0;
250
251    /**
252     * Constant: the attribute value is a string value.
253     * @see #getAttributeType
254     */
255    public final static int ATTRIBUTE_CDATA = 1;
256
257    /**
258     * Constant: the attribute value is a unique identifier.
259     * @see #getAttributeType
260     */
261    public final static int ATTRIBUTE_ID = 2;
262
263    /**
264     * Constant: the attribute value is a reference to a unique identifier.
265     * @see #getAttributeType
266     */
267    public final static int ATTRIBUTE_IDREF = 3;
268
269    /**
270     * Constant: the attribute value is a list of ID references.
271     * @see #getAttributeType
272     */
273    public final static int ATTRIBUTE_IDREFS = 4;
274
275    /**
276     * Constant: the attribute value is the name of an entity.
277     * @see #getAttributeType
278     */
279    public final static int ATTRIBUTE_ENTITY = 5;
280
281    /**
282     * Constant: the attribute value is a list of entity names.
283     * @see #getAttributeType
284     */
285    public final static int ATTRIBUTE_ENTITIES = 6;
286
287    /**
288     * Constant: the attribute value is a name token.
289     * @see #getAttributeType
290     */
291    public final static int ATTRIBUTE_NMTOKEN = 7;
292
293    /**
294     * Constant: the attribute value is a list of name tokens.
295     * @see #getAttributeType
296     */
297    public final static int ATTRIBUTE_NMTOKENS = 8;
298
299    /**
300     * Constant: the attribute value is a token from an enumeration.
301     * @see #getAttributeType
302     */
303    public final static int ATTRIBUTE_ENUMERATED = 9;
304
305    /**
306     * Constant: the attribute is the name of a notation.
307     * @see #getAttributeType
308     */
309    public final static int ATTRIBUTE_NOTATION = 10;
310
311
312    //
313    // When the class is loaded, populate the hash table of
314    // attribute types.
315    //
316
317    /**
318     * Hash table of attribute types.
319     */
320    private static HashMap attributeTypeHash;
321    static {
322        attributeTypeHash = new HashMap (13);
323        attributeTypeHash.put ("CDATA", new Integer (ATTRIBUTE_CDATA));
324        attributeTypeHash.put ("ID", new Integer (ATTRIBUTE_ID));
325        attributeTypeHash.put ("IDREF", new Integer (ATTRIBUTE_IDREF));
326        attributeTypeHash.put ("IDREFS", new Integer (ATTRIBUTE_IDREFS));
327        attributeTypeHash.put ("ENTITY", new Integer (ATTRIBUTE_ENTITY));
328        attributeTypeHash.put ("ENTITIES", new Integer (ATTRIBUTE_ENTITIES));
329        attributeTypeHash.put ("NMTOKEN", new Integer (ATTRIBUTE_NMTOKEN));
330        attributeTypeHash.put ("NMTOKENS", new Integer (ATTRIBUTE_NMTOKENS));
331        attributeTypeHash.put ("NOTATION", new Integer (ATTRIBUTE_NOTATION));
332    }
333
334
335    //
336    // Constants for supported encodings.  "external" is just a flag.
337    //
338    private final static int ENCODING_EXTERNAL = 0;
339    private final static int ENCODING_UTF_8 = 1;
340    private final static int ENCODING_ISO_8859_1 = 2;
341    private final static int ENCODING_UCS_2_12 = 3;
342    private final static int ENCODING_UCS_2_21 = 4;
343    private final static int ENCODING_UCS_4_1234 = 5;
344    private final static int ENCODING_UCS_4_4321 = 6;
345    private final static int ENCODING_UCS_4_2143 = 7;
346    private final static int ENCODING_UCS_4_3412 = 8;
347    private final static int ENCODING_ASCII = 9;
348
349
350    //
351    // Constants for attribute default value.
352    //
353
354    /**
355     * Constant: the attribute is not declared.
356     * @see #getAttributeDefaultValueType
357     */
358    public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
359
360    /**
361     * Constant: the attribute has a literal default value specified.
362     * @see #getAttributeDefaultValueType
363     * @see #getAttributeDefaultValue
364     */
365    public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
366
367    /**
368     * Constant: the attribute was declared #IMPLIED.
369     * @see #getAttributeDefaultValueType
370     */
371    public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
372
373    /**
374     * Constant: the attribute was declared #REQUIRED.
375     * @see #getAttributeDefaultValueType
376     */
377    public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
378
379    /**
380     * Constant: the attribute was declared #FIXED.
381     * @see #getAttributeDefaultValueType
382     * @see #getAttributeDefaultValue
383     */
384    public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
385
386
387    //
388    // Constants for input.
389    //
390    private final static int INPUT_NONE = 0;
391    private final static int INPUT_INTERNAL = 1;
392    private final static int INPUT_EXTERNAL = 2;
393    private final static int INPUT_STREAM = 3;
394    private final static int INPUT_BUFFER = 4;
395    private final static int INPUT_READER = 5;
396
397
398    //
399    // Flags for reading literals.
400    //
401        // expand general entity refs (attribute values in dtd and content)
402    private final static int LIT_ENTITY_REF = 2;
403        // normalize this value (whitespace etc) (attributes, public ids)
404    private final static int LIT_NORMALIZE = 4;
405        // literal is an attribute value 
406    private final static int LIT_ATTRIBUTE = 8;
407        // don't expand parameter entities
408    private final static int LIT_DISABLE_PE = 16;
409        // don't expand [or parse] character refs
410    private final static int LIT_DISABLE_CREF = 32;
411        // don't parse general entity refs
412    private final static int LIT_DISABLE_EREF = 64;
413        // don't expand general entities, but make sure we _could_
414    private final static int LIT_ENTITY_CHECK = 128;
415
416
417    //
418    // Flags affecting PE handling in DTDs (if expandPE is true).
419    // PEs expand with space padding, except inside literals.
420    //
421    private final static int CONTEXT_NORMAL = 0;
422    private final static int CONTEXT_LITERAL = 1;
423
424
425    //////////////////////////////////////////////////////////////////////
426    // Error reporting.
427    //////////////////////////////////////////////////////////////////////
428
429
430    /**
431     * Report an error.
432     * @param message The error message.
433     * @param textFound The text that caused the error (or null).
434     * @see SAXDriver#error
435     * @see #line
436     */
437    private void error (String message, String textFound, String textExpected)
438    throws SAXException
439    {
440        if (textFound != null) {
441            message = message + " (found \"" + textFound + "\")";
442        }
443        if (textExpected != null) {
444            message = message + " (expected \"" + textExpected + "\")";
445        }
446        String uri = null;
447
448        if (externalEntity != null) {
449            uri = externalEntity.getURL ().toString ();
450        }
451        handler.error (message, uri, line, column);
452
453        // "can't happen"
454        throw new SAXException (message);
455    }
456
457
458    /**
459     * Report a serious error.
460     * @param message The error message.
461     * @param textFound The text that caused the error (or null).
462     */
463    private void error (String message, char textFound, String textExpected)
464    throws SAXException
465    {
466        error (message, new Character (textFound).toString (), textExpected);
467    }
468
469    /** Report typical case fatal errors. */
470    private void error (String message)
471    throws SAXException
472    {
473        error (message, null, null);
474    }
475
476
477    //////////////////////////////////////////////////////////////////////
478    // Major syntactic productions.
479    //////////////////////////////////////////////////////////////////////
480
481
482    /**
483     * Parse an XML document.
484     * <pre>
485     * [1] document ::= prolog element Misc*
486     * </pre>
487     * <p>This is the top-level parsing function for a single XML
488     * document.  As a minimum, a well-formed document must have
489     * a document element, and a valid document must have a prolog
490     * (one with doctype) as well.
491     */
492    private void parseDocument ()
493    throws Exception
494    {
495        char c;
496        try {                                       // added by MHK
497            parseProlog ();
498            require ('<');
499            parseElement ();
500        } catch (EOFException ee) {                 // added by MHK
501            error("premature end of file", "[EOF]", null);
502        }
503        
504        try {
505            parseMisc ();   //skip all white, PIs, and comments
506            c = readCh ();    //if this doesn't throw an exception...
507            error ("unexpected characters after document end", c, null);
508        } catch (EOFException e) {
509            return;
510        }
511    }
512
513
514    /**
515     * Skip a comment.
516     * <pre>
517     * [15] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
518     * </pre>
519     * <p> (The <code>&lt;!--</code> has already been read.)
520     */
521    private void parseComment ()
522    throws Exception
523    {
524        char c;
525        boolean saved = expandPE;
526
527        expandPE = false;
528        parseUntil ("--");
529        require ('>');
530        expandPE = saved;
531        handler.comment (dataBuffer, 0, dataBufferPos);
532        dataBufferPos = 0;
533    }
534
535
536    /**
537     * Parse a processing instruction and do a call-back.
538     * <pre>
539     * [16] PI ::= '&lt;?' PITarget
540     *          (S (Char* - (Char* '?&gt;' Char*)))?
541     *          '?&gt;'
542     * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
543     * </pre>
544     * <p> (The <code>&lt;?</code> has already been read.)
545     */
546    private void parsePI ()
547    throws SAXException, IOException
548    {
549        String name;
550        boolean saved = expandPE;
551
552        expandPE = false;
553        name = readNmtoken (true);
554        if ("xml".equalsIgnoreCase (name))
555            error ("Illegal processing instruction target", name, null);
556        if (!tryRead ("?>")) {
557            requireWhitespace ();
558            parseUntil ("?>");
559        }
560        expandPE = saved;
561        handler.processingInstruction (name, dataBufferToString ());
562    }
563
564
565    /**
566     * Parse a CDATA section.
567     * <pre>
568     * [18] CDSect ::= CDStart CData CDEnd
569     * [19] CDStart ::= '&lt;![CDATA['
570     * [20] CData ::= (Char* - (Char* ']]&gt;' Char*))
571     * [21] CDEnd ::= ']]&gt;'
572     * </pre>
573     * <p> (The '&lt;![CDATA[' has already been read.)
574     */
575    private void parseCDSect ()
576    throws Exception
577    {
578        parseUntil ("]]>");
579        dataBufferFlush ();
580    }
581
582
583    /**
584     * Parse the prolog of an XML document.
585     * <pre>
586     * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
587     * </pre>
588     * <p>There are a couple of tricks here.  First, it is necessary to
589     * declare the XML default attributes after the DTD (if present)
590     * has been read. [??]  Second, it is not possible to expand general
591     * references in attribute value literals until after the entire
592     * DTD (if present) has been parsed.
593     * <p>We do not look for the XML declaration here, because it was
594     * handled by pushURL ().
595     * @see pushURL
596     */
597    private void parseProlog ()
598    throws Exception
599    {
600        parseMisc ();
601
602        if (tryRead ("<!DOCTYPE")) {
603            parseDoctypedecl ();
604            parseMisc ();
605        }
606    }
607
608
609    /**
610     * Parse the XML declaration.
611     * <pre>
612     * [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
613     * [24] VersionInfo ::= S 'version' Eq
614     *          ("'" VersionNum "'" | '"' VersionNum '"' )
615     * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
616     * [32] SDDecl ::= S 'standalone' Eq
617     *          ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
618     * [80] EncodingDecl ::= S 'encoding' Eq
619     *          ( "'" EncName "'" | "'" EncName "'" )
620     * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
621     * </pre>
622     * <p> (The <code>&lt;?xml</code> and whitespace have already been read.)
623     * @return the encoding in the declaration, uppercased; or null
624     * @see #parseTextDecl
625     * @see #setupDecoding
626     */
627    private String parseXMLDecl (boolean ignoreEncoding)
628    throws SAXException, IOException
629    {
630        String  version;
631        String  encodingName = null;
632        String  standalone = null;
633        boolean white;
634        int     flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
635
636        // Read the version.
637        require ("version");
638        parseEq ();
639        version = readLiteral (flags);
640        if (!version.equals ("1.0")) {
641            error ("unsupported XML version", version, "1.0");
642        }
643
644        // Try reading an encoding declaration.
645        white = tryWhitespace ();
646        if (tryRead ("encoding")) {
647            if (!white)
648                error ("whitespace required before 'encoding='");
649            parseEq ();
650            encodingName = readLiteral (flags);
651            if (!ignoreEncoding)
652                setupDecoding (encodingName);
653        }
654
655        // Try reading a standalone declaration
656        if (encodingName != null)
657            white = tryWhitespace ();
658        if (tryRead ("standalone")) {
659            if (!white)
660                error ("whitespace required before 'standalone='");
661            parseEq ();
662            standalone = readLiteral (flags);
663            if (! ("yes".equals (standalone) || "no".equals (standalone)))
664                error ("standalone flag must be 'yes' or 'no'");
665        }
666
667        skipWhitespace ();
668        require ("?>");
669
670        return encodingName;
671    }
672
673
674    /**
675     * Parse a text declaration.
676     * <pre>
677     * [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'
678     * [80] EncodingDecl ::= S 'encoding' Eq
679     *          ( '"' EncName '"' | "'" EncName "'" )
680     * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
681     * </pre>
682     * <p> (The <code>&lt;?xml</code>' and whitespace have already been read.)
683     * @return the encoding in the declaration, uppercased; or null
684     * @see #parseXMLDecl
685     * @see #setupDecoding
686     */
687    private String parseTextDecl (boolean ignoreEncoding)
688    throws SAXException, IOException
689    {
690        String  encodingName = null;
691        int     flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
692
693        // Read an optional version.
694        if (tryRead ("version")) {
695            String version;
696            parseEq ();
697            version = readLiteral (flags);
698            if (!version.equals ("1.0")) {
699                error ("unsupported XML version", version, "1.0");
700            }
701            requireWhitespace ();
702        }
703
704
705        // Read the encoding.
706        require ("encoding");
707        parseEq ();
708        encodingName = readLiteral (flags);
709        if (!ignoreEncoding)
710            setupDecoding (encodingName);
711
712        skipWhitespace ();
713        require ("?>");
714
715        return encodingName;
716    }
717
718
719    /**
720     * Sets up internal state so that we can decode an entity using the
721     * specified encoding.  This is used when we start to read an entity
722     * and we have been given knowledge of its encoding before we start to
723     * read any data (e.g. from a SAX input source or from a MIME type).
724     *
725     * <p> It is also used after autodetection, at which point only very
726     * limited adjustments to the encoding may be used (switching between
727     * related builtin decoders).
728     *
729     * @param encodingName The name of the encoding specified by the user.
730     * @exception IOException if the encoding isn't supported either
731     *  internally to this parser, or by the hosting JVM.
732     * @see #parseXMLDecl
733     * @see #parseTextDecl
734     */
735    private void setupDecoding (String encodingName)
736    throws SAXException, IOException
737    {
738        encodingName = encodingName.toUpperCase ();
739
740        // ENCODING_EXTERNAL indicates an encoding that wasn't
741        // autodetected ... we can use builtin decoders, or
742        // ones from the JVM (InputStreamReader).
743
744        // Otherwise we can only tweak what was autodetected, and
745        // only for single byte (ASCII derived) builtin encodings.
746
747        // ASCII-derived encodings
748        if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) {
749            if (encodingName.equals ("ISO-8859-1")
750                    || encodingName.equals ("8859_1")
751                    || encodingName.equals ("ISO8859_1")
752              ) {
753                encoding = ENCODING_ISO_8859_1;
754                return;
755            } else if (encodingName.equals ("US-ASCII")
756                        || encodingName.equals ("ASCII")) {
757                encoding = ENCODING_ASCII;
758                return;
759            } else if (encodingName.equals ("UTF-8")
760                        || encodingName.equals ("UTF8")) {
761                encoding = ENCODING_UTF_8;
762                return;
763            } else if (encoding != ENCODING_EXTERNAL) {
764                // fatal error
765                error ("unsupported ASCII-derived encoding",
766                       encodingName,
767                       "UTF-8, US-ASCII, or ISO-8859-1");
768            }
769            // else fallthrough ...
770            // it's ASCII-ish and something other than a builtin
771        }
772
773        // Unicode and such
774        if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) {
775            if (!(encodingName.equals ("ISO-10646-UCS-2")
776                    || encodingName.equals ("UTF-16")
777                    || encodingName.equals ("UTF-16BE")
778                    || encodingName.equals ("UTF-16LE")))
779                error ("unsupported Unicode encoding",
780                       encodingName,
781                       "UTF-16");
782            return;
783        }
784
785        // four byte encodings
786        if (encoding == ENCODING_UCS_4_1234
787                || encoding == ENCODING_UCS_4_4321
788                || encoding == ENCODING_UCS_4_2143
789                || encoding == ENCODING_UCS_4_3412) {
790            if (!encodingName.equals ("ISO-10646-UCS-4"))
791                error ("unsupported 32-bit encoding",
792                       encodingName,
793                       "ISO-10646-UCS-4");
794            return;
795        }
796
797        // assert encoding == ENCODING_EXTERNAL
798        // if (encoding != ENCODING_EXTERNAL)
799        //     throw new RuntimeException ("encoding = " + encoding);
800
801        if (encodingName.equals ("UTF-16BE")) {
802            encoding = ENCODING_UCS_2_12;
803            return;
804        }
805        if (encodingName.equals ("UTF-16LE")) {
806            encoding = ENCODING_UCS_2_21;
807            return;
808        }
809
810        // We couldn't use the builtin decoders at all.  But we can try to
811        // create a reader, since we haven't messed up buffering.  Tweak
812        // the encoding name if necessary.
813
814        if (encodingName.equals ("UTF-16")
815                || encodingName.equals ("ISO-10646-UCS-2"))
816            encodingName = "Unicode";
817        // Ignoring all the EBCDIC aliases here
818
819        reader = new InputStreamReader (is, encodingName);
820        sourceType = INPUT_READER;
821        is = null;
822    }
823
824
825    /**
826     * Parse miscellaneous markup outside the document element and DOCTYPE
827     * declaration.
828     * <pre>
829     * [27] Misc ::= Comment | PI | S
830     * </pre>
831     */
832    private void parseMisc ()
833    throws Exception
834    {
835        while (true) {
836            skipWhitespace ();
837            if (tryRead ("<?")) {
838                parsePI ();
839            } else if (tryRead ("<!--")) {
840                parseComment ();
841            } else {
842                return;
843            }
844        }
845    }
846
847
848    /**
849     * Parse a document type declaration.
850     * <pre>
851     * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
852     *          ('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'
853     * </pre>
854     * <p> (The <code>&lt;!DOCTYPE</code> has already been read.)
855     */
856    private void parseDoctypedecl ()
857    throws Exception
858    {
859        char c;
860        String doctypeName, ids[];
861
862        // Read the document type name.
863        requireWhitespace ();
864        doctypeName = readNmtoken (true);
865
866        // Read the External subset's IDs
867        skipWhitespace ();
868        ids = readExternalIds (false);
869
870        // report (a) declaration of name, (b) lexical info (ids)
871        handler.doctypeDecl (doctypeName, ids [0], ids [1]);
872
873        // Internal subset is parsed first, if present
874        skipWhitespace ();
875        if (tryRead ('[')) {
876
877            // loop until the subset ends
878            while (true) {
879                expandPE = true;
880                skipWhitespace ();
881                expandPE = false;
882                if (tryRead (']')) {
883                    break;              // end of subset
884                } else {
885                    // WFC, PEs in internal subset (only between decls)
886                    peIsError = expandPE = true;
887                    parseMarkupdecl ();
888                    peIsError = expandPE = false;
889                }
890            }
891        }
892
893        // Read the external subset, if any
894        if (ids [1] != null) {
895            pushURL ("[external subset]", ids [0], ids [1], null, null, null);
896
897            // Loop until we end up back at '>'
898            while (true) {
899                expandPE = true;
900                skipWhitespace ();
901                expandPE = false;
902                if (tryRead ('>')) {
903                    break;
904                } else {
905                    expandPE = true;
906                    parseMarkupdecl ();
907                    expandPE = false;
908                }
909            }
910        } else {
911            // No external subset.
912            skipWhitespace ();
913            require ('>');
914        }
915
916        // done dtd
917        handler.endDoctype ();
918        expandPE = false;
919    }
920
921
922    /**
923     * Parse a markup declaration in the internal or external DTD subset.
924     * <pre>
925     * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
926     *          | NotationDecl | PI | Comment
927     * [30] extSubsetDecl ::= (markupdecl | conditionalSect
928     *          | PEReference | S) *
929     * </pre>
930     * <p> Reading toplevel PE references is handled as a lexical issue
931     * by the caller, as is whitespace.
932     */
933    private void parseMarkupdecl ()
934    throws Exception
935    {
936        if (tryRead ("<!ELEMENT")) {
937            parseElementdecl ();
938        } else if (tryRead ("<!ATTLIST")) {
939            parseAttlistDecl ();
940        } else if (tryRead ("<!ENTITY")) {
941            parseEntityDecl ();
942        } else if (tryRead ("<!NOTATION")) {
943            parseNotationDecl ();
944        } else if (tryRead ("<?")) {
945            parsePI ();
946        } else if (tryRead ("<!--")) {
947            parseComment ();
948        } else if (tryRead ("<![")) {
949            if (inputStack.size () > 0)
950                parseConditionalSect ();
951            else
952                error ("conditional sections illegal in internal subset");
953        } else {
954            error ("expected markup declaration");
955        }
956    }
957
958
959    /**
960     * Parse an element, with its tags.
961     * <pre>
962     * [39] element ::= EmptyElementTag | STag content ETag
963     * [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'
964     * [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'
965     * </pre>
966     * <p> (The '&lt;' has already been read.)
967     * <p>NOTE: this method actually chains onto parseContent (), if necessary,
968     * and parseContent () will take care of calling parseETag ().
969     */
970    private void parseElement ()
971    throws Exception
972    {
973        String  gi;
974        char    c;
975        int     oldElementContent = currentElementContent;
976        String  oldElement = currentElement;
977        Object  element [];
978
979        // This is the (global) counter for the
980        // array of specified attributes.
981        tagAttributePos = 0;
982
983        // Read the element type name.
984        gi = readNmtoken (true);
985
986        // Determine the current content type.
987        currentElement = gi;
988        element = (Object []) elementInfo.get (gi);
989        currentElementContent = getContentType (element, CONTENT_ANY);
990
991        // Read the attributes, if any.
992        // After this loop, "c" is the closing delimiter.
993        boolean white = tryWhitespace ();
994        c = readCh ();
995        while (c != '/' && c != '>') {
996            unread (c);
997            if (!white)
998                error ("need whitespace between attributes");
999            parseAttribute (gi);
1000            white = tryWhitespace ();
1001            c = readCh ();
1002        }
1003
1004        // Supply any defaulted attributes.
1005        Iterator atts = declaredAttributes (element);
1006        if (atts != null) {
1007            String aname;
1008loop:
1009            while (atts.hasNext ()) {
1010                aname = (String) atts.next ();
1011                // See if it was specified.
1012                for (int i = 0; i < tagAttributePos; i++) {
1013                    if (tagAttributes [i] == aname) {
1014                        continue loop;
1015                    }
1016                }
1017                // I guess not...
1018                handler.attribute (aname,
1019                                   getAttributeExpandedValue (gi, aname),
1020                                   false);
1021            }
1022        }
1023
1024        // Figure out if this is a start tag
1025        // or an empty element, and dispatch an
1026        // event accordingly.
1027        switch (c) {
1028        case '>':
1029            handler.startElement (gi);
1030            parseContent ();
1031            break;
1032        case '/':
1033            require ('>');
1034            handler.startElement (gi);
1035            handler.endElement (gi);
1036            break;
1037        }
1038
1039        // Restore the previous state.
1040        currentElement = oldElement;
1041        currentElementContent = oldElementContent;
1042    }
1043
1044
1045    /**
1046     * Parse an attribute assignment.
1047     * <pre>
1048     * [41] Attribute ::= Name Eq AttValue
1049     * </pre>
1050     * @param name The name of the attribute's element.
1051     * @see SAXDriver#attribute
1052     */
1053    private void parseAttribute (String name)
1054    throws Exception
1055    {
1056        String aname;
1057        int type;
1058        String value;
1059        int flags = LIT_ATTRIBUTE |  LIT_ENTITY_REF;
1060
1061        // Read the attribute name.
1062        aname = readNmtoken (true);
1063        type = getAttributeType (name, aname);
1064
1065        // Parse '='
1066        parseEq ();
1067
1068        // Read the value, normalizing whitespace
1069        // unless it is CDATA.
1070        if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) {
1071            value = readLiteral (flags);
1072        } else {
1073            value = readLiteral (flags | LIT_NORMALIZE);
1074        }
1075
1076        // WFC: no duplicate attributes
1077        for (int i = 0; i < tagAttributePos; i++)
1078            if (aname.equals (tagAttributes [i]))
1079                error ("duplicate attribute", aname, null);
1080
1081        // Inform the handler about the
1082        // attribute.
1083        handler.attribute (aname, value, true);
1084        dataBufferPos = 0;
1085
1086        // Note that the attribute has been
1087        // specified.
1088        if (tagAttributePos == tagAttributes.length) {
1089            String newAttrib[] = new String [tagAttributes.length * 2];
1090            System.arraycopy (tagAttributes, 0, newAttrib, 0, tagAttributePos);
1091            tagAttributes = newAttrib;
1092        }
1093        tagAttributes [tagAttributePos++] = aname;
1094    }
1095
1096
1097    /**
1098     * Parse an equals sign surrounded by optional whitespace.
1099     * <pre>
1100     * [25] Eq ::= S? '=' S?
1101     * </pre>
1102     */
1103    private void parseEq ()
1104    throws SAXException, IOException
1105    {
1106        skipWhitespace ();
1107        require ('=');
1108        skipWhitespace ();
1109    }
1110
1111
1112    /**
1113     * Parse an end tag.
1114     * <pre>
1115     * [42] ETag ::= '</' Name S? '>'
1116     * </pre>
1117     * <p>NOTE: parseContent () chains to here, we already read the
1118     * "&lt;/".
1119     */
1120    private void parseETag ()
1121    throws Exception
1122    {
1123        require (currentElement);
1124        skipWhitespace ();
1125        require ('>');
1126        handler.endElement (currentElement);
1127        // not re-reporting any SAXException re bogus end tags,
1128        // even though that diagnostic might be clearer ...
1129    }
1130
1131
1132    /**
1133     * Parse the content of an element.
1134     * <pre>
1135     * [43] content ::= (element | CharData | Reference
1136     *          | CDSect | PI | Comment)*
1137     * [67] Reference ::= EntityRef | CharRef
1138     * </pre>
1139     * <p> NOTE: consumes ETtag.
1140     */
1141    private void parseContent ()
1142    throws Exception
1143    {
1144        String data;
1145        char c;
1146
1147        while (true) {
1148            switch (currentElementContent) {
1149                case CONTENT_ANY:
1150                case CONTENT_MIXED:
1151                case CONTENT_UNDECLARED:    // this line added by MHK 24 May 2000
1152                case CONTENT_EMPTY:         // this line added by MHK 8 Sept 2000
1153                        parseCharData ();
1154                        break;
1155                case CONTENT_ELEMENTS:
1156                        parseWhitespace ();
1157                        break;
1158            }
1159
1160            // Handle delimiters
1161            c = readCh ();
1162            switch (c) {
1163
1164            case '&':                   // Found "&"
1165
1166                c = readCh ();
1167                if (c == '#') {
1168                    parseCharRef ();
1169                } else {
1170                    unread (c);
1171                    parseEntityRef (true);
1172                }
1173                break;
1174
1175            case '<':                   // Found "<"
1176                dataBufferFlush ();
1177                c = readCh ();
1178                switch (c) {
1179                  case '!':                     // Found "<!"
1180                    c = readCh ();
1181                    switch (c) {
1182                      case '-':                 // Found "<!-"
1183                                require ('-');
1184                                parseComment ();
1185                                break;
1186                      case '[':                 // Found "<!["
1187                                require ("CDATA[");
1188                                handler.startCDATA ();
1189                                inCDATA = true;
1190                                parseCDSect ();
1191                                inCDATA = false;
1192                                handler.endCDATA ();
1193                                break;
1194                      default:
1195                                error ("expected comment or CDATA section", c, null);
1196                                break;
1197                    }
1198                    break;
1199
1200                  case '?':             // Found "<?"
1201                    parsePI ();
1202                    break;
1203
1204                  case '/':             // Found "</"
1205                    parseETag ();
1206                    return;
1207
1208                  default:              // Found "<" followed by something else
1209                    unread (c);
1210                    parseElement ();
1211                    break;
1212                }
1213                }
1214            }
1215    }
1216
1217
1218    /**
1219     * Parse an element type declaration.
1220     * <pre>
1221     * [45] elementdecl ::= '&lt;!ELEMENT' S Name S contentspec S? '&gt;'
1222     * </pre>
1223     * <p> NOTE: the '&lt;!ELEMENT' has already been read.
1224     */
1225    private void parseElementdecl ()
1226    throws Exception
1227    {
1228        String name;
1229
1230        requireWhitespace ();
1231        // Read the element type name.
1232        name = readNmtoken (true);
1233
1234        requireWhitespace ();
1235        // Read the content model.
1236        parseContentspec (name);
1237
1238        skipWhitespace ();
1239        require ('>');
1240    }
1241
1242
1243    /**
1244     * Content specification.
1245     * <pre>
1246     * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1247     * </pre>
1248     */
1249    private void parseContentspec (String name)
1250    throws Exception
1251    {
1252        if (tryRead ("EMPTY")) {
1253            setElement (name, CONTENT_EMPTY, null, null);
1254            return;
1255        } else if (tryRead ("ANY")) {
1256            setElement (name, CONTENT_ANY, null, null);
1257            return;
1258        } else {
1259            require ('(');
1260            dataBufferAppend ('(');
1261            skipWhitespace ();
1262            if (tryRead ("#PCDATA")) {
1263                dataBufferAppend ("#PCDATA");
1264                parseMixed ();
1265                setElement (name, CONTENT_MIXED, dataBufferToString (), null);
1266            } else {
1267                parseElements ();
1268                setElement (name, CONTENT_ELEMENTS,
1269                        dataBufferToString (), null);
1270            }
1271        }
1272    }
1273
1274
1275    /**
1276     * Parse an element-content model.
1277     * <pre>
1278     * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1279     * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1280     * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1281     * </pre>
1282     *
1283     * <p> NOTE: the opening '(' and S have already been read.
1284     */
1285    private void parseElements ()
1286    throws Exception
1287    {
1288        char c;
1289        char sep;
1290
1291        // Parse the first content particle
1292        skipWhitespace ();
1293        parseCp ();
1294
1295        // Check for end or for a separator.
1296        skipWhitespace ();
1297        c = readCh ();
1298        switch (c) {
1299        case ')':
1300            dataBufferAppend (')');
1301            c = readCh ();
1302            switch (c) {
1303            case '*':
1304            case '+':
1305            case '?':
1306                dataBufferAppend (c);
1307                break;
1308            default:
1309                unread (c);
1310            }
1311            return;
1312        case ',':                       // Register the separator.
1313        case '|':
1314            sep = c;
1315            dataBufferAppend (c);
1316            break;
1317        default:
1318            error ("bad separator in content model", c, null);
1319            return;
1320        }
1321
1322        // Parse the rest of the content model.
1323        while (true) {
1324            skipWhitespace ();
1325            parseCp ();
1326            skipWhitespace ();
1327            c = readCh ();
1328            if (c == ')') {
1329                dataBufferAppend (')');
1330                break;
1331            } else if (c != sep) {
1332                error ("bad separator in content model", c, null);
1333                return;
1334            } else {
1335                dataBufferAppend (c);
1336            }
1337        }
1338
1339        // Check for the occurrence indicator.
1340        c = readCh ();
1341        switch (c) {
1342        case '?':
1343        case '*':
1344        case '+':
1345            dataBufferAppend (c);
1346            return;
1347        default:
1348            unread (c);
1349            return;
1350        }
1351    }
1352
1353
1354    /**
1355     * Parse a content particle.
1356     * <pre>
1357     * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1358     * </pre>
1359     */
1360    private void parseCp ()
1361    throws Exception
1362    {
1363        char c;
1364
1365        if (tryRead ('(')) {
1366            dataBufferAppend ('(');
1367            parseElements ();
1368        } else {
1369            dataBufferAppend (readNmtoken (true));
1370            c = readCh ();
1371            switch (c) {
1372            case '?':
1373            case '*':
1374            case '+':
1375                dataBufferAppend (c);
1376                break;
1377            default:
1378                unread (c);
1379                break;
1380            }
1381        }
1382    }
1383
1384
1385    /**
1386     * Parse mixed content.
1387     * <pre>
1388     * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1389     *        | '(' S? ('#PCDATA') S? ')'
1390     * </pre>
1391     */
1392    private void parseMixed ()
1393    throws Exception
1394    {
1395        char c;
1396
1397        // Check for PCDATA alone.
1398        skipWhitespace ();
1399        if (tryRead (')')) {
1400            dataBufferAppend (")*");
1401            tryRead ('*');
1402            return;
1403        }
1404
1405        // Parse mixed content.
1406        skipWhitespace ();
1407        while (!tryRead (")*")) {
1408            require ('|');
1409            dataBufferAppend ('|');
1410            skipWhitespace ();
1411            dataBufferAppend (readNmtoken (true));
1412            skipWhitespace ();
1413        }
1414        dataBufferAppend (")*");
1415    }
1416
1417
1418    /**
1419     * Parse an attribute list declaration.
1420     * <pre>
1421     * [52] AttlistDecl ::= '&lt;!ATTLIST' S Name AttDef* S? '&gt;'
1422     * </pre>
1423     * <p>NOTE: the '&lt;!ATTLIST' has already been read.
1424     */
1425    private void parseAttlistDecl ()
1426    throws Exception
1427    {
1428        String elementName;
1429
1430        requireWhitespace ();
1431        elementName = readNmtoken (true);
1432        boolean white = tryWhitespace ();
1433        while (!tryRead ('>')) {
1434            if (!white)
1435                error ("whitespace required before attribute definition");
1436            parseAttDef (elementName);
1437            white = tryWhitespace ();
1438        }
1439    }
1440
1441
1442    /**
1443     * Parse a single attribute definition.
1444     * <pre>
1445     * [53] AttDef ::= S Name S AttType S DefaultDecl
1446     * </pre>
1447     */
1448    private void parseAttDef (String elementName)
1449    throws Exception
1450    {
1451      String name;
1452      int type;
1453      String myEnum = null;
1454    
1455      // Read the attribute name.
1456      name = readNmtoken (true);
1457    
1458      // Read the attribute type.
1459      requireWhitespace ();
1460      type = readAttType ();
1461    
1462      // Get the string of enumerated values
1463      // if necessary.
1464      if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) {
1465          myEnum = dataBufferToString ();
1466      }
1467    
1468      // Read the default value.
1469      requireWhitespace ();
1470      parseDefault (elementName, name, type, myEnum);
1471    }
1472
1473
1474    /**
1475     * Parse the attribute type.
1476     * <pre>
1477     * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1478     * [55] StringType ::= 'CDATA'
1479     * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1480     *          | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1481     * [57] EnumeratedType ::= NotationType | Enumeration
1482     * </pre>
1483     */
1484    private int readAttType ()
1485    throws Exception
1486    {
1487      String typeString;
1488      Integer type;
1489    
1490      if (tryRead ('(')) {
1491          parseEnumeration (false);
1492          return ATTRIBUTE_ENUMERATED;
1493      } else {
1494          typeString = readNmtoken (true);
1495          if (typeString.equals ("NOTATION")) {
1496        parseNotationType ();
1497          }
1498          type = (Integer) attributeTypeHash.get (typeString);
1499          if (type == null) {
1500        error ("illegal attribute type", typeString, null);
1501        return ATTRIBUTE_UNDECLARED;
1502          } else {
1503        return type.intValue ();
1504          }
1505      }
1506    }
1507
1508
1509    /**
1510     * Parse an enumeration.
1511     * <pre>
1512     * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1513     * </pre>
1514     * <p>NOTE: the '(' has already been read.
1515     */
1516    private void parseEnumeration (boolean isNames)
1517    throws Exception
1518    {
1519        char c;
1520
1521        dataBufferAppend ('(');
1522
1523        // Read the first token.
1524        skipWhitespace ();
1525        dataBufferAppend (readNmtoken (isNames));
1526        // Read the remaining tokens.
1527        skipWhitespace ();
1528        while (!tryRead (')')) {
1529            require ('|');
1530            dataBufferAppend ('|');
1531            skipWhitespace ();
1532            dataBufferAppend (readNmtoken (isNames));
1533            skipWhitespace ();
1534        }
1535        dataBufferAppend (')');
1536    }
1537
1538
1539    /**
1540     * Parse a notation type for an attribute.
1541     * <pre>
1542     * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
1543     *          (S? '|' S? name)* S? ')'
1544     * </pre>
1545     * <p>NOTE: the 'NOTATION' has already been read
1546     */
1547    private void parseNotationType ()
1548    throws Exception
1549    {
1550        requireWhitespace ();
1551        require ('(');
1552
1553        parseEnumeration (true);
1554    }
1555
1556
1557    /**
1558     * Parse the default value for an attribute.
1559     * <pre>
1560     * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
1561     *          | (('#FIXED' S)? AttValue)
1562     * </pre>
1563     */
1564    private void parseDefault (
1565        String elementName,
1566        String name,
1567        int type,
1568        String myEnum
1569    ) throws Exception
1570    {
1571        int     valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1572        String  value = null;
1573        int     flags = LIT_ATTRIBUTE | LIT_DISABLE_CREF | LIT_ENTITY_CHECK;
1574
1575        // Note: char refs not checked here, and input not normalized,
1576        // since it's done correctly later when we actually expand any
1577        // entity refs.  We ought to report char ref syntax errors now,
1578        // but don't.  Cost: unused defaults mean unreported WF errs.
1579        
1580        // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace
1581        // chars to spaces (doesn't matter when that's done if it doesn't
1582        // interfere with char refs expanding to whitespace).
1583
1584        if (tryRead ('#')) {
1585            if (tryRead ("FIXED")) {
1586                valueType = ATTRIBUTE_DEFAULT_FIXED;
1587                requireWhitespace ();
1588                value = readLiteral (flags);
1589            } else if (tryRead ("REQUIRED")) {
1590                valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1591            } else if (tryRead ("IMPLIED")) {
1592                valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1593            } else {
1594                error ("illegal keyword for attribute default value");
1595            }
1596        } else
1597            value = readLiteral (flags);
1598        setAttribute (elementName, name, type, myEnum, value, valueType);
1599    }
1600
1601
1602    /**
1603     * Parse a conditional section.
1604     * <pre>
1605     * [61] conditionalSect ::= includeSect || ignoreSect
1606     * [62] includeSect ::= '&lt;![' S? 'INCLUDE' S? '['
1607     *          extSubsetDecl ']]&gt;'
1608     * [63] ignoreSect ::= '&lt;![' S? 'IGNORE' S? '['
1609     *          ignoreSectContents* ']]&gt;'
1610     * [64] ignoreSectContents ::= Ignore
1611     *          ('&lt;![' ignoreSectContents* ']]&gt;' Ignore )*
1612     * [65] Ignore ::= Char* - (Char* ( '&lt;![' | ']]&gt;') Char* )
1613     * </pre>
1614     * <p> NOTE: the '&gt;![' has already been read.
1615     */
1616    private void parseConditionalSect ()
1617    throws Exception
1618    {
1619        skipWhitespace ();
1620        if (tryRead ("INCLUDE")) {
1621            skipWhitespace ();
1622            require ('[');
1623            skipWhitespace ();
1624            while (!tryRead ("]]>")) {
1625                parseMarkupdecl ();
1626                skipWhitespace ();
1627            }
1628        } else if (tryRead ("IGNORE")) {
1629            skipWhitespace ();
1630            require ('[');
1631            int nesting = 1;
1632            char c;
1633            expandPE = false;
1634            for (int nest = 1; nest > 0;) {
1635                c = readCh ();
1636                switch (c) {
1637                case '<':
1638                    if (tryRead ("![")) {
1639                        nest++;
1640                    }
1641                case ']':
1642                    if (tryRead ("]>")) {
1643                        nest--;
1644                    }
1645                }
1646            }
1647            expandPE = true;
1648        } else {
1649            error ("conditional section must begin with INCLUDE or IGNORE");
1650        }
1651    }
1652
1653
1654    /**
1655     * Read and interpret a character reference.
1656     * <pre>
1657     * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1658     * </pre>
1659     * <p>NOTE: the '&#' has already been read.
1660     */
1661    private void parseCharRef ()
1662    throws SAXException, IOException
1663    {
1664        int value = 0;
1665        char c;
1666
1667        if (tryRead ('x')) {
1668loop1:
1669            while (true) {
1670                c = readCh ();
1671                switch (c) {
1672                case '0':
1673                case '1':
1674                case '2':
1675                case '3':
1676                case '4':
1677                case '5':
1678                case '6':
1679                case '7':
1680                case '8':
1681                case '9':
1682                case 'a':
1683                case 'A':
1684                case 'b':
1685                case 'B':
1686                case 'c':
1687                case 'C':
1688                case 'd':
1689                case 'D':
1690                case 'e':
1691                case 'E':
1692                case 'f':
1693                case 'F':
1694                    value *= 16;
1695                    value += Integer.parseInt (new Character (c).toString (),
1696                                    16);
1697                    break;
1698                case ';':
1699                    break loop1;
1700                default:
1701                    error ("illegal character in character reference", c, null);
1702                    break loop1;
1703                }
1704            }
1705        } else {
1706loop2:
1707            while (true) {
1708                c = readCh ();
1709                switch (c) {
1710                case '0':
1711                case '1':
1712                case '2':
1713                case '3':
1714                case '4':
1715                case '5':
1716                case '6':
1717                case '7':
1718                case '8':
1719                case '9':
1720                    value *= 10;
1721                    value += Integer.parseInt (new Character (c).toString (),
1722                                    10);
1723                    break;
1724                case ';':
1725                    break loop2;
1726                default:
1727                    error ("illegal character in character reference", c, null);
1728                    break loop2;
1729                }
1730            }
1731        }
1732
1733        // check for character refs being legal XML
1734        if ((value < 0x0020
1735                && ! (value == '\n' || value == '\t' || value == '\r'))
1736                || (value >= 0xD800 && value <= 0xDFFF)
1737                || value == 0xFFFE || value == 0xFFFF
1738                || value > 0x0010ffff)
1739            error ("illegal XML character reference U+"
1740                    + Integer.toHexString (value));
1741
1742        // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
1743        //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
1744        if (value <= 0x0000ffff) {
1745            // no surrogates needed
1746            dataBufferAppend ((char) value);
1747        } else if (value <= 0x0010ffff) {
1748            value -= 0x10000;
1749            // > 16 bits, surrogate needed
1750            dataBufferAppend ((char) (0xd800 | (value >> 10)));
1751            dataBufferAppend ((char) (0xdc00 | (value & 0x0003ff)));
1752        } else {
1753            // too big for surrogate
1754            error ("character reference " + value + " is too large for UTF-16",
1755                   new Integer (value).toString (), null);
1756        }
1757    }
1758
1759
1760    /**
1761     * Parse and expand an entity reference.
1762     * <pre>
1763     * [68] EntityRef ::= '&' Name ';'
1764     * </pre>
1765     * <p>NOTE: the '&amp;' has already been read.
1766     * @param externalAllowed External entities are allowed here.
1767     */
1768    private void parseEntityRef (boolean externalAllowed)
1769    throws SAXException, IOException
1770    {
1771        String name;
1772
1773        name = readNmtoken (true);
1774        require (';');
1775        switch (getEntityType (name)) {
1776        case ENTITY_UNDECLARED:
1777            error ("reference to undeclared entity", name, null);
1778            break;
1779        case ENTITY_INTERNAL:
1780            pushString (name, getEntityValue (name));
1781            break;
1782        case ENTITY_TEXT:
1783            if (externalAllowed) {
1784                pushURL (name, getEntityPublicId (name),
1785                         getEntitySystemId (name),
1786                         null, null, null);
1787            } else {
1788                error ("reference to external entity in attribute value.",
1789                        name, null);
1790            }
1791            break;
1792        case ENTITY_NDATA:
1793            if (externalAllowed) {
1794                error ("unparsed entity reference in content", name, null);
1795            } else {
1796                error ("reference to external entity in attribute value.",
1797                        name, null);
1798            }
1799            break;
1800        }
1801    }
1802
1803
1804    /**
1805     * Parse and expand a parameter entity reference.
1806     * <pre>
1807     * [69] PEReference ::= '%' Name ';'
1808     * </pre>
1809     * <p>NOTE: the '%' has already been read.
1810     */
1811    private void parsePEReference ()
1812    throws SAXException, IOException
1813    {
1814        String name;
1815
1816        name = "%" + readNmtoken (true);
1817        require (';');
1818        switch (getEntityType (name)) {
1819        case ENTITY_UNDECLARED:
1820            // this is a validity problem, not a WFC violation ... but
1821            // we should disable handling of all subsequent declarations
1822            // unless this is a standalone document
1823            // warn ("reference to undeclared parameter entity", name, null);
1824
1825            break;
1826        case ENTITY_INTERNAL:
1827            if (inLiteral)
1828                pushString (name, getEntityValue (name));
1829            else
1830                pushString (name, " " + getEntityValue (name) + ' ');
1831            break;
1832        case ENTITY_TEXT:
1833            if (!inLiteral)
1834                pushString (null, " ");
1835            pushURL (name, getEntityPublicId (name),
1836                     getEntitySystemId (name),
1837                     null, null, null);
1838            if (!inLiteral)
1839                pushString (null, " ");
1840            break;
1841        }
1842    }
1843
1844    /**
1845     * Parse an entity declaration.
1846     * <pre>
1847     * [70] EntityDecl ::= GEDecl | PEDecl
1848     * [71] GEDecl ::= '&lt;!ENTITY' S Name S EntityDef S? '&gt;'
1849     * [72] PEDecl ::= '&lt;!ENTITY' S '%' S Name S PEDef S? '&gt;'
1850     * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
1851     * [74] PEDef ::= EntityValue | ExternalID
1852     * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1853     *             | 'PUBLIC' S PubidLiteral S SystemLiteral
1854     * [76] NDataDecl ::= S 'NDATA' S Name
1855     * </pre>
1856     * <p>NOTE: the '&lt;!ENTITY' has already been read.
1857     */
1858    private void parseEntityDecl ()
1859    throws Exception
1860    {
1861        char c;
1862        boolean peFlag = false;
1863        String name, value, notationName, ids[];
1864
1865        // Check for a parameter entity.
1866        expandPE = false;
1867        requireWhitespace ();
1868        if (tryRead ('%')) {
1869            peFlag = true;
1870            requireWhitespace ();
1871        }
1872        expandPE = true;
1873
1874        // Read the entity name, and prepend
1875        // '%' if necessary.
1876        name = readNmtoken (true);
1877        if (peFlag) {
1878            name = "%" + name;
1879        }
1880
1881        // Read the entity value.
1882        requireWhitespace ();
1883        c = readCh ();
1884        unread (c);
1885        if (c == '"' || c == '\'') {
1886            // Internal entity ... replacement text has expanded refs
1887            // to characters and PEs, but not to general entities
1888            value = readLiteral (0);
1889            setInternalEntity (name, value);
1890        } else {
1891            // Read the external IDs
1892            ids = readExternalIds (false);
1893            if (ids [1] == null) {
1894                error ("system identifer missing", name, null);
1895            }
1896
1897            // Check for NDATA declaration.
1898            boolean white = tryWhitespace ();
1899            if (!peFlag && tryRead ("NDATA")) {
1900                if (!white)
1901                    error ("whitespace required before NDATA");
1902                requireWhitespace ();
1903                notationName = readNmtoken (true);
1904                setExternalDataEntity (name, ids [0], ids [1], notationName);
1905            } else {
1906                setExternalTextEntity (name, ids [0], ids [1]);
1907            }
1908        }
1909
1910        // Finish the declaration.
1911        skipWhitespace ();
1912        require ('>');
1913    }
1914
1915
1916    /**
1917     * Parse a notation declaration.
1918     * <pre>
1919     * [82] NotationDecl ::= '&lt;!NOTATION' S Name S
1920     *          (ExternalID | PublicID) S? '&gt;'
1921     * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1922     * </pre>
1923     * <P>NOTE: the '&lt;!NOTATION' has already been read.
1924     */
1925    private void parseNotationDecl ()
1926    throws Exception
1927    {
1928        String nname, ids[];
1929
1930
1931        requireWhitespace ();
1932        nname = readNmtoken (true);
1933
1934        requireWhitespace ();
1935
1936        // Read the external identifiers.
1937        ids = readExternalIds (true);
1938        if (ids [0] == null && ids [1] == null) {
1939            error ("external identifer missing", nname, null);
1940        }
1941
1942        // Register the notation.
1943        setNotation (nname, ids [0], ids [1]);
1944
1945        skipWhitespace ();
1946        require ('>');
1947    }
1948
1949
1950    /**
1951     * Parse character data.
1952     * <pre>
1953     * [14] CharData ::= [^&lt;&amp;]* - ([^&lt;&amp;]* ']]&gt;' [^&lt;&amp;]*)
1954     * </pre>
1955     */
1956    private void parseCharData ()
1957    throws Exception
1958    {
1959        char c;
1960
1961        // Start with a little cheat -- in most
1962        // cases, the entire sequence of
1963        // character data will already be in
1964        // the readBuffer; if not, fall through to
1965        // the normal approach.
1966        if (USE_CHEATS) {
1967            int lineAugment = 0;
1968            int columnAugment = 0;
1969
1970loop:
1971            for (int i = readBufferPos; i < readBufferLength; i++) {
1972                switch (c = readBuffer [i]) {
1973                case '\n':
1974                    lineAugment++;
1975                    columnAugment = 0;
1976                    break;
1977                case '&':
1978                case '<':
1979                    int start = readBufferPos;
1980                    columnAugment++;
1981                    readBufferPos = i;
1982                    if (lineAugment > 0) {
1983                        line += lineAugment;
1984                        column = columnAugment;
1985                    } else {
1986                        column += columnAugment;
1987                    }
1988                    dataBufferAppend (readBuffer, start, i - start);
1989                    return;
1990                case ']':
1991                    // XXX missing two end-of-buffer cases
1992                    if ((i + 2) < readBufferLength) {
1993                        if (readBuffer [i + 1] == ']'
1994                                && readBuffer [i + 2] == '>') {
1995                            error ("character data may not contain ']]>'");
1996                        }
1997                    }
1998                    columnAugment++;
1999                    break;
2000                default:
2001                    if (c < 0x0020 || c > 0xFFFD)
2002                        error ("illegal XML character U+"
2003                                + Integer.toHexString (c));
2004                    // FALLTHROUGH
2005                case '\r':
2006                case '\t':
2007                    columnAugment++;
2008                }
2009            }
2010        }
2011
2012        // OK, the cheat didn't work; start over
2013        // and do it by the book.
2014        while (true) {
2015            c = readCh ();
2016            switch (c) {
2017            case '<':
2018            case '&':
2019                unread (c);
2020                return;
2021            // XXX "]]>" precluded ...
2022            default:
2023                dataBufferAppend (c);
2024                break;
2025            }
2026        }
2027    }
2028
2029
2030    //////////////////////////////////////////////////////////////////////
2031    // High-level reading and scanning methods.
2032    //////////////////////////////////////////////////////////////////////
2033
2034    /**
2035     * Require whitespace characters.
2036     */
2037    private void requireWhitespace ()
2038    throws SAXException, IOException
2039    {
2040        char c = readCh ();
2041        if (isWhitespace (c)) {
2042            skipWhitespace ();
2043        } else {
2044            error ("whitespace required", c, null);
2045        }
2046    }
2047
2048
2049    /**
2050     * Parse whitespace characters, and leave them in the data buffer.
2051     */
2052    private void parseWhitespace ()
2053    throws Exception
2054    {
2055        char c = readCh ();
2056        while (isWhitespace (c)) {
2057            dataBufferAppend (c);
2058            c = readCh ();
2059        }
2060        unread (c);
2061    }
2062
2063
2064    /**
2065     * Skip whitespace characters.
2066     * <pre>
2067     * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2068     * </pre>
2069     */
2070    private void skipWhitespace ()
2071    throws SAXException, IOException
2072    {
2073        // Start with a little cheat.  Most of
2074        // the time, the white space will fall
2075        // within the current read buffer; if
2076        // not, then fall through.
2077        if (USE_CHEATS) {
2078            int lineAugment = 0;
2079            int columnAugment = 0;
2080
2081loop:
2082            for (int i = readBufferPos; i < readBufferLength; i++) {
2083                switch (readBuffer [i]) {
2084                case ' ':
2085                case '\t':
2086                case '\r':
2087                    columnAugment++;
2088                    break;
2089                case '\n':
2090                    lineAugment++;
2091                    columnAugment = 0;
2092                    break;
2093                case '%':
2094                    if (expandPE)
2095                        break loop;
2096                    // else fall through...
2097                default:
2098                    readBufferPos = i;
2099                    if (lineAugment > 0) {
2100                        line += lineAugment;
2101                        column = columnAugment;
2102                    } else {
2103                        column += columnAugment;
2104                    }
2105                    return;
2106                }
2107            }
2108        }
2109
2110        // OK, do it by the book.
2111        char c = readCh ();
2112        while (isWhitespace (c)) {
2113            c = readCh ();
2114        }
2115        unread (c);
2116    }
2117
2118
2119    /**
2120     * Read a name or (when parsing an enumeration) name token.
2121     * <pre>
2122     * [5] Name ::= (Letter | '_' | ':') (NameChar)*
2123     * [7] Nmtoken ::= (NameChar)+
2124     * </pre>
2125     */
2126    private String readNmtoken (boolean isName)
2127    throws SAXException, IOException
2128    {
2129        char c;
2130
2131        if (USE_CHEATS) {
2132loop:
2133            for (int i = readBufferPos; i < readBufferLength; i++) {
2134                c = readBuffer [i];
2135                switch (c) {
2136                  case '%':
2137                    if (expandPE)
2138                        break loop;
2139                    // else fall through...
2140
2141                    // What may legitimately come AFTER a name/nmtoken?
2142                  case '<': case '>': case '&':
2143                  case ',': case '|': case '*': case '+': case '?':
2144                  case ')':
2145                  case '=':
2146                  case '\'': case '"':
2147                  case '[':
2148                  case ' ': case '\t': case '\r': case '\n':
2149                  case ';':
2150                  case '/':
2151                    int start = readBufferPos;
2152                    if (i == start)
2153                        error ("name expected", readBuffer [i], null);
2154                    readBufferPos = i;
2155                    return intern (readBuffer, start, i - start);
2156
2157                  default:
2158                    // punt on exact tests from Appendix A; approximate
2159                    // them using the Unicode ID start/part rules
2160                    if (i == readBufferPos && isName) {
2161                        if (!Character.isUnicodeIdentifierStart (c)
2162                                && c != ':' && c != '_')
2163                            error ("Not a name start character, U+"
2164                                  + Integer.toHexString (c));
2165                    } else if (!Character.isUnicodeIdentifierPart (c)
2166                            && c != '-' && c != ':' && c != '_' && c != '.'
2167                            && !isExtender (c))
2168                        error ("Not a name character, U+"
2169                                + Integer.toHexString (c));
2170                }
2171            }
2172        }
2173
2174        nameBufferPos = 0;
2175
2176        // Read the first character.
2177loop:
2178        while (true) {
2179            c = readCh ();
2180            switch (c) {
2181            case '%':
2182            case '<': case '>': case '&':
2183            case ',': case '|': case '*': case '+': case '?':
2184            case ')':
2185            case '=':
2186            case '\'': case '"':
2187            case '[':
2188            case ' ': case '\t': case '\n': case '\r':
2189            case ';':
2190            case '/':
2191                unread (c);
2192                if (nameBufferPos == 0) {
2193                    error ("name expected");
2194                }
2195                // punt on exact tests from Appendix A, but approximate them
2196                if (isName
2197                        && !Character.isUnicodeIdentifierStart (
2198                                nameBuffer [0])
2199                        && ":_".indexOf (nameBuffer [0]) == -1)
2200                    error ("Not a name start character, U+"
2201                              + Integer.toHexString (nameBuffer [0]));
2202                String s = intern (nameBuffer, 0, nameBufferPos);
2203                nameBufferPos = 0;
2204                return s;
2205            default:
2206                // punt on exact tests from Appendix A, but approximate them
2207
2208                if ((nameBufferPos != 0 || !isName)
2209                        && !Character.isUnicodeIdentifierPart (c)
2210                        && ":-_.".indexOf (c) == -1
2211                        && !isExtender (c))
2212                    error ("Not a name character, U+"
2213                            + Integer.toHexString (c));
2214                if (nameBufferPos >= nameBuffer.length)
2215                    nameBuffer =
2216                        (char[]) extendArray (nameBuffer,
2217                                    nameBuffer.length, nameBufferPos);
2218                nameBuffer [nameBufferPos++] = c;
2219            }
2220        }
2221    }
2222
2223    private static boolean isExtender (char c)
2224    {
2225        // [88] Extender ::= ...
2226        return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
2227               || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
2228               || (c >= 0x3031 && c <= 0x3035)
2229               || (c >= 0x309d && c <= 0x309e)
2230               || (c >= 0x30fc && c <= 0x30fe);
2231    }
2232
2233
2234    /**
2235     * Read a literal.  With matching single or double quotes as
2236     * delimiters (and not embedded!) this is used to parse:
2237     * <pre>
2238     *  [9] EntityValue ::= ... ([^%&amp;] | PEReference | Reference)* ...
2239     *  [10] AttValue ::= ... ([^<&] | Reference)* ...
2240     *  [11] SystemLiteral ::= ... (URLchar - "'")* ...
2241     *  [12] PubidLiteral ::= ... (PubidChar - "'")* ...
2242     * </pre>
2243     * as well as the quoted strings in XML and text declarations
2244     * (for version, encoding, and standalone) which have their
2245     * own constraints.
2246     */
2247    private String readLiteral (int flags)
2248    throws SAXException, IOException
2249    {
2250        char    delim, c;
2251        int     startLine = line;
2252        boolean saved = expandPE;
2253
2254        // Find the first delimiter.
2255        delim = readCh ();
2256        if (delim != '"' && delim != '\'' && delim != (char) 0) {
2257            error ("expected '\"' or \"'\"", delim, null);
2258            return null;
2259        }
2260        inLiteral = true;
2261        if ((flags & LIT_DISABLE_PE) != 0)
2262            expandPE = false;
2263
2264        // Each level of input source has its own buffer; remember
2265        // ours, so we won't read the ending delimiter from any
2266        // other input source, regardless of entity processing.
2267        char ourBuf [] = readBuffer;
2268
2269        // Read the literal.
2270        try {
2271            c = readCh ();
2272loop:
2273            while (! (c == delim && readBuffer == ourBuf)) {
2274                switch (c) {
2275                    // Can't escape this normalization for attributes
2276                case '\n':
2277                case '\r':
2278                case '\t':
2279                    if ((flags & LIT_ATTRIBUTE) != 0)
2280                        c = ' ';
2281                    break;
2282                case '&':
2283                    c = readCh ();
2284                    // Char refs are expanded immediately, except for
2285                    // all the cases where it's deferred.
2286                    if (c == '#') {
2287                        if ((flags & LIT_DISABLE_CREF) != 0) {
2288                            dataBufferAppend ('&');
2289                            dataBufferAppend ('#');
2290                            continue;
2291                        }
2292                        parseCharRef ();
2293
2294                    // It looks like an entity ref ...
2295                    } else {
2296                        unread (c);
2297                        // Expand it?
2298                        if ((flags & LIT_ENTITY_REF) > 0) {
2299                            parseEntityRef (false);
2300
2301                        // Is it just data?
2302                        } else if ((flags & LIT_DISABLE_EREF) != 0) {
2303                            dataBufferAppend ('&');
2304
2305                        // OK, it will be an entity ref -- expanded later.
2306                        } else {
2307                            String name = readNmtoken (true);
2308                            require (';');
2309                            if ((flags & LIT_ENTITY_CHECK) != 0
2310                                    && getEntityType (name) ==
2311                                            ENTITY_UNDECLARED) {
2312                                error ("General entity '" + name
2313                                    + "' must be declared before use");
2314                            }
2315                            dataBufferAppend ('&');
2316                            dataBufferAppend (name);
2317                            dataBufferAppend (';');
2318                        }
2319                    }
2320                    c = readCh ();
2321                    continue loop;
2322
2323                case '<':
2324                    // and why?  Perhaps so "&foo;" expands the same
2325                    // inside and outside an attribute?
2326                    if ((flags & LIT_ATTRIBUTE) != 0)
2327                        error ("attribute values may not contain '<'");
2328                    break;
2329
2330                // We don't worry about case '%' and PE refs, readCh does.
2331
2332                default:
2333                    break;
2334                }
2335                dataBufferAppend (c);
2336                c = readCh ();
2337            }
2338        } catch (EOFException e) {
2339            error ("end of input while looking for delimiter (started on line "
2340                   + startLine + ')', null, new Character (delim).toString ());
2341        }
2342        inLiteral = false;
2343        expandPE = saved;
2344
2345        // Normalise whitespace if necessary.
2346        if ((flags & LIT_NORMALIZE) > 0) {
2347            dataBufferNormalize ();
2348        }
2349
2350        // Return the value.
2351        return dataBufferToString ();
2352    }
2353
2354
2355    /**
2356     * Try reading external identifiers.
2357     * A system identifier is not required for notations.
2358     * @param inNotation Are we in a notation?
2359     * @return A two-member String array containing the identifiers.
2360     */
2361    private String[] readExternalIds (boolean inNotation)
2362    throws Exception
2363    {
2364        char    c;
2365        String  ids[] = new String [2];
2366        int     flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
2367
2368        if (tryRead ("PUBLIC")) {
2369            requireWhitespace ();
2370            ids [0] = readLiteral (LIT_NORMALIZE | flags);
2371            if (inNotation) {
2372                skipWhitespace ();
2373                c = readCh ();
2374                unread (c);
2375                if (c == '"' || c == '\'') {
2376                    ids [1] = readLiteral (flags);
2377                }
2378            } else {
2379                requireWhitespace ();
2380                ids [1] = readLiteral (flags);
2381            }
2382
2383            for (int i = 0; i < ids [0].length (); i++) {
2384                c = ids [0].charAt (i);
2385                if (c >= 'a' && c <= 'z')
2386                    continue;
2387                if (c >= 'A' && c <= 'Z')
2388                    continue;
2389                if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf (c) != -1)
2390                    continue;
2391                error ("illegal PUBLIC id character U+"
2392                        + Integer.toHexString (c));
2393            }
2394        } else if (tryRead ("SYSTEM")) {
2395            requireWhitespace ();
2396            ids [1] = readLiteral (flags);
2397        }
2398
2399        // XXX should normalize system IDs as follows:
2400        // - Convert to UTF-8
2401        // - Map reserved and non-ASCII characters to %HH
2402
2403        return ids;
2404    }
2405
2406
2407    /**
2408     * Test if a character is whitespace.
2409     * <pre>
2410     * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2411     * </pre>
2412     * @param c The character to test.
2413     * @return true if the character is whitespace.
2414     */
2415    private final boolean isWhitespace (char c)
2416    {
2417        if (c > 0x20)
2418            return false;
2419        if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d)
2420            return true;
2421        return false;   // illegal ...
2422    }
2423
2424
2425    //////////////////////////////////////////////////////////////////////
2426    // Utility routines.
2427    //////////////////////////////////////////////////////////////////////
2428
2429
2430    /**
2431     * Add a character to the data buffer.
2432     */
2433    private void dataBufferAppend (char c)
2434    {
2435        // Expand buffer if necessary.
2436        if (dataBufferPos >= dataBuffer.length)
2437            dataBuffer =
2438                (char[]) extendArray (dataBuffer,
2439                        dataBuffer.length, dataBufferPos);
2440        dataBuffer [dataBufferPos++] = c;
2441    }
2442
2443
2444    /**
2445     * Add a string to the data buffer.
2446     */
2447    private void dataBufferAppend (String s)
2448    {
2449        dataBufferAppend (s.toCharArray (), 0, s.length ());
2450    }
2451
2452
2453    /**
2454     * Append (part of) a character array to the data buffer.
2455     */
2456    private void dataBufferAppend (char ch[], int start, int length)
2457    {
2458        dataBuffer = (char[])
2459                extendArray (dataBuffer, dataBuffer.length,
2460                                    dataBufferPos + length);
2461
2462        System.arraycopy (ch, start, dataBuffer, dataBufferPos, length);
2463        dataBufferPos += length;
2464    }
2465
2466
2467    /**
2468     * Normalise whitespace in the data buffer.
2469     */
2470    private void dataBufferNormalize ()
2471    {
2472        int i = 0;
2473        int j = 0;
2474        int end = dataBufferPos;
2475
2476        // Skip whitespace at the start.
2477        while (j < end && isWhitespace (dataBuffer [j])) {
2478            j++;
2479        }
2480
2481        // Skip whitespace at the end.
2482        while (end > j && isWhitespace (dataBuffer [end - 1])) {
2483            end --;
2484        }
2485
2486        // Start copying to the left.
2487        while (j < end) {
2488
2489            char c = dataBuffer [j++];
2490
2491            // Normalise all other whitespace to
2492            // a single space.
2493            if (isWhitespace (c)) {
2494                while (j < end && isWhitespace (dataBuffer [j++])) {}
2495
2496                dataBuffer [i++] = ' ';
2497                dataBuffer [i++] = dataBuffer [j - 1];
2498            } else {
2499                dataBuffer [i++] = c;
2500            }
2501        }
2502
2503        // The new length is <= the old one.
2504        dataBufferPos = i;
2505    }
2506
2507
2508    /**
2509     * Convert the data buffer to a string.
2510     */
2511    private String dataBufferToString ()
2512    {
2513        String s = new String (dataBuffer, 0, dataBufferPos);
2514        dataBufferPos = 0;
2515        return s;
2516    }
2517
2518
2519    /**
2520     * Flush the contents of the data buffer to the handler, as
2521     * appropriate, and reset the buffer for new input.
2522     */
2523    private void dataBufferFlush ()
2524    throws SAXException
2525    {
2526        if (currentElementContent == CONTENT_ELEMENTS
2527                && dataBufferPos > 0
2528                && !inCDATA
2529                ) {
2530            // We can't just trust the buffer to be whitespace, there
2531            // are cases when it isn't
2532            for (int i = 0; i < dataBufferPos; i++) {
2533                if (!isWhitespace (dataBuffer [i])) {
2534                    handler.charData (dataBuffer, 0, dataBufferPos);
2535                    dataBufferPos = 0;
2536                }
2537            }
2538            if (dataBufferPos > 0) {
2539                handler.ignorableWhitespace (dataBuffer, 0, dataBufferPos);
2540                dataBufferPos = 0;
2541            }
2542        } else if (dataBufferPos > 0) {
2543            handler.charData (dataBuffer, 0, dataBufferPos);
2544            dataBufferPos = 0;
2545        }
2546    }
2547
2548
2549    /**
2550     * Require a string to appear, or throw an exception.
2551     * <p><em>Precondition:</em> Entity expansion is not required.
2552     * <p><em>Precondition:</em> data buffer has no characters that
2553     * will get sent to the application.
2554     */
2555    private void require (String delim)
2556    throws SAXException, IOException
2557    {
2558        int     length = delim.length ();
2559        char    ch [];
2560                
2561        if (length < dataBuffer.length) {
2562            ch = dataBuffer;
2563            delim.getChars (0, length, ch, 0);
2564        } else
2565            ch = delim.toCharArray ();
2566
2567        if (USE_CHEATS
2568                && length <= (readBufferLength - readBufferPos)) {
2569            int offset = readBufferPos;
2570
2571            for (int i = 0; i < length; i++, offset++)
2572                if (ch [i] != readBuffer [offset])
2573                    error ("required string", null, delim);
2574            readBufferPos = offset;
2575            
2576        } else {
2577            for (int i = 0; i < length; i++)
2578                require (ch [i]);
2579        }
2580    }
2581
2582
2583    /**
2584     * Require a character to appear, or throw an exception.
2585     */
2586    private void require (char delim)
2587    throws SAXException, IOException
2588    {
2589        char c = readCh ();
2590
2591        if (c != delim) {
2592            error ("required character", c, new Character (delim).toString ());
2593        }
2594    }
2595
2596
2597    /**
2598     * Create an interned string from a character array.
2599     * &AElig;lfred uses this method to create an interned version
2600     * of all names and name tokens, so that it can test equality
2601     * with <code>==</code> instead of <code>String.equals ()</code>.
2602     *
2603     * <p>This is much more efficient than constructing a non-interned
2604     * string first, and then interning it.
2605     *
2606     * @param ch an array of characters for building the string.
2607     * @param start the starting position in the array.
2608     * @param length the number of characters to place in the string.
2609     * @return an interned string.
2610     * @see #intern (String)
2611     * @see java.lang.String#intern
2612     */
2613    public String intern (char ch[], int start, int length)
2614    {
2615        int     index = 0;
2616        int     hash = 0;
2617        Object  bucket [];
2618
2619        // Generate a hash code.
2620        for (int i = start; i < start + length; i++)
2621            hash = 31 * hash + ch [i];
2622        hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
2623
2624        // Get the bucket -- consists of {array,String} pairs
2625        if ((bucket = symbolTable [hash]) == null) {
2626            // first string in this bucket
2627            bucket = new Object [8];
2628
2629        // Search for a matching tuple, and
2630        // return the string if we find one.
2631        } else {
2632            while (index < bucket.length) {
2633                char chFound [] = (char []) bucket [index];
2634
2635                // Stop when we hit a null index.
2636                if (chFound == null)
2637                    break;
2638
2639                // If they're the same length, check for a match.
2640                if (chFound.length == length) {
2641                    for (int i = 0; i < chFound.length; i++) {
2642                        // continue search on failure
2643                        if (ch [start + i] != chFound [i]) {
2644                            break;
2645                        } else if (i == length - 1) {
2646                            // That's it, we have a match!
2647                            return (String) bucket [index + 1];
2648                        }
2649                    }
2650                }
2651                index += 2;
2652            }
2653            // Not found -- we'll have to add it.
2654
2655            // Do we have to grow the bucket?
2656            bucket = (Object []) extendArray (bucket, bucket.length, index);
2657        }
2658        symbolTable [hash] = bucket;
2659
2660        // OK, add it to the end of the bucket -- "local" interning.
2661        // Intern "globally" to let applications share interning benefits.
2662        String s = new String (ch, start, length).intern ();
2663        bucket [index] = s.toCharArray ();
2664        bucket [index + 1] = s;
2665        return s;
2666    }
2667
2668
2669    /**
2670     * Ensure the capacity of an array, allocating a new one if
2671     * necessary.  Usually called only a handful of times.
2672     */
2673    private Object extendArray (Object array, int currentSize, int requiredSize)
2674    {
2675        if (requiredSize < currentSize) {
2676            return array;
2677        } else {
2678            Object newArray = null;
2679            int newSize = currentSize * 2;
2680
2681            if (newSize <= requiredSize)
2682                newSize = requiredSize + 1;
2683
2684            if (array instanceof char[])
2685                newArray = new char [newSize];
2686            else if (array instanceof Object[])
2687                newArray = new Object [newSize];
2688            else
2689                throw new RuntimeException ();
2690
2691            System.arraycopy (array, 0, newArray, 0, currentSize);
2692            return newArray;
2693        }
2694    }
2695
2696
2697    //////////////////////////////////////////////////////////////////////
2698    // XML query routines.
2699    //////////////////////////////////////////////////////////////////////
2700
2701
2702    //
2703    // Elements
2704    //
2705
2706    /**
2707     * Get the declared elements for an XML document.
2708     * <p>The results will be valid only after the DTD (if any) has been
2709     * parsed.
2710     * @return An enumeration of all element types declared for this
2711     *   document (as Strings).
2712     * @see #getElementContentType
2713     * @see #getElementContentModel
2714     */
2715    public Iterator declaredElements ()
2716    {
2717        return elementInfo.keySet().iterator();
2718    }
2719
2720
2721    /**
2722     * Look up the content type of an element.
2723     * @param element element info vector
2724     * @param defaultType value for null vector
2725     * @return An integer constant representing the content type.
2726     * @see #CONTENT_UNDECLARED
2727     * @see #CONTENT_ANY
2728     * @see #CONTENT_EMPTY
2729     * @see #CONTENT_MIXED
2730     * @see #CONTENT_ELEMENTS
2731     */
2732    private int getContentType (Object element [], int defaultType)
2733    {
2734        if (element == null)
2735            return defaultType;
2736        else
2737            return ((Integer) element [0]).intValue ();
2738    }
2739
2740
2741    /**
2742     * Look up the content type of an element.
2743     * @param name The element type name.
2744     * @return An integer constant representing the content type.
2745     * @see #getElementContentModel
2746     * @see #CONTENT_UNDECLARED
2747     * @see #CONTENT_ANY
2748     * @see #CONTENT_EMPTY
2749     * @see #CONTENT_MIXED
2750     * @see #CONTENT_ELEMENTS
2751     */
2752    public int getElementContentType (String name)
2753    {
2754        Object element [] = (Object []) elementInfo.get (name);
2755        return getContentType (element, CONTENT_UNDECLARED);
2756    }
2757
2758
2759    /**
2760     * Look up the content model of an element.
2761     * <p>The result will always be null unless the content type is
2762     * CONTENT_ELEMENTS or CONTENT_MIXED.
2763     * @param name The element type name.
2764     * @return The normalised content model, as a string.
2765     * @see #getElementContentType
2766     */
2767    public String getElementContentModel (String name)
2768    {
2769        Object element[] = (Object[]) elementInfo.get (name);
2770        if (element == null) {
2771            return null;
2772        } else {
2773            return (String) element [1];
2774        }
2775    }
2776
2777
2778    /**
2779     * Register an element.
2780     * Array format:
2781     *  element type
2782     *  attribute hash table
2783     */
2784    private void setElement (String name, int contentType,
2785                      String contentModel, HashMap attributes)
2786    throws Exception
2787    {
2788        Object element[];
2789
2790        // Try looking up the element
2791        element = (Object[]) elementInfo.get (name);
2792
2793        // Make a new one if necessary.
2794        if (element == null) {
2795            element = new Object [3];
2796            element [0] = new Integer (CONTENT_UNDECLARED);
2797            element [1] = null;
2798            element [2] = null;
2799        } else if (contentType != CONTENT_UNDECLARED
2800                && ((Integer) element [0]).intValue () != CONTENT_UNDECLARED
2801                ) {
2802            // warn ("multiple declarations for element type", name, null);
2803            return;
2804        }
2805
2806        // Insert the content type, if any.
2807        if (contentType != CONTENT_UNDECLARED) {
2808            element [0] = new Integer (contentType);
2809        }
2810
2811        // Insert the content model, if any.
2812        if (contentModel != null) {
2813            element [1] = contentModel;
2814        }
2815
2816        // Insert the attributes, if any.
2817        if (attributes != null) {
2818            element [2] = attributes;
2819        }
2820
2821        // Save the element info.
2822        elementInfo.put (name, element);
2823    }
2824
2825
2826    /**
2827     * Look up the attribute hash table for an element.
2828     * The hash table is the second item in the element array.
2829     */
2830    private HashMap getElementAttributes (String name)
2831    {
2832        Object element[] = (Object[]) elementInfo.get (name);
2833        if (element == null) {
2834            return null;
2835        } else {
2836            return (HashMap) element [2];
2837        }
2838    }
2839
2840
2841
2842    //
2843    // Attributes
2844    //
2845
2846    /**
2847     * Get the declared attributes for an element type.
2848     * @param elname The name of the element type.
2849     * @return An Iterator of all the attributes declared for
2850     *   a specific element type.  The results will be valid only
2851     *   after the DTD (if any) has been parsed.
2852     * @see #getAttributeType
2853     * @see #getAttributeIterator
2854     * @see #getAttributeDefaultValueType
2855     * @see #getAttributeDefaultValue
2856     * @see #getAttributeExpandedValue
2857     */
2858    private Iterator declaredAttributes (Object element [])
2859    {
2860        HashMap attlist;
2861
2862        if (element == null)
2863            return null;
2864        if ((attlist = (HashMap) element [2]) == null)
2865            return null;
2866        return attlist.keySet().iterator();
2867    }
2868
2869    /**
2870     * Get the declared attributes for an element type.
2871     * @param elname The name of the element type.
2872     * @return An Iterator of all the attributes declared for
2873     *   a specific element type.  The results will be valid only
2874     *   after the DTD (if any) has been parsed.
2875     * @see #getAttributeType
2876     * @see #getAttributeIterator
2877     * @see #getAttributeDefaultValueType
2878     * @see #getAttributeDefaultValue
2879     * @see #getAttributeExpandedValue
2880     */
2881    public Iterator declaredAttributes (String elname)
2882    {
2883        return declaredAttributes ((Object []) elementInfo.get (elname));
2884    }
2885
2886
2887    /**
2888     * Retrieve the declared type of an attribute.
2889     * @param name The name of the associated element.
2890     * @param aname The name of the attribute.
2891     * @return An integer constant representing the attribute type.
2892     * @see #ATTRIBUTE_UNDECLARED
2893     * @see #ATTRIBUTE_CDATA
2894     * @see #ATTRIBUTE_ID
2895     * @see #ATTRIBUTE_IDREF
2896     * @see #ATTRIBUTE_IDREFS
2897     * @see #ATTRIBUTE_ENTITY
2898     * @see #ATTRIBUTE_ENTITIES
2899     * @see #ATTRIBUTE_NMTOKEN
2900     * @see #ATTRIBUTE_NMTOKENS
2901     * @see #ATTRIBUTE_ENUMERATED
2902     * @see #ATTRIBUTE_NOTATION
2903     */
2904    public int getAttributeType (String name, String aname)
2905    {
2906        Object attribute[] = getAttribute (name, aname);
2907        if (attribute == null) {
2908            return ATTRIBUTE_UNDECLARED;
2909        } else {
2910            return ((Integer) attribute [0]).intValue ();
2911        }
2912    }
2913
2914
2915    /**
2916     * Retrieve the allowed values for an enumerated attribute type.
2917     * @param name The name of the associated element.
2918     * @param aname The name of the attribute.
2919     * @return A string containing the token list.
2920     * @see #ATTRIBUTE_ENUMERATED
2921     * @see #ATTRIBUTE_NOTATION
2922     */
2923    public String getAttributeIterator (String name, String aname)
2924    {
2925        Object attribute[] = getAttribute (name, aname);
2926        if (attribute == null) {
2927            return null;
2928        } else {
2929            return (String) attribute [3];
2930        }
2931    }
2932
2933
2934    /**
2935     * Retrieve the default value of a declared attribute.
2936     * @param name The name of the associated element.
2937     * @param aname The name of the attribute.
2938     * @return The default value, or null if the attribute was
2939     *   #IMPLIED or simply undeclared and unspecified.
2940     * @see #getAttributeExpandedValue
2941     */
2942    public String getAttributeDefaultValue (String name, String aname)
2943    {
2944        Object attribute[] = getAttribute (name, aname);
2945        if (attribute == null) {
2946            return null;
2947        } else {
2948            return (String) attribute [1];
2949        }
2950    }
2951
2952
2953    /**
2954     * Retrieve the expanded value of a declared attribute.
2955     * <p>General entities will be expanded (once).
2956     * @param name The name of the associated element.
2957     * @param aname The name of the attribute.
2958     * @return The expanded default value, or null if the attribute was
2959     *   #IMPLIED or simply undeclared
2960     * @see #getAttributeDefaultValue
2961     */
2962    public String getAttributeExpandedValue (String name, String aname)
2963    throws Exception
2964    {
2965        Object attribute[] = getAttribute (name, aname);
2966
2967        if (attribute == null) {
2968            return null;
2969        } else if (attribute [4] == null && attribute [1] != null) {
2970            // we MUST use the same buf for both quotes else the literal
2971            // can't be properly terminated
2972            char buf [] = new char [1];
2973            int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE;
2974            int type = getAttributeType (name, aname);
2975
2976            if (type != ATTRIBUTE_CDATA && type != ATTRIBUTE_UNDECLARED)
2977                flags |= LIT_NORMALIZE;
2978            buf [0] = '"';
2979            pushCharArray (null, buf, 0, 1);
2980            pushString (null, (String) attribute [1]);
2981            pushCharArray (null, buf, 0, 1);
2982            attribute [4] = readLiteral (flags);
2983        }
2984        return (String) attribute [4];
2985    }
2986
2987
2988    /**
2989     * Retrieve the default value type of a declared attribute.
2990     * @see #ATTRIBUTE_DEFAULT_SPECIFIED
2991     * @see #ATTRIBUTE_DEFAULT_IMPLIED
2992     * @see #ATTRIBUTE_DEFAULT_REQUIRED
2993     * @see #ATTRIBUTE_DEFAULT_FIXED
2994     */
2995    public int getAttributeDefaultValueType (String name, String aname)
2996    {
2997        Object attribute[] = getAttribute (name, aname);
2998        if (attribute == null) {
2999            return ATTRIBUTE_DEFAULT_UNDECLARED;
3000        } else {
3001            return ((Integer) attribute [2]).intValue ();
3002        }
3003    }
3004
3005
3006    /**
3007     * Register an attribute declaration for later retrieval.
3008     * Format:
3009     * - String type
3010     * - String default value
3011     * - int value type
3012     */
3013    private void setAttribute (String elName, String name, int type,
3014                        String myEnum,
3015                        String value, int valueType)
3016    throws Exception
3017    {
3018        HashMap attlist;
3019        Object attribute[];
3020
3021        // Create a new hashtable if necessary.
3022        attlist = getElementAttributes (elName);
3023        if (attlist == null) {
3024            attlist = new HashMap ();
3025        }
3026
3027        // ignore multiple attribute declarations!
3028        if (attlist.get (name) != null) {
3029            return;
3030        } else {
3031            attribute = new Object [5];
3032            attribute [0] = new Integer (type);
3033            attribute [1] = value;
3034            attribute [2] = new Integer (valueType);
3035            attribute [3] = myEnum;
3036            attribute [4] = null;
3037            attlist.put (name, attribute);
3038
3039            // Use CONTENT_UNDECLARED to avoid overwriting
3040            // existing element declaration.
3041            setElement (elName, CONTENT_UNDECLARED, null, attlist);
3042        }
3043    }
3044
3045
3046    /**
3047     * Retrieve the three-member array representing an
3048     * attribute declaration.
3049     */
3050    private Object[] getAttribute (String elName, String name)
3051    {
3052        HashMap attlist;
3053        Object attribute[];
3054
3055        attlist = getElementAttributes (elName);
3056        if (attlist == null) {
3057            return null;
3058        }
3059
3060        attribute = (Object[]) attlist.get (name);
3061        return attribute;
3062    }
3063
3064
3065    //
3066    // Entities
3067    //
3068
3069    /**
3070     * Get declared entities.
3071     * @return An Iterator of all the entities declared for
3072     *   this XML document.  The results will be valid only
3073     *   after the DTD (if any) has been parsed.
3074     * @see #getEntityType
3075     * @see #getEntityPublicId
3076     * @see #getEntitySystemId
3077     * @see #getEntityValue
3078     * @see #getEntityNotationName
3079     */
3080    public Iterator declaredEntities ()
3081    {
3082        return entityInfo.keySet().iterator();
3083    }
3084
3085
3086    /**
3087     * Find the type of an entity.
3088     * @returns An integer constant representing the entity type.
3089     * @see #ENTITY_UNDECLARED
3090     * @see #ENTITY_INTERNAL
3091     * @see #ENTITY_NDATA
3092     * @see #ENTITY_TEXT
3093     */
3094    public int getEntityType (String ename)
3095    {
3096        Object entity[] = (Object[]) entityInfo.get (ename);
3097        if (entity == null) {
3098            return ENTITY_UNDECLARED;
3099        } else {
3100            return ((Integer) entity [0]).intValue ();
3101        }
3102    }
3103
3104
3105    /**
3106     * Return an external entity's public identifier, if any.
3107     * @param ename The name of the external entity.
3108     * @return The entity's system identifier, or null if the
3109     *   entity was not declared, if it is not an
3110     *   external entity, or if no public identifier was
3111     *   provided.
3112     * @see #getEntityType
3113     */
3114    public String getEntityPublicId (String ename)
3115    {
3116        Object entity[] = (Object[]) entityInfo.get (ename);
3117        if (entity == null) {
3118            return null;
3119        } else {
3120            return (String) entity [1];
3121        }
3122    }
3123
3124
3125    /**
3126     * Return an external entity's system identifier.
3127     * @param ename The name of the external entity.
3128     * @return The entity's system identifier, or null if the
3129     *   entity was not declared, or if it is not an
3130     *   external entity.
3131     * @see #getEntityType
3132     */
3133    public String getEntitySystemId (String ename)
3134    {
3135        Object entity[] = (Object[]) entityInfo.get (ename);
3136        if (entity == null) {
3137            return null;
3138        } else {
3139            return (String) entity [2];
3140        }
3141    }
3142
3143
3144    /**
3145     * Return the value of an internal entity.
3146     * @param ename The name of the internal entity.
3147     * @return The entity's value, or null if the entity was
3148     *   not declared, or if it is not an internal entity.
3149     * @see #getEntityType
3150     */
3151    public String getEntityValue (String ename)
3152    {
3153        Object entity[] = (Object[]) entityInfo.get (ename);
3154        if (entity == null) {
3155            return null;
3156        } else {
3157            return (String) entity [3];
3158        }
3159    }
3160
3161
3162    /**
3163     * Get the notation name associated with an NDATA entity.
3164     * @param ename The NDATA entity name.
3165     * @return The associated notation name, or null if the
3166     *   entity was not declared, or if it is not an
3167     *   NDATA entity.
3168     * @see #getEntityType
3169     */
3170    public String getEntityNotationName (String eName)
3171    {
3172        Object entity[] = (Object[]) entityInfo.get (eName);
3173        if (entity == null) {
3174            return null;
3175        } else {
3176            return (String) entity [4];
3177        }
3178    }
3179
3180
3181    /**
3182     * Register an entity declaration for later retrieval.
3183     */
3184    private void setInternalEntity (String eName, String value)
3185    {
3186        setEntity (eName, ENTITY_INTERNAL, null, null, value, null);
3187    }
3188
3189
3190    /**
3191     * Register an external data entity.
3192     */
3193    private void setExternalDataEntity (String eName, String pubid,
3194                                 String sysid, String nName)
3195    {
3196        setEntity (eName, ENTITY_NDATA, pubid, sysid, null, nName);
3197    }
3198
3199
3200    /**
3201     * Register an external text entity.
3202     */
3203    private void setExternalTextEntity (String eName,
3204                    String pubid, String sysid)
3205    {
3206        setEntity (eName, ENTITY_TEXT, pubid, sysid, null, null);
3207    }
3208
3209
3210    /**
3211     * Register an entity declaration for later retrieval.
3212     */
3213    private void setEntity (String eName, int eClass,
3214                     String pubid, String sysid,
3215                     String value, String nName)
3216    {
3217        Object entity[];
3218
3219        if (entityInfo.get (eName) == null) {
3220            entity = new Object [5];
3221            entity [0] = new Integer (eClass);
3222            entity [1] = pubid;
3223            entity [2] = sysid;
3224            entity [3] = value;
3225            entity [4] = nName;
3226
3227            entityInfo.put (eName, entity);
3228        }
3229    }
3230
3231
3232    //
3233    // Notations.
3234    //
3235
3236    /**
3237     * Get declared notations.
3238     * @return An Iterator of all the notations declared for
3239     *   this XML document.  The results will be valid only
3240     *   after the DTD (if any) has been parsed.
3241     * @see #getNotationPublicId
3242     * @see #getNotationSystemId
3243     */
3244    public Iterator declaredNotations ()
3245    {
3246        return notationInfo.keySet().iterator();
3247    }
3248
3249
3250    /**
3251     * Look up the public identifier for a notation.
3252     * You will normally use this method to look up a notation
3253     * that was provided as an attribute value or for an NDATA entity.
3254     * @param nname The name of the notation.
3255     * @return A string containing the public identifier, or null
3256     *   if none was provided or if no such notation was
3257     *   declared.
3258     * @see #getNotationSystemId
3259     */
3260    public String getNotationPublicId (String nname)
3261    {
3262        Object notation[] = (Object[]) notationInfo.get (nname);
3263        if (notation == null) {
3264            return null;
3265        } else {
3266            return (String) notation [0];
3267        }
3268    }
3269
3270
3271    /**
3272     * Look up the system identifier for a notation.
3273     * You will normally use this method to look up a notation
3274     * that was provided as an attribute value or for an NDATA entity.
3275     * @param nname The name of the notation.
3276     * @return A string containing the system identifier, or null
3277     *   if no such notation was declared.
3278     * @see #getNotationPublicId
3279     */
3280    public String getNotationSystemId (String nname)
3281    {
3282        Object notation[] = (Object[]) notationInfo.get (nname);
3283        if (notation == null) {
3284            return null;
3285        } else {
3286            return (String) notation [1];
3287        }
3288    }
3289
3290
3291    /**
3292     * Register a notation declaration for later retrieval.
3293     * Format:
3294     * - public id
3295     * - system id
3296     */
3297    private void setNotation (String nname, String pubid, String sysid)
3298    throws Exception
3299    {
3300        Object notation[];
3301
3302        if (notationInfo.get (nname) == null) {
3303            notation = new Object [2];
3304            notation [0] = pubid;
3305            notation [1] = sysid;
3306            notationInfo.put (nname, notation);
3307        } else {
3308            // VC: Unique Notation Name
3309            // (it's not fatal)
3310        }
3311    }
3312
3313
3314    //
3315    // Location.
3316    //
3317
3318
3319    /**
3320     * Return the current line number.
3321     */
3322    public int getLineNumber ()
3323    {
3324        return line;
3325    }
3326
3327
3328    /**
3329     * Return the current column number.
3330     */
3331    public int getColumnNumber ()
3332    {
3333        return column;
3334    }
3335
3336
3337    //////////////////////////////////////////////////////////////////////
3338    // High-level I/O.
3339    //////////////////////////////////////////////////////////////////////
3340
3341
3342    /**
3343     * Read a single character from the readBuffer.
3344     * <p>The readDataChunk () method maintains the buffer.
3345     * <p>If we hit the end of an entity, try to pop the stack and
3346     * keep going.
3347     * <p> (This approach doesn't really enforce XML's rules about
3348     * entity boundaries, but this is not currently a validating
3349     * parser).
3350     * <p>This routine also attempts to keep track of the current
3351     * position in external entities, but it's not entirely accurate.
3352     * @return The next available input character.
3353     * @see #unread (char)
3354     * @see #unread (String)
3355     * @see #readDataChunk
3356     * @see #readBuffer
3357     * @see #line
3358     * @return The next character from the current input source.
3359     */
3360    private char readCh ()
3361    throws SAXException, IOException
3362    {
3363        char c;
3364
3365        // As long as there's nothing in the
3366        // read buffer, try reading more data
3367        // (for an external entity) or popping
3368        // the entity stack (for either).
3369        while (readBufferPos >= readBufferLength) {
3370            switch (sourceType) {
3371            case INPUT_READER:
3372            case INPUT_EXTERNAL:
3373            case INPUT_STREAM:
3374                readDataChunk ();
3375                while (readBufferLength < 1) {
3376                    popInput ();
3377                    if (readBufferLength < 1) {
3378                        readDataChunk ();
3379                    }
3380                }
3381                break;
3382
3383            default:
3384
3385                popInput ();
3386                break;
3387            }
3388        }
3389
3390        c = readBuffer [readBufferPos++];
3391
3392        if (c == '\n') {
3393            line++;
3394            column = 0;
3395        } else {
3396            if (c == '<')
3397                /* favorite return to parseContent () .. NOP */ ;
3398            else if ((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
3399                error ("illegal XML character U+"
3400                        + Integer.toHexString (c));
3401
3402            // If we're in the DTD and in a context where PEs get expanded,
3403            // do so ... 1/14/2000 errata identify those contexts.  There
3404            // are also spots in the internal subset where PE refs are fatal
3405            // errors, hence yet another flag.
3406            else if (c == '%' && expandPE) {
3407                if (peIsError)
3408                    error ("PE reference within decl in internal subset.");
3409                parsePEReference ();
3410                return readCh ();
3411            }
3412            column++;
3413        }
3414
3415        return c;
3416    }
3417
3418
3419    /**
3420     * Push a single character back onto the current input stream.
3421     * <p>This method usually pushes the character back onto
3422     * the readBuffer, while the unread (String) method treats the
3423     * string as a new internal entity.
3424     * <p>I don't think that this would ever be called with 
3425     * readBufferPos = 0, because the methods always reads a character
3426     * before unreading it, but just in case, I've added a boundary
3427     * condition.
3428     * @param c The character to push back.
3429     * @see #readCh
3430     * @see #unread (String)
3431     * @see #unread (char[])
3432     * @see #readBuffer
3433     */
3434    private void unread (char c)
3435    throws SAXException
3436    {
3437        // Normal condition.
3438        if (c == '\n') {
3439            line--;
3440            column = -1;
3441        }
3442        if (readBufferPos > 0) {
3443            readBuffer [--readBufferPos] = c;
3444        } else {
3445            pushString (null, new Character (c).toString ());
3446        }
3447    }
3448
3449
3450    /**
3451     * Push a char array back onto the current input stream.
3452     * <p>NOTE: you must <em>never</em> push back characters that you
3453     * haven't actually read: use pushString () instead.
3454     * @see #readCh
3455     * @see #unread (char)
3456     * @see #unread (String)
3457     * @see #readBuffer
3458     * @see #pushString
3459     */
3460    private void unread (char ch[], int length)
3461    throws SAXException
3462    {
3463        for (int i = 0; i < length; i++) {
3464            if (ch [i] == '\n') {
3465                line--;
3466                column = -1;
3467            }
3468        }
3469        if (length < readBufferPos) {
3470            readBufferPos -= length;
3471        } else {
3472            pushCharArray (null, ch, 0, length);
3473            sourceType = INPUT_BUFFER;
3474        }
3475    }
3476
3477
3478    /**
3479     * Push a new external input source.
3480     * The source will be some kind of parsed entity, such as a PE
3481     * (including the external DTD subset) or content for the body.
3482     * <p>TODO: Right now, this method always attempts to autodetect
3483     * the encoding; in the future, it should allow the caller to 
3484     * request an encoding explicitly, and it should also look at the
3485     * headers with an HTTP connection.
3486     * @param url The java.net.URL object for the entity.
3487     * @see SAXDriver#resolveEntity
3488     * @see #pushString
3489     * @see #sourceType
3490     * @see #pushInput
3491     * @see #detectEncoding
3492     * @see #sourceType
3493     * @see #readBuffer
3494     */
3495    private void pushURL (
3496        String          ename,
3497        String          publicId,
3498        String          systemId,
3499        Reader          reader,
3500        InputStream     stream,
3501        String          encoding
3502    ) throws SAXException, IOException
3503    {
3504        URL     url;
3505        boolean ignoreEncoding = false;
3506
3507        // Push the existing status.
3508        pushInput (ename);
3509
3510        // Create a new read buffer.
3511        // (Note the four-character margin)
3512        readBuffer = new char [READ_BUFFER_MAX + 4];
3513        readBufferPos = 0;
3514        readBufferLength = 0;
3515        readBufferOverflow = -1;
3516        is = null;
3517        line = 1;
3518
3519        currentByteCount = 0;
3520
3521        // Make any system ID (URI/URL) absolute.  There's one case
3522        // where it may be null:  parser was invoked without providing
3523        // one, e.g. since the XML data came from a memory buffer.
3524
3525        if (systemId != null && externalEntity != null) {
3526            systemId = new URL (externalEntity.getURL (), systemId).toString ();
3527        } else if (baseURI != null) {
3528            systemId = new URL (new URL (baseURI), systemId).toString ();
3529            // throws IOException if couldn't create new URL
3530        }
3531
3532        // See if the application wants to
3533        // redirect the system ID and/or
3534        // supply its own character stream.
3535        if (reader == null && stream == null && systemId != null) {
3536            Object input = handler.resolveEntity (publicId, systemId);
3537            if (input != null) {
3538                if (input instanceof String) {
3539                    systemId = (String) input;
3540                } else if (input instanceof InputStream) {
3541                    stream = (InputStream) input;
3542                } else if (input instanceof Reader) {
3543                    reader = (Reader) input;
3544                }
3545            }
3546        }
3547
3548        // Start the entity.
3549        if (systemId != null) {
3550            handler.startExternalEntity (systemId);
3551        } else {
3552            handler.startExternalEntity ("[unidentified data stream]");
3553        }
3554
3555        // If there's an explicit character stream, just
3556        // ignore encoding declarations.
3557        if (reader != null) {
3558            sourceType = INPUT_READER;
3559            this.reader = reader;
3560            tryEncodingDecl (true);
3561            return;
3562        }
3563        
3564        // Else we handle the conversion, and need to ensure
3565        // it's done right.
3566        if (stream != null) {
3567            sourceType = INPUT_STREAM;
3568            is = stream;
3569            url = null;
3570        } else {
3571            // We have to open our own stream to the URL.
3572
3573            // Set the new status
3574            sourceType = INPUT_EXTERNAL;
3575            url = new URL (systemId);
3576
3577            externalEntity = url.openConnection ();
3578            externalEntity.connect ();
3579            is = externalEntity.getInputStream ();
3580        }
3581
3582        // If we get to here, there must be
3583        // an InputStream available.
3584        if (!is.markSupported ()) {
3585            is = new BufferedInputStream (is);
3586        }
3587
3588        // Get any external encoding label.
3589        if (encoding == null && externalEntity != null) {
3590            // External labels can be untrustworthy; filesystems in
3591            // particular often have the wrong default for content
3592            // that wasn't locally originated.  Those we autodetect.
3593            if (!"file".equals (externalEntity.getURL ().getProtocol ())) {
3594                int temp;
3595
3596                // application/xml;charset=something;otherAttr=...
3597                // ... with many variants on 'something'
3598                encoding = externalEntity.getContentType ();
3599
3600                // MHK code (fix for Saxon 5.5.1/007): protect against encoding==null
3601                if (encoding==null) {
3602                    temp = -1;
3603                } else {
3604                    temp = encoding.indexOf ("charset");
3605                }
3606
3607                // RFC 2376 sez MIME text defaults to ASCII, but since the
3608                // JDK will create a MIME type out of thin air, we always
3609                // autodetect when there's no explicit charset attribute.
3610                if (temp < 0)
3611                    encoding = null;    // autodetect
3612                else {
3613                    temp = encoding.indexOf ('=', temp + 7);
3614                    encoding = encoding.substring (temp);
3615                    if ((temp = encoding.indexOf (';')) > 0)
3616                        encoding = encoding.substring (0, temp);
3617
3618                    // attributes can have comment fields (RFC 822)
3619                    if ((temp = encoding.indexOf ('(')) > 0)
3620                        encoding = encoding.substring (0, temp);
3621                    // ... and values may be quoted
3622                    if ((temp = encoding.indexOf ('"')) > 0)
3623                        encoding = encoding.substring (temp + 1,
3624                                encoding.indexOf ('"', temp + 2));
3625                    encoding.trim ();
3626                }
3627            }
3628        }
3629
3630        // if we got an external encoding label, use it ...
3631        if (encoding != null) {
3632            this.encoding = ENCODING_EXTERNAL;
3633            setupDecoding (encoding);
3634            ignoreEncoding = true;
3635        
3636        // ... else autodetect
3637        } else {
3638            detectEncoding ();
3639            ignoreEncoding = false;
3640        }
3641
3642        // Read any XML or text declaration.
3643        tryEncodingDecl (ignoreEncoding);
3644    }
3645
3646
3647    /**
3648     * Check for an encoding declaration.  This is the second part of the
3649     * XML encoding autodetection algorithm, relying on detectEncoding to
3650     * get to the point that this part can read any encoding declaration
3651     * in the document (using only US-ASCII characters).
3652     *
3653     * <p> Because this part starts to fill parser buffers with this data,
3654     * it's tricky to to a reader so that Java's built-in decoders can be
3655     * used for the character encodings that aren't built in to this parser
3656     * (such as EUC-JP, KOI8-R, Big5, etc).
3657     *
3658     * @return any encoding in the declaration, uppercased; or null
3659     * @see detectEncoding
3660     */
3661    private String tryEncodingDecl (boolean ignoreEncoding)
3662    throws SAXException, IOException
3663    {
3664        // Read the XML/text declaration.
3665        if (tryRead ("<?xml")) {
3666            dataBufferFlush ();
3667            if (tryWhitespace ()) {
3668                if (inputStack.size () > 0) {
3669                    return parseTextDecl (ignoreEncoding);
3670                } else {
3671                    return parseXMLDecl (ignoreEncoding);
3672                }
3673            } else {
3674                unread ("xml".toCharArray (), 3);
3675                parsePI ();
3676            }
3677        }
3678        return null;
3679    }
3680
3681
3682    /**
3683     * Attempt to detect the encoding of an entity.
3684     * <p>The trick here (as suggested in the XML standard) is that
3685     * any entity not in UTF-8, or in UCS-2 with a byte-order mark, 
3686     * <b>must</b> begin with an XML declaration or an encoding
3687     * declaration; we simply have to look for "&lt;?xml" in various
3688     * encodings.
3689     * <p>This method has no way to distinguish among 8-bit encodings.
3690     * Instead, it sets up for UTF-8, then (possibly) revises its assumption
3691     * later in setupDecoding ().  Any ASCII-derived 8-bit encoding
3692     * should work, but most will be rejected later by setupDecoding ().
3693     * <p>I don't currently detect EBCDIC, since I'm concerned that it
3694     * could also be a valid UTF-8 sequence; I'll have to do more checking
3695     * later.
3696     * @see #tryEncoding (byte[], byte, byte, byte, byte)
3697     * @see #tryEncoding (byte[], byte, byte)
3698     * @see #setupDecoding
3699     * @see #read8bitEncodingDeclaration
3700     */
3701    private void detectEncoding ()
3702    throws SAXException, IOException
3703    {
3704        byte signature[] = new byte [4];
3705
3706        // Read the first four bytes for
3707        // autodetection.
3708        is.mark (4);
3709        is.read (signature);
3710        is.reset ();
3711
3712        //
3713        // FIRST:  four byte encodings (who uses these?)
3714        //
3715        if (tryEncoding (signature, (byte) 0x00, (byte) 0x00,
3716                          (byte) 0x00, (byte) 0x3c)) {
3717            // UCS-4 must begin with "<?xml"
3718            // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
3719            encoding = ENCODING_UCS_4_1234;
3720
3721        } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00,
3722                                 (byte) 0x00, (byte) 0x00)) {
3723            // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
3724            encoding = ENCODING_UCS_4_4321;
3725
3726        } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x00,
3727                                 (byte) 0x3c, (byte) 0x00)) {
3728            // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
3729            encoding = ENCODING_UCS_4_2143;
3730
3731        } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c,
3732                                 (byte) 0x00, (byte) 0x00)) {
3733            // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
3734            encoding = ENCODING_UCS_4_3412;
3735
3736            // 00 00 fe ff UCS_4_1234 (with BOM)
3737            // ff fe 00 00 UCS_4_4321 (with BOM)
3738        }
3739
3740        //
3741        // SECOND:  two byte encodings
3742        // note ... with 1/14/2000 errata the XML spec identifies some
3743        // more "broken UTF-16" autodetection cases, with no XML decl,
3744        // which we don't handle here (that's legal too).
3745        //
3746        else if (tryEncoding (signature, (byte) 0xfe, (byte) 0xff)) {
3747            // UCS-2 with a byte-order marker. (UTF-16)
3748            // 0xfe 0xff: UCS-2, big-endian (12)
3749            encoding = ENCODING_UCS_2_12;
3750            is.read (); is.read ();
3751
3752        } else if (tryEncoding (signature, (byte) 0xff, (byte) 0xfe)) {
3753            // UCS-2 with a byte-order marker. (UTF-16)
3754            // 0xff 0xfe: UCS-2, little-endian (21)
3755            encoding = ENCODING_UCS_2_21;
3756            is.read (); is.read ();
3757
3758        } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c,
3759                                 (byte) 0x00, (byte) 0x3f)) {
3760            // UTF-16-BE (otherwise, malformed UTF-16)
3761            // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
3762            encoding = ENCODING_UCS_2_12;
3763            error ("no byte-order mark for UCS-2 entity");
3764
3765        } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00,
3766                                 (byte) 0x3f, (byte) 0x00)) {
3767            // UTF-16-LE (otherwise, malformed UTF-16)
3768            // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
3769            encoding = ENCODING_UCS_2_21;
3770            error ("no byte-order mark for UCS-2 entity");
3771        }
3772
3773        //
3774        // THIRD:  ASCII-derived encodings, fixed and variable lengths
3775        //
3776        else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x3f,
3777                               (byte) 0x78, (byte) 0x6d)) {
3778            // ASCII derived
3779            // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
3780            encoding = ENCODING_UTF_8;
3781            read8bitEncodingDeclaration ();
3782
3783        } else {
3784            // 4c 6f a7 94 ... we don't understand EBCDIC flavors
3785            // ... but we COULD at least kick in some fixed code page
3786
3787            // (default) UTF-8 without encoding/XML declaration
3788            encoding = ENCODING_UTF_8;
3789        }
3790    }
3791
3792
3793    /**
3794     * Check for a four-byte signature.
3795     * <p>Utility routine for detectEncoding ().
3796     * <p>Always looks for some part of "<?XML" in a specific encoding.
3797     * @param sig The first four bytes read.
3798     * @param b1 The first byte of the signature
3799     * @param b2 The second byte of the signature
3800     * @param b3 The third byte of the signature
3801     * @param b4 The fourth byte of the signature
3802     * @see #detectEncoding
3803     */
3804    private static boolean tryEncoding (
3805        byte sig[], byte b1, byte b2, byte b3, byte b4)
3806    {
3807        return (sig [0] == b1 && sig [1] == b2
3808                && sig [2] == b3 && sig [3] == b4);
3809    }
3810
3811
3812    /**
3813     * Check for a two-byte signature.
3814     * <p>Looks for a UCS-2 byte-order mark.
3815     * <p>Utility routine for detectEncoding ().
3816     * @param sig The first four bytes read.
3817     * @param b1 The first byte of the signature
3818     * @param b2 The second byte of the signature
3819     * @see #detectEncoding
3820     */
3821    private static boolean tryEncoding (byte sig[], byte b1, byte b2)
3822    {
3823        return ((sig [0] == b1) && (sig [1] == b2));
3824    }
3825
3826
3827    /**
3828     * This method pushes a string back onto input.
3829     * <p>It is useful either as the expansion of an internal entity, 
3830     * or for backtracking during the parse.
3831     * <p>Call pushCharArray () to do the actual work.
3832     * @param s The string to push back onto input.
3833     * @see #pushCharArray
3834     */
3835    private void pushString (String ename, String s)
3836    throws SAXException
3837    {
3838        char ch[] = s.toCharArray ();
3839        pushCharArray (ename, ch, 0, ch.length);
3840    }
3841
3842
3843    /**
3844     * Push a new internal input source.
3845     * <p>This method is useful for expanding an internal entity,
3846     * or for unreading a string of characters.  It creates a new
3847     * readBuffer containing the characters in the array, instead
3848     * of characters converted from an input byte stream.
3849     * @param ch The char array to push.
3850     * @see #pushString
3851     * @see #pushURL
3852     * @see #readBuffer
3853     * @see #sourceType
3854     * @see #pushInput
3855     */
3856    private void pushCharArray (String ename, char ch[], int start, int length)
3857    throws SAXException
3858    {
3859        // Push the existing status
3860        pushInput (ename);
3861        sourceType = INPUT_INTERNAL;
3862        readBuffer = ch;
3863        readBufferPos = start;
3864        readBufferLength = length;
3865        readBufferOverflow = -1;
3866    }
3867
3868
3869    /**
3870     * Save the current input source onto the stack.
3871     * <p>This method saves all of the global variables associated with
3872     * the current input source, so that they can be restored when a new
3873     * input source has finished.  It also tests for entity recursion.
3874     * <p>The method saves the following global variables onto a stack
3875     * using a fixed-length array:
3876     * <ol>
3877     * <li>sourceType
3878     * <li>externalEntity
3879     * <li>readBuffer
3880     * <li>readBufferPos
3881     * <li>readBufferLength
3882     * <li>line
3883     * <li>encoding
3884     * </ol>
3885     * @param ename The name of the entity (if any) causing the new input.
3886     * @see #popInput
3887     * @see #sourceType
3888     * @see #externalEntity
3889     * @see #readBuffer
3890     * @see #readBufferPos
3891     * @see #readBufferLength
3892     * @see #line
3893     * @see #encoding
3894     */
3895    private void pushInput (String ename)
3896    throws SAXException
3897    {
3898        Object input[] = new Object [12];
3899
3900        // Check for entity recursion.
3901        if (ename != null) {
3902            Iterator entities = entityStack.iterator ();
3903            while (entities.hasNext ()) {
3904                String e = (String) entities.next ();
3905                if (e == ename) {
3906                    error ("recursive reference to entity", ename, null);
3907                }
3908            }
3909        }
3910        entityStack.add (ename);
3911
3912        // Don't bother if there is no current input.
3913        if (sourceType == INPUT_NONE) {
3914            return;
3915        }
3916
3917        // Set up a snapshot of the current
3918        // input source.
3919        input [0] = new Integer (sourceType);
3920        input [1] = externalEntity;
3921        input [2] = readBuffer;
3922        input [3] = new Integer (readBufferPos);
3923        input [4] = new Integer (readBufferLength);
3924        input [5] = new Integer (line);
3925        input [6] = new Integer (encoding);
3926        input [7] = new Integer (readBufferOverflow);
3927        input [8] = is;
3928        input [9] = new Integer (currentByteCount);
3929        input [10] = new Integer (column);
3930        input [11] = reader;
3931
3932        // Push it onto the stack.
3933        inputStack.add (input);
3934    }
3935
3936
3937    /**
3938     * Restore a previous input source.
3939     * <p>This method restores all of the global variables associated with
3940     * the current input source.
3941     * @exception java.io.EOFException
3942     *    If there are no more entries on the input stack.
3943     * @see #pushInput
3944     * @see #sourceType
3945     * @see #externalEntity
3946     * @see #readBuffer
3947     * @see #readBufferPos
3948     * @see #readBufferLength
3949     * @see #line
3950     * @see #encoding
3951     */
3952    private void popInput ()
3953    throws SAXException, IOException
3954    {
3955        Object input[];
3956
3957
3958        switch (sourceType) {
3959
3960        case INPUT_EXTERNAL:
3961            if (externalEntity != null) {
3962                handler.endExternalEntity (
3963                        externalEntity.getURL ().toString ());
3964            }
3965            break;
3966        case INPUT_STREAM:
3967            if (baseURI != null) {
3968                handler.endExternalEntity (baseURI);
3969            }
3970            is.close ();
3971            break;
3972        case INPUT_READER:
3973            if (baseURI != null) {
3974                handler.endExternalEntity (baseURI);
3975            }
3976            reader.close ();
3977            break;
3978        }
3979
3980        // Throw an EOFException if there
3981        // is nothing else to pop.
3982        if (inputStack.isEmpty ()) {
3983            throw new EOFException ("no more input");
3984        } else {
3985            String s;
3986            input = (Object[]) inputStack.remove ( inputStack.size() - 1 );
3987            s = (String) entityStack.remove ( entityStack.size() - 1 );
3988        }
3989
3990        sourceType = ((Integer) input [0]).intValue ();
3991        externalEntity = (URLConnection) input [1];
3992        readBuffer = (char[]) input [2];
3993        readBufferPos = ((Integer) input [3]).intValue ();
3994        readBufferLength = ((Integer) input [4]).intValue ();
3995        line = ((Integer) input [5]).intValue ();
3996        encoding = ((Integer) input [6]).intValue ();
3997        readBufferOverflow = ((Integer) input [7]).intValue ();
3998        is = (InputStream) input [8];
3999        currentByteCount = ((Integer) input [9]).intValue ();
4000        column = ((Integer) input [10]).intValue ();
4001        reader = (Reader) input [11];
4002    }
4003
4004
4005    /**
4006     * Return true if we can read the expected character.
4007     * <p>Note that the character will be removed from the input stream
4008     * on success, but will be put back on failure.  Do not attempt to
4009     * read the character again if the method succeeds.
4010     * @param delim The character that should appear next.  For a
4011     *        insensitive match, you must supply this in upper-case.
4012     * @return true if the character was successfully read, or false if
4013     *   it was not.
4014     * @see #tryRead (String)
4015     */
4016    private boolean tryRead (char delim)
4017    throws SAXException, IOException
4018    {
4019        char c;
4020
4021        // Read the character
4022        c = readCh ();
4023
4024        // Test for a match, and push the character
4025        // back if the match fails.
4026        if (c == delim) {
4027            return true;
4028        } else {
4029            unread (c);
4030            return false;
4031        }
4032    }
4033
4034
4035    /**
4036     * Return true if we can read the expected string.
4037     * <p>This is simply a convenience method.
4038     * <p>Note that the string will be removed from the input stream
4039     * on success, but will be put back on failure.  Do not attempt to
4040     * read the string again if the method succeeds.
4041     * <p>This method will push back a character rather than an
4042     * array whenever possible (probably the majority of cases).
4043     * <p><b>NOTE:</b> This method currently has a hard-coded limit
4044     * of 100 characters for the delimiter.
4045     * @param delim The string that should appear next.
4046     * @return true if the string was successfully read, or false if
4047     *   it was not.
4048     * @see #tryRead (char)
4049     */
4050    private boolean tryRead (String delim)
4051    throws SAXException, IOException
4052    {
4053        char ch[] = delim.toCharArray ();
4054        char c;
4055
4056        // Compare the input, character-
4057        // by character.
4058
4059        for (int i = 0; i < ch.length; i++) {
4060            c = readCh ();
4061            if (c != ch [i]) {
4062                unread (c);
4063                if (i != 0) {
4064                    unread (ch, i);
4065                }
4066                return false;
4067            }
4068        }
4069        return true;
4070    }
4071
4072
4073
4074    /**
4075     * Return true if we can read some whitespace.
4076     * <p>This is simply a convenience method.
4077     * <p>This method will push back a character rather than an
4078     * array whenever possible (probably the majority of cases).
4079     * @return true if whitespace was found.
4080     */
4081    private boolean tryWhitespace ()
4082    throws SAXException, IOException
4083    {
4084        char c;
4085        c = readCh ();
4086        if (isWhitespace (c)) {
4087            skipWhitespace ();
4088            return true;
4089        } else {
4090            unread (c);
4091            return false;
4092        }
4093    }
4094
4095
4096    /**
4097     * Read all data until we find the specified string.
4098     * This is useful for scanning CDATA sections and PIs.
4099     * <p>This is inefficient right now, since it calls tryRead ()
4100     * for every character.
4101     * @param delim The string delimiter
4102     * @see #tryRead (String, boolean)
4103     * @see #readCh
4104     */
4105    private void parseUntil (String delim)
4106    throws SAXException, IOException
4107    {
4108        char c;
4109        int startLine = line;
4110
4111        try {
4112            while (!tryRead (delim)) {
4113                c = readCh ();
4114                dataBufferAppend (c);
4115            }
4116        } catch (EOFException e) {
4117            error ("end of input while looking for delimiter "
4118                + "(started on line " + startLine
4119                + ')', null, delim);
4120        }
4121    }
4122
4123
4124    /**
4125     * Read just the encoding declaration (or XML declaration) at the 
4126     * start of an external entity.
4127     * When this method is called, we know that the declaration is
4128     * present (or appears to be).  We also know that the entity is
4129     * in some sort of ASCII-derived 8-bit encoding.
4130     * The idea of this is to let us read what the 8-bit encoding is
4131     * before we've committed to converting any more of the file; the
4132     * XML or encoding declaration must be in 7-bit ASCII, so we're
4133     * safe as long as we don't go past it.
4134     */
4135    private void read8bitEncodingDeclaration ()
4136    throws SAXException, IOException
4137    {
4138        int ch;
4139        readBufferPos = readBufferLength = 0;
4140
4141        while (true) {
4142            ch = is.read ();
4143            readBuffer [readBufferLength++] = (char) ch;
4144            switch (ch) {
4145              case (int) '>':
4146                return;
4147              case - 1:
4148                error ("end of file before end of XML or encoding declaration.",
4149                       null, "?>");
4150            }
4151            if (readBuffer.length == readBufferLength)
4152                error ("unfinished XML or encoding declaration");
4153        }
4154    }
4155
4156
4157    //////////////////////////////////////////////////////////////////////
4158    // Low-level I/O.
4159    //////////////////////////////////////////////////////////////////////
4160
4161
4162    /**
4163     * Read a chunk of data from an external input source.
4164     * <p>This is simply a front-end that fills the rawReadBuffer
4165     * with bytes, then calls the appropriate encoding handler.
4166     * @see #encoding
4167     * @see #rawReadBuffer
4168     * @see #readBuffer
4169     * @see #filterCR
4170     * @see #copyUtf8ReadBuffer
4171     * @see #copyIso8859_1ReadBuffer
4172     * @see #copyUcs_2ReadBuffer
4173     * @see #copyUcs_4ReadBuffer
4174     */
4175    private void readDataChunk ()
4176    throws SAXException, IOException
4177    {
4178        int count, i, j;
4179
4180        // See if we have any overflow (filterCR sets for CR at end)
4181        if (readBufferOverflow > -1) {
4182            readBuffer [0] = (char) readBufferOverflow;
4183            readBufferOverflow = -1;
4184            readBufferPos = 1;
4185            sawCR = true;
4186        } else {
4187            readBufferPos = 0;
4188            sawCR = false;
4189        }
4190
4191        // input from a character stream.
4192        if (sourceType == INPUT_READER) {
4193            count = reader.read (readBuffer,
4194                            readBufferPos, READ_BUFFER_MAX - readBufferPos);
4195            if (count < 0)
4196                readBufferLength = readBufferPos;
4197            else
4198                readBufferLength = readBufferPos + count;
4199            if (readBufferLength > 0)
4200                filterCR (count >= 0);
4201            sawCR = false;
4202            return;
4203        }
4204
4205        // Read as many bytes as possible into the raw buffer.
4206        count = is.read (rawReadBuffer, 0, READ_BUFFER_MAX);
4207
4208        // Dispatch to an encoding-specific reader method to populate
4209        // the readBuffer.  In most parser speed profiles, these routines
4210        // show up at the top of the CPU usage chart.
4211        if (count > 0) {
4212            switch (encoding) {
4213              // one byte builtins
4214              case ENCODING_ASCII:
4215                copyIso8859_1ReadBuffer (count, (char) 0x0080);
4216                break;
4217              case ENCODING_UTF_8:
4218                copyUtf8ReadBuffer (count);
4219                break;
4220              case ENCODING_ISO_8859_1:
4221                copyIso8859_1ReadBuffer (count, (char) 0);
4222                break;
4223
4224              // two byte builtins
4225              case ENCODING_UCS_2_12:
4226                copyUcs2ReadBuffer (count, 8, 0);
4227                break;
4228              case ENCODING_UCS_2_21:
4229                copyUcs2ReadBuffer (count, 0, 8);
4230                break;
4231
4232              // four byte builtins
4233              case ENCODING_UCS_4_1234:
4234                copyUcs4ReadBuffer (count, 24, 16, 8, 0);
4235                break;
4236              case ENCODING_UCS_4_4321:
4237                copyUcs4ReadBuffer (count, 0, 8, 16, 24);
4238                break;
4239              case ENCODING_UCS_4_2143:
4240                copyUcs4ReadBuffer (count, 16, 24, 0, 8);
4241                break;
4242              case ENCODING_UCS_4_3412:
4243                copyUcs4ReadBuffer (count, 8, 0, 24, 16);
4244                break;
4245            }
4246        } else
4247            readBufferLength = readBufferPos;
4248
4249        readBufferPos = 0;
4250
4251        // Filter out all carriage returns if we've seen any
4252        // (including any saved from a previous read)
4253        if (sawCR) {
4254            filterCR (count >= 0);
4255            sawCR = false;
4256
4257            // must actively report EOF, lest some CRs get lost.
4258            if (readBufferLength == 0 && count >= 0)
4259                readDataChunk ();
4260        }
4261
4262        if (count > 0)
4263            currentByteCount += count;
4264    }
4265
4266
4267    /**
4268     * Filter carriage returns in the read buffer.
4269     * CRLF becomes LF; CR becomes LF.
4270     * @param moreData true iff more data might come from the same source
4271     * @see #readDataChunk
4272     * @see #readBuffer
4273     * @see #readBufferOverflow
4274     */
4275    private void filterCR (boolean moreData)
4276    {
4277        int i, j;
4278
4279        readBufferOverflow = -1;
4280
4281loop:
4282        for (i = j = readBufferPos; j < readBufferLength; i++, j++) {
4283            switch (readBuffer [j]) {
4284            case '\r':
4285                if (j == readBufferLength - 1) {
4286                    if (moreData) {
4287                        readBufferOverflow = '\r';
4288                        readBufferLength--;
4289                    } else      // CR at end of buffer
4290                        readBuffer [i++] = '\n';
4291                    break loop;
4292                } else if (readBuffer [j + 1] == '\n') {
4293                    j++;
4294                }
4295                readBuffer [i] = '\n';
4296                break;
4297
4298            case '\n':
4299            default:
4300                readBuffer [i] = readBuffer [j];
4301                break;
4302            }
4303        }
4304        readBufferLength = i;
4305    }
4306
4307    /**
4308     * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
4309     * <p>When readDataChunk () calls this method, the raw bytes are in 
4310     * rawReadBuffer, and the final characters will appear in 
4311     * readBuffer.
4312     * @param count The number of bytes to convert.
4313     * @see #readDataChunk
4314     * @see #rawReadBuffer
4315     * @see #readBuffer
4316     * @see #getNextUtf8Byte
4317     */
4318    private void copyUtf8ReadBuffer (int count)
4319    throws SAXException, IOException
4320    {
4321        int     i = 0;
4322        int     j = readBufferPos;
4323        int     b1;
4324        char    c = 0;
4325
4326        /*
4327        // check once, so the runtime won't (if it's smart enough)
4328        if (count < 0 || count > rawReadBuffer.length)
4329            throw new ArrayIndexOutOfBoundsException (Integer.toString (count));
4330        */
4331
4332        while (i < count) {
4333            b1 = rawReadBuffer [i++];
4334
4335            // Determine whether we are dealing
4336            // with a one-, two-, three-, or four-
4337            // byte sequence.
4338            if (b1 < 0) {
4339                if ((b1 & 0xe0) == 0xc0) {
4340                    // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
4341                    c = (char) (((b1 & 0x1f) << 6)
4342                                | getNextUtf8Byte (i++, count));
4343                } else if ((b1 & 0xf0) == 0xe0) {
4344                    // 3-byte sequence:
4345                    // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
4346                    // most CJKV characters
4347                    c = (char) (((b1 & 0x0f) << 12) |
4348                                   (getNextUtf8Byte (i++, count) << 6) |
4349                                   getNextUtf8Byte (i++, count));
4350                } else if ((b1 & 0xf8) == 0xf0) {
4351                    // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
4352                    //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
4353                    // (uuuuu = wwww + 1)
4354                    // "Surrogate Pairs" ... from the "Astral Planes"
4355                    int iso646 = b1 & 07;
4356                    iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4357                    iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4358                    iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4359
4360                    if (iso646 <= 0xffff) {
4361                        c = (char) iso646;
4362                    } else {
4363                        if (iso646 > 0x0010ffff)
4364                            encodingError (
4365                                "UTF-8 value out of range for Unicode",
4366                                iso646, 0);
4367                        iso646 -= 0x010000;
4368                        readBuffer [j++] = (char) (0xd800 | (iso646 >> 10));
4369                        readBuffer [j++] = (char) (0xdc00 | (iso646 & 0x03ff));
4370                        continue;
4371                    }
4372                } else {
4373                    // The five and six byte encodings aren't supported;
4374                    // they exceed the Unicode (and XML) range.
4375                    encodingError (
4376                            "unsupported five or six byte UTF-8 sequence",
4377                            0xff & b1, i);
4378                    // NOTREACHED
4379                    c = 0;
4380                }
4381            } else {
4382                // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
4383                // (US-ASCII character, "common" case, one branch to here)
4384                c = (char) b1;
4385            }
4386            readBuffer [j++] = c;
4387            if (c == '\r')
4388                sawCR = true;
4389        }
4390        // How many characters have we read?
4391        readBufferLength = j;
4392    }
4393
4394
4395    /**
4396     * Return the next byte value in a UTF-8 sequence.
4397     * If it is not possible to get a byte from the current
4398     * entity, throw an exception.
4399     * @param pos The current position in the rawReadBuffer.
4400     * @param count The number of bytes in the rawReadBuffer
4401     * @return The significant six bits of a non-initial byte in
4402     *   a UTF-8 sequence.
4403     * @exception EOFException If the sequence is incomplete.
4404     */
4405    private int getNextUtf8Byte (int pos, int count)
4406    throws SAXException, IOException
4407    {
4408        int val;
4409
4410        // Take a character from the buffer
4411        // or from the actual input stream.
4412        if (pos < count) {
4413            val = rawReadBuffer [pos];
4414        } else {
4415            val = is.read ();
4416            if (val == -1) {
4417                encodingError ("unfinished multi-byte UTF-8 sequence at EOF",
4418                        -1, pos);
4419            }
4420        }
4421
4422        // Check for the correct bits at the start.
4423        if ((val & 0xc0) != 0x80) {
4424            encodingError ("bad continuation of multi-byte UTF-8 sequence",
4425                    val, pos + 1);
4426        }
4427
4428        // Return the significant bits.
4429        return (val & 0x3f);
4430    }
4431
4432
4433    /**
4434     * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into
4435     * UTF-16 characters.
4436     *
4437     * <p>When readDataChunk () calls this method, the raw bytes are in 
4438     * rawReadBuffer, and the final characters will appear in 
4439     * readBuffer.
4440     *
4441     * @param count The number of bytes to convert.
4442     * @param mask For ASCII conversion, 0x7f; else, 0xff.
4443     * @see #readDataChunk
4444     * @see #rawReadBuffer
4445     * @see #readBuffer
4446     */
4447    private void copyIso8859_1ReadBuffer (int count, char mask)
4448    throws IOException
4449    {
4450        int i, j;
4451        for (i = 0, j = readBufferPos; i < count; i++, j++) {
4452            char c = (char) (rawReadBuffer [i] & 0xff);
4453            if ((c & mask) != 0)
4454                throw new CharConversionException ("non-ASCII character U+"
4455                                                    + Integer.toHexString (c));
4456            readBuffer [j] = c;
4457            if (c == '\r') {
4458                sawCR = true;
4459            }
4460        }
4461        readBufferLength = j;
4462    }
4463
4464
4465    /**
4466     * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters
4467     * (as used in Java string manipulation).
4468     *
4469     * <p>When readDataChunk () calls this method, the raw bytes are in 
4470     * rawReadBuffer, and the final characters will appear in 
4471     * readBuffer.
4472     * @param count The number of bytes to convert.
4473     * @param shift1 The number of bits to shift byte 1.
4474     * @param shift2 The number of bits to shift byte 2
4475     * @see #readDataChunk
4476     * @see #rawReadBuffer
4477     * @see #readBuffer
4478     */
4479    private void copyUcs2ReadBuffer (int count, int shift1, int shift2)
4480    throws SAXException
4481    {
4482        int j = readBufferPos;
4483
4484        if (count > 0 && (count % 2) != 0) {
4485            encodingError ("odd number of bytes in UCS-2 encoding", -1, count);
4486        }
4487        // The loops are faster with less internal brancing; hence two
4488        if (shift1 == 0) {      // "UTF-16-LE"
4489            for (int i = 0; i < count; i += 2) {
4490                char c = (char) (rawReadBuffer [i + 1] << 8);
4491                c |= 0xff & rawReadBuffer [i];
4492                readBuffer [j++] = c;
4493                if (c == '\r')
4494                    sawCR = true;
4495            }
4496        } else {        // "UTF-16-BE"
4497            for (int i = 0; i < count; i += 2) {
4498                char c = (char) (rawReadBuffer [i] << 8);
4499                c |= 0xff & rawReadBuffer [i + 1];
4500                readBuffer [j++] = c;
4501                if (c == '\r')
4502                    sawCR = true;
4503            }
4504        }
4505        readBufferLength = j;
4506    }
4507
4508
4509    /**
4510     * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
4511     *
4512     * <p>When readDataChunk () calls this method, the raw bytes are in 
4513     * rawReadBuffer, and the final characters will appear in 
4514     * readBuffer.
4515     * <p>Java has Unicode chars, and this routine uses surrogate pairs
4516     * for ISO-10646 values between 0x00010000 and 0x000fffff.  An
4517     * exception is thrown if the ISO-10646 character has no Unicode
4518     * representation.
4519     *
4520     * @param count The number of bytes to convert.
4521     * @param shift1 The number of bits to shift byte 1.
4522     * @param shift2 The number of bits to shift byte 2
4523     * @param shift3 The number of bits to shift byte 2
4524     * @param shift4 The number of bits to shift byte 2
4525     * @see #readDataChunk
4526     * @see #rawReadBuffer
4527     * @see #readBuffer
4528     */
4529    private void copyUcs4ReadBuffer (int count, int shift1, int shift2,
4530                              int shift3, int shift4)
4531    throws SAXException
4532    {
4533        int j = readBufferPos;
4534        int value;
4535
4536        if (count > 0 && (count % 4) != 0) {
4537            encodingError (
4538                    "number of bytes in UCS-4 encoding not divisible by 4",
4539                    -1, count);
4540        }
4541        for (int i = 0; i < count; i += 4) {
4542            value = (((rawReadBuffer [i] & 0xff) << shift1) |
4543                      ((rawReadBuffer [i + 1] & 0xff) << shift2) |
4544                      ((rawReadBuffer [i + 2] & 0xff) << shift3) |
4545                      ((rawReadBuffer [i + 3] & 0xff) << shift4));
4546            if (value < 0x0000ffff) {
4547                readBuffer [j++] = (char) value;
4548                if (value == (int) '\r') {
4549                    sawCR = true;
4550                }
4551            } else if (value < 0x0010ffff) {
4552                value -= 0x010000;
4553                readBuffer [j++] = (char) (0xd8 | ((value >> 10) & 0x03ff));
4554                readBuffer [j++] = (char) (0xdc | (value & 0x03ff));
4555            } else {
4556                encodingError ("UCS-4 value out of range for Unicode",
4557                               value, i);
4558            }
4559        }
4560        readBufferLength = j;
4561    }
4562
4563
4564    /**
4565     * Report a character encoding error.
4566     */
4567    private void encodingError (String message, int value, int offset)
4568    throws SAXException
4569    {
4570        String uri;
4571
4572        if (value != -1) {
4573            message = message + " (character code: 0x" +
4574                      Integer.toHexString (value) + ')';
4575        }
4576        if (externalEntity != null) {
4577            uri = externalEntity.getURL ().toString ();
4578        } else {
4579            uri = baseURI;
4580        }
4581        handler.error (message, uri, -1, offset + currentByteCount);
4582    }
4583
4584
4585    //////////////////////////////////////////////////////////////////////
4586    // Local Variables.
4587    //////////////////////////////////////////////////////////////////////
4588
4589    /**
4590     * Re-initialize the variables for each parse.
4591     */
4592    private void initializeVariables ()
4593    {
4594        // First line
4595        line = 1;
4596        column = 0;
4597
4598        // Set up the buffers for data and names
4599        dataBufferPos = 0;
4600        dataBuffer = new char [DATA_BUFFER_INITIAL];
4601        nameBufferPos = 0;
4602        nameBuffer = new char [NAME_BUFFER_INITIAL];
4603
4604        // Set up the DTD hash tables
4605        elementInfo = new HashMap ();
4606        entityInfo = new HashMap ();
4607        notationInfo = new HashMap ();
4608
4609        // Set up the variables for the current
4610        // element context.
4611        currentElement = null;
4612        currentElementContent = CONTENT_UNDECLARED;
4613
4614        // Set up the input variables
4615        sourceType = INPUT_NONE;
4616        inputStack = new ArrayList ();
4617        entityStack = new ArrayList ();
4618        externalEntity = null;
4619        tagAttributePos = 0;
4620        tagAttributes = new String [100];
4621        rawReadBuffer = new byte [READ_BUFFER_MAX];
4622        readBufferOverflow = -1;
4623
4624        inLiteral = false;
4625        expandPE = false;
4626        peIsError = false;
4627
4628        inCDATA = false;
4629
4630        symbolTable = new Object [SYMBOL_TABLE_LENGTH][];
4631    }
4632
4633
4634    /**
4635     * Clean up after the parse to allow some garbage collection.
4636     */
4637    private void cleanupVariables ()
4638    {
4639        dataBuffer = null;
4640        nameBuffer = null;
4641
4642        elementInfo = null;
4643        entityInfo = null;
4644        notationInfo = null;
4645
4646        currentElement = null;
4647
4648        inputStack = null;
4649        entityStack = null;
4650        externalEntity = null;
4651
4652        tagAttributes = null;
4653        rawReadBuffer = null;
4654
4655        symbolTable = null;
4656    }
4657
4658    //
4659    // The current XML handler interface.
4660    //
4661    private SAXDriver   handler;
4662
4663    //
4664    // I/O information.
4665    //
4666    private Reader      reader;         // current reader
4667    private InputStream is;             // current input stream
4668    private int         line;           // current line number
4669    private int         column;         // current column number
4670    private int         sourceType;     // type of input source
4671    private ArrayList   inputStack;     // stack of input soruces
4672    private URLConnection externalEntity; // current external entity
4673    private int         encoding;       // current character encoding
4674    private int         currentByteCount; // bytes read from current source
4675
4676    //
4677    // Buffers for decoded but unparsed character input.
4678    //
4679    private char        readBuffer [];
4680    private int         readBufferPos;
4681    private int         readBufferLength;
4682    private int         readBufferOverflow;  // overflow from last data chunk.
4683
4684
4685    //
4686    // Buffer for undecoded raw byte input.
4687    //
4688    private final static int READ_BUFFER_MAX = 16384;
4689    private byte        rawReadBuffer [];
4690
4691
4692    //
4693    // Buffer for parsed character data.
4694    //
4695    private static int DATA_BUFFER_INITIAL = 4096;
4696    private char        dataBuffer [];
4697    private int         dataBufferPos;
4698
4699    //
4700    // Buffer for parsed names.
4701    //
4702    private static int NAME_BUFFER_INITIAL = 1024;
4703    private char        nameBuffer [];
4704    private int         nameBufferPos;
4705
4706
4707    //
4708    // HashMaps for DTD information on elements, entities, and notations.
4709    //
4710    private HashMap     elementInfo;
4711    private HashMap     entityInfo;
4712    private HashMap     notationInfo;
4713
4714
4715    //
4716    // Element type currently in force.
4717    //
4718    private String      currentElement;
4719    private int         currentElementContent;
4720
4721    //
4722    // Base external identifiers for resolution.
4723    //
4724    private String      basePublicId;
4725    private String      baseURI;
4726    private int         baseEncoding;
4727    private Reader      baseReader;
4728    private InputStream baseInputStream;
4729    private char        baseInputBuffer [];
4730    private int         baseInputBufferStart;
4731    private int         baseInputBufferLength;
4732
4733    //
4734    // Stack of entity names, to detect recursion.
4735    //
4736    private ArrayList   entityStack;
4737
4738    //
4739    // PE expansion is enabled in most chunks of the DTD, not all.
4740    // When it's enabled, literals are treated differently.
4741    //
4742    private boolean     inLiteral;
4743    private boolean     expandPE;
4744    private boolean     peIsError;
4745
4746    //
4747    // Symbol table, for caching interned names.
4748    //
4749    private final static int SYMBOL_TABLE_LENGTH = 1087;
4750    private Object      symbolTable [][];
4751
4752    //
4753    // Hash table of attributes found in current start tag.
4754    //
4755    private String      tagAttributes [];
4756    private int         tagAttributePos;
4757
4758    //
4759    // Utility flag: have we noticed a CR while reading the last
4760    // data chunk?  If so, we will have to go back and normalise
4761    // CR or CR/LF line ends.
4762    //
4763    private boolean     sawCR;
4764
4765    //
4766    // Utility flag: are we in CDATA?  If so, whitespace isn't ignorable.
4767    // 
4768    private boolean     inCDATA;
4769}