001/* ----------------------------------------------------------------------------
002   The Kiwi Toolkit - A Java Class Library
003   Copyright (C) 1998-2004 Mark A. Lindner
004
005   This library is free software; you can redistribute it and/or
006   modify it under the terms of the GNU General Public License as
007   published by the Free Software Foundation; either version 2 of the
008   License, or (at your option) any later version.
009
010   This library is distributed in the hope that it will be useful,
011   but WITHOUT ANY WARRANTY; without even the implied warranty of
012   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013   General Public License for more details.
014
015   You should have received a copy of the GNU General Public License
016   along with this library; if not, write to the Free Software
017   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
018   02111-1307, USA.
019 
020   The author may be contacted at: mark_a_lindner@yahoo.com
021   ----------------------------------------------------------------------------
022   $Log: XMLParser.java,v $
023   Revision 1.2  2004/05/05 22:47:37  markl
024   comment block updates
025
026   Revision 1.1  2004/05/05 18:47:08  markl
027   classes renamed
028
029   Revision 1.4  2003/01/19 09:34:27  markl
030   Javadoc & comment header updates.
031
032   Revision 1.3  2001/03/12 02:18:29  markl
033   Source code cleanup.
034
035   Revision 1.2  1999/01/10 03:37:18  markl
036   added GPL header & RCS tag
037   ----------------------------------------------------------------------------
038*/
039
040package kiwi.text;
041
042import java.io.*;
043import java.util.*;
044
045/** A very simple XML parser. <code>XMLParser</code> tokenizes XML
046  * into a series of <i>tags</i> and <i>strings</i>, which are passed
047  * in series to a consumer. The parser does not support CDATA
048  * sections, entities, or comments.
049  *
050  * <p> Here is a trivial example that recognizes a few HTML tags read
051  * from standard input:
052  *
053  * <pre>
054  * class HTMLParser implements XMLConsumer
055  *   {
056  *   XMLParser parser;
057  *
058  *   HTMLParser()
059  *     {
060  *     parser = new XMLParser(new InputStreamReader(System.in), this);
061  *     }
062  *
063  *   void parse()
064  *     {
065  *     try
066  *       {
067  *       parser.parse();
068  *       }
069  *     catch(IOException ex)
070  *       {
071  *       System.out.println("End of input, or other error.");
072  *       }
073  *     }
074  *
075  *   public void consumeText(String text)
076  *     {
077  *     System.out.println("Text: " + text);
078  *     }
079  *
080  *   public void consumeElement(XMLElement e)
081  *     {
082  *     String tag = e.getTag();
083  *     if(tag.equalsIgnoreCase("b"))
084  *       System.out.println("Bold " + (tag.isEnd() ? "end" : "begin"));
085  *     else if(tag.equalsIgnoreCase("center"))
086  *       System.out.println("Centering " + (tag.isEnd() ? "end" : "begin"));
087  *     }
088  *   }
089  * </pre>
090  *
091  * @author Mark Lindner
092  * @author Alex Lian (bug fixes)
093  * @author Eric Lunt (bug fixes)
094  */
095
096public class XMLParser
097  {
098  private XMLConsumer consumer;
099  private StreamTokenizer st;
100  private Reader reader;
101  private boolean pre = false;
102  private StringBuffer text;
103  private static final int STATE_NONE = 0, STATE_TAG = 1, STATE_NAME = 2,
104    STATE_EQUALS = 3, STATE_VALUE = 4, STATE_COMMENT = 5;
105
106  /** Construct a new <code>XMLParser</code>.
107    *
108    * @param reader The reader to parse input from.
109    * @param consumer The <code>XMLConsumer</code> that will consume text
110    * and XML elements decoded from the reader.
111    */
112
113  public XMLParser(Reader reader, XMLConsumer consumer)
114    {
115    this.consumer = consumer;
116    this.reader = reader;
117    text = new StringBuffer(200);
118
119    st = new StreamTokenizer(reader);
120    st.slashStarComments(false);
121    st.slashSlashComments(false);
122    st.eolIsSignificant(false);
123
124    st.ordinaryChar('<');
125    st.ordinaryChar('>');
126    st.ordinaryChar('!');
127    st.ordinaryChars('#', ';');
128    st.ordinaryChars('?', '@');
129    st.ordinaryChars('[', '~');
130    st.ordinaryChar('&');
131    st.ordinaryChar('\"');
132    st.wordChars('#', ';');
133    st.wordChars('?', '@');
134    st.wordChars('[', '~');
135
136    st.whitespaceChars(0x0, 0x20);
137    st.wordChars(0x7F, 0xFF);
138    }
139
140  /** Parse the input. Data is read from the input stream and tokenized streams
141    * and tags are passed to the consumer until there is no more data available
142    * on the stream.
143    *
144    * @exception java.io.IOException If an error occurs on the input stream.
145    */
146
147  public void parse() throws IOException
148    {
149    boolean quote = false;
150    String pname = null, pvalue = "";
151    XMLElement e = null;
152    int state = STATE_NONE;
153
154    _whitespaceOn();
155    _whitespaceOff();
156
157      out:
158    for(;;)
159      {
160      switch(st.nextToken())
161        {
162        case StreamTokenizer.TT_EOF:
163          break out;
164
165        case StreamTokenizer.TT_WORD:
166          if(quote)
167            {
168            pvalue = st.sval;
169            }
170          else
171            {
172            switch(state)
173              {
174              case STATE_NONE:
175                //if(text.length() > 0) text.append(' ');
176                text.append(st.sval);
177                break;
178
179              case STATE_TAG:
180                if(st.sval.charAt(0) == '/')
181                  {
182                  e.setEnd(true);
183                  e.setTag(st.sval.substring(1));
184                  }
185                else e.setTag(st.sval);
186                state = STATE_NAME;
187                break;
188
189              case STATE_NAME:
190                pname = st.sval;
191                state = STATE_EQUALS;
192                break;
193
194              case STATE_VALUE:
195                e.addParam(pname, st.sval);
196                state = STATE_NAME;
197                break;
198
199              case STATE_EQUALS:
200                e.addParam(pname, null);
201                pname = st.sval;
202                state = STATE_EQUALS;
203                break;
204              }
205            }
206          break;
207
208        case '>':
209          if(state == STATE_TAG || state == STATE_NAME)
210            sendElement(e);
211          else if(state == STATE_EQUALS || state == STATE_VALUE)
212            {
213            e.addParam(pname, null);
214            sendElement(e);
215            }
216          else if(state == STATE_NONE)
217            text.append('>');
218
219          st.wordChars('"', '"');
220          st.wordChars('!', '!');
221          state = STATE_NONE;
222          _whitespaceOff();
223          break;
224
225        case '<':
226          _whitespaceOn();
227          flushText();
228          e = new XMLElement();
229          st.ordinaryChar('\"');
230          st.ordinaryChar('!');
231          state = STATE_TAG;
232          break;
233
234        case '\"':
235          if(state == STATE_VALUE)
236            {
237            if(!quote)
238              {
239              quote = true;
240              pvalue = "";
241              st.wordChars(' ', ' ');
242              st.wordChars('\t', '\t');
243              st.wordChars('<', '>');
244              }
245            else
246              {
247              quote = false;
248              e.addParam(pname, pvalue);
249              st.whitespaceChars(' ', ' ');
250              st.whitespaceChars('\t', '\t');
251              st.ordinaryChars('<', '>');
252              state = STATE_NAME;
253              }
254            }
255          break;
256
257        case '=':
258          if(state == STATE_EQUALS) state = STATE_VALUE;
259          break;
260
261        case '!':
262          if(state == STATE_TAG)
263            {
264            state = STATE_COMMENT;
265            }
266          else if(state != STATE_COMMENT)
267            text.append('!');
268          break;
269        }
270      }
271    flushText();
272    }
273
274  /* flush the text buffer to the consumer */
275
276  private void flushText()
277    {
278    if(text.length() > 0)
279      {
280      consumer.consumeText(text.toString());
281      text = new StringBuffer(200);
282      }
283    }
284
285  /* send a fully constructed element to the consumer */
286
287  private void sendElement(XMLElement e)
288    {
289    boolean r = consumer.consumeElement(e);  
290
291    if(r)
292      {
293      if(pre)
294        {
295                                // turn formatting on  
296        _whitespaceOn();
297        pre = false;
298        }
299      else
300        {
301                                // turn formatting off
302        _whitespaceOff();
303        pre = true;
304        }
305      }
306    }
307
308  /* turn off whitespace */
309
310  private void _whitespaceOff()
311    {
312    st.wordChars('\n', '\n');
313    st.wordChars('\f', '\f');
314    st.wordChars('\t', '\t');
315    st.wordChars(' ', ' ');
316    }
317
318  /* turn on whitespace */
319
320  private void _whitespaceOn()
321    {
322    st.ordinaryChars('\n', '\n');
323    st.ordinaryChars('\f', '\f');
324    st.ordinaryChars('\t', '\t');
325    st.ordinaryChars(' ', ' ');
326    }
327  
328  }
329
330/* end of source file */