001/* ---------------------------------------------------------------------------- 002 The Kiwi Toolkit - A Java Class Library 003 Copyright (C) 1998-2004 Mark A. Lindner 004 005 This library is free software; you can redistribute it and/or 006 modify it under the terms of the GNU General Public License as 007 published by the Free Software Foundation; either version 2 of the 008 License, or (at your option) any later version. 009 010 This library is distributed in the hope that it will be useful, 011 but WITHOUT ANY WARRANTY; without even the implied warranty of 012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013 General Public License for more details. 014 015 You should have received a copy of the GNU General Public License 016 along with this library; if not, write to the Free Software 017 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 018 02111-1307, USA. 019 020 The author may be contacted at: mark_a_lindner@yahoo.com 021 ---------------------------------------------------------------------------- 022 $Log: XMLParser.java,v $ 023 Revision 1.2 2004/05/05 22:47:37 markl 024 comment block updates 025 026 Revision 1.1 2004/05/05 18:47:08 markl 027 classes renamed 028 029 Revision 1.4 2003/01/19 09:34:27 markl 030 Javadoc & comment header updates. 031 032 Revision 1.3 2001/03/12 02:18:29 markl 033 Source code cleanup. 034 035 Revision 1.2 1999/01/10 03:37:18 markl 036 added GPL header & RCS tag 037 ---------------------------------------------------------------------------- 038*/ 039 040package kiwi.text; 041 042import java.io.*; 043import java.util.*; 044 045/** A very simple XML parser. <code>XMLParser</code> tokenizes XML 046 * into a series of <i>tags</i> and <i>strings</i>, which are passed 047 * in series to a consumer. The parser does not support CDATA 048 * sections, entities, or comments. 049 * 050 * <p> Here is a trivial example that recognizes a few HTML tags read 051 * from standard input: 052 * 053 * <pre> 054 * class HTMLParser implements XMLConsumer 055 * { 056 * XMLParser parser; 057 * 058 * HTMLParser() 059 * { 060 * parser = new XMLParser(new InputStreamReader(System.in), this); 061 * } 062 * 063 * void parse() 064 * { 065 * try 066 * { 067 * parser.parse(); 068 * } 069 * catch(IOException ex) 070 * { 071 * System.out.println("End of input, or other error."); 072 * } 073 * } 074 * 075 * public void consumeText(String text) 076 * { 077 * System.out.println("Text: " + text); 078 * } 079 * 080 * public void consumeElement(XMLElement e) 081 * { 082 * String tag = e.getTag(); 083 * if(tag.equalsIgnoreCase("b")) 084 * System.out.println("Bold " + (tag.isEnd() ? "end" : "begin")); 085 * else if(tag.equalsIgnoreCase("center")) 086 * System.out.println("Centering " + (tag.isEnd() ? "end" : "begin")); 087 * } 088 * } 089 * </pre> 090 * 091 * @author Mark Lindner 092 * @author Alex Lian (bug fixes) 093 * @author Eric Lunt (bug fixes) 094 */ 095 096public class XMLParser 097 { 098 private XMLConsumer consumer; 099 private StreamTokenizer st; 100 private Reader reader; 101 private boolean pre = false; 102 private StringBuffer text; 103 private static final int STATE_NONE = 0, STATE_TAG = 1, STATE_NAME = 2, 104 STATE_EQUALS = 3, STATE_VALUE = 4, STATE_COMMENT = 5; 105 106 /** Construct a new <code>XMLParser</code>. 107 * 108 * @param reader The reader to parse input from. 109 * @param consumer The <code>XMLConsumer</code> that will consume text 110 * and XML elements decoded from the reader. 111 */ 112 113 public XMLParser(Reader reader, XMLConsumer consumer) 114 { 115 this.consumer = consumer; 116 this.reader = reader; 117 text = new StringBuffer(200); 118 119 st = new StreamTokenizer(reader); 120 st.slashStarComments(false); 121 st.slashSlashComments(false); 122 st.eolIsSignificant(false); 123 124 st.ordinaryChar('<'); 125 st.ordinaryChar('>'); 126 st.ordinaryChar('!'); 127 st.ordinaryChars('#', ';'); 128 st.ordinaryChars('?', '@'); 129 st.ordinaryChars('[', '~'); 130 st.ordinaryChar('&'); 131 st.ordinaryChar('\"'); 132 st.wordChars('#', ';'); 133 st.wordChars('?', '@'); 134 st.wordChars('[', '~'); 135 136 st.whitespaceChars(0x0, 0x20); 137 st.wordChars(0x7F, 0xFF); 138 } 139 140 /** Parse the input. Data is read from the input stream and tokenized streams 141 * and tags are passed to the consumer until there is no more data available 142 * on the stream. 143 * 144 * @exception java.io.IOException If an error occurs on the input stream. 145 */ 146 147 public void parse() throws IOException 148 { 149 boolean quote = false; 150 String pname = null, pvalue = ""; 151 XMLElement e = null; 152 int state = STATE_NONE; 153 154 _whitespaceOn(); 155 _whitespaceOff(); 156 157 out: 158 for(;;) 159 { 160 switch(st.nextToken()) 161 { 162 case StreamTokenizer.TT_EOF: 163 break out; 164 165 case StreamTokenizer.TT_WORD: 166 if(quote) 167 { 168 pvalue = st.sval; 169 } 170 else 171 { 172 switch(state) 173 { 174 case STATE_NONE: 175 //if(text.length() > 0) text.append(' '); 176 text.append(st.sval); 177 break; 178 179 case STATE_TAG: 180 if(st.sval.charAt(0) == '/') 181 { 182 e.setEnd(true); 183 e.setTag(st.sval.substring(1)); 184 } 185 else e.setTag(st.sval); 186 state = STATE_NAME; 187 break; 188 189 case STATE_NAME: 190 pname = st.sval; 191 state = STATE_EQUALS; 192 break; 193 194 case STATE_VALUE: 195 e.addParam(pname, st.sval); 196 state = STATE_NAME; 197 break; 198 199 case STATE_EQUALS: 200 e.addParam(pname, null); 201 pname = st.sval; 202 state = STATE_EQUALS; 203 break; 204 } 205 } 206 break; 207 208 case '>': 209 if(state == STATE_TAG || state == STATE_NAME) 210 sendElement(e); 211 else if(state == STATE_EQUALS || state == STATE_VALUE) 212 { 213 e.addParam(pname, null); 214 sendElement(e); 215 } 216 else if(state == STATE_NONE) 217 text.append('>'); 218 219 st.wordChars('"', '"'); 220 st.wordChars('!', '!'); 221 state = STATE_NONE; 222 _whitespaceOff(); 223 break; 224 225 case '<': 226 _whitespaceOn(); 227 flushText(); 228 e = new XMLElement(); 229 st.ordinaryChar('\"'); 230 st.ordinaryChar('!'); 231 state = STATE_TAG; 232 break; 233 234 case '\"': 235 if(state == STATE_VALUE) 236 { 237 if(!quote) 238 { 239 quote = true; 240 pvalue = ""; 241 st.wordChars(' ', ' '); 242 st.wordChars('\t', '\t'); 243 st.wordChars('<', '>'); 244 } 245 else 246 { 247 quote = false; 248 e.addParam(pname, pvalue); 249 st.whitespaceChars(' ', ' '); 250 st.whitespaceChars('\t', '\t'); 251 st.ordinaryChars('<', '>'); 252 state = STATE_NAME; 253 } 254 } 255 break; 256 257 case '=': 258 if(state == STATE_EQUALS) state = STATE_VALUE; 259 break; 260 261 case '!': 262 if(state == STATE_TAG) 263 { 264 state = STATE_COMMENT; 265 } 266 else if(state != STATE_COMMENT) 267 text.append('!'); 268 break; 269 } 270 } 271 flushText(); 272 } 273 274 /* flush the text buffer to the consumer */ 275 276 private void flushText() 277 { 278 if(text.length() > 0) 279 { 280 consumer.consumeText(text.toString()); 281 text = new StringBuffer(200); 282 } 283 } 284 285 /* send a fully constructed element to the consumer */ 286 287 private void sendElement(XMLElement e) 288 { 289 boolean r = consumer.consumeElement(e); 290 291 if(r) 292 { 293 if(pre) 294 { 295 // turn formatting on 296 _whitespaceOn(); 297 pre = false; 298 } 299 else 300 { 301 // turn formatting off 302 _whitespaceOff(); 303 pre = true; 304 } 305 } 306 } 307 308 /* turn off whitespace */ 309 310 private void _whitespaceOff() 311 { 312 st.wordChars('\n', '\n'); 313 st.wordChars('\f', '\f'); 314 st.wordChars('\t', '\t'); 315 st.wordChars(' ', ' '); 316 } 317 318 /* turn on whitespace */ 319 320 private void _whitespaceOn() 321 { 322 st.ordinaryChars('\n', '\n'); 323 st.ordinaryChars('\f', '\f'); 324 st.ordinaryChars('\t', '\t'); 325 st.ordinaryChars(' ', ' '); 326 } 327 328 } 329 330/* end of source file */