001package org.jsoup.parser; 002 003import org.jsoup.Jsoup; 004import org.jsoup.helper.Validate; 005import org.jsoup.nodes.*; 006 007import java.io.Reader; 008import java.io.StringReader; 009import java.util.List; 010 011/** 012 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the 013 * document. 014 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p> 015 * 016 * @author Jonathan Hedley 017 */ 018public class XmlTreeBuilder extends TreeBuilder { 019 ParseSettings defaultSettings() { 020 return ParseSettings.preserveCase; 021 } 022 023 Document parse(Reader input, String baseUri) { 024 return parse(input, baseUri, ParseErrorList.noTracking(), ParseSettings.preserveCase); 025 } 026 027 Document parse(String input, String baseUri) { 028 return parse(new StringReader(input), baseUri, ParseErrorList.noTracking(), ParseSettings.preserveCase); 029 } 030 031 @Override 032 protected void initialiseParse(Reader input, String baseUri, ParseErrorList errors, ParseSettings settings) { 033 super.initialiseParse(input, baseUri, errors, settings); 034 stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack) 035 doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml); 036 } 037 038 @Override 039 protected boolean process(Token token) { 040 // start tag, end tag, doctype, comment, character, eof 041 switch (token.type) { 042 case StartTag: 043 insert(token.asStartTag()); 044 break; 045 case EndTag: 046 popStackToClose(token.asEndTag()); 047 break; 048 case Comment: 049 insert(token.asComment()); 050 break; 051 case Character: 052 insert(token.asCharacter()); 053 break; 054 case Doctype: 055 insert(token.asDoctype()); 056 break; 057 case EOF: // could put some normalisation here if desired 058 break; 059 default: 060 Validate.fail("Unexpected token type: " + token.type); 061 } 062 return true; 063 } 064 065 private void insertNode(Node node) { 066 currentElement().appendChild(node); 067 } 068 069 Element insert(Token.StartTag startTag) { 070 Tag tag = Tag.valueOf(startTag.name(), settings); 071 // todo: wonder if for xml parsing, should treat all tags as unknown? because it's not html. 072 Element el = new Element(tag, baseUri, settings.normalizeAttributes(startTag.attributes)); 073 insertNode(el); 074 if (startTag.isSelfClosing()) { 075 if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output. see above. 076 tag.setSelfClosing(); 077 } else { 078 stack.add(el); 079 } 080 return el; 081 } 082 083 void insert(Token.Comment commentToken) { 084 Comment comment = new Comment(commentToken.getData()); 085 Node insert = comment; 086 if (commentToken.bogus) { // xml declarations are emitted as bogus comments (which is right for html, but not xml) 087 // so we do a bit of a hack and parse the data as an element to pull the attributes out 088 String data = comment.getData(); 089 if (data.length() > 1 && (data.startsWith("!") || data.startsWith("?"))) { 090 Document doc = Jsoup.parse("<" + data.substring(1, data.length() -1) + ">", baseUri, Parser.xmlParser()); 091 Element el = doc.child(0); 092 insert = new XmlDeclaration(settings.normalizeTag(el.tagName()), data.startsWith("!")); 093 insert.attributes().addAll(el.attributes()); 094 } 095 } 096 insertNode(insert); 097 } 098 099 void insert(Token.Character characterToken) { 100 Node node = new TextNode(characterToken.getData()); 101 insertNode(node); 102 } 103 104 void insert(Token.Doctype d) { 105 DocumentType doctypeNode = new DocumentType(settings.normalizeTag(d.getName()), d.getPublicIdentifier(), d.getSystemIdentifier()); 106 doctypeNode.setPubSysKey(d.getPubSysKey()); 107 insertNode(doctypeNode); 108 } 109 110 /** 111 * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not 112 * found, skips. 113 * 114 * @param endTag tag to close 115 */ 116 private void popStackToClose(Token.EndTag endTag) { 117 String elName = endTag.name(); 118 Element firstFound = null; 119 120 for (int pos = stack.size() -1; pos >= 0; pos--) { 121 Element next = stack.get(pos); 122 if (next.nodeName().equals(elName)) { 123 firstFound = next; 124 break; 125 } 126 } 127 if (firstFound == null) 128 return; // not found, skip 129 130 for (int pos = stack.size() -1; pos >= 0; pos--) { 131 Element next = stack.get(pos); 132 stack.remove(pos); 133 if (next == firstFound) 134 break; 135 } 136 } 137 138 List<Node> parseFragment(String inputFragment, String baseUri, ParseErrorList errors, ParseSettings settings) { 139 initialiseParse(new StringReader(inputFragment), baseUri, errors, settings); 140 runParser(); 141 return doc.childNodes(); 142 } 143}