001package org.jsoup.parser; 002 003import org.jsoup.nodes.Document; 004import org.jsoup.nodes.Element; 005import org.jsoup.nodes.Node; 006 007import java.io.Reader; 008import java.io.StringReader; 009import java.util.List; 010 011/** 012 * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the more convenient parse methods 013 * in {@link org.jsoup.Jsoup}. 014 */ 015public class Parser { 016 private static final int DEFAULT_MAX_ERRORS = 0; // by default, error tracking is disabled. 017 018 private TreeBuilder treeBuilder; 019 private int maxErrors = DEFAULT_MAX_ERRORS; 020 private ParseErrorList errors; 021 private ParseSettings settings; 022 023 /** 024 * Create a new Parser, using the specified TreeBuilder 025 * @param treeBuilder TreeBuilder to use to parse input into Documents. 026 */ 027 public Parser(TreeBuilder treeBuilder) { 028 this.treeBuilder = treeBuilder; 029 settings = treeBuilder.defaultSettings(); 030 } 031 032 public Document parseInput(String html, String baseUri) { 033 errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); 034 return treeBuilder.parse(new StringReader(html), baseUri, errors, settings); 035 } 036 037 public Document parseInput(Reader inputHtml, String baseUri) { 038 errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); 039 return treeBuilder.parse(inputHtml, baseUri, errors, settings); 040 } 041 042 // gets & sets 043 /** 044 * Get the TreeBuilder currently in use. 045 * @return current TreeBuilder. 046 */ 047 public TreeBuilder getTreeBuilder() { 048 return treeBuilder; 049 } 050 051 /** 052 * Update the TreeBuilder used when parsing content. 053 * @param treeBuilder current TreeBuilder 054 * @return this, for chaining 055 */ 056 public Parser setTreeBuilder(TreeBuilder treeBuilder) { 057 this.treeBuilder = treeBuilder; 058 return this; 059 } 060 061 /** 062 * Check if parse error tracking is enabled. 063 * @return current track error state. 064 */ 065 public boolean isTrackErrors() { 066 return maxErrors > 0; 067 } 068 069 /** 070 * Enable or disable parse error tracking for the next parse. 071 * @param maxErrors the maximum number of errors to track. Set to 0 to disable. 072 * @return this, for chaining 073 */ 074 public Parser setTrackErrors(int maxErrors) { 075 this.maxErrors = maxErrors; 076 return this; 077 } 078 079 /** 080 * Retrieve the parse errors, if any, from the last parse. 081 * @return list of parse errors, up to the size of the maximum errors tracked. 082 */ 083 public List<ParseError> getErrors() { 084 return errors; 085 } 086 087 public Parser settings(ParseSettings settings) { 088 this.settings = settings; 089 return this; 090 } 091 092 public ParseSettings settings() { 093 return settings; 094 } 095 096 // static parse functions below 097 /** 098 * Parse HTML into a Document. 099 * 100 * @param html HTML to parse 101 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 102 * 103 * @return parsed Document 104 */ 105 public static Document parse(String html, String baseUri) { 106 TreeBuilder treeBuilder = new HtmlTreeBuilder(); 107 return treeBuilder.parse(new StringReader(html), baseUri, ParseErrorList.noTracking(), treeBuilder.defaultSettings()); 108 } 109 110 /** 111 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 112 * 113 * @param fragmentHtml the fragment of HTML to parse 114 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 115 * provides stack context (for implicit element creation). 116 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 117 * 118 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 119 */ 120 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) { 121 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 122 return treeBuilder.parseFragment(fragmentHtml, context, baseUri, ParseErrorList.noTracking(), treeBuilder.defaultSettings()); 123 } 124 125 /** 126 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 127 * 128 * @param fragmentHtml the fragment of HTML to parse 129 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 130 * provides stack context (for implicit element creation). 131 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 132 * @param errorList list to add errors to 133 * 134 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 135 */ 136 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) { 137 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 138 return treeBuilder.parseFragment(fragmentHtml, context, baseUri, errorList, treeBuilder.defaultSettings()); 139 } 140 141 /** 142 * Parse a fragment of XML into a list of nodes. 143 * 144 * @param fragmentXml the fragment of XML to parse 145 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 146 * @return list of nodes parsed from the input XML. 147 */ 148 public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) { 149 XmlTreeBuilder treeBuilder = new XmlTreeBuilder(); 150 return treeBuilder.parseFragment(fragmentXml, baseUri, ParseErrorList.noTracking(), treeBuilder.defaultSettings()); 151 } 152 153 /** 154 * Parse a fragment of HTML into the {@code body} of a Document. 155 * 156 * @param bodyHtml fragment of HTML 157 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 158 * 159 * @return Document, with empty head, and HTML parsed into body 160 */ 161 public static Document parseBodyFragment(String bodyHtml, String baseUri) { 162 Document doc = Document.createShell(baseUri); 163 Element body = doc.body(); 164 List<Node> nodeList = parseFragment(bodyHtml, body, baseUri); 165 Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented 166 for (int i = nodes.length - 1; i > 0; i--) { 167 nodes[i].remove(); 168 } 169 for (Node node : nodes) { 170 body.appendChild(node); 171 } 172 return doc; 173 } 174 175 /** 176 * Utility method to unescape HTML entities from a string 177 * @param string HTML escaped string 178 * @param inAttribute if the string is to be escaped in strict mode (as attributes are) 179 * @return an unescaped string 180 */ 181 public static String unescapeEntities(String string, boolean inAttribute) { 182 Tokeniser tokeniser = new Tokeniser(new CharacterReader(string), ParseErrorList.noTracking()); 183 return tokeniser.unescapeEntities(inAttribute); 184 } 185 186 /** 187 * @param bodyHtml HTML to parse 188 * @param baseUri baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 189 * 190 * @return parsed Document 191 * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} instead. 192 */ 193 public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) { 194 return parse(bodyHtml, baseUri); 195 } 196 197 // builders 198 199 /** 200 * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, 201 * based on a knowledge of the semantics of the incoming tags. 202 * @return a new HTML parser. 203 */ 204 public static Parser htmlParser() { 205 return new Parser(new HtmlTreeBuilder()); 206 } 207 208 /** 209 * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML, 210 * rather creates a simple tree directly from the input. 211 * @return a new simple XML parser. 212 */ 213 public static Parser xmlParser() { 214 return new Parser(new XmlTreeBuilder()); 215 } 216}