001package org.jsoup.parser;
002
003import org.jsoup.nodes.Document;
004import org.jsoup.nodes.Element;
005import org.jsoup.nodes.Node;
006
007import java.io.Reader;
008import java.io.StringReader;
009import java.util.List;
010
011/**
012 * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the  more convenient parse methods
013 * in {@link org.jsoup.Jsoup}.
014 */
015public class Parser {
016    private static final int DEFAULT_MAX_ERRORS = 0; // by default, error tracking is disabled.
017    
018    private TreeBuilder treeBuilder;
019    private int maxErrors = DEFAULT_MAX_ERRORS;
020    private ParseErrorList errors;
021    private ParseSettings settings;
022
023    /**
024     * Create a new Parser, using the specified TreeBuilder
025     * @param treeBuilder TreeBuilder to use to parse input into Documents.
026     */
027    public Parser(TreeBuilder treeBuilder) {
028        this.treeBuilder = treeBuilder;
029        settings = treeBuilder.defaultSettings();
030    }
031    
032    public Document parseInput(String html, String baseUri) {
033        errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
034        return treeBuilder.parse(new StringReader(html), baseUri, errors, settings);
035    }
036
037    public Document parseInput(Reader inputHtml, String baseUri) {
038        errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
039        return treeBuilder.parse(inputHtml, baseUri, errors, settings);
040    }
041
042    // gets & sets
043    /**
044     * Get the TreeBuilder currently in use.
045     * @return current TreeBuilder.
046     */
047    public TreeBuilder getTreeBuilder() {
048        return treeBuilder;
049    }
050
051    /**
052     * Update the TreeBuilder used when parsing content.
053     * @param treeBuilder current TreeBuilder
054     * @return this, for chaining
055     */
056    public Parser setTreeBuilder(TreeBuilder treeBuilder) {
057        this.treeBuilder = treeBuilder;
058        return this;
059    }
060
061    /**
062     * Check if parse error tracking is enabled.
063     * @return current track error state.
064     */
065    public boolean isTrackErrors() {
066        return maxErrors > 0;
067    }
068
069    /**
070     * Enable or disable parse error tracking for the next parse.
071     * @param maxErrors the maximum number of errors to track. Set to 0 to disable.
072     * @return this, for chaining
073     */
074    public Parser setTrackErrors(int maxErrors) {
075        this.maxErrors = maxErrors;
076        return this;
077    }
078
079    /**
080     * Retrieve the parse errors, if any, from the last parse.
081     * @return list of parse errors, up to the size of the maximum errors tracked.
082     */
083    public List<ParseError> getErrors() {
084        return errors;
085    }
086
087    public Parser settings(ParseSettings settings) {
088        this.settings = settings;
089        return this;
090    }
091
092    public ParseSettings settings() {
093        return settings;
094    }
095
096    // static parse functions below
097    /**
098     * Parse HTML into a Document.
099     *
100     * @param html HTML to parse
101     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
102     *
103     * @return parsed Document
104     */
105    public static Document parse(String html, String baseUri) {
106        TreeBuilder treeBuilder = new HtmlTreeBuilder();
107        return treeBuilder.parse(new StringReader(html), baseUri, ParseErrorList.noTracking(), treeBuilder.defaultSettings());
108    }
109
110    /**
111     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
112     *
113     * @param fragmentHtml the fragment of HTML to parse
114     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
115     * provides stack context (for implicit element creation).
116     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
117     *
118     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
119     */
120    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) {
121        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
122        return treeBuilder.parseFragment(fragmentHtml, context, baseUri, ParseErrorList.noTracking(), treeBuilder.defaultSettings());
123    }
124
125    /**
126     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
127     *
128     * @param fragmentHtml the fragment of HTML to parse
129     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
130     * provides stack context (for implicit element creation).
131     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
132     * @param errorList list to add errors to
133     *
134     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
135     */
136    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) {
137        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
138        return treeBuilder.parseFragment(fragmentHtml, context, baseUri, errorList, treeBuilder.defaultSettings());
139    }
140
141    /**
142     * Parse a fragment of XML into a list of nodes.
143     *
144     * @param fragmentXml the fragment of XML to parse
145     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
146     * @return list of nodes parsed from the input XML.
147     */
148    public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
149        XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
150        return treeBuilder.parseFragment(fragmentXml, baseUri, ParseErrorList.noTracking(), treeBuilder.defaultSettings());
151    }
152
153    /**
154     * Parse a fragment of HTML into the {@code body} of a Document.
155     *
156     * @param bodyHtml fragment of HTML
157     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
158     *
159     * @return Document, with empty head, and HTML parsed into body
160     */
161    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
162        Document doc = Document.createShell(baseUri);
163        Element body = doc.body();
164        List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
165        Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented
166        for (int i = nodes.length - 1; i > 0; i--) {
167            nodes[i].remove();
168        }
169        for (Node node : nodes) {
170            body.appendChild(node);
171        }
172        return doc;
173    }
174
175    /**
176     * Utility method to unescape HTML entities from a string
177     * @param string HTML escaped string
178     * @param inAttribute if the string is to be escaped in strict mode (as attributes are)
179     * @return an unescaped string
180     */
181    public static String unescapeEntities(String string, boolean inAttribute) {
182        Tokeniser tokeniser = new Tokeniser(new CharacterReader(string), ParseErrorList.noTracking());
183        return tokeniser.unescapeEntities(inAttribute);
184    }
185
186    /**
187     * @param bodyHtml HTML to parse
188     * @param baseUri baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
189     *
190     * @return parsed Document
191     * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} instead.
192     */
193    public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) {
194        return parse(bodyHtml, baseUri);
195    }
196    
197    // builders
198
199    /**
200     * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document,
201     * based on a knowledge of the semantics of the incoming tags.
202     * @return a new HTML parser.
203     */
204    public static Parser htmlParser() {
205        return new Parser(new HtmlTreeBuilder());
206    }
207
208    /**
209     * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML,
210     * rather creates a simple tree directly from the input.
211     * @return a new simple XML parser.
212     */
213    public static Parser xmlParser() {
214        return new Parser(new XmlTreeBuilder());
215    }
216}