Source code

001package org.jsoup;
002
003import org.jsoup.nodes.Document;
004import org.jsoup.parser.Parser;
005import org.jsoup.safety.Cleaner;
006import org.jsoup.safety.Whitelist;
007import org.jsoup.helper.DataUtil;
008import org.jsoup.helper.HttpConnection;
009
010import java.io.File;
011import java.io.IOException;
012import java.io.InputStream;
013import java.net.URL;
014
015/**
016 The core public access point to the jsoup functionality.
017
018 @author Jonathan Hedley */
019public class Jsoup {
020    private Jsoup() {}
021
022    /**
023     Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML.
024
025     @param html    HTML to parse
026     @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
027     before the HTML declares a {@code <base href>} tag.
028     @return sane HTML
029     */
030    public static Document parse(String html, String baseUri) {
031        return Parser.parse(html, baseUri);
032    }
033
034    /**
035     Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
036     (non-HTML) parser.
037
038     @param html    HTML to parse
039     @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
040     before the HTML declares a {@code <base href>} tag.
041     @param parser alternate {@link Parser#xmlParser() parser} to use.
042     @return sane HTML
043     */
044    public static Document parse(String html, String baseUri, Parser parser) {
045        return parser.parseInput(html, baseUri);
046    }
047
048    /**
049     Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a
050     {@code <base href>} tag.
051
052     @param html HTML to parse
053     @return sane HTML
054
055     @see #parse(String, String)
056     */
057    public static Document parse(String html) {
058        return Parser.parse(html, "");
059    }
060
061    /**
062     * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page.
063     * <p>
064     * Use examples:
065     * <ul>
066     *  <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li>
067     *  <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();</code></li>
068     * </ul>
069     * @param url URL to connect to. The protocol must be {@code http} or {@code https}.
070     * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute.
071     */
072    public static Connection connect(String url) {
073        return HttpConnection.connect(url);
074    }
075
076    /**
077     Parse the contents of a file as HTML.
078
079     @param in          file to load HTML from
080     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
081     present, or fall back to {@code UTF-8} (which is often safe to do).
082     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
083     @return sane HTML
084
085     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
086     */
087    public static Document parse(File in, String charsetName, String baseUri) throws IOException {
088        return DataUtil.load(in, charsetName, baseUri);
089    }
090
091    /**
092     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
093
094     @param in          file to load HTML from
095     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
096     present, or fall back to {@code UTF-8} (which is often safe to do).
097     @return sane HTML
098
099     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
100     @see #parse(File, String, String)
101     */
102    public static Document parse(File in, String charsetName) throws IOException {
103        return DataUtil.load(in, charsetName, in.getAbsolutePath());
104    }
105
106     /**
107     Read an input stream, and parse it to a Document.
108
109     @param in          input stream to read. Make sure to close it after parsing.
110     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
111     present, or fall back to {@code UTF-8} (which is often safe to do).
112     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
113     @return sane HTML
114
115     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
116     */
117    public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException {
118        return DataUtil.load(in, charsetName, baseUri);
119    }
120
121    /**
122     Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML
123     (non-HTML) parser.
124
125     @param in          input stream to read. Make sure to close it after parsing.
126     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
127     present, or fall back to {@code UTF-8} (which is often safe to do).
128     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
129     @param parser alternate {@link Parser#xmlParser() parser} to use.
130     @return sane HTML
131
132     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
133     */
134    public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException {
135        return DataUtil.load(in, charsetName, baseUri, parser);
136    }
137
138    /**
139     Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
140
141     @param bodyHtml body HTML fragment
142     @param baseUri  URL to resolve relative URLs against.
143     @return sane HTML document
144
145     @see Document#body()
146     */
147    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
148        return Parser.parseBodyFragment(bodyHtml, baseUri);
149    }
150
151    /**
152     Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
153
154     @param bodyHtml body HTML fragment
155     @return sane HTML document
156
157     @see Document#body()
158     */
159    public static Document parseBodyFragment(String bodyHtml) {
160        return Parser.parseBodyFragment(bodyHtml, "");
161    }
162
163    /**
164     Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead.
165     <p>
166     The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}.
167
168     @param url           URL to fetch (with a GET). The protocol must be {@code http} or {@code https}.
169     @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown.
170     @return The parsed HTML.
171
172     @throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed
173     @throws HttpStatusException if the response is not OK and HTTP response errors are not ignored
174     @throws UnsupportedMimeTypeException if the response mime type is not supported and those errors are not ignored
175     @throws java.net.SocketTimeoutException if the connection times out
176     @throws IOException if a connection or read error occurs
177
178     @see #connect(String)
179     */
180    public static Document parse(URL url, int timeoutMillis) throws IOException {
181        Connection con = HttpConnection.connect(url);
182        con.timeout(timeoutMillis);
183        return con.get();
184    }
185
186    /**
187     Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
188     tags and attributes.
189
190     @param bodyHtml  input untrusted HTML (body fragment)
191     @param baseUri   URL to resolve relative URLs against
192     @param whitelist white-list of permitted HTML elements
193     @return safe HTML (body fragment)
194
195     @see Cleaner#clean(Document)
196     */
197    public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) {
198        Document dirty = parseBodyFragment(bodyHtml, baseUri);
199        Cleaner cleaner = new Cleaner(whitelist);
200        Document clean = cleaner.clean(dirty);
201        return clean.body().html();
202    }
203
204    /**
205     Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
206     tags and attributes.
207
208     @param bodyHtml  input untrusted HTML (body fragment)
209     @param whitelist white-list of permitted HTML elements
210     @return safe HTML (body fragment)
211
212     @see Cleaner#clean(Document)
213     */
214    public static String clean(String bodyHtml, Whitelist whitelist) {
215        return clean(bodyHtml, "", whitelist);
216    }
217
218    /**
219     * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of
220     * permitted tags and attributes.
221     * <p>The HTML is treated as a body fragment; it's expected the cleaned HTML will be used within the body of an
222     * existing document. If you want to clean full documents, use {@link Cleaner#clean(Document)} instead, and add
223     * structural tags (<code>html, head, body</code> etc) to the whitelist.
224     *
225     * @param bodyHtml input untrusted HTML (body fragment)
226     * @param baseUri URL to resolve relative URLs against
227     * @param whitelist white-list of permitted HTML elements
228     * @param outputSettings document output settings; use to control pretty-printing and entity escape modes
229     * @return safe HTML (body fragment)
230     * @see Cleaner#clean(Document)
231     */
232    public static String clean(String bodyHtml, String baseUri, Whitelist whitelist, Document.OutputSettings outputSettings) {
233        Document dirty = parseBodyFragment(bodyHtml, baseUri);
234        Cleaner cleaner = new Cleaner(whitelist);
235        Document clean = cleaner.clean(dirty);
236        clean.outputSettings(outputSettings);
237        return clean.body().html();
238    }
239
240    /**
241     Test if the input body HTML has only tags and attributes allowed by the Whitelist. Useful for form validation.
242     <p>The input HTML should still be run through the cleaner to set up enforced attributes, and to tidy the output.
243     <p>Assumes the HTML is a body fragment (i.e. will be used in an existing HTML document body.)
244     @param bodyHtml HTML to test
245     @param whitelist whitelist to test against
246     @return true if no tags or attributes were removed; false otherwise
247     @see #clean(String, org.jsoup.safety.Whitelist) 
248     */
249    public static boolean isValid(String bodyHtml, Whitelist whitelist) {
250        return new Cleaner(whitelist).isValidBodyHtml(bodyHtml);
251    }
252    
253}