001package org.jsoup; 002 003import org.jsoup.nodes.Document; 004import org.jsoup.parser.Parser; 005import org.jsoup.safety.Cleaner; 006import org.jsoup.safety.Whitelist; 007import org.jsoup.helper.DataUtil; 008import org.jsoup.helper.HttpConnection; 009 010import java.io.File; 011import java.io.IOException; 012import java.io.InputStream; 013import java.net.URL; 014 015/** 016 The core public access point to the jsoup functionality. 017 018 @author Jonathan Hedley */ 019public class Jsoup { 020 private Jsoup() {} 021 022 /** 023 Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML. 024 025 @param html HTML to parse 026 @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur 027 before the HTML declares a {@code <base href>} tag. 028 @return sane HTML 029 */ 030 public static Document parse(String html, String baseUri) { 031 return Parser.parse(html, baseUri); 032 } 033 034 /** 035 Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML 036 (non-HTML) parser. 037 038 @param html HTML to parse 039 @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur 040 before the HTML declares a {@code <base href>} tag. 041 @param parser alternate {@link Parser#xmlParser() parser} to use. 042 @return sane HTML 043 */ 044 public static Document parse(String html, String baseUri, Parser parser) { 045 return parser.parseInput(html, baseUri); 046 } 047 048 /** 049 Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a 050 {@code <base href>} tag. 051 052 @param html HTML to parse 053 @return sane HTML 054 055 @see #parse(String, String) 056 */ 057 public static Document parse(String html) { 058 return Parser.parse(html, ""); 059 } 060 061 /** 062 * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page. 063 * <p> 064 * Use examples: 065 * <ul> 066 * <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li> 067 * <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();</code></li> 068 * </ul> 069 * @param url URL to connect to. The protocol must be {@code http} or {@code https}. 070 * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute. 071 */ 072 public static Connection connect(String url) { 073 return HttpConnection.connect(url); 074 } 075 076 /** 077 Parse the contents of a file as HTML. 078 079 @param in file to load HTML from 080 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 081 present, or fall back to {@code UTF-8} (which is often safe to do). 082 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 083 @return sane HTML 084 085 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 086 */ 087 public static Document parse(File in, String charsetName, String baseUri) throws IOException { 088 return DataUtil.load(in, charsetName, baseUri); 089 } 090 091 /** 092 Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. 093 094 @param in file to load HTML from 095 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 096 present, or fall back to {@code UTF-8} (which is often safe to do). 097 @return sane HTML 098 099 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 100 @see #parse(File, String, String) 101 */ 102 public static Document parse(File in, String charsetName) throws IOException { 103 return DataUtil.load(in, charsetName, in.getAbsolutePath()); 104 } 105 106 /** 107 Read an input stream, and parse it to a Document. 108 109 @param in input stream to read. Make sure to close it after parsing. 110 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 111 present, or fall back to {@code UTF-8} (which is often safe to do). 112 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 113 @return sane HTML 114 115 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 116 */ 117 public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException { 118 return DataUtil.load(in, charsetName, baseUri); 119 } 120 121 /** 122 Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML 123 (non-HTML) parser. 124 125 @param in input stream to read. Make sure to close it after parsing. 126 @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if 127 present, or fall back to {@code UTF-8} (which is often safe to do). 128 @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. 129 @param parser alternate {@link Parser#xmlParser() parser} to use. 130 @return sane HTML 131 132 @throws IOException if the file could not be found, or read, or if the charsetName is invalid. 133 */ 134 public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException { 135 return DataUtil.load(in, charsetName, baseUri, parser); 136 } 137 138 /** 139 Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. 140 141 @param bodyHtml body HTML fragment 142 @param baseUri URL to resolve relative URLs against. 143 @return sane HTML document 144 145 @see Document#body() 146 */ 147 public static Document parseBodyFragment(String bodyHtml, String baseUri) { 148 return Parser.parseBodyFragment(bodyHtml, baseUri); 149 } 150 151 /** 152 Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. 153 154 @param bodyHtml body HTML fragment 155 @return sane HTML document 156 157 @see Document#body() 158 */ 159 public static Document parseBodyFragment(String bodyHtml) { 160 return Parser.parseBodyFragment(bodyHtml, ""); 161 } 162 163 /** 164 Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead. 165 <p> 166 The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}. 167 168 @param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}. 169 @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown. 170 @return The parsed HTML. 171 172 @throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed 173 @throws HttpStatusException if the response is not OK and HTTP response errors are not ignored 174 @throws UnsupportedMimeTypeException if the response mime type is not supported and those errors are not ignored 175 @throws java.net.SocketTimeoutException if the connection times out 176 @throws IOException if a connection or read error occurs 177 178 @see #connect(String) 179 */ 180 public static Document parse(URL url, int timeoutMillis) throws IOException { 181 Connection con = HttpConnection.connect(url); 182 con.timeout(timeoutMillis); 183 return con.get(); 184 } 185 186 /** 187 Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted 188 tags and attributes. 189 190 @param bodyHtml input untrusted HTML (body fragment) 191 @param baseUri URL to resolve relative URLs against 192 @param whitelist white-list of permitted HTML elements 193 @return safe HTML (body fragment) 194 195 @see Cleaner#clean(Document) 196 */ 197 public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) { 198 Document dirty = parseBodyFragment(bodyHtml, baseUri); 199 Cleaner cleaner = new Cleaner(whitelist); 200 Document clean = cleaner.clean(dirty); 201 return clean.body().html(); 202 } 203 204 /** 205 Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted 206 tags and attributes. 207 208 @param bodyHtml input untrusted HTML (body fragment) 209 @param whitelist white-list of permitted HTML elements 210 @return safe HTML (body fragment) 211 212 @see Cleaner#clean(Document) 213 */ 214 public static String clean(String bodyHtml, Whitelist whitelist) { 215 return clean(bodyHtml, "", whitelist); 216 } 217 218 /** 219 * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of 220 * permitted tags and attributes. 221 * <p>The HTML is treated as a body fragment; it's expected the cleaned HTML will be used within the body of an 222 * existing document. If you want to clean full documents, use {@link Cleaner#clean(Document)} instead, and add 223 * structural tags (<code>html, head, body</code> etc) to the whitelist. 224 * 225 * @param bodyHtml input untrusted HTML (body fragment) 226 * @param baseUri URL to resolve relative URLs against 227 * @param whitelist white-list of permitted HTML elements 228 * @param outputSettings document output settings; use to control pretty-printing and entity escape modes 229 * @return safe HTML (body fragment) 230 * @see Cleaner#clean(Document) 231 */ 232 public static String clean(String bodyHtml, String baseUri, Whitelist whitelist, Document.OutputSettings outputSettings) { 233 Document dirty = parseBodyFragment(bodyHtml, baseUri); 234 Cleaner cleaner = new Cleaner(whitelist); 235 Document clean = cleaner.clean(dirty); 236 clean.outputSettings(outputSettings); 237 return clean.body().html(); 238 } 239 240 /** 241 Test if the input body HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. 242 <p>The input HTML should still be run through the cleaner to set up enforced attributes, and to tidy the output. 243 <p>Assumes the HTML is a body fragment (i.e. will be used in an existing HTML document body.) 244 @param bodyHtml HTML to test 245 @param whitelist whitelist to test against 246 @return true if no tags or attributes were removed; false otherwise 247 @see #clean(String, org.jsoup.safety.Whitelist) 248 */ 249 public static boolean isValid(String bodyHtml, Whitelist whitelist) { 250 return new Cleaner(whitelist).isValidBodyHtml(bodyHtml); 251 } 252 253}