001package org.jsoup.nodes; 002 003import org.jsoup.helper.StringUtil; 004import org.jsoup.helper.Validate; 005import org.jsoup.parser.ParseSettings; 006import org.jsoup.parser.Tag; 007import org.jsoup.select.Elements; 008 009import java.nio.charset.Charset; 010import java.nio.charset.CharsetEncoder; 011import java.util.ArrayList; 012import java.util.List; 013 014/** 015 A HTML Document. 016 017 @author Jonathan Hedley, jonathan@hedley.net */ 018public class Document extends Element { 019 private OutputSettings outputSettings = new OutputSettings(); 020 private QuirksMode quirksMode = QuirksMode.noQuirks; 021 private String location; 022 private boolean updateMetaCharset = false; 023 024 /** 025 Create a new, empty Document. 026 @param baseUri base URI of document 027 @see org.jsoup.Jsoup#parse 028 @see #createShell 029 */ 030 public Document(String baseUri) { 031 super(Tag.valueOf("#root", ParseSettings.htmlDefault), baseUri); 032 this.location = baseUri; 033 } 034 035 /** 036 Create a valid, empty shell of a document, suitable for adding more elements to. 037 @param baseUri baseUri of document 038 @return document with html, head, and body elements. 039 */ 040 public static Document createShell(String baseUri) { 041 Validate.notNull(baseUri); 042 043 Document doc = new Document(baseUri); 044 Element html = doc.appendElement("html"); 045 html.appendElement("head"); 046 html.appendElement("body"); 047 048 return doc; 049 } 050 051 /** 052 * Get the URL this Document was parsed from. If the starting URL is a redirect, 053 * this will return the final URL from which the document was served from. 054 * @return location 055 */ 056 public String location() { 057 return location; 058 } 059 060 /** 061 Accessor to the document's {@code head} element. 062 @return {@code head} 063 */ 064 public Element head() { 065 return findFirstElementByTagName("head", this); 066 } 067 068 /** 069 Accessor to the document's {@code body} element. 070 @return {@code body} 071 */ 072 public Element body() { 073 return findFirstElementByTagName("body", this); 074 } 075 076 /** 077 Get the string contents of the document's {@code title} element. 078 @return Trimmed title, or empty string if none set. 079 */ 080 public String title() { 081 // title is a preserve whitespace tag (for document output), but normalised here 082 Element titleEl = getElementsByTag("title").first(); 083 return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : ""; 084 } 085 086 /** 087 Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if 088 not present 089 @param title string to set as title 090 */ 091 public void title(String title) { 092 Validate.notNull(title); 093 Element titleEl = getElementsByTag("title").first(); 094 if (titleEl == null) { // add to head 095 head().appendElement("title").text(title); 096 } else { 097 titleEl.text(title); 098 } 099 } 100 101 /** 102 Create a new Element, with this document's base uri. Does not make the new element a child of this document. 103 @param tagName element tag name (e.g. {@code a}) 104 @return new element 105 */ 106 public Element createElement(String tagName) { 107 return new Element(Tag.valueOf(tagName, ParseSettings.preserveCase), this.baseUri()); 108 } 109 110 /** 111 Normalise the document. This happens after the parse phase so generally does not need to be called. 112 Moves any text content that is not in the body element into the body. 113 @return this document after normalisation 114 */ 115 public Document normalise() { 116 Element htmlEl = findFirstElementByTagName("html", this); 117 if (htmlEl == null) 118 htmlEl = appendElement("html"); 119 if (head() == null) 120 htmlEl.prependElement("head"); 121 if (body() == null) 122 htmlEl.appendElement("body"); 123 124 // pull text nodes out of root, html, and head els, and push into body. non-text nodes are already taken care 125 // of. do in inverse order to maintain text order. 126 normaliseTextNodes(head()); 127 normaliseTextNodes(htmlEl); 128 normaliseTextNodes(this); 129 130 normaliseStructure("head", htmlEl); 131 normaliseStructure("body", htmlEl); 132 133 ensureMetaCharsetElement(); 134 135 return this; 136 } 137 138 // does not recurse. 139 private void normaliseTextNodes(Element element) { 140 List<Node> toMove = new ArrayList<>(); 141 for (Node node: element.childNodes) { 142 if (node instanceof TextNode) { 143 TextNode tn = (TextNode) node; 144 if (!tn.isBlank()) 145 toMove.add(tn); 146 } 147 } 148 149 for (int i = toMove.size()-1; i >= 0; i--) { 150 Node node = toMove.get(i); 151 element.removeChild(node); 152 body().prependChild(new TextNode(" ")); 153 body().prependChild(node); 154 } 155 } 156 157 // merge multiple <head> or <body> contents into one, delete the remainder, and ensure they are owned by <html> 158 private void normaliseStructure(String tag, Element htmlEl) { 159 Elements elements = this.getElementsByTag(tag); 160 Element master = elements.first(); // will always be available as created above if not existent 161 if (elements.size() > 1) { // dupes, move contents to master 162 List<Node> toMove = new ArrayList<>(); 163 for (int i = 1; i < elements.size(); i++) { 164 Node dupe = elements.get(i); 165 toMove.addAll(dupe.ensureChildNodes()); 166 dupe.remove(); 167 } 168 169 for (Node dupe : toMove) 170 master.appendChild(dupe); 171 } 172 // ensure parented by <html> 173 if (!master.parent().equals(htmlEl)) { 174 htmlEl.appendChild(master); // includes remove() 175 } 176 } 177 178 // fast method to get first by tag name, used for html, head, body finders 179 private Element findFirstElementByTagName(String tag, Node node) { 180 if (node.nodeName().equals(tag)) 181 return (Element) node; 182 else { 183 int size = node.childNodeSize(); 184 for (int i = 0; i < size; i++) { 185 Element found = findFirstElementByTagName(tag, node.childNode(i)); 186 if (found != null) 187 return found; 188 } 189 } 190 return null; 191 } 192 193 @Override 194 public String outerHtml() { 195 return super.html(); // no outer wrapper tag 196 } 197 198 /** 199 Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared. 200 @param text unencoded text 201 @return this document 202 */ 203 @Override 204 public Element text(String text) { 205 body().text(text); // overridden to not nuke doc structure 206 return this; 207 } 208 209 @Override 210 public String nodeName() { 211 return "#document"; 212 } 213 214 /** 215 * Sets the charset used in this document. This method is equivalent 216 * to {@link OutputSettings#charset(java.nio.charset.Charset) 217 * OutputSettings.charset(Charset)} but in addition it updates the 218 * charset / encoding element within the document. 219 * 220 * <p>This enables 221 * {@link #updateMetaCharsetElement(boolean) meta charset update}.</p> 222 * 223 * <p>If there's no element with charset / encoding information yet it will 224 * be created. Obsolete charset / encoding definitions are removed!</p> 225 * 226 * <p><b>Elements used:</b></p> 227 * 228 * <ul> 229 * <li><b>Html:</b> <i><meta charset="CHARSET"></i></li> 230 * <li><b>Xml:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li> 231 * </ul> 232 * 233 * @param charset Charset 234 * 235 * @see #updateMetaCharsetElement(boolean) 236 * @see OutputSettings#charset(java.nio.charset.Charset) 237 */ 238 public void charset(Charset charset) { 239 updateMetaCharsetElement(true); 240 outputSettings.charset(charset); 241 ensureMetaCharsetElement(); 242 } 243 244 /** 245 * Returns the charset used in this document. This method is equivalent 246 * to {@link OutputSettings#charset()}. 247 * 248 * @return Current Charset 249 * 250 * @see OutputSettings#charset() 251 */ 252 public Charset charset() { 253 return outputSettings.charset(); 254 } 255 256 /** 257 * Sets whether the element with charset information in this document is 258 * updated on changes through {@link #charset(java.nio.charset.Charset) 259 * Document.charset(Charset)} or not. 260 * 261 * <p>If set to <tt>false</tt> <i>(default)</i> there are no elements 262 * modified.</p> 263 * 264 * @param update If <tt>true</tt> the element updated on charset 265 * changes, <tt>false</tt> if not 266 * 267 * @see #charset(java.nio.charset.Charset) 268 */ 269 public void updateMetaCharsetElement(boolean update) { 270 this.updateMetaCharset = update; 271 } 272 273 /** 274 * Returns whether the element with charset information in this document is 275 * updated on changes through {@link #charset(java.nio.charset.Charset) 276 * Document.charset(Charset)} or not. 277 * 278 * @return Returns <tt>true</tt> if the element is updated on charset 279 * changes, <tt>false</tt> if not 280 */ 281 public boolean updateMetaCharsetElement() { 282 return updateMetaCharset; 283 } 284 285 @Override 286 public Document clone() { 287 Document clone = (Document) super.clone(); 288 clone.outputSettings = this.outputSettings.clone(); 289 return clone; 290 } 291 292 /** 293 * Ensures a meta charset (html) or xml declaration (xml) with the current 294 * encoding used. This only applies with 295 * {@link #updateMetaCharsetElement(boolean) updateMetaCharset} set to 296 * <tt>true</tt>, otherwise this method does nothing. 297 * 298 * <ul> 299 * <li>An existing element gets updated with the current charset</li> 300 * <li>If there's no element yet it will be inserted</li> 301 * <li>Obsolete elements are removed</li> 302 * </ul> 303 * 304 * <p><b>Elements used:</b></p> 305 * 306 * <ul> 307 * <li><b>Html:</b> <i><meta charset="CHARSET"></i></li> 308 * <li><b>Xml:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li> 309 * </ul> 310 */ 311 private void ensureMetaCharsetElement() { 312 if (updateMetaCharset) { 313 OutputSettings.Syntax syntax = outputSettings().syntax(); 314 315 if (syntax == OutputSettings.Syntax.html) { 316 Element metaCharset = select("meta[charset]").first(); 317 318 if (metaCharset != null) { 319 metaCharset.attr("charset", charset().displayName()); 320 } else { 321 Element head = head(); 322 323 if (head != null) { 324 head.appendElement("meta").attr("charset", charset().displayName()); 325 } 326 } 327 328 // Remove obsolete elements 329 select("meta[name=charset]").remove(); 330 } else if (syntax == OutputSettings.Syntax.xml) { 331 Node node = childNodes().get(0); 332 333 if (node instanceof XmlDeclaration) { 334 XmlDeclaration decl = (XmlDeclaration) node; 335 336 if (decl.name().equals("xml")) { 337 decl.attr("encoding", charset().displayName()); 338 339 final String version = decl.attr("version"); 340 341 if (version != null) { 342 decl.attr("version", "1.0"); 343 } 344 } else { 345 decl = new XmlDeclaration("xml", false); 346 decl.attr("version", "1.0"); 347 decl.attr("encoding", charset().displayName()); 348 349 prependChild(decl); 350 } 351 } else { 352 XmlDeclaration decl = new XmlDeclaration("xml", false); 353 decl.attr("version", "1.0"); 354 decl.attr("encoding", charset().displayName()); 355 356 prependChild(decl); 357 } 358 } 359 } 360 } 361 362 363 /** 364 * A Document's output settings control the form of the text() and html() methods. 365 */ 366 public static class OutputSettings implements Cloneable { 367 /** 368 * The output serialization syntax. 369 */ 370 public enum Syntax {html, xml} 371 372 private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; 373 private Charset charset; 374 CharsetEncoder encoder; // initialized by start of OuterHtmlVisitor and cleared at end 375 Entities.CoreCharset coreCharset; // fast encoders for ascii and utf8 376 377 private boolean prettyPrint = true; 378 private boolean outline = false; 379 private int indentAmount = 1; 380 private Syntax syntax = Syntax.html; 381 382 public OutputSettings() { 383 charset(Charset.forName("UTF8")); 384 } 385 386 /** 387 * Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML 388 * entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>, 389 * which uses the complete set of HTML named entities. 390 * <p> 391 * The default escape mode is <code>base</code>. 392 * @return the document's current escape mode 393 */ 394 public Entities.EscapeMode escapeMode() { 395 return escapeMode; 396 } 397 398 /** 399 * Set the document's escape mode, which determines how characters are escaped when the output character set 400 * does not support a given character:- using either a named or a numbered escape. 401 * @param escapeMode the new escape mode to use 402 * @return the document's output settings, for chaining 403 */ 404 public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { 405 this.escapeMode = escapeMode; 406 return this; 407 } 408 409 /** 410 * Get the document's current output charset, which is used to control which characters are escaped when 411 * generating HTML (via the <code>html()</code> methods), and which are kept intact. 412 * <p> 413 * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the 414 * input charset. Otherwise, it defaults to UTF-8. 415 * @return the document's current charset. 416 */ 417 public Charset charset() { 418 return charset; 419 } 420 421 /** 422 * Update the document's output charset. 423 * @param charset the new charset to use. 424 * @return the document's output settings, for chaining 425 */ 426 public OutputSettings charset(Charset charset) { 427 this.charset = charset; 428 return this; 429 } 430 431 /** 432 * Update the document's output charset. 433 * @param charset the new charset (by name) to use. 434 * @return the document's output settings, for chaining 435 */ 436 public OutputSettings charset(String charset) { 437 charset(Charset.forName(charset)); 438 return this; 439 } 440 441 CharsetEncoder prepareEncoder() { 442 encoder = charset.newEncoder(); // created at start of OuterHtmlVisitor so each pass has own encoder, so OutputSettings can be shared among threads 443 coreCharset = Entities.CoreCharset.byName(encoder.charset().name()); 444 return encoder; 445 } 446 447 /** 448 * Get the document's current output syntax. 449 * @return current syntax 450 */ 451 public Syntax syntax() { 452 return syntax; 453 } 454 455 /** 456 * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or 457 * {@code xml}, with self-closing tags. 458 * @param syntax serialization syntax 459 * @return the document's output settings, for chaining 460 */ 461 public OutputSettings syntax(Syntax syntax) { 462 this.syntax = syntax; 463 return this; 464 } 465 466 /** 467 * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format 468 * the output, and the output will generally look like the input. 469 * @return if pretty printing is enabled. 470 */ 471 public boolean prettyPrint() { 472 return prettyPrint; 473 } 474 475 /** 476 * Enable or disable pretty printing. 477 * @param pretty new pretty print setting 478 * @return this, for chaining 479 */ 480 public OutputSettings prettyPrint(boolean pretty) { 481 prettyPrint = pretty; 482 return this; 483 } 484 485 /** 486 * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider 487 * all tags as block. 488 * @return if outline mode is enabled. 489 */ 490 public boolean outline() { 491 return outline; 492 } 493 494 /** 495 * Enable or disable HTML outline mode. 496 * @param outlineMode new outline setting 497 * @return this, for chaining 498 */ 499 public OutputSettings outline(boolean outlineMode) { 500 outline = outlineMode; 501 return this; 502 } 503 504 /** 505 * Get the current tag indent amount, used when pretty printing. 506 * @return the current indent amount 507 */ 508 public int indentAmount() { 509 return indentAmount; 510 } 511 512 /** 513 * Set the indent amount for pretty printing 514 * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0. 515 * @return this, for chaining 516 */ 517 public OutputSettings indentAmount(int indentAmount) { 518 Validate.isTrue(indentAmount >= 0); 519 this.indentAmount = indentAmount; 520 return this; 521 } 522 523 @Override 524 public OutputSettings clone() { 525 OutputSettings clone; 526 try { 527 clone = (OutputSettings) super.clone(); 528 } catch (CloneNotSupportedException e) { 529 throw new RuntimeException(e); 530 } 531 clone.charset(charset.name()); // new charset and charset encoder 532 clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); 533 // indentAmount, prettyPrint are primitives so object.clone() will handle 534 return clone; 535 } 536 } 537 538 /** 539 * Get the document's current output settings. 540 * @return the document's current output settings. 541 */ 542 public OutputSettings outputSettings() { 543 return outputSettings; 544 } 545 546 /** 547 * Set the document's output settings. 548 * @param outputSettings new output settings. 549 * @return this document, for chaining. 550 */ 551 public Document outputSettings(OutputSettings outputSettings) { 552 Validate.notNull(outputSettings); 553 this.outputSettings = outputSettings; 554 return this; 555 } 556 557 public enum QuirksMode { 558 noQuirks, quirks, limitedQuirks 559 } 560 561 public QuirksMode quirksMode() { 562 return quirksMode; 563 } 564 565 public Document quirksMode(QuirksMode quirksMode) { 566 this.quirksMode = quirksMode; 567 return this; 568 } 569}