001package org.jsoup.nodes; 002 003import org.jsoup.SerializationException; 004import org.jsoup.helper.StringUtil; 005import org.jsoup.helper.Validate; 006import org.jsoup.parser.Parser; 007import org.jsoup.select.NodeFilter; 008import org.jsoup.select.NodeTraversor; 009import org.jsoup.select.NodeVisitor; 010 011import java.io.IOException; 012import java.util.*; 013 014/** 015 The base, abstract Node model. Elements, Documents, Comments etc are all Node instances. 016 017 @author Jonathan Hedley, jonathan@hedley.net */ 018public abstract class Node implements Cloneable { 019 static final String EmptyString = ""; 020 Node parentNode; 021 int siblingIndex; 022 023 /** 024 * Default constructor. Doesn't setup base uri, children, or attributes; use with caution. 025 */ 026 protected Node() { 027 } 028 029 /** 030 Get the node name of this node. Use for debugging purposes and not logic switching (for that, use instanceof). 031 @return node name 032 */ 033 public abstract String nodeName(); 034 035 /** 036 * Check if this Node has an actual Attributes object. 037 */ 038 protected abstract boolean hasAttributes(); 039 040 public boolean hasParent() { 041 return parentNode != null; 042 } 043 044 /** 045 * Get an attribute's value by its key. <b>Case insensitive</b> 046 * <p> 047 * To get an absolute URL from an attribute that may be a relative URL, prefix the key with <code><b>abs</b></code>, 048 * which is a shortcut to the {@link #absUrl} method. 049 * </p> 050 * E.g.: 051 * <blockquote><code>String url = a.attr("abs:href");</code></blockquote> 052 * 053 * @param attributeKey The attribute key. 054 * @return The attribute, or empty string if not present (to avoid nulls). 055 * @see #attributes() 056 * @see #hasAttr(String) 057 * @see #absUrl(String) 058 */ 059 public String attr(String attributeKey) { 060 Validate.notNull(attributeKey); 061 if (!hasAttributes()) 062 return EmptyString; 063 064 String val = attributes().getIgnoreCase(attributeKey); 065 if (val.length() > 0) 066 return val; 067 else if (attributeKey.startsWith("abs:")) 068 return absUrl(attributeKey.substring("abs:".length())); 069 else return ""; 070 } 071 072 /** 073 * Get all of the element's attributes. 074 * @return attributes (which implements iterable, in same order as presented in original HTML). 075 */ 076 public abstract Attributes attributes(); 077 078 /** 079 * Set an attribute (key=value). If the attribute already exists, it is replaced. The attribute key comparison is 080 * <b>case insensitive</b>. 081 * @param attributeKey The attribute key. 082 * @param attributeValue The attribute value. 083 * @return this (for chaining) 084 */ 085 public Node attr(String attributeKey, String attributeValue) { 086 attributes().putIgnoreCase(attributeKey, attributeValue); 087 return this; 088 } 089 090 /** 091 * Test if this element has an attribute. <b>Case insensitive</b> 092 * @param attributeKey The attribute key to check. 093 * @return true if the attribute exists, false if not. 094 */ 095 public boolean hasAttr(String attributeKey) { 096 Validate.notNull(attributeKey); 097 098 if (attributeKey.startsWith("abs:")) { 099 String key = attributeKey.substring("abs:".length()); 100 if (attributes().hasKeyIgnoreCase(key) && !absUrl(key).equals("")) 101 return true; 102 } 103 return attributes().hasKeyIgnoreCase(attributeKey); 104 } 105 106 /** 107 * Remove an attribute from this element. 108 * @param attributeKey The attribute to remove. 109 * @return this (for chaining) 110 */ 111 public Node removeAttr(String attributeKey) { 112 Validate.notNull(attributeKey); 113 attributes().removeIgnoreCase(attributeKey); 114 return this; 115 } 116 117 /** 118 * Clear (remove) all of the attributes in this node. 119 * @return this, for chaining 120 */ 121 public Node clearAttributes() { 122 Iterator<Attribute> it = attributes().iterator(); 123 while (it.hasNext()) { 124 it.next(); 125 it.remove(); 126 } 127 return this; 128 } 129 130 /** 131 Get the base URI of this node. 132 @return base URI 133 */ 134 public abstract String baseUri(); 135 136 /** 137 * Set the baseUri for just this node (not its descendants), if this Node tracks base URIs. 138 * @param baseUri new URI 139 */ 140 protected abstract void doSetBaseUri(String baseUri); 141 142 /** 143 Update the base URI of this node and all of its descendants. 144 @param baseUri base URI to set 145 */ 146 public void setBaseUri(final String baseUri) { 147 Validate.notNull(baseUri); 148 149 traverse(new NodeVisitor() { 150 public void head(Node node, int depth) { 151 node.doSetBaseUri(baseUri); 152 } 153 154 public void tail(Node node, int depth) { 155 } 156 }); 157 } 158 159 /** 160 * Get an absolute URL from a URL attribute that may be relative (i.e. an <code><a href></code> or 161 * <code><img src></code>). 162 * <p> 163 * E.g.: <code>String absUrl = linkEl.absUrl("href");</code> 164 * </p> 165 * <p> 166 * If the attribute value is already absolute (i.e. it starts with a protocol, like 167 * <code>http://</code> or <code>https://</code> etc), and it successfully parses as a URL, the attribute is 168 * returned directly. Otherwise, it is treated as a URL relative to the element's {@link #baseUri}, and made 169 * absolute using that. 170 * </p> 171 * <p> 172 * As an alternate, you can use the {@link #attr} method with the <code>abs:</code> prefix, e.g.: 173 * <code>String absUrl = linkEl.attr("abs:href");</code> 174 * </p> 175 * 176 * @param attributeKey The attribute key 177 * @return An absolute URL if one could be made, or an empty string (not null) if the attribute was missing or 178 * could not be made successfully into a URL. 179 * @see #attr 180 * @see java.net.URL#URL(java.net.URL, String) 181 */ 182 public String absUrl(String attributeKey) { 183 Validate.notEmpty(attributeKey); 184 185 if (!hasAttr(attributeKey)) { 186 return ""; // nothing to make absolute with 187 } else { 188 return StringUtil.resolve(baseUri(), attr(attributeKey)); 189 } 190 } 191 192 protected abstract List<Node> ensureChildNodes(); 193 194 /** 195 Get a child node by its 0-based index. 196 @param index index of child node 197 @return the child node at this index. Throws a {@code IndexOutOfBoundsException} if the index is out of bounds. 198 */ 199 public Node childNode(int index) { 200 return ensureChildNodes().get(index); 201 } 202 203 /** 204 Get this node's children. Presented as an unmodifiable list: new children can not be added, but the child nodes 205 themselves can be manipulated. 206 @return list of children. If no children, returns an empty list. 207 */ 208 public List<Node> childNodes() { 209 return Collections.unmodifiableList(ensureChildNodes()); 210 } 211 212 /** 213 * Returns a deep copy of this node's children. Changes made to these nodes will not be reflected in the original 214 * nodes 215 * @return a deep copy of this node's children 216 */ 217 public List<Node> childNodesCopy() { 218 final List<Node> nodes = ensureChildNodes(); 219 final ArrayList<Node> children = new ArrayList<>(nodes.size()); 220 for (Node node : nodes) { 221 children.add(node.clone()); 222 } 223 return children; 224 } 225 226 /** 227 * Get the number of child nodes that this node holds. 228 * @return the number of child nodes that this node holds. 229 */ 230 public abstract int childNodeSize(); 231 232 protected Node[] childNodesAsArray() { 233 return ensureChildNodes().toArray(new Node[childNodeSize()]); 234 } 235 236 /** 237 Gets this node's parent node. 238 @return parent node; or null if no parent. 239 */ 240 public Node parent() { 241 return parentNode; 242 } 243 244 /** 245 Gets this node's parent node. Not overridable by extending classes, so useful if you really just need the Node type. 246 @return parent node; or null if no parent. 247 */ 248 public final Node parentNode() { 249 return parentNode; 250 } 251 252 /** 253 * Get this node's root node; that is, its topmost ancestor. If this node is the top ancestor, returns {@code this}. 254 * @return topmost ancestor. 255 */ 256 public Node root() { 257 Node node = this; 258 while (node.parentNode != null) 259 node = node.parentNode; 260 return node; 261 } 262 263 /** 264 * Gets the Document associated with this Node. 265 * @return the Document associated with this Node, or null if there is no such Document. 266 */ 267 public Document ownerDocument() { 268 Node root = root(); 269 return (root instanceof Document) ? (Document) root : null; 270 } 271 272 /** 273 * Remove (delete) this node from the DOM tree. If this node has children, they are also removed. 274 */ 275 public void remove() { 276 Validate.notNull(parentNode); 277 parentNode.removeChild(this); 278 } 279 280 /** 281 * Insert the specified HTML into the DOM before this node (i.e. as a preceding sibling). 282 * @param html HTML to add before this node 283 * @return this node, for chaining 284 * @see #after(String) 285 */ 286 public Node before(String html) { 287 addSiblingHtml(siblingIndex, html); 288 return this; 289 } 290 291 /** 292 * Insert the specified node into the DOM before this node (i.e. as a preceding sibling). 293 * @param node to add before this node 294 * @return this node, for chaining 295 * @see #after(Node) 296 */ 297 public Node before(Node node) { 298 Validate.notNull(node); 299 Validate.notNull(parentNode); 300 301 parentNode.addChildren(siblingIndex, node); 302 return this; 303 } 304 305 /** 306 * Insert the specified HTML into the DOM after this node (i.e. as a following sibling). 307 * @param html HTML to add after this node 308 * @return this node, for chaining 309 * @see #before(String) 310 */ 311 public Node after(String html) { 312 addSiblingHtml(siblingIndex + 1, html); 313 return this; 314 } 315 316 /** 317 * Insert the specified node into the DOM after this node (i.e. as a following sibling). 318 * @param node to add after this node 319 * @return this node, for chaining 320 * @see #before(Node) 321 */ 322 public Node after(Node node) { 323 Validate.notNull(node); 324 Validate.notNull(parentNode); 325 326 parentNode.addChildren(siblingIndex + 1, node); 327 return this; 328 } 329 330 private void addSiblingHtml(int index, String html) { 331 Validate.notNull(html); 332 Validate.notNull(parentNode); 333 334 Element context = parent() instanceof Element ? (Element) parent() : null; 335 List<Node> nodes = Parser.parseFragment(html, context, baseUri()); 336 parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()])); 337 } 338 339 /** 340 Wrap the supplied HTML around this node. 341 @param html HTML to wrap around this element, e.g. {@code <div class="head"></div>}. Can be arbitrarily deep. 342 @return this node, for chaining. 343 */ 344 public Node wrap(String html) { 345 Validate.notEmpty(html); 346 347 Element context = parent() instanceof Element ? (Element) parent() : null; 348 List<Node> wrapChildren = Parser.parseFragment(html, context, baseUri()); 349 Node wrapNode = wrapChildren.get(0); 350 if (wrapNode == null || !(wrapNode instanceof Element)) // nothing to wrap with; noop 351 return null; 352 353 Element wrap = (Element) wrapNode; 354 Element deepest = getDeepChild(wrap); 355 parentNode.replaceChild(this, wrap); 356 deepest.addChildren(this); 357 358 // remainder (unbalanced wrap, like <div></div><p></p> -- The <p> is remainder 359 if (wrapChildren.size() > 0) { 360 //noinspection ForLoopReplaceableByForEach (beacause it allocates an Iterator which is wasteful here) 361 for (int i = 0; i < wrapChildren.size(); i++) { 362 Node remainder = wrapChildren.get(i); 363 remainder.parentNode.removeChild(remainder); 364 wrap.appendChild(remainder); 365 } 366 } 367 return this; 368 } 369 370 /** 371 * Removes this node from the DOM, and moves its children up into the node's parent. This has the effect of dropping 372 * the node but keeping its children. 373 * <p> 374 * For example, with the input html: 375 * </p> 376 * <p>{@code <div>One <span>Two <b>Three</b></span></div>}</p> 377 * Calling {@code element.unwrap()} on the {@code span} element will result in the html: 378 * <p>{@code <div>One Two <b>Three</b></div>}</p> 379 * and the {@code "Two "} {@link TextNode} being returned. 380 * 381 * @return the first child of this node, after the node has been unwrapped. Null if the node had no children. 382 * @see #remove() 383 * @see #wrap(String) 384 */ 385 public Node unwrap() { 386 Validate.notNull(parentNode); 387 final List<Node> childNodes = ensureChildNodes(); 388 Node firstChild = childNodes.size() > 0 ? childNodes.get(0) : null; 389 parentNode.addChildren(siblingIndex, this.childNodesAsArray()); 390 this.remove(); 391 392 return firstChild; 393 } 394 395 private Element getDeepChild(Element el) { 396 List<Element> children = el.children(); 397 if (children.size() > 0) 398 return getDeepChild(children.get(0)); 399 else 400 return el; 401 } 402 403 void nodelistChanged() { 404 // Element overrides this to clear its shadow children elements 405 } 406 407 /** 408 * Replace this node in the DOM with the supplied node. 409 * @param in the node that will will replace the existing node. 410 */ 411 public void replaceWith(Node in) { 412 Validate.notNull(in); 413 Validate.notNull(parentNode); 414 parentNode.replaceChild(this, in); 415 } 416 417 protected void setParentNode(Node parentNode) { 418 Validate.notNull(parentNode); 419 if (this.parentNode != null) 420 this.parentNode.removeChild(this); 421 this.parentNode = parentNode; 422 } 423 424 protected void replaceChild(Node out, Node in) { 425 Validate.isTrue(out.parentNode == this); 426 Validate.notNull(in); 427 if (in.parentNode != null) 428 in.parentNode.removeChild(in); 429 430 final int index = out.siblingIndex; 431 ensureChildNodes().set(index, in); 432 in.parentNode = this; 433 in.setSiblingIndex(index); 434 out.parentNode = null; 435 } 436 437 protected void removeChild(Node out) { 438 Validate.isTrue(out.parentNode == this); 439 final int index = out.siblingIndex; 440 ensureChildNodes().remove(index); 441 reindexChildren(index); 442 out.parentNode = null; 443 } 444 445 protected void addChildren(Node... children) { 446 //most used. short circuit addChildren(int), which hits reindex children and array copy 447 final List<Node> nodes = ensureChildNodes(); 448 449 for (Node child: children) { 450 reparentChild(child); 451 nodes.add(child); 452 child.setSiblingIndex(nodes.size()-1); 453 } 454 } 455 456 protected void addChildren(int index, Node... children) { 457 Validate.noNullElements(children); 458 final List<Node> nodes = ensureChildNodes(); 459 460 for (Node child : children) { 461 reparentChild(child); 462 } 463 nodes.addAll(index, Arrays.asList(children)); 464 reindexChildren(index); 465 } 466 467 protected void reparentChild(Node child) { 468 child.setParentNode(this); 469 } 470 471 private void reindexChildren(int start) { 472 final List<Node> childNodes = ensureChildNodes(); 473 474 for (int i = start; i < childNodes.size(); i++) { 475 childNodes.get(i).setSiblingIndex(i); 476 } 477 } 478 479 /** 480 Retrieves this node's sibling nodes. Similar to {@link #childNodes() node.parent.childNodes()}, but does not 481 include this node (a node is not a sibling of itself). 482 @return node siblings. If the node has no parent, returns an empty list. 483 */ 484 public List<Node> siblingNodes() { 485 if (parentNode == null) 486 return Collections.emptyList(); 487 488 List<Node> nodes = parentNode.ensureChildNodes(); 489 List<Node> siblings = new ArrayList<>(nodes.size() - 1); 490 for (Node node: nodes) 491 if (node != this) 492 siblings.add(node); 493 return siblings; 494 } 495 496 /** 497 Get this node's next sibling. 498 @return next sibling, or null if this is the last sibling 499 */ 500 public Node nextSibling() { 501 if (parentNode == null) 502 return null; // root 503 504 final List<Node> siblings = parentNode.ensureChildNodes(); 505 final int index = siblingIndex+1; 506 if (siblings.size() > index) 507 return siblings.get(index); 508 else 509 return null; 510 } 511 512 /** 513 Get this node's previous sibling. 514 @return the previous sibling, or null if this is the first sibling 515 */ 516 public Node previousSibling() { 517 if (parentNode == null) 518 return null; // root 519 520 if (siblingIndex > 0) 521 return parentNode.ensureChildNodes().get(siblingIndex-1); 522 else 523 return null; 524 } 525 526 /** 527 * Get the list index of this node in its node sibling list. I.e. if this is the first node 528 * sibling, returns 0. 529 * @return position in node sibling list 530 * @see org.jsoup.nodes.Element#elementSiblingIndex() 531 */ 532 public int siblingIndex() { 533 return siblingIndex; 534 } 535 536 protected void setSiblingIndex(int siblingIndex) { 537 this.siblingIndex = siblingIndex; 538 } 539 540 /** 541 * Perform a depth-first traversal through this node and its descendants. 542 * @param nodeVisitor the visitor callbacks to perform on each node 543 * @return this node, for chaining 544 */ 545 public Node traverse(NodeVisitor nodeVisitor) { 546 Validate.notNull(nodeVisitor); 547 NodeTraversor.traverse(nodeVisitor, this); 548 return this; 549 } 550 551 /** 552 * Perform a depth-first filtering through this node and its descendants. 553 * @param nodeFilter the filter callbacks to perform on each node 554 * @return this node, for chaining 555 */ 556 public Node filter(NodeFilter nodeFilter) { 557 Validate.notNull(nodeFilter); 558 NodeTraversor.filter(nodeFilter, this); 559 return this; 560 } 561 562 /** 563 Get the outer HTML of this node. 564 @return HTML 565 */ 566 public String outerHtml() { 567 StringBuilder accum = new StringBuilder(128); 568 outerHtml(accum); 569 return accum.toString(); 570 } 571 572 protected void outerHtml(Appendable accum) { 573 NodeTraversor.traverse(new OuterHtmlVisitor(accum, getOutputSettings()), this); 574 } 575 576 // if this node has no document (or parent), retrieve the default output settings 577 Document.OutputSettings getOutputSettings() { 578 Document owner = ownerDocument(); 579 return owner != null ? owner.outputSettings() : (new Document("")).outputSettings(); 580 } 581 582 /** 583 Get the outer HTML of this node. 584 @param accum accumulator to place HTML into 585 @throws IOException if appending to the given accumulator fails. 586 */ 587 abstract void outerHtmlHead(final Appendable accum, int depth, final Document.OutputSettings out) throws IOException; 588 589 abstract void outerHtmlTail(final Appendable accum, int depth, final Document.OutputSettings out) throws IOException; 590 591 /** 592 * Write this node and its children to the given {@link Appendable}. 593 * 594 * @param appendable the {@link Appendable} to write to. 595 * @return the supplied {@link Appendable}, for chaining. 596 */ 597 public <T extends Appendable> T html(T appendable) { 598 outerHtml(appendable); 599 return appendable; 600 } 601 602 public String toString() { 603 return outerHtml(); 604 } 605 606 protected void indent(Appendable accum, int depth, Document.OutputSettings out) throws IOException { 607 accum.append('\n').append(StringUtil.padding(depth * out.indentAmount())); 608 } 609 610 /** 611 * Check if this node is the same instance of another (object identity test). 612 * @param o other object to compare to 613 * @return true if the content of this node is the same as the other 614 * @see Node#hasSameValue(Object) to compare nodes by their value 615 */ 616 @Override 617 public boolean equals(Object o) { 618 // implemented just so that javadoc is clear this is an identity test 619 return this == o; 620 } 621 622 /** 623 * Check if this node is has the same content as another node. A node is considered the same if its name, attributes and content match the 624 * other node; particularly its position in the tree does not influence its similarity. 625 * @param o other object to compare to 626 * @return true if the content of this node is the same as the other 627 */ 628 629 public boolean hasSameValue(Object o) { 630 if (this == o) return true; 631 if (o == null || getClass() != o.getClass()) return false; 632 633 return this.outerHtml().equals(((Node) o).outerHtml()); 634 } 635 636 /** 637 * Create a stand-alone, deep copy of this node, and all of its children. The cloned node will have no siblings or 638 * parent node. As a stand-alone object, any changes made to the clone or any of its children will not impact the 639 * original node. 640 * <p> 641 * The cloned node may be adopted into another Document or node structure using {@link Element#appendChild(Node)}. 642 * @return stand-alone cloned node 643 */ 644 @Override 645 public Node clone() { 646 Node thisClone = doClone(null); // splits for orphan 647 648 // Queue up nodes that need their children cloned (BFS). 649 final LinkedList<Node> nodesToProcess = new LinkedList<>(); 650 nodesToProcess.add(thisClone); 651 652 while (!nodesToProcess.isEmpty()) { 653 Node currParent = nodesToProcess.remove(); 654 655 final int size = currParent.childNodeSize(); 656 for (int i = 0; i < size; i++) { 657 final List<Node> childNodes = currParent.ensureChildNodes(); 658 Node childClone = childNodes.get(i).doClone(currParent); 659 childNodes.set(i, childClone); 660 nodesToProcess.add(childClone); 661 } 662 } 663 664 return thisClone; 665 } 666 667 /* 668 * Return a clone of the node using the given parent (which can be null). 669 * Not a deep copy of children. 670 */ 671 protected Node doClone(Node parent) { 672 Node clone; 673 674 try { 675 clone = (Node) super.clone(); 676 } catch (CloneNotSupportedException e) { 677 throw new RuntimeException(e); 678 } 679 680 clone.parentNode = parent; // can be null, to create an orphan split 681 clone.siblingIndex = parent == null ? 0 : siblingIndex; 682 683 return clone; 684 } 685 686 private static class OuterHtmlVisitor implements NodeVisitor { 687 private Appendable accum; 688 private Document.OutputSettings out; 689 690 OuterHtmlVisitor(Appendable accum, Document.OutputSettings out) { 691 this.accum = accum; 692 this.out = out; 693 out.prepareEncoder(); 694 } 695 696 public void head(Node node, int depth) { 697 try { 698 node.outerHtmlHead(accum, depth, out); 699 } catch (IOException exception) { 700 throw new SerializationException(exception); 701 } 702 } 703 704 public void tail(Node node, int depth) { 705 if (!node.nodeName().equals("#text")) { // saves a void hit. 706 try { 707 node.outerHtmlTail(accum, depth, out); 708 } catch (IOException exception) { 709 throw new SerializationException(exception); 710 } 711 } 712 } 713 } 714}