001package org.jsoup.parser; 002 003import org.jsoup.helper.StringUtil; 004import org.jsoup.helper.Validate; 005import org.jsoup.nodes.Comment; 006import org.jsoup.nodes.DataNode; 007import org.jsoup.nodes.Document; 008import org.jsoup.nodes.Element; 009import org.jsoup.nodes.FormElement; 010import org.jsoup.nodes.Node; 011import org.jsoup.nodes.TextNode; 012import org.jsoup.select.Elements; 013 014import java.io.Reader; 015import java.io.StringReader; 016import java.util.ArrayList; 017import java.util.List; 018 019import static org.jsoup.helper.StringUtil.inSorted; 020 021/** 022 * HTML Tree Builder; creates a DOM from Tokens. 023 */ 024public class HtmlTreeBuilder extends TreeBuilder { 025 // tag searches. must be sorted, used in inSorted. MUST update HtmlTreeBuilderTest if more arrays are added. 026 static final String[] TagsSearchInScope = new String[]{"applet", "caption", "html", "marquee", "object", "table", "td", "th"}; 027 static final String[] TagSearchList = new String[]{"ol", "ul"}; 028 static final String[] TagSearchButton = new String[]{"button"}; 029 static final String[] TagSearchTableScope = new String[]{"html", "table"}; 030 static final String[] TagSearchSelectScope = new String[]{"optgroup", "option"}; 031 static final String[] TagSearchEndTags = new String[]{"dd", "dt", "li", "optgroup", "option", "p", "rp", "rt"}; 032 static final String[] TagSearchSpecial = new String[]{"address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", 033 "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd", 034 "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form", 035 "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", 036 "iframe", "img", "input", "isindex", "li", "link", "listing", "marquee", "menu", "meta", "nav", 037 "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script", 038 "section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", 039 "title", "tr", "ul", "wbr", "xmp"}; 040 041 public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages 042 043 private HtmlTreeBuilderState state; // the current state 044 private HtmlTreeBuilderState originalState; // original / marked state 045 046 private boolean baseUriSetFromDoc; 047 private Element headElement; // the current head element 048 private FormElement formElement; // the current form element 049 private Element contextElement; // fragment parse context -- could be null even if fragment parsing 050 private ArrayList<Element> formattingElements; // active (open) formatting elements 051 private List<String> pendingTableCharacters; // chars in table to be shifted out 052 private Token.EndTag emptyEnd; // reused empty end tag 053 054 private boolean framesetOk; // if ok to go into frameset 055 private boolean fosterInserts; // if next inserts should be fostered 056 private boolean fragmentParsing; // if parsing a fragment of html 057 058 HtmlTreeBuilder() {} 059 060 ParseSettings defaultSettings() { 061 return ParseSettings.htmlDefault; 062 } 063 064 @Override 065 protected void initialiseParse(Reader input, String baseUri, ParseErrorList errors, ParseSettings settings) { 066 super.initialiseParse(input, baseUri, errors, settings); 067 068 // this is a bit mucky. todo - probably just create new parser objects to ensure all reset. 069 state = HtmlTreeBuilderState.Initial; 070 originalState = null; 071 baseUriSetFromDoc = false; 072 headElement = null; 073 formElement = null; 074 contextElement = null; 075 formattingElements = new ArrayList<>(); 076 pendingTableCharacters = new ArrayList<>(); 077 emptyEnd = new Token.EndTag(); 078 framesetOk = true; 079 fosterInserts = false; 080 fragmentParsing = false; 081 } 082 083 List<Node> parseFragment(String inputFragment, Element context, String baseUri, ParseErrorList errors, ParseSettings settings) { 084 // context may be null 085 state = HtmlTreeBuilderState.Initial; 086 initialiseParse(new StringReader(inputFragment), baseUri, errors, settings); 087 contextElement = context; 088 fragmentParsing = true; 089 Element root = null; 090 091 if (context != null) { 092 if (context.ownerDocument() != null) // quirks setup: 093 doc.quirksMode(context.ownerDocument().quirksMode()); 094 095 // initialise the tokeniser state: 096 String contextTag = context.tagName(); 097 if (StringUtil.in(contextTag, "title", "textarea")) 098 tokeniser.transition(TokeniserState.Rcdata); 099 else if (StringUtil.in(contextTag, "iframe", "noembed", "noframes", "style", "xmp")) 100 tokeniser.transition(TokeniserState.Rawtext); 101 else if (contextTag.equals("script")) 102 tokeniser.transition(TokeniserState.ScriptData); 103 else if (contextTag.equals(("noscript"))) 104 tokeniser.transition(TokeniserState.Data); // if scripting enabled, rawtext 105 else if (contextTag.equals("plaintext")) 106 tokeniser.transition(TokeniserState.Data); 107 else 108 tokeniser.transition(TokeniserState.Data); // default 109 110 root = new Element(Tag.valueOf("html", settings), baseUri); 111 doc.appendChild(root); 112 stack.add(root); 113 resetInsertionMode(); 114 115 // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated 116 // with form correctly 117 Elements contextChain = context.parents(); 118 contextChain.add(0, context); 119 for (Element parent: contextChain) { 120 if (parent instanceof FormElement) { 121 formElement = (FormElement) parent; 122 break; 123 } 124 } 125 } 126 127 runParser(); 128 if (context != null) 129 return root.childNodes(); 130 else 131 return doc.childNodes(); 132 } 133 134 @Override 135 protected boolean process(Token token) { 136 currentToken = token; 137 return this.state.process(token, this); 138 } 139 140 boolean process(Token token, HtmlTreeBuilderState state) { 141 currentToken = token; 142 return state.process(token, this); 143 } 144 145 void transition(HtmlTreeBuilderState state) { 146 this.state = state; 147 } 148 149 HtmlTreeBuilderState state() { 150 return state; 151 } 152 153 void markInsertionMode() { 154 originalState = state; 155 } 156 157 HtmlTreeBuilderState originalState() { 158 return originalState; 159 } 160 161 void framesetOk(boolean framesetOk) { 162 this.framesetOk = framesetOk; 163 } 164 165 boolean framesetOk() { 166 return framesetOk; 167 } 168 169 Document getDocument() { 170 return doc; 171 } 172 173 String getBaseUri() { 174 return baseUri; 175 } 176 177 void maybeSetBaseUri(Element base) { 178 if (baseUriSetFromDoc) // only listen to the first <base href> in parse 179 return; 180 181 String href = base.absUrl("href"); 182 if (href.length() != 0) { // ignore <base target> etc 183 baseUri = href; 184 baseUriSetFromDoc = true; 185 doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants 186 } 187 } 188 189 boolean isFragmentParsing() { 190 return fragmentParsing; 191 } 192 193 void error(HtmlTreeBuilderState state) { 194 if (errors.canAddError()) 195 errors.add(new ParseError(reader.pos(), "Unexpected token [%s] when in state [%s]", currentToken.tokenType(), state)); 196 } 197 198 Element insert(Token.StartTag startTag) { 199 // handle empty unknown tags 200 // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag. 201 if (startTag.isSelfClosing()) { 202 Element el = insertEmpty(startTag); 203 stack.add(el); 204 tokeniser.transition(TokeniserState.Data); // handles <script />, otherwise needs breakout steps from script data 205 tokeniser.emit(emptyEnd.reset().name(el.tagName())); // ensure we get out of whatever state we are in. emitted for yielded processing 206 return el; 207 } 208 209 Element el = new Element(Tag.valueOf(startTag.name(), settings), baseUri, settings.normalizeAttributes(startTag.attributes)); 210 insert(el); 211 return el; 212 } 213 214 Element insertStartTag(String startTagName) { 215 Element el = new Element(Tag.valueOf(startTagName, settings), baseUri); 216 insert(el); 217 return el; 218 } 219 220 void insert(Element el) { 221 insertNode(el); 222 stack.add(el); 223 } 224 225 Element insertEmpty(Token.StartTag startTag) { 226 Tag tag = Tag.valueOf(startTag.name(), settings); 227 Element el = new Element(tag, baseUri, startTag.attributes); 228 insertNode(el); 229 if (startTag.isSelfClosing()) { 230 if (tag.isKnownTag()) { 231 if (!tag.isEmpty()) 232 tokeniser.error("Tag cannot be self closing; not a void tag"); 233 } 234 else // unknown tag, remember this is self closing for output 235 tag.setSelfClosing(); 236 } 237 return el; 238 } 239 240 FormElement insertForm(Token.StartTag startTag, boolean onStack) { 241 Tag tag = Tag.valueOf(startTag.name(), settings); 242 FormElement el = new FormElement(tag, baseUri, startTag.attributes); 243 setFormElement(el); 244 insertNode(el); 245 if (onStack) 246 stack.add(el); 247 return el; 248 } 249 250 void insert(Token.Comment commentToken) { 251 Comment comment = new Comment(commentToken.getData()); 252 insertNode(comment); 253 } 254 255 void insert(Token.Character characterToken) { 256 Node node; 257 // characters in script and style go in as datanodes, not text nodes 258 String tagName = currentElement().tagName(); 259 if (tagName.equals("script") || tagName.equals("style")) 260 node = new DataNode(characterToken.getData()); 261 else 262 node = new TextNode(characterToken.getData()); 263 currentElement().appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. 264 } 265 266 private void insertNode(Node node) { 267 // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc 268 if (stack.size() == 0) 269 doc.appendChild(node); 270 else if (isFosterInserts()) 271 insertInFosterParent(node); 272 else 273 currentElement().appendChild(node); 274 275 // connect form controls to their form element 276 if (node instanceof Element && ((Element) node).tag().isFormListed()) { 277 if (formElement != null) 278 formElement.addElement((Element) node); 279 } 280 } 281 282 Element pop() { 283 int size = stack.size(); 284 return stack.remove(size-1); 285 } 286 287 void push(Element element) { 288 stack.add(element); 289 } 290 291 ArrayList<Element> getStack() { 292 return stack; 293 } 294 295 boolean onStack(Element el) { 296 return isElementInQueue(stack, el); 297 } 298 299 private boolean isElementInQueue(ArrayList<Element> queue, Element element) { 300 for (int pos = queue.size() -1; pos >= 0; pos--) { 301 Element next = queue.get(pos); 302 if (next == element) { 303 return true; 304 } 305 } 306 return false; 307 } 308 309 Element getFromStack(String elName) { 310 for (int pos = stack.size() -1; pos >= 0; pos--) { 311 Element next = stack.get(pos); 312 if (next.nodeName().equals(elName)) { 313 return next; 314 } 315 } 316 return null; 317 } 318 319 boolean removeFromStack(Element el) { 320 for (int pos = stack.size() -1; pos >= 0; pos--) { 321 Element next = stack.get(pos); 322 if (next == el) { 323 stack.remove(pos); 324 return true; 325 } 326 } 327 return false; 328 } 329 330 void popStackToClose(String elName) { 331 for (int pos = stack.size() -1; pos >= 0; pos--) { 332 Element next = stack.get(pos); 333 stack.remove(pos); 334 if (next.nodeName().equals(elName)) 335 break; 336 } 337 } 338 339 // elnames is sorted, comes from Constants 340 void popStackToClose(String... elNames) { 341 for (int pos = stack.size() -1; pos >= 0; pos--) { 342 Element next = stack.get(pos); 343 stack.remove(pos); 344 if (inSorted(next.nodeName(), elNames)) 345 break; 346 } 347 } 348 349 void popStackToBefore(String elName) { 350 for (int pos = stack.size() -1; pos >= 0; pos--) { 351 Element next = stack.get(pos); 352 if (next.nodeName().equals(elName)) { 353 break; 354 } else { 355 stack.remove(pos); 356 } 357 } 358 } 359 360 void clearStackToTableContext() { 361 clearStackToContext("table"); 362 } 363 364 void clearStackToTableBodyContext() { 365 clearStackToContext("tbody", "tfoot", "thead", "template"); 366 } 367 368 void clearStackToTableRowContext() { 369 clearStackToContext("tr", "template"); 370 } 371 372 private void clearStackToContext(String... nodeNames) { 373 for (int pos = stack.size() -1; pos >= 0; pos--) { 374 Element next = stack.get(pos); 375 if (StringUtil.in(next.nodeName(), nodeNames) || next.nodeName().equals("html")) 376 break; 377 else 378 stack.remove(pos); 379 } 380 } 381 382 Element aboveOnStack(Element el) { 383 assert onStack(el); 384 for (int pos = stack.size() -1; pos >= 0; pos--) { 385 Element next = stack.get(pos); 386 if (next == el) { 387 return stack.get(pos-1); 388 } 389 } 390 return null; 391 } 392 393 void insertOnStackAfter(Element after, Element in) { 394 int i = stack.lastIndexOf(after); 395 Validate.isTrue(i != -1); 396 stack.add(i+1, in); 397 } 398 399 void replaceOnStack(Element out, Element in) { 400 replaceInQueue(stack, out, in); 401 } 402 403 private void replaceInQueue(ArrayList<Element> queue, Element out, Element in) { 404 int i = queue.lastIndexOf(out); 405 Validate.isTrue(i != -1); 406 queue.set(i, in); 407 } 408 409 void resetInsertionMode() { 410 boolean last = false; 411 for (int pos = stack.size() -1; pos >= 0; pos--) { 412 Element node = stack.get(pos); 413 if (pos == 0) { 414 last = true; 415 node = contextElement; 416 } 417 String name = node.nodeName(); 418 if ("select".equals(name)) { 419 transition(HtmlTreeBuilderState.InSelect); 420 break; // frag 421 } else if (("td".equals(name) || "th".equals(name) && !last)) { 422 transition(HtmlTreeBuilderState.InCell); 423 break; 424 } else if ("tr".equals(name)) { 425 transition(HtmlTreeBuilderState.InRow); 426 break; 427 } else if ("tbody".equals(name) || "thead".equals(name) || "tfoot".equals(name)) { 428 transition(HtmlTreeBuilderState.InTableBody); 429 break; 430 } else if ("caption".equals(name)) { 431 transition(HtmlTreeBuilderState.InCaption); 432 break; 433 } else if ("colgroup".equals(name)) { 434 transition(HtmlTreeBuilderState.InColumnGroup); 435 break; // frag 436 } else if ("table".equals(name)) { 437 transition(HtmlTreeBuilderState.InTable); 438 break; 439 } else if ("head".equals(name)) { 440 transition(HtmlTreeBuilderState.InBody); 441 break; // frag 442 } else if ("body".equals(name)) { 443 transition(HtmlTreeBuilderState.InBody); 444 break; 445 } else if ("frameset".equals(name)) { 446 transition(HtmlTreeBuilderState.InFrameset); 447 break; // frag 448 } else if ("html".equals(name)) { 449 transition(HtmlTreeBuilderState.BeforeHead); 450 break; // frag 451 } else if (last) { 452 transition(HtmlTreeBuilderState.InBody); 453 break; // frag 454 } 455 } 456 } 457 458 // todo: tidy up in specific scope methods 459 private String[] specificScopeTarget = {null}; 460 461 private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) { 462 specificScopeTarget[0] = targetName; 463 return inSpecificScope(specificScopeTarget, baseTypes, extraTypes); 464 } 465 466 private boolean inSpecificScope(String[] targetNames, String[] baseTypes, String[] extraTypes) { 467 int depth = stack.size() -1; 468 if (depth > MaxScopeSearchDepth) { 469 depth = MaxScopeSearchDepth; 470 } 471 for (int pos = depth; pos >= 0; pos--) { 472 Element el = stack.get(pos); 473 String elName = el.nodeName(); 474 if (inSorted(elName, targetNames)) 475 return true; 476 if (inSorted(elName, baseTypes)) 477 return false; 478 if (extraTypes != null && inSorted(elName, extraTypes)) 479 return false; 480 } 481 Validate.fail("Should not be reachable"); 482 return false; 483 } 484 485 boolean inScope(String[] targetNames) { 486 return inSpecificScope(targetNames, TagsSearchInScope, null); 487 } 488 489 boolean inScope(String targetName) { 490 return inScope(targetName, null); 491 } 492 493 boolean inScope(String targetName, String[] extras) { 494 return inSpecificScope(targetName, TagsSearchInScope, extras); 495 // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml 496 // todo: in svg namespace: forignOjbect, desc, title 497 } 498 499 boolean inListItemScope(String targetName) { 500 return inScope(targetName, TagSearchList); 501 } 502 503 boolean inButtonScope(String targetName) { 504 return inScope(targetName, TagSearchButton); 505 } 506 507 boolean inTableScope(String targetName) { 508 return inSpecificScope(targetName, TagSearchTableScope, null); 509 } 510 511 boolean inSelectScope(String targetName) { 512 for (int pos = stack.size() -1; pos >= 0; pos--) { 513 Element el = stack.get(pos); 514 String elName = el.nodeName(); 515 if (elName.equals(targetName)) 516 return true; 517 if (!inSorted(elName, TagSearchSelectScope)) // all elements except 518 return false; 519 } 520 Validate.fail("Should not be reachable"); 521 return false; 522 } 523 524 void setHeadElement(Element headElement) { 525 this.headElement = headElement; 526 } 527 528 Element getHeadElement() { 529 return headElement; 530 } 531 532 boolean isFosterInserts() { 533 return fosterInserts; 534 } 535 536 void setFosterInserts(boolean fosterInserts) { 537 this.fosterInserts = fosterInserts; 538 } 539 540 FormElement getFormElement() { 541 return formElement; 542 } 543 544 void setFormElement(FormElement formElement) { 545 this.formElement = formElement; 546 } 547 548 void newPendingTableCharacters() { 549 pendingTableCharacters = new ArrayList<>(); 550 } 551 552 List<String> getPendingTableCharacters() { 553 return pendingTableCharacters; 554 } 555 556 void setPendingTableCharacters(List<String> pendingTableCharacters) { 557 this.pendingTableCharacters = pendingTableCharacters; 558 } 559 560 /** 561 11.2.5.2 Closing elements that have implied end tags<p/> 562 When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a 563 dt element, an li element, an option element, an optgroup element, a p element, an rp element, or an rt element, 564 the UA must pop the current node off the stack of open elements. 565 566 @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the 567 process, then the UA must perform the above steps as if that element was not in the above list. 568 */ 569 void generateImpliedEndTags(String excludeTag) { 570 while ((excludeTag != null && !currentElement().nodeName().equals(excludeTag)) && 571 inSorted(currentElement().nodeName(), TagSearchEndTags)) 572 pop(); 573 } 574 575 void generateImpliedEndTags() { 576 generateImpliedEndTags(null); 577 } 578 579 boolean isSpecial(Element el) { 580 // todo: mathml's mi, mo, mn 581 // todo: svg's foreigObject, desc, title 582 String name = el.nodeName(); 583 return inSorted(name, TagSearchSpecial); 584 } 585 586 Element lastFormattingElement() { 587 return formattingElements.size() > 0 ? formattingElements.get(formattingElements.size()-1) : null; 588 } 589 590 Element removeLastFormattingElement() { 591 int size = formattingElements.size(); 592 if (size > 0) 593 return formattingElements.remove(size-1); 594 else 595 return null; 596 } 597 598 // active formatting elements 599 void pushActiveFormattingElements(Element in) { 600 int numSeen = 0; 601 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 602 Element el = formattingElements.get(pos); 603 if (el == null) // marker 604 break; 605 606 if (isSameFormattingElement(in, el)) 607 numSeen++; 608 609 if (numSeen == 3) { 610 formattingElements.remove(pos); 611 break; 612 } 613 } 614 formattingElements.add(in); 615 } 616 617 private boolean isSameFormattingElement(Element a, Element b) { 618 // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children 619 return a.nodeName().equals(b.nodeName()) && 620 // a.namespace().equals(b.namespace()) && 621 a.attributes().equals(b.attributes()); 622 // todo: namespaces 623 } 624 625 void reconstructFormattingElements() { 626 Element last = lastFormattingElement(); 627 if (last == null || onStack(last)) 628 return; 629 630 Element entry = last; 631 int size = formattingElements.size(); 632 int pos = size - 1; 633 boolean skip = false; 634 while (true) { 635 if (pos == 0) { // step 4. if none before, skip to 8 636 skip = true; 637 break; 638 } 639 entry = formattingElements.get(--pos); // step 5. one earlier than entry 640 if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack 641 break; // jump to 8, else continue back to 4 642 } 643 while(true) { 644 if (!skip) // step 7: on later than entry 645 entry = formattingElements.get(++pos); 646 Validate.notNull(entry); // should not occur, as we break at last element 647 648 // 8. create new element from element, 9 insert into current node, onto stack 649 skip = false; // can only skip increment from 4. 650 Element newEl = insertStartTag(entry.nodeName()); // todo: avoid fostering here? 651 // newEl.namespace(entry.namespace()); // todo: namespaces 652 newEl.attributes().addAll(entry.attributes()); 653 654 // 10. replace entry with new entry 655 formattingElements.set(pos, newEl); 656 657 // 11 658 if (pos == size-1) // if not last entry in list, jump to 7 659 break; 660 } 661 } 662 663 void clearFormattingElementsToLastMarker() { 664 while (!formattingElements.isEmpty()) { 665 Element el = removeLastFormattingElement(); 666 if (el == null) 667 break; 668 } 669 } 670 671 void removeFromActiveFormattingElements(Element el) { 672 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 673 Element next = formattingElements.get(pos); 674 if (next == el) { 675 formattingElements.remove(pos); 676 break; 677 } 678 } 679 } 680 681 boolean isInActiveFormattingElements(Element el) { 682 return isElementInQueue(formattingElements, el); 683 } 684 685 Element getActiveFormattingElement(String nodeName) { 686 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 687 Element next = formattingElements.get(pos); 688 if (next == null) // scope marker 689 break; 690 else if (next.nodeName().equals(nodeName)) 691 return next; 692 } 693 return null; 694 } 695 696 void replaceActiveFormattingElement(Element out, Element in) { 697 replaceInQueue(formattingElements, out, in); 698 } 699 700 void insertMarkerToFormattingElements() { 701 formattingElements.add(null); 702 } 703 704 void insertInFosterParent(Node in) { 705 Element fosterParent; 706 Element lastTable = getFromStack("table"); 707 boolean isLastTableParent = false; 708 if (lastTable != null) { 709 if (lastTable.parent() != null) { 710 fosterParent = lastTable.parent(); 711 isLastTableParent = true; 712 } else 713 fosterParent = aboveOnStack(lastTable); 714 } else { // no table == frag 715 fosterParent = stack.get(0); 716 } 717 718 if (isLastTableParent) { 719 Validate.notNull(lastTable); // last table cannot be null by this point. 720 lastTable.before(in); 721 } 722 else 723 fosterParent.appendChild(in); 724 } 725 726 @Override 727 public String toString() { 728 return "TreeBuilder{" + 729 "currentToken=" + currentToken + 730 ", state=" + state + 731 ", currentElement=" + currentElement() + 732 '}'; 733 } 734}