001package org.jsoup.parser; 002 003import org.jsoup.helper.StringUtil; 004import org.jsoup.nodes.Attribute; 005import org.jsoup.nodes.Attributes; 006import org.jsoup.nodes.Document; 007import org.jsoup.nodes.DocumentType; 008import org.jsoup.nodes.Element; 009import org.jsoup.nodes.Node; 010 011import java.util.ArrayList; 012 013/** 014 * The Tree Builder's current state. Each state embodies the processing for the state, and transitions to other states. 015 */ 016enum HtmlTreeBuilderState { 017 Initial { 018 boolean process(Token t, HtmlTreeBuilder tb) { 019 if (isWhitespace(t)) { 020 return true; // ignore whitespace 021 } else if (t.isComment()) { 022 tb.insert(t.asComment()); 023 } else if (t.isDoctype()) { 024 // todo: parse error check on expected doctypes 025 // todo: quirk state check on doctype ids 026 Token.Doctype d = t.asDoctype(); 027 DocumentType doctype = new DocumentType( 028 tb.settings.normalizeTag(d.getName()), d.getPublicIdentifier(), d.getSystemIdentifier()); 029 doctype.setPubSysKey(d.getPubSysKey()); 030 tb.getDocument().appendChild(doctype); 031 if (d.isForceQuirks()) 032 tb.getDocument().quirksMode(Document.QuirksMode.quirks); 033 tb.transition(BeforeHtml); 034 } else { 035 // todo: check not iframe srcdoc 036 tb.transition(BeforeHtml); 037 return tb.process(t); // re-process token 038 } 039 return true; 040 } 041 }, 042 BeforeHtml { 043 boolean process(Token t, HtmlTreeBuilder tb) { 044 if (t.isDoctype()) { 045 tb.error(this); 046 return false; 047 } else if (t.isComment()) { 048 tb.insert(t.asComment()); 049 } else if (isWhitespace(t)) { 050 return true; // ignore whitespace 051 } else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) { 052 tb.insert(t.asStartTag()); 053 tb.transition(BeforeHead); 054 } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().normalName(), "head", "body", "html", "br"))) { 055 return anythingElse(t, tb); 056 } else if (t.isEndTag()) { 057 tb.error(this); 058 return false; 059 } else { 060 return anythingElse(t, tb); 061 } 062 return true; 063 } 064 065 private boolean anythingElse(Token t, HtmlTreeBuilder tb) { 066 tb.insertStartTag("html"); 067 tb.transition(BeforeHead); 068 return tb.process(t); 069 } 070 }, 071 BeforeHead { 072 boolean process(Token t, HtmlTreeBuilder tb) { 073 if (isWhitespace(t)) { 074 return true; 075 } else if (t.isComment()) { 076 tb.insert(t.asComment()); 077 } else if (t.isDoctype()) { 078 tb.error(this); 079 return false; 080 } else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) { 081 return InBody.process(t, tb); // does not transition 082 } else if (t.isStartTag() && t.asStartTag().normalName().equals("head")) { 083 Element head = tb.insert(t.asStartTag()); 084 tb.setHeadElement(head); 085 tb.transition(InHead); 086 } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().normalName(), "head", "body", "html", "br"))) { 087 tb.processStartTag("head"); 088 return tb.process(t); 089 } else if (t.isEndTag()) { 090 tb.error(this); 091 return false; 092 } else { 093 tb.processStartTag("head"); 094 return tb.process(t); 095 } 096 return true; 097 } 098 }, 099 InHead { 100 boolean process(Token t, HtmlTreeBuilder tb) { 101 if (isWhitespace(t)) { 102 tb.insert(t.asCharacter()); 103 return true; 104 } 105 switch (t.type) { 106 case Comment: 107 tb.insert(t.asComment()); 108 break; 109 case Doctype: 110 tb.error(this); 111 return false; 112 case StartTag: 113 Token.StartTag start = t.asStartTag(); 114 String name = start.normalName(); 115 if (name.equals("html")) { 116 return InBody.process(t, tb); 117 } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link")) { 118 Element el = tb.insertEmpty(start); 119 // jsoup special: update base the frist time it is seen 120 if (name.equals("base") && el.hasAttr("href")) 121 tb.maybeSetBaseUri(el); 122 } else if (name.equals("meta")) { 123 Element meta = tb.insertEmpty(start); 124 // todo: charset switches 125 } else if (name.equals("title")) { 126 handleRcData(start, tb); 127 } else if (StringUtil.in(name, "noframes", "style")) { 128 handleRawtext(start, tb); 129 } else if (name.equals("noscript")) { 130 // else if noscript && scripting flag = true: rawtext (jsoup doesn't run script, to handle as noscript) 131 tb.insert(start); 132 tb.transition(InHeadNoscript); 133 } else if (name.equals("script")) { 134 // skips some script rules as won't execute them 135 136 tb.tokeniser.transition(TokeniserState.ScriptData); 137 tb.markInsertionMode(); 138 tb.transition(Text); 139 tb.insert(start); 140 } else if (name.equals("head")) { 141 tb.error(this); 142 return false; 143 } else { 144 return anythingElse(t, tb); 145 } 146 break; 147 case EndTag: 148 Token.EndTag end = t.asEndTag(); 149 name = end.normalName(); 150 if (name.equals("head")) { 151 tb.pop(); 152 tb.transition(AfterHead); 153 } else if (StringUtil.in(name, "body", "html", "br")) { 154 return anythingElse(t, tb); 155 } else { 156 tb.error(this); 157 return false; 158 } 159 break; 160 default: 161 return anythingElse(t, tb); 162 } 163 return true; 164 } 165 166 private boolean anythingElse(Token t, TreeBuilder tb) { 167 tb.processEndTag("head"); 168 return tb.process(t); 169 } 170 }, 171 InHeadNoscript { 172 boolean process(Token t, HtmlTreeBuilder tb) { 173 if (t.isDoctype()) { 174 tb.error(this); 175 } else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) { 176 return tb.process(t, InBody); 177 } else if (t.isEndTag() && t.asEndTag().normalName().equals("noscript")) { 178 tb.pop(); 179 tb.transition(InHead); 180 } else if (isWhitespace(t) || t.isComment() || (t.isStartTag() && StringUtil.in(t.asStartTag().normalName(), 181 "basefont", "bgsound", "link", "meta", "noframes", "style"))) { 182 return tb.process(t, InHead); 183 } else if (t.isEndTag() && t.asEndTag().normalName().equals("br")) { 184 return anythingElse(t, tb); 185 } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().normalName(), "head", "noscript")) || t.isEndTag()) { 186 tb.error(this); 187 return false; 188 } else { 189 return anythingElse(t, tb); 190 } 191 return true; 192 } 193 194 private boolean anythingElse(Token t, HtmlTreeBuilder tb) { 195 tb.error(this); 196 tb.insert(new Token.Character().data(t.toString())); 197 return true; 198 } 199 }, 200 AfterHead { 201 boolean process(Token t, HtmlTreeBuilder tb) { 202 if (isWhitespace(t)) { 203 tb.insert(t.asCharacter()); 204 } else if (t.isComment()) { 205 tb.insert(t.asComment()); 206 } else if (t.isDoctype()) { 207 tb.error(this); 208 } else if (t.isStartTag()) { 209 Token.StartTag startTag = t.asStartTag(); 210 String name = startTag.normalName(); 211 if (name.equals("html")) { 212 return tb.process(t, InBody); 213 } else if (name.equals("body")) { 214 tb.insert(startTag); 215 tb.framesetOk(false); 216 tb.transition(InBody); 217 } else if (name.equals("frameset")) { 218 tb.insert(startTag); 219 tb.transition(InFrameset); 220 } else if (StringUtil.in(name, "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title")) { 221 tb.error(this); 222 Element head = tb.getHeadElement(); 223 tb.push(head); 224 tb.process(t, InHead); 225 tb.removeFromStack(head); 226 } else if (name.equals("head")) { 227 tb.error(this); 228 return false; 229 } else { 230 anythingElse(t, tb); 231 } 232 } else if (t.isEndTag()) { 233 if (StringUtil.in(t.asEndTag().normalName(), "body", "html")) { 234 anythingElse(t, tb); 235 } else { 236 tb.error(this); 237 return false; 238 } 239 } else { 240 anythingElse(t, tb); 241 } 242 return true; 243 } 244 245 private boolean anythingElse(Token t, HtmlTreeBuilder tb) { 246 tb.processStartTag("body"); 247 tb.framesetOk(true); 248 return tb.process(t); 249 } 250 }, 251 InBody { 252 boolean process(Token t, HtmlTreeBuilder tb) { 253 switch (t.type) { 254 case Character: { 255 Token.Character c = t.asCharacter(); 256 if (c.getData().equals(nullString)) { 257 // todo confirm that check 258 tb.error(this); 259 return false; 260 } else if (tb.framesetOk() && isWhitespace(c)) { // don't check if whitespace if frames already closed 261 tb.reconstructFormattingElements(); 262 tb.insert(c); 263 } else { 264 tb.reconstructFormattingElements(); 265 tb.insert(c); 266 tb.framesetOk(false); 267 } 268 break; 269 } 270 case Comment: { 271 tb.insert(t.asComment()); 272 break; 273 } 274 case Doctype: { 275 tb.error(this); 276 return false; 277 } 278 case StartTag: 279 Token.StartTag startTag = t.asStartTag(); 280 // todo - refactor to a switch statement 281 String name = startTag.normalName(); 282 if (name.equals("a")) { 283 if (tb.getActiveFormattingElement("a") != null) { 284 tb.error(this); 285 tb.processEndTag("a"); 286 287 // still on stack? 288 Element remainingA = tb.getFromStack("a"); 289 if (remainingA != null) { 290 tb.removeFromActiveFormattingElements(remainingA); 291 tb.removeFromStack(remainingA); 292 } 293 } 294 tb.reconstructFormattingElements(); 295 Element a = tb.insert(startTag); 296 tb.pushActiveFormattingElements(a); 297 } else if (StringUtil.inSorted(name, Constants.InBodyStartEmptyFormatters)) { 298 tb.reconstructFormattingElements(); 299 tb.insertEmpty(startTag); 300 tb.framesetOk(false); 301 } else if (StringUtil.inSorted(name, Constants.InBodyStartPClosers)) { 302 if (tb.inButtonScope("p")) { 303 tb.processEndTag("p"); 304 } 305 tb.insert(startTag); 306 } else if (name.equals("span")) { 307 // same as final else, but short circuits lots of checks 308 tb.reconstructFormattingElements(); 309 tb.insert(startTag); 310 } else if (name.equals("li")) { 311 tb.framesetOk(false); 312 ArrayList<Element> stack = tb.getStack(); 313 for (int i = stack.size() - 1; i > 0; i--) { 314 Element el = stack.get(i); 315 if (el.nodeName().equals("li")) { 316 tb.processEndTag("li"); 317 break; 318 } 319 if (tb.isSpecial(el) && !StringUtil.inSorted(el.nodeName(), Constants.InBodyStartLiBreakers)) 320 break; 321 } 322 if (tb.inButtonScope("p")) { 323 tb.processEndTag("p"); 324 } 325 tb.insert(startTag); 326 } else if (name.equals("html")) { 327 tb.error(this); 328 // merge attributes onto real html 329 Element html = tb.getStack().get(0); 330 for (Attribute attribute : startTag.getAttributes()) { 331 if (!html.hasAttr(attribute.getKey())) 332 html.attributes().put(attribute); 333 } 334 } else if (StringUtil.inSorted(name, Constants.InBodyStartToHead)) { 335 return tb.process(t, InHead); 336 } else if (name.equals("body")) { 337 tb.error(this); 338 ArrayList<Element> stack = tb.getStack(); 339 if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) { 340 // only in fragment case 341 return false; // ignore 342 } else { 343 tb.framesetOk(false); 344 Element body = stack.get(1); 345 for (Attribute attribute : startTag.getAttributes()) { 346 if (!body.hasAttr(attribute.getKey())) 347 body.attributes().put(attribute); 348 } 349 } 350 } else if (name.equals("frameset")) { 351 tb.error(this); 352 ArrayList<Element> stack = tb.getStack(); 353 if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) { 354 // only in fragment case 355 return false; // ignore 356 } else if (!tb.framesetOk()) { 357 return false; // ignore frameset 358 } else { 359 Element second = stack.get(1); 360 if (second.parent() != null) 361 second.remove(); 362 // pop up to html element 363 while (stack.size() > 1) 364 stack.remove(stack.size()-1); 365 tb.insert(startTag); 366 tb.transition(InFrameset); 367 } 368 } else if (StringUtil.inSorted(name, Constants.Headings)) { 369 if (tb.inButtonScope("p")) { 370 tb.processEndTag("p"); 371 } 372 if (StringUtil.inSorted(tb.currentElement().nodeName(), Constants.Headings)) { 373 tb.error(this); 374 tb.pop(); 375 } 376 tb.insert(startTag); 377 } else if (StringUtil.inSorted(name, Constants.InBodyStartPreListing)) { 378 if (tb.inButtonScope("p")) { 379 tb.processEndTag("p"); 380 } 381 tb.insert(startTag); 382 // todo: ignore LF if next token 383 tb.framesetOk(false); 384 } else if (name.equals("form")) { 385 if (tb.getFormElement() != null) { 386 tb.error(this); 387 return false; 388 } 389 if (tb.inButtonScope("p")) { 390 tb.processEndTag("p"); 391 } 392 tb.insertForm(startTag, true); 393 } else if (StringUtil.inSorted(name, Constants.DdDt)) { 394 tb.framesetOk(false); 395 ArrayList<Element> stack = tb.getStack(); 396 for (int i = stack.size() - 1; i > 0; i--) { 397 Element el = stack.get(i); 398 if (StringUtil.inSorted(el.nodeName(), Constants.DdDt)) { 399 tb.processEndTag(el.nodeName()); 400 break; 401 } 402 if (tb.isSpecial(el) && !StringUtil.inSorted(el.nodeName(), Constants.InBodyStartLiBreakers)) 403 break; 404 } 405 if (tb.inButtonScope("p")) { 406 tb.processEndTag("p"); 407 } 408 tb.insert(startTag); 409 } else if (name.equals("plaintext")) { 410 if (tb.inButtonScope("p")) { 411 tb.processEndTag("p"); 412 } 413 tb.insert(startTag); 414 tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once in, never gets out 415 } else if (name.equals("button")) { 416 if (tb.inButtonScope("button")) { 417 // close and reprocess 418 tb.error(this); 419 tb.processEndTag("button"); 420 tb.process(startTag); 421 } else { 422 tb.reconstructFormattingElements(); 423 tb.insert(startTag); 424 tb.framesetOk(false); 425 } 426 } else if (StringUtil.inSorted(name, Constants.Formatters)) { 427 tb.reconstructFormattingElements(); 428 Element el = tb.insert(startTag); 429 tb.pushActiveFormattingElements(el); 430 } else if (name.equals("nobr")) { 431 tb.reconstructFormattingElements(); 432 if (tb.inScope("nobr")) { 433 tb.error(this); 434 tb.processEndTag("nobr"); 435 tb.reconstructFormattingElements(); 436 } 437 Element el = tb.insert(startTag); 438 tb.pushActiveFormattingElements(el); 439 } else if (StringUtil.inSorted(name, Constants.InBodyStartApplets)) { 440 tb.reconstructFormattingElements(); 441 tb.insert(startTag); 442 tb.insertMarkerToFormattingElements(); 443 tb.framesetOk(false); 444 } else if (name.equals("table")) { 445 if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks && tb.inButtonScope("p")) { 446 tb.processEndTag("p"); 447 } 448 tb.insert(startTag); 449 tb.framesetOk(false); 450 tb.transition(InTable); 451 } else if (name.equals("input")) { 452 tb.reconstructFormattingElements(); 453 Element el = tb.insertEmpty(startTag); 454 if (!el.attr("type").equalsIgnoreCase("hidden")) 455 tb.framesetOk(false); 456 } else if (StringUtil.inSorted(name, Constants.InBodyStartMedia)) { 457 tb.insertEmpty(startTag); 458 } else if (name.equals("hr")) { 459 if (tb.inButtonScope("p")) { 460 tb.processEndTag("p"); 461 } 462 tb.insertEmpty(startTag); 463 tb.framesetOk(false); 464 } else if (name.equals("image")) { 465 if (tb.getFromStack("svg") == null) 466 return tb.process(startTag.name("img")); // change <image> to <img>, unless in svg 467 else 468 tb.insert(startTag); 469 } else if (name.equals("isindex")) { 470 // how much do we care about the early 90s? 471 tb.error(this); 472 if (tb.getFormElement() != null) 473 return false; 474 475 tb.processStartTag("form"); 476 if (startTag.attributes.hasKey("action")) { 477 Element form = tb.getFormElement(); 478 form.attr("action", startTag.attributes.get("action")); 479 } 480 tb.processStartTag("hr"); 481 tb.processStartTag("label"); 482 // hope you like english. 483 String prompt = startTag.attributes.hasKey("prompt") ? 484 startTag.attributes.get("prompt") : 485 "This is a searchable index. Enter search keywords: "; 486 487 tb.process(new Token.Character().data(prompt)); 488 489 // input 490 Attributes inputAttribs = new Attributes(); 491 for (Attribute attr : startTag.attributes) { 492 if (!StringUtil.inSorted(attr.getKey(), Constants.InBodyStartInputAttribs)) 493 inputAttribs.put(attr); 494 } 495 inputAttribs.put("name", "isindex"); 496 tb.processStartTag("input", inputAttribs); 497 tb.processEndTag("label"); 498 tb.processStartTag("hr"); 499 tb.processEndTag("form"); 500 } else if (name.equals("textarea")) { 501 tb.insert(startTag); 502 // todo: If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move on to the next one. (Newlines at the start of textarea elements are ignored as an authoring convenience.) 503 tb.tokeniser.transition(TokeniserState.Rcdata); 504 tb.markInsertionMode(); 505 tb.framesetOk(false); 506 tb.transition(Text); 507 } else if (name.equals("xmp")) { 508 if (tb.inButtonScope("p")) { 509 tb.processEndTag("p"); 510 } 511 tb.reconstructFormattingElements(); 512 tb.framesetOk(false); 513 handleRawtext(startTag, tb); 514 } else if (name.equals("iframe")) { 515 tb.framesetOk(false); 516 handleRawtext(startTag, tb); 517 } else if (name.equals("noembed")) { 518 // also handle noscript if script enabled 519 handleRawtext(startTag, tb); 520 } else if (name.equals("select")) { 521 tb.reconstructFormattingElements(); 522 tb.insert(startTag); 523 tb.framesetOk(false); 524 525 HtmlTreeBuilderState state = tb.state(); 526 if (state.equals(InTable) || state.equals(InCaption) || state.equals(InTableBody) || state.equals(InRow) || state.equals(InCell)) 527 tb.transition(InSelectInTable); 528 else 529 tb.transition(InSelect); 530 } else if (StringUtil.inSorted(name, Constants.InBodyStartOptions)) { 531 if (tb.currentElement().nodeName().equals("option")) 532 tb.processEndTag("option"); 533 tb.reconstructFormattingElements(); 534 tb.insert(startTag); 535 } else if (StringUtil.inSorted(name, Constants.InBodyStartRuby)) { 536 if (tb.inScope("ruby")) { 537 tb.generateImpliedEndTags(); 538 if (!tb.currentElement().nodeName().equals("ruby")) { 539 tb.error(this); 540 tb.popStackToBefore("ruby"); // i.e. close up to but not include name 541 } 542 tb.insert(startTag); 543 } 544 } else if (name.equals("math")) { 545 tb.reconstructFormattingElements(); 546 // todo: handle A start tag whose tag name is "math" (i.e. foreign, mathml) 547 tb.insert(startTag); 548 } else if (name.equals("svg")) { 549 tb.reconstructFormattingElements(); 550 // todo: handle A start tag whose tag name is "svg" (xlink, svg) 551 tb.insert(startTag); 552 } else if (StringUtil.inSorted(name, Constants.InBodyStartDrop)) { 553 tb.error(this); 554 return false; 555 } else { 556 tb.reconstructFormattingElements(); 557 tb.insert(startTag); 558 } 559 break; 560 561 case EndTag: 562 Token.EndTag endTag = t.asEndTag(); 563 name = endTag.normalName(); 564 if (StringUtil.inSorted(name, Constants.InBodyEndAdoptionFormatters)) { 565 // Adoption Agency Algorithm. 566 for (int i = 0; i < 8; i++) { 567 Element formatEl = tb.getActiveFormattingElement(name); 568 if (formatEl == null) 569 return anyOtherEndTag(t, tb); 570 else if (!tb.onStack(formatEl)) { 571 tb.error(this); 572 tb.removeFromActiveFormattingElements(formatEl); 573 return true; 574 } else if (!tb.inScope(formatEl.nodeName())) { 575 tb.error(this); 576 return false; 577 } else if (tb.currentElement() != formatEl) 578 tb.error(this); 579 580 Element furthestBlock = null; 581 Element commonAncestor = null; 582 boolean seenFormattingElement = false; 583 ArrayList<Element> stack = tb.getStack(); 584 // the spec doesn't limit to < 64, but in degenerate cases (9000+ stack depth) this prevents 585 // run-aways 586 final int stackSize = stack.size(); 587 for (int si = 0; si < stackSize && si < 64; si++) { 588 Element el = stack.get(si); 589 if (el == formatEl) { 590 commonAncestor = stack.get(si - 1); 591 seenFormattingElement = true; 592 } else if (seenFormattingElement && tb.isSpecial(el)) { 593 furthestBlock = el; 594 break; 595 } 596 } 597 if (furthestBlock == null) { 598 tb.popStackToClose(formatEl.nodeName()); 599 tb.removeFromActiveFormattingElements(formatEl); 600 return true; 601 } 602 603 // todo: Let a bookmark note the position of the formatting element in the list of active formatting elements relative to the elements on either side of it in the list. 604 // does that mean: int pos of format el in list? 605 Element node = furthestBlock; 606 Element lastNode = furthestBlock; 607 for (int j = 0; j < 3; j++) { 608 if (tb.onStack(node)) 609 node = tb.aboveOnStack(node); 610 if (!tb.isInActiveFormattingElements(node)) { // note no bookmark check 611 tb.removeFromStack(node); 612 continue; 613 } else if (node == formatEl) 614 break; 615 616 Element replacement = new Element(Tag.valueOf(node.nodeName(), ParseSettings.preserveCase), tb.getBaseUri()); 617 // case will follow the original node (so honours ParseSettings) 618 tb.replaceActiveFormattingElement(node, replacement); 619 tb.replaceOnStack(node, replacement); 620 node = replacement; 621 622 if (lastNode == furthestBlock) { 623 // todo: move the aforementioned bookmark to be immediately after the new node in the list of active formatting elements. 624 // not getting how this bookmark both straddles the element above, but is inbetween here... 625 } 626 if (lastNode.parent() != null) 627 lastNode.remove(); 628 node.appendChild(lastNode); 629 630 lastNode = node; 631 } 632 633 if (StringUtil.inSorted(commonAncestor.nodeName(), Constants.InBodyEndTableFosters)) { 634 if (lastNode.parent() != null) 635 lastNode.remove(); 636 tb.insertInFosterParent(lastNode); 637 } else { 638 if (lastNode.parent() != null) 639 lastNode.remove(); 640 commonAncestor.appendChild(lastNode); 641 } 642 643 Element adopter = new Element(formatEl.tag(), tb.getBaseUri()); 644 adopter.attributes().addAll(formatEl.attributes()); 645 Node[] childNodes = furthestBlock.childNodes().toArray(new Node[furthestBlock.childNodeSize()]); 646 for (Node childNode : childNodes) { 647 adopter.appendChild(childNode); // append will reparent. thus the clone to avoid concurrent mod. 648 } 649 furthestBlock.appendChild(adopter); 650 tb.removeFromActiveFormattingElements(formatEl); 651 // todo: insert the new element into the list of active formatting elements at the position of the aforementioned bookmark. 652 tb.removeFromStack(formatEl); 653 tb.insertOnStackAfter(furthestBlock, adopter); 654 } 655 } else if (StringUtil.inSorted(name, Constants.InBodyEndClosers)) { 656 if (!tb.inScope(name)) { 657 // nothing to close 658 tb.error(this); 659 return false; 660 } else { 661 tb.generateImpliedEndTags(); 662 if (!tb.currentElement().nodeName().equals(name)) 663 tb.error(this); 664 tb.popStackToClose(name); 665 } 666 } else if (name.equals("span")) { 667 // same as final fall through, but saves short circuit 668 return anyOtherEndTag(t, tb); 669 } else if (name.equals("li")) { 670 if (!tb.inListItemScope(name)) { 671 tb.error(this); 672 return false; 673 } else { 674 tb.generateImpliedEndTags(name); 675 if (!tb.currentElement().nodeName().equals(name)) 676 tb.error(this); 677 tb.popStackToClose(name); 678 } 679 } else if (name.equals("body")) { 680 if (!tb.inScope("body")) { 681 tb.error(this); 682 return false; 683 } else { 684 // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html 685 tb.transition(AfterBody); 686 } 687 } else if (name.equals("html")) { 688 boolean notIgnored = tb.processEndTag("body"); 689 if (notIgnored) 690 return tb.process(endTag); 691 } else if (name.equals("form")) { 692 Element currentForm = tb.getFormElement(); 693 tb.setFormElement(null); 694 if (currentForm == null || !tb.inScope(name)) { 695 tb.error(this); 696 return false; 697 } else { 698 tb.generateImpliedEndTags(); 699 if (!tb.currentElement().nodeName().equals(name)) 700 tb.error(this); 701 // remove currentForm from stack. will shift anything under up. 702 tb.removeFromStack(currentForm); 703 } 704 } else if (name.equals("p")) { 705 if (!tb.inButtonScope(name)) { 706 tb.error(this); 707 tb.processStartTag(name); // if no p to close, creates an empty <p></p> 708 return tb.process(endTag); 709 } else { 710 tb.generateImpliedEndTags(name); 711 if (!tb.currentElement().nodeName().equals(name)) 712 tb.error(this); 713 tb.popStackToClose(name); 714 } 715 } else if (StringUtil.inSorted(name, Constants.DdDt)) { 716 if (!tb.inScope(name)) { 717 tb.error(this); 718 return false; 719 } else { 720 tb.generateImpliedEndTags(name); 721 if (!tb.currentElement().nodeName().equals(name)) 722 tb.error(this); 723 tb.popStackToClose(name); 724 } 725 } else if (StringUtil.inSorted(name, Constants.Headings)) { 726 if (!tb.inScope(Constants.Headings)) { 727 tb.error(this); 728 return false; 729 } else { 730 tb.generateImpliedEndTags(name); 731 if (!tb.currentElement().nodeName().equals(name)) 732 tb.error(this); 733 tb.popStackToClose(Constants.Headings); 734 } 735 } else if (name.equals("sarcasm")) { 736 // *sigh* 737 return anyOtherEndTag(t, tb); 738 } else if (StringUtil.inSorted(name, Constants.InBodyStartApplets)) { 739 if (!tb.inScope("name")) { 740 if (!tb.inScope(name)) { 741 tb.error(this); 742 return false; 743 } 744 tb.generateImpliedEndTags(); 745 if (!tb.currentElement().nodeName().equals(name)) 746 tb.error(this); 747 tb.popStackToClose(name); 748 tb.clearFormattingElementsToLastMarker(); 749 } 750 } else if (name.equals("br")) { 751 tb.error(this); 752 tb.processStartTag("br"); 753 return false; 754 } else { 755 return anyOtherEndTag(t, tb); 756 } 757 758 break; 759 case EOF: 760 // todo: error if stack contains something not dd, dt, li, p, tbody, td, tfoot, th, thead, tr, body, html 761 // stop parsing 762 break; 763 } 764 return true; 765 } 766 767 boolean anyOtherEndTag(Token t, HtmlTreeBuilder tb) { 768 String name = tb.settings.normalizeTag(t.asEndTag().name()); // matches with case sensitivity if enabled 769 ArrayList<Element> stack = tb.getStack(); 770 for (int pos = stack.size() -1; pos >= 0; pos--) { 771 Element node = stack.get(pos); 772 if (node.nodeName().equals(name)) { 773 tb.generateImpliedEndTags(name); 774 if (!name.equals(tb.currentElement().nodeName())) 775 tb.error(this); 776 tb.popStackToClose(name); 777 break; 778 } else { 779 if (tb.isSpecial(node)) { 780 tb.error(this); 781 return false; 782 } 783 } 784 } 785 return true; 786 } 787 }, 788 Text { 789 // in script, style etc. normally treated as data tags 790 boolean process(Token t, HtmlTreeBuilder tb) { 791 if (t.isCharacter()) { 792 tb.insert(t.asCharacter()); 793 } else if (t.isEOF()) { 794 tb.error(this); 795 // if current node is script: already started 796 tb.pop(); 797 tb.transition(tb.originalState()); 798 return tb.process(t); 799 } else if (t.isEndTag()) { 800 // if: An end tag whose tag name is "script" -- scripting nesting level, if evaluating scripts 801 tb.pop(); 802 tb.transition(tb.originalState()); 803 } 804 return true; 805 } 806 }, 807 InTable { 808 boolean process(Token t, HtmlTreeBuilder tb) { 809 if (t.isCharacter()) { 810 tb.newPendingTableCharacters(); 811 tb.markInsertionMode(); 812 tb.transition(InTableText); 813 return tb.process(t); 814 } else if (t.isComment()) { 815 tb.insert(t.asComment()); 816 return true; 817 } else if (t.isDoctype()) { 818 tb.error(this); 819 return false; 820 } else if (t.isStartTag()) { 821 Token.StartTag startTag = t.asStartTag(); 822 String name = startTag.normalName(); 823 if (name.equals("caption")) { 824 tb.clearStackToTableContext(); 825 tb.insertMarkerToFormattingElements(); 826 tb.insert(startTag); 827 tb.transition(InCaption); 828 } else if (name.equals("colgroup")) { 829 tb.clearStackToTableContext(); 830 tb.insert(startTag); 831 tb.transition(InColumnGroup); 832 } else if (name.equals("col")) { 833 tb.processStartTag("colgroup"); 834 return tb.process(t); 835 } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { 836 tb.clearStackToTableContext(); 837 tb.insert(startTag); 838 tb.transition(InTableBody); 839 } else if (StringUtil.in(name, "td", "th", "tr")) { 840 tb.processStartTag("tbody"); 841 return tb.process(t); 842 } else if (name.equals("table")) { 843 tb.error(this); 844 boolean processed = tb.processEndTag("table"); 845 if (processed) // only ignored if in fragment 846 return tb.process(t); 847 } else if (StringUtil.in(name, "style", "script")) { 848 return tb.process(t, InHead); 849 } else if (name.equals("input")) { 850 if (!startTag.attributes.get("type").equalsIgnoreCase("hidden")) { 851 return anythingElse(t, tb); 852 } else { 853 tb.insertEmpty(startTag); 854 } 855 } else if (name.equals("form")) { 856 tb.error(this); 857 if (tb.getFormElement() != null) 858 return false; 859 else { 860 tb.insertForm(startTag, false); 861 } 862 } else { 863 return anythingElse(t, tb); 864 } 865 return true; // todo: check if should return processed http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intable 866 } else if (t.isEndTag()) { 867 Token.EndTag endTag = t.asEndTag(); 868 String name = endTag.normalName(); 869 870 if (name.equals("table")) { 871 if (!tb.inTableScope(name)) { 872 tb.error(this); 873 return false; 874 } else { 875 tb.popStackToClose("table"); 876 } 877 tb.resetInsertionMode(); 878 } else if (StringUtil.in(name, 879 "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) { 880 tb.error(this); 881 return false; 882 } else { 883 return anythingElse(t, tb); 884 } 885 return true; // todo: as above todo 886 } else if (t.isEOF()) { 887 if (tb.currentElement().nodeName().equals("html")) 888 tb.error(this); 889 return true; // stops parsing 890 } 891 return anythingElse(t, tb); 892 } 893 894 boolean anythingElse(Token t, HtmlTreeBuilder tb) { 895 tb.error(this); 896 boolean processed; 897 if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { 898 tb.setFosterInserts(true); 899 processed = tb.process(t, InBody); 900 tb.setFosterInserts(false); 901 } else { 902 processed = tb.process(t, InBody); 903 } 904 return processed; 905 } 906 }, 907 InTableText { 908 boolean process(Token t, HtmlTreeBuilder tb) { 909 switch (t.type) { 910 case Character: 911 Token.Character c = t.asCharacter(); 912 if (c.getData().equals(nullString)) { 913 tb.error(this); 914 return false; 915 } else { 916 tb.getPendingTableCharacters().add(c.getData()); 917 } 918 break; 919 default: 920 // todo - don't really like the way these table character data lists are built 921 if (tb.getPendingTableCharacters().size() > 0) { 922 for (String character : tb.getPendingTableCharacters()) { 923 if (!isWhitespace(character)) { 924 // InTable anything else section: 925 tb.error(this); 926 if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { 927 tb.setFosterInserts(true); 928 tb.process(new Token.Character().data(character), InBody); 929 tb.setFosterInserts(false); 930 } else { 931 tb.process(new Token.Character().data(character), InBody); 932 } 933 } else 934 tb.insert(new Token.Character().data(character)); 935 } 936 tb.newPendingTableCharacters(); 937 } 938 tb.transition(tb.originalState()); 939 return tb.process(t); 940 } 941 return true; 942 } 943 }, 944 InCaption { 945 boolean process(Token t, HtmlTreeBuilder tb) { 946 if (t.isEndTag() && t.asEndTag().normalName().equals("caption")) { 947 Token.EndTag endTag = t.asEndTag(); 948 String name = endTag.normalName(); 949 if (!tb.inTableScope(name)) { 950 tb.error(this); 951 return false; 952 } else { 953 tb.generateImpliedEndTags(); 954 if (!tb.currentElement().nodeName().equals("caption")) 955 tb.error(this); 956 tb.popStackToClose("caption"); 957 tb.clearFormattingElementsToLastMarker(); 958 tb.transition(InTable); 959 } 960 } else if (( 961 t.isStartTag() && StringUtil.in(t.asStartTag().normalName(), 962 "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr") || 963 t.isEndTag() && t.asEndTag().normalName().equals("table")) 964 ) { 965 tb.error(this); 966 boolean processed = tb.processEndTag("caption"); 967 if (processed) 968 return tb.process(t); 969 } else if (t.isEndTag() && StringUtil.in(t.asEndTag().normalName(), 970 "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) { 971 tb.error(this); 972 return false; 973 } else { 974 return tb.process(t, InBody); 975 } 976 return true; 977 } 978 }, 979 InColumnGroup { 980 boolean process(Token t, HtmlTreeBuilder tb) { 981 if (isWhitespace(t)) { 982 tb.insert(t.asCharacter()); 983 return true; 984 } 985 switch (t.type) { 986 case Comment: 987 tb.insert(t.asComment()); 988 break; 989 case Doctype: 990 tb.error(this); 991 break; 992 case StartTag: 993 Token.StartTag startTag = t.asStartTag(); 994 switch (startTag.normalName()) { 995 case "html": 996 return tb.process(t, InBody); 997 case "col": 998 tb.insertEmpty(startTag); 999 break; 1000 default: 1001 return anythingElse(t, tb); 1002 } 1003 break; 1004 case EndTag: 1005 Token.EndTag endTag = t.asEndTag(); 1006 if (endTag.normalName.equals("colgroup")) { 1007 if (tb.currentElement().nodeName().equals("html")) { // frag case 1008 tb.error(this); 1009 return false; 1010 } else { 1011 tb.pop(); 1012 tb.transition(InTable); 1013 } 1014 } else 1015 return anythingElse(t, tb); 1016 break; 1017 case EOF: 1018 if (tb.currentElement().nodeName().equals("html")) 1019 return true; // stop parsing; frag case 1020 else 1021 return anythingElse(t, tb); 1022 default: 1023 return anythingElse(t, tb); 1024 } 1025 return true; 1026 } 1027 1028 private boolean anythingElse(Token t, TreeBuilder tb) { 1029 boolean processed = tb.processEndTag("colgroup"); 1030 if (processed) // only ignored in frag case 1031 return tb.process(t); 1032 return true; 1033 } 1034 }, 1035 InTableBody { 1036 boolean process(Token t, HtmlTreeBuilder tb) { 1037 switch (t.type) { 1038 case StartTag: 1039 Token.StartTag startTag = t.asStartTag(); 1040 String name = startTag.normalName(); 1041 if (name.equals("template")) { 1042 tb.insert(startTag); 1043 } else if (name.equals("tr")) { 1044 tb.clearStackToTableBodyContext(); 1045 tb.insert(startTag); 1046 tb.transition(InRow); 1047 } else if (StringUtil.in(name, "th", "td")) { 1048 tb.error(this); 1049 tb.processStartTag("tr"); 1050 return tb.process(startTag); 1051 } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead")) { 1052 return exitTableBody(t, tb); 1053 } else 1054 return anythingElse(t, tb); 1055 break; 1056 case EndTag: 1057 Token.EndTag endTag = t.asEndTag(); 1058 name = endTag.normalName(); 1059 if (StringUtil.in(name, "tbody", "tfoot", "thead")) { 1060 if (!tb.inTableScope(name)) { 1061 tb.error(this); 1062 return false; 1063 } else { 1064 tb.clearStackToTableBodyContext(); 1065 tb.pop(); 1066 tb.transition(InTable); 1067 } 1068 } else if (name.equals("table")) { 1069 return exitTableBody(t, tb); 1070 } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th", "tr")) { 1071 tb.error(this); 1072 return false; 1073 } else 1074 return anythingElse(t, tb); 1075 break; 1076 default: 1077 return anythingElse(t, tb); 1078 } 1079 return true; 1080 } 1081 1082 private boolean exitTableBody(Token t, HtmlTreeBuilder tb) { 1083 if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb.inScope("tfoot"))) { 1084 // frag case 1085 tb.error(this); 1086 return false; 1087 } 1088 tb.clearStackToTableBodyContext(); 1089 tb.processEndTag(tb.currentElement().nodeName()); // tbody, tfoot, thead 1090 return tb.process(t); 1091 } 1092 1093 private boolean anythingElse(Token t, HtmlTreeBuilder tb) { 1094 return tb.process(t, InTable); 1095 } 1096 }, 1097 InRow { 1098 boolean process(Token t, HtmlTreeBuilder tb) { 1099 if (t.isStartTag()) { 1100 Token.StartTag startTag = t.asStartTag(); 1101 String name = startTag.normalName(); 1102 1103 if (name.equals("template")) { 1104 tb.insert(startTag); 1105 } else if (StringUtil.in(name, "th", "td")) { 1106 tb.clearStackToTableRowContext(); 1107 tb.insert(startTag); 1108 tb.transition(InCell); 1109 tb.insertMarkerToFormattingElements(); 1110 } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr")) { 1111 return handleMissingTr(t, tb); 1112 } else { 1113 return anythingElse(t, tb); 1114 } 1115 } else if (t.isEndTag()) { 1116 Token.EndTag endTag = t.asEndTag(); 1117 String name = endTag.normalName(); 1118 1119 if (name.equals("tr")) { 1120 if (!tb.inTableScope(name)) { 1121 tb.error(this); // frag 1122 return false; 1123 } 1124 tb.clearStackToTableRowContext(); 1125 tb.pop(); // tr 1126 tb.transition(InTableBody); 1127 } else if (name.equals("table")) { 1128 return handleMissingTr(t, tb); 1129 } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { 1130 if (!tb.inTableScope(name)) { 1131 tb.error(this); 1132 return false; 1133 } 1134 tb.processEndTag("tr"); 1135 return tb.process(t); 1136 } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th")) { 1137 tb.error(this); 1138 return false; 1139 } else { 1140 return anythingElse(t, tb); 1141 } 1142 } else { 1143 return anythingElse(t, tb); 1144 } 1145 return true; 1146 } 1147 1148 private boolean anythingElse(Token t, HtmlTreeBuilder tb) { 1149 return tb.process(t, InTable); 1150 } 1151 1152 private boolean handleMissingTr(Token t, TreeBuilder tb) { 1153 boolean processed = tb.processEndTag("tr"); 1154 if (processed) 1155 return tb.process(t); 1156 else 1157 return false; 1158 } 1159 }, 1160 InCell { 1161 boolean process(Token t, HtmlTreeBuilder tb) { 1162 if (t.isEndTag()) { 1163 Token.EndTag endTag = t.asEndTag(); 1164 String name = endTag.normalName(); 1165 1166 if (StringUtil.in(name, "td", "th")) { 1167 if (!tb.inTableScope(name)) { 1168 tb.error(this); 1169 tb.transition(InRow); // might not be in scope if empty: <td /> and processing fake end tag 1170 return false; 1171 } 1172 tb.generateImpliedEndTags(); 1173 if (!tb.currentElement().nodeName().equals(name)) 1174 tb.error(this); 1175 tb.popStackToClose(name); 1176 tb.clearFormattingElementsToLastMarker(); 1177 tb.transition(InRow); 1178 } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html")) { 1179 tb.error(this); 1180 return false; 1181 } else if (StringUtil.in(name, "table", "tbody", "tfoot", "thead", "tr")) { 1182 if (!tb.inTableScope(name)) { 1183 tb.error(this); 1184 return false; 1185 } 1186 closeCell(tb); 1187 return tb.process(t); 1188 } else { 1189 return anythingElse(t, tb); 1190 } 1191 } else if (t.isStartTag() && 1192 StringUtil.in(t.asStartTag().normalName(), 1193 "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr")) { 1194 if (!(tb.inTableScope("td") || tb.inTableScope("th"))) { 1195 tb.error(this); 1196 return false; 1197 } 1198 closeCell(tb); 1199 return tb.process(t); 1200 } else { 1201 return anythingElse(t, tb); 1202 } 1203 return true; 1204 } 1205 1206 private boolean anythingElse(Token t, HtmlTreeBuilder tb) { 1207 return tb.process(t, InBody); 1208 } 1209 1210 private void closeCell(HtmlTreeBuilder tb) { 1211 if (tb.inTableScope("td")) 1212 tb.processEndTag("td"); 1213 else 1214 tb.processEndTag("th"); // only here if th or td in scope 1215 } 1216 }, 1217 InSelect { 1218 boolean process(Token t, HtmlTreeBuilder tb) { 1219 switch (t.type) { 1220 case Character: 1221 Token.Character c = t.asCharacter(); 1222 if (c.getData().equals(nullString)) { 1223 tb.error(this); 1224 return false; 1225 } else { 1226 tb.insert(c); 1227 } 1228 break; 1229 case Comment: 1230 tb.insert(t.asComment()); 1231 break; 1232 case Doctype: 1233 tb.error(this); 1234 return false; 1235 case StartTag: 1236 Token.StartTag start = t.asStartTag(); 1237 String name = start.normalName(); 1238 if (name.equals("html")) 1239 return tb.process(start, InBody); 1240 else if (name.equals("option")) { 1241 if (tb.currentElement().nodeName().equals("option")) 1242 tb.processEndTag("option"); 1243 tb.insert(start); 1244 } else if (name.equals("optgroup")) { 1245 if (tb.currentElement().nodeName().equals("option")) 1246 tb.processEndTag("option"); 1247 else if (tb.currentElement().nodeName().equals("optgroup")) 1248 tb.processEndTag("optgroup"); 1249 tb.insert(start); 1250 } else if (name.equals("select")) { 1251 tb.error(this); 1252 return tb.processEndTag("select"); 1253 } else if (StringUtil.in(name, "input", "keygen", "textarea")) { 1254 tb.error(this); 1255 if (!tb.inSelectScope("select")) 1256 return false; // frag 1257 tb.processEndTag("select"); 1258 return tb.process(start); 1259 } else if (name.equals("script")) { 1260 return tb.process(t, InHead); 1261 } else { 1262 return anythingElse(t, tb); 1263 } 1264 break; 1265 case EndTag: 1266 Token.EndTag end = t.asEndTag(); 1267 name = end.normalName(); 1268 switch (name) { 1269 case "optgroup": 1270 if (tb.currentElement().nodeName().equals("option") && tb.aboveOnStack(tb.currentElement()) != null && tb.aboveOnStack(tb.currentElement()).nodeName().equals("optgroup")) 1271 tb.processEndTag("option"); 1272 if (tb.currentElement().nodeName().equals("optgroup")) 1273 tb.pop(); 1274 else 1275 tb.error(this); 1276 break; 1277 case "option": 1278 if (tb.currentElement().nodeName().equals("option")) 1279 tb.pop(); 1280 else 1281 tb.error(this); 1282 break; 1283 case "select": 1284 if (!tb.inSelectScope(name)) { 1285 tb.error(this); 1286 return false; 1287 } else { 1288 tb.popStackToClose(name); 1289 tb.resetInsertionMode(); 1290 } 1291 break; 1292 default: 1293 return anythingElse(t, tb); 1294 } 1295 break; 1296 case EOF: 1297 if (!tb.currentElement().nodeName().equals("html")) 1298 tb.error(this); 1299 break; 1300 default: 1301 return anythingElse(t, tb); 1302 } 1303 return true; 1304 } 1305 1306 private boolean anythingElse(Token t, HtmlTreeBuilder tb) { 1307 tb.error(this); 1308 return false; 1309 } 1310 }, 1311 InSelectInTable { 1312 boolean process(Token t, HtmlTreeBuilder tb) { 1313 if (t.isStartTag() && StringUtil.in(t.asStartTag().normalName(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { 1314 tb.error(this); 1315 tb.processEndTag("select"); 1316 return tb.process(t); 1317 } else if (t.isEndTag() && StringUtil.in(t.asEndTag().normalName(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { 1318 tb.error(this); 1319 if (tb.inTableScope(t.asEndTag().normalName())) { 1320 tb.processEndTag("select"); 1321 return (tb.process(t)); 1322 } else 1323 return false; 1324 } else { 1325 return tb.process(t, InSelect); 1326 } 1327 } 1328 }, 1329 AfterBody { 1330 boolean process(Token t, HtmlTreeBuilder tb) { 1331 if (isWhitespace(t)) { 1332 return tb.process(t, InBody); 1333 } else if (t.isComment()) { 1334 tb.insert(t.asComment()); // into html node 1335 } else if (t.isDoctype()) { 1336 tb.error(this); 1337 return false; 1338 } else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) { 1339 return tb.process(t, InBody); 1340 } else if (t.isEndTag() && t.asEndTag().normalName().equals("html")) { 1341 if (tb.isFragmentParsing()) { 1342 tb.error(this); 1343 return false; 1344 } else { 1345 tb.transition(AfterAfterBody); 1346 } 1347 } else if (t.isEOF()) { 1348 // chillax! we're done 1349 } else { 1350 tb.error(this); 1351 tb.transition(InBody); 1352 return tb.process(t); 1353 } 1354 return true; 1355 } 1356 }, 1357 InFrameset { 1358 boolean process(Token t, HtmlTreeBuilder tb) { 1359 if (isWhitespace(t)) { 1360 tb.insert(t.asCharacter()); 1361 } else if (t.isComment()) { 1362 tb.insert(t.asComment()); 1363 } else if (t.isDoctype()) { 1364 tb.error(this); 1365 return false; 1366 } else if (t.isStartTag()) { 1367 Token.StartTag start = t.asStartTag(); 1368 switch (start.normalName()) { 1369 case "html": 1370 return tb.process(start, InBody); 1371 case "frameset": 1372 tb.insert(start); 1373 break; 1374 case "frame": 1375 tb.insertEmpty(start); 1376 break; 1377 case "noframes": 1378 return tb.process(start, InHead); 1379 default: 1380 tb.error(this); 1381 return false; 1382 } 1383 } else if (t.isEndTag() && t.asEndTag().normalName().equals("frameset")) { 1384 if (tb.currentElement().nodeName().equals("html")) { // frag 1385 tb.error(this); 1386 return false; 1387 } else { 1388 tb.pop(); 1389 if (!tb.isFragmentParsing() && !tb.currentElement().nodeName().equals("frameset")) { 1390 tb.transition(AfterFrameset); 1391 } 1392 } 1393 } else if (t.isEOF()) { 1394 if (!tb.currentElement().nodeName().equals("html")) { 1395 tb.error(this); 1396 return true; 1397 } 1398 } else { 1399 tb.error(this); 1400 return false; 1401 } 1402 return true; 1403 } 1404 }, 1405 AfterFrameset { 1406 boolean process(Token t, HtmlTreeBuilder tb) { 1407 if (isWhitespace(t)) { 1408 tb.insert(t.asCharacter()); 1409 } else if (t.isComment()) { 1410 tb.insert(t.asComment()); 1411 } else if (t.isDoctype()) { 1412 tb.error(this); 1413 return false; 1414 } else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) { 1415 return tb.process(t, InBody); 1416 } else if (t.isEndTag() && t.asEndTag().normalName().equals("html")) { 1417 tb.transition(AfterAfterFrameset); 1418 } else if (t.isStartTag() && t.asStartTag().normalName().equals("noframes")) { 1419 return tb.process(t, InHead); 1420 } else if (t.isEOF()) { 1421 // cool your heels, we're complete 1422 } else { 1423 tb.error(this); 1424 return false; 1425 } 1426 return true; 1427 } 1428 }, 1429 AfterAfterBody { 1430 boolean process(Token t, HtmlTreeBuilder tb) { 1431 if (t.isComment()) { 1432 tb.insert(t.asComment()); 1433 } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) { 1434 return tb.process(t, InBody); 1435 } else if (t.isEOF()) { 1436 // nice work chuck 1437 } else { 1438 tb.error(this); 1439 tb.transition(InBody); 1440 return tb.process(t); 1441 } 1442 return true; 1443 } 1444 }, 1445 AfterAfterFrameset { 1446 boolean process(Token t, HtmlTreeBuilder tb) { 1447 if (t.isComment()) { 1448 tb.insert(t.asComment()); 1449 } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) { 1450 return tb.process(t, InBody); 1451 } else if (t.isEOF()) { 1452 // nice work chuck 1453 } else if (t.isStartTag() && t.asStartTag().normalName().equals("noframes")) { 1454 return tb.process(t, InHead); 1455 } else { 1456 tb.error(this); 1457 return false; 1458 } 1459 return true; 1460 } 1461 }, 1462 ForeignContent { 1463 boolean process(Token t, HtmlTreeBuilder tb) { 1464 return true; 1465 // todo: implement. Also; how do we get here? 1466 } 1467 }; 1468 1469 private static String nullString = String.valueOf('\u0000'); 1470 1471 abstract boolean process(Token t, HtmlTreeBuilder tb); 1472 1473 private static boolean isWhitespace(Token t) { 1474 if (t.isCharacter()) { 1475 String data = t.asCharacter().getData(); 1476 return isWhitespace(data); 1477 } 1478 return false; 1479 } 1480 1481 private static boolean isWhitespace(String data) { 1482 // todo: this checks more than spec - "\t", "\n", "\f", "\r", " " 1483 for (int i = 0; i < data.length(); i++) { 1484 char c = data.charAt(i); 1485 if (!StringUtil.isWhitespace(c)) 1486 return false; 1487 } 1488 return true; 1489 } 1490 1491 private static void handleRcData(Token.StartTag startTag, HtmlTreeBuilder tb) { 1492 tb.tokeniser.transition(TokeniserState.Rcdata); 1493 tb.markInsertionMode(); 1494 tb.transition(Text); 1495 tb.insert(startTag); 1496 } 1497 1498 private static void handleRawtext(Token.StartTag startTag, HtmlTreeBuilder tb) { 1499 tb.tokeniser.transition(TokeniserState.Rawtext); 1500 tb.markInsertionMode(); 1501 tb.transition(Text); 1502 tb.insert(startTag); 1503 } 1504 1505 // lists of tags to search through. A little harder to read here, but causes less GC than dynamic varargs. 1506 // was contributing around 10% of parse GC load. 1507 // must make sure these are sorted, as used in findSorted. MUST update HtmlTreebuilderStateTest if more arrays added. 1508 static final class Constants { 1509 static final String[] InBodyStartToHead = new String[]{"base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "title"}; 1510 static final String[] InBodyStartPClosers = new String[]{"address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", 1511 "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", 1512 "p", "section", "summary", "ul"}; 1513 static final String[] Headings = new String[]{"h1", "h2", "h3", "h4", "h5", "h6"}; 1514 static final String[] InBodyStartPreListing = new String[]{"listing", "pre"}; 1515 static final String[] InBodyStartLiBreakers = new String[]{"address", "div", "p"}; 1516 static final String[] DdDt = new String[]{"dd", "dt"}; 1517 static final String[] Formatters = new String[]{"b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u"}; 1518 static final String[] InBodyStartApplets = new String[]{"applet", "marquee", "object"}; 1519 static final String[] InBodyStartEmptyFormatters = new String[]{"area", "br", "embed", "img", "keygen", "wbr"}; 1520 static final String[] InBodyStartMedia = new String[]{"param", "source", "track"}; 1521 static final String[] InBodyStartInputAttribs = new String[]{"action", "name", "prompt"}; 1522 static final String[] InBodyStartOptions = new String[]{"optgroup", "option"}; 1523 static final String[] InBodyStartRuby = new String[]{"rp", "rt"}; 1524 static final String[] InBodyStartDrop = new String[]{"caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr"}; 1525 static final String[] InBodyEndClosers = new String[]{"address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", 1526 "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", 1527 "nav", "ol", "pre", "section", "summary", "ul"}; 1528 static final String[] InBodyEndAdoptionFormatters = new String[]{"a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u"}; 1529 static final String[] InBodyEndTableFosters = new String[]{"table", "tbody", "tfoot", "thead", "tr"}; 1530 } 1531}