001package org.jsoup.parser;
002
003import org.jsoup.helper.StringUtil;
004import org.jsoup.helper.Validate;
005import org.jsoup.nodes.Comment;
006import org.jsoup.nodes.DataNode;
007import org.jsoup.nodes.Document;
008import org.jsoup.nodes.Element;
009import org.jsoup.nodes.FormElement;
010import org.jsoup.nodes.Node;
011import org.jsoup.nodes.TextNode;
012import org.jsoup.select.Elements;
013
014import java.io.Reader;
015import java.io.StringReader;
016import java.util.ArrayList;
017import java.util.List;
018
019import static org.jsoup.helper.StringUtil.inSorted;
020
021/**
022 * HTML Tree Builder; creates a DOM from Tokens.
023 */
024public class HtmlTreeBuilder extends TreeBuilder {
025    // tag searches. must be sorted, used in inSorted. MUST update HtmlTreeBuilderTest if more arrays are added.
026    static final String[] TagsSearchInScope = new String[]{"applet", "caption", "html", "marquee", "object", "table", "td", "th"};
027    static final String[] TagSearchList = new String[]{"ol", "ul"};
028    static final String[] TagSearchButton = new String[]{"button"};
029    static final String[] TagSearchTableScope = new String[]{"html", "table"};
030    static final String[] TagSearchSelectScope = new String[]{"optgroup", "option"};
031    static final String[] TagSearchEndTags = new String[]{"dd", "dt", "li", "optgroup", "option", "p", "rp", "rt"};
032    static final String[] TagSearchSpecial = new String[]{"address", "applet", "area", "article", "aside", "base", "basefont", "bgsound",
033        "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd",
034        "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form",
035        "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
036        "iframe", "img", "input", "isindex", "li", "link", "listing", "marquee", "menu", "meta", "nav",
037        "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script",
038        "section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead",
039        "title", "tr", "ul", "wbr", "xmp"};
040
041    public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages
042
043    private HtmlTreeBuilderState state; // the current state
044    private HtmlTreeBuilderState originalState; // original / marked state
045
046    private boolean baseUriSetFromDoc;
047    private Element headElement; // the current head element
048    private FormElement formElement; // the current form element
049    private Element contextElement; // fragment parse context -- could be null even if fragment parsing
050    private ArrayList<Element> formattingElements; // active (open) formatting elements
051    private List<String> pendingTableCharacters; // chars in table to be shifted out
052    private Token.EndTag emptyEnd; // reused empty end tag
053
054    private boolean framesetOk; // if ok to go into frameset
055    private boolean fosterInserts; // if next inserts should be fostered
056    private boolean fragmentParsing; // if parsing a fragment of html
057
058    HtmlTreeBuilder() {}
059
060    ParseSettings defaultSettings() {
061        return ParseSettings.htmlDefault;
062    }
063
064    @Override
065    protected void initialiseParse(Reader input, String baseUri, ParseErrorList errors, ParseSettings settings) {
066        super.initialiseParse(input, baseUri, errors, settings);
067
068        // this is a bit mucky. todo - probably just create new parser objects to ensure all reset.
069        state = HtmlTreeBuilderState.Initial;
070        originalState = null;
071        baseUriSetFromDoc = false;
072        headElement = null;
073        formElement = null;
074        contextElement = null;
075        formattingElements = new ArrayList<>();
076        pendingTableCharacters = new ArrayList<>();
077        emptyEnd = new Token.EndTag();
078        framesetOk = true;
079        fosterInserts = false;
080        fragmentParsing = false;
081    }
082
083    List<Node> parseFragment(String inputFragment, Element context, String baseUri, ParseErrorList errors, ParseSettings settings) {
084        // context may be null
085        state = HtmlTreeBuilderState.Initial;
086        initialiseParse(new StringReader(inputFragment), baseUri, errors, settings);
087        contextElement = context;
088        fragmentParsing = true;
089        Element root = null;
090
091        if (context != null) {
092            if (context.ownerDocument() != null) // quirks setup:
093                doc.quirksMode(context.ownerDocument().quirksMode());
094
095            // initialise the tokeniser state:
096            String contextTag = context.tagName();
097            if (StringUtil.in(contextTag, "title", "textarea"))
098                tokeniser.transition(TokeniserState.Rcdata);
099            else if (StringUtil.in(contextTag, "iframe", "noembed", "noframes", "style", "xmp"))
100                tokeniser.transition(TokeniserState.Rawtext);
101            else if (contextTag.equals("script"))
102                tokeniser.transition(TokeniserState.ScriptData);
103            else if (contextTag.equals(("noscript")))
104                tokeniser.transition(TokeniserState.Data); // if scripting enabled, rawtext
105            else if (contextTag.equals("plaintext"))
106                tokeniser.transition(TokeniserState.Data);
107            else
108                tokeniser.transition(TokeniserState.Data); // default
109
110            root = new Element(Tag.valueOf("html", settings), baseUri);
111            doc.appendChild(root);
112            stack.add(root);
113            resetInsertionMode();
114
115            // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated
116            // with form correctly
117            Elements contextChain = context.parents();
118            contextChain.add(0, context);
119            for (Element parent: contextChain) {
120                if (parent instanceof FormElement) {
121                    formElement = (FormElement) parent;
122                    break;
123                }
124            }
125        }
126
127        runParser();
128        if (context != null)
129            return root.childNodes();
130        else
131            return doc.childNodes();
132    }
133
134    @Override
135    protected boolean process(Token token) {
136        currentToken = token;
137        return this.state.process(token, this);
138    }
139
140    boolean process(Token token, HtmlTreeBuilderState state) {
141        currentToken = token;
142        return state.process(token, this);
143    }
144
145    void transition(HtmlTreeBuilderState state) {
146        this.state = state;
147    }
148
149    HtmlTreeBuilderState state() {
150        return state;
151    }
152
153    void markInsertionMode() {
154        originalState = state;
155    }
156
157    HtmlTreeBuilderState originalState() {
158        return originalState;
159    }
160
161    void framesetOk(boolean framesetOk) {
162        this.framesetOk = framesetOk;
163    }
164
165    boolean framesetOk() {
166        return framesetOk;
167    }
168
169    Document getDocument() {
170        return doc;
171    }
172
173    String getBaseUri() {
174        return baseUri;
175    }
176
177    void maybeSetBaseUri(Element base) {
178        if (baseUriSetFromDoc) // only listen to the first <base href> in parse
179            return;
180
181        String href = base.absUrl("href");
182        if (href.length() != 0) { // ignore <base target> etc
183            baseUri = href;
184            baseUriSetFromDoc = true;
185            doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants
186        }
187    }
188
189    boolean isFragmentParsing() {
190        return fragmentParsing;
191    }
192
193    void error(HtmlTreeBuilderState state) {
194        if (errors.canAddError())
195            errors.add(new ParseError(reader.pos(), "Unexpected token [%s] when in state [%s]", currentToken.tokenType(), state));
196    }
197
198    Element insert(Token.StartTag startTag) {
199        // handle empty unknown tags
200        // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag.
201        if (startTag.isSelfClosing()) {
202            Element el = insertEmpty(startTag);
203            stack.add(el);
204            tokeniser.transition(TokeniserState.Data); // handles <script />, otherwise needs breakout steps from script data
205            tokeniser.emit(emptyEnd.reset().name(el.tagName()));  // ensure we get out of whatever state we are in. emitted for yielded processing
206            return el;
207        }
208        
209        Element el = new Element(Tag.valueOf(startTag.name(), settings), baseUri, settings.normalizeAttributes(startTag.attributes));
210        insert(el);
211        return el;
212    }
213
214    Element insertStartTag(String startTagName) {
215        Element el = new Element(Tag.valueOf(startTagName, settings), baseUri);
216        insert(el);
217        return el;
218    }
219
220    void insert(Element el) {
221        insertNode(el);
222        stack.add(el);
223    }
224
225    Element insertEmpty(Token.StartTag startTag) {
226        Tag tag = Tag.valueOf(startTag.name(), settings);
227        Element el = new Element(tag, baseUri, startTag.attributes);
228        insertNode(el);
229        if (startTag.isSelfClosing()) {
230            if (tag.isKnownTag()) {
231                if (!tag.isEmpty())
232                    tokeniser.error("Tag cannot be self closing; not a void tag");
233            }
234            else // unknown tag, remember this is self closing for output
235                tag.setSelfClosing();
236        }
237        return el;
238    }
239
240    FormElement insertForm(Token.StartTag startTag, boolean onStack) {
241        Tag tag = Tag.valueOf(startTag.name(), settings);
242        FormElement el = new FormElement(tag, baseUri, startTag.attributes);
243        setFormElement(el);
244        insertNode(el);
245        if (onStack)
246            stack.add(el);
247        return el;
248    }
249
250    void insert(Token.Comment commentToken) {
251        Comment comment = new Comment(commentToken.getData());
252        insertNode(comment);
253    }
254
255    void insert(Token.Character characterToken) {
256        Node node;
257        // characters in script and style go in as datanodes, not text nodes
258        String tagName = currentElement().tagName();
259        if (tagName.equals("script") || tagName.equals("style"))
260            node = new DataNode(characterToken.getData());
261        else
262            node = new TextNode(characterToken.getData());
263        currentElement().appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack.
264    }
265
266    private void insertNode(Node node) {
267        // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc
268        if (stack.size() == 0)
269            doc.appendChild(node);
270        else if (isFosterInserts())
271            insertInFosterParent(node);
272        else
273            currentElement().appendChild(node);
274
275        // connect form controls to their form element
276        if (node instanceof Element && ((Element) node).tag().isFormListed()) {
277            if (formElement != null)
278                formElement.addElement((Element) node);
279        }
280    }
281
282    Element pop() {
283        int size = stack.size();
284        return stack.remove(size-1);
285    }
286
287    void push(Element element) {
288        stack.add(element);
289    }
290
291    ArrayList<Element> getStack() {
292        return stack;
293    }
294
295    boolean onStack(Element el) {
296        return isElementInQueue(stack, el);
297    }
298
299    private boolean isElementInQueue(ArrayList<Element> queue, Element element) {
300        for (int pos = queue.size() -1; pos >= 0; pos--) {
301            Element next = queue.get(pos);
302            if (next == element) {
303                return true;
304            }
305        }
306        return false;
307    }
308
309    Element getFromStack(String elName) {
310        for (int pos = stack.size() -1; pos >= 0; pos--) {
311            Element next = stack.get(pos);
312            if (next.nodeName().equals(elName)) {
313                return next;
314            }
315        }
316        return null;
317    }
318
319    boolean removeFromStack(Element el) {
320        for (int pos = stack.size() -1; pos >= 0; pos--) {
321            Element next = stack.get(pos);
322            if (next == el) {
323                stack.remove(pos);
324                return true;
325            }
326        }
327        return false;
328    }
329
330    void popStackToClose(String elName) {
331        for (int pos = stack.size() -1; pos >= 0; pos--) {
332            Element next = stack.get(pos);
333            stack.remove(pos);
334            if (next.nodeName().equals(elName))
335                break;
336        }
337    }
338
339    // elnames is sorted, comes from Constants
340    void popStackToClose(String... elNames) {
341        for (int pos = stack.size() -1; pos >= 0; pos--) {
342            Element next = stack.get(pos);
343            stack.remove(pos);
344            if (inSorted(next.nodeName(), elNames))
345                break;
346        }
347    }
348
349    void popStackToBefore(String elName) {
350        for (int pos = stack.size() -1; pos >= 0; pos--) {
351            Element next = stack.get(pos);
352            if (next.nodeName().equals(elName)) {
353                break;
354            } else {
355                stack.remove(pos);
356            }
357        }
358    }
359
360    void clearStackToTableContext() {
361        clearStackToContext("table");
362    }
363
364    void clearStackToTableBodyContext() {
365        clearStackToContext("tbody", "tfoot", "thead", "template");
366    }
367
368    void clearStackToTableRowContext() {
369        clearStackToContext("tr", "template");
370    }
371
372    private void clearStackToContext(String... nodeNames) {
373        for (int pos = stack.size() -1; pos >= 0; pos--) {
374            Element next = stack.get(pos);
375            if (StringUtil.in(next.nodeName(), nodeNames) || next.nodeName().equals("html"))
376                break;
377            else
378                stack.remove(pos);
379        }
380    }
381
382    Element aboveOnStack(Element el) {
383        assert onStack(el);
384        for (int pos = stack.size() -1; pos >= 0; pos--) {
385            Element next = stack.get(pos);
386            if (next == el) {
387                return stack.get(pos-1);
388            }
389        }
390        return null;
391    }
392
393    void insertOnStackAfter(Element after, Element in) {
394        int i = stack.lastIndexOf(after);
395        Validate.isTrue(i != -1);
396        stack.add(i+1, in);
397    }
398
399    void replaceOnStack(Element out, Element in) {
400        replaceInQueue(stack, out, in);
401    }
402
403    private void replaceInQueue(ArrayList<Element> queue, Element out, Element in) {
404        int i = queue.lastIndexOf(out);
405        Validate.isTrue(i != -1);
406        queue.set(i, in);
407    }
408
409    void resetInsertionMode() {
410        boolean last = false;
411        for (int pos = stack.size() -1; pos >= 0; pos--) {
412            Element node = stack.get(pos);
413            if (pos == 0) {
414                last = true;
415                node = contextElement;
416            }
417            String name = node.nodeName();
418            if ("select".equals(name)) {
419                transition(HtmlTreeBuilderState.InSelect);
420                break; // frag
421            } else if (("td".equals(name) || "th".equals(name) && !last)) {
422                transition(HtmlTreeBuilderState.InCell);
423                break;
424            } else if ("tr".equals(name)) {
425                transition(HtmlTreeBuilderState.InRow);
426                break;
427            } else if ("tbody".equals(name) || "thead".equals(name) || "tfoot".equals(name)) {
428                transition(HtmlTreeBuilderState.InTableBody);
429                break;
430            } else if ("caption".equals(name)) {
431                transition(HtmlTreeBuilderState.InCaption);
432                break;
433            } else if ("colgroup".equals(name)) {
434                transition(HtmlTreeBuilderState.InColumnGroup);
435                break; // frag
436            } else if ("table".equals(name)) {
437                transition(HtmlTreeBuilderState.InTable);
438                break;
439            } else if ("head".equals(name)) {
440                transition(HtmlTreeBuilderState.InBody);
441                break; // frag
442            } else if ("body".equals(name)) {
443                transition(HtmlTreeBuilderState.InBody);
444                break;
445            } else if ("frameset".equals(name)) {
446                transition(HtmlTreeBuilderState.InFrameset);
447                break; // frag
448            } else if ("html".equals(name)) {
449                transition(HtmlTreeBuilderState.BeforeHead);
450                break; // frag
451            } else if (last) {
452                transition(HtmlTreeBuilderState.InBody);
453                break; // frag
454            }
455        }
456    }
457
458    // todo: tidy up in specific scope methods
459    private String[] specificScopeTarget = {null};
460
461    private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) {
462        specificScopeTarget[0] = targetName;
463        return inSpecificScope(specificScopeTarget, baseTypes, extraTypes);
464    }
465
466    private boolean inSpecificScope(String[] targetNames, String[] baseTypes, String[] extraTypes) {
467        int depth = stack.size() -1;
468        if (depth > MaxScopeSearchDepth) {
469            depth = MaxScopeSearchDepth;
470        }
471        for (int pos = depth; pos >= 0; pos--) {
472            Element el = stack.get(pos);
473            String elName = el.nodeName();
474            if (inSorted(elName, targetNames))
475                return true;
476            if (inSorted(elName, baseTypes))
477                return false;
478            if (extraTypes != null && inSorted(elName, extraTypes))
479                return false;
480        }
481        Validate.fail("Should not be reachable");
482        return false;
483    }
484
485    boolean inScope(String[] targetNames) {
486        return inSpecificScope(targetNames, TagsSearchInScope, null);
487    }
488
489    boolean inScope(String targetName) {
490        return inScope(targetName, null);
491    }
492
493    boolean inScope(String targetName, String[] extras) {
494        return inSpecificScope(targetName, TagsSearchInScope, extras);
495        // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml
496        // todo: in svg namespace: forignOjbect, desc, title
497    }
498
499    boolean inListItemScope(String targetName) {
500        return inScope(targetName, TagSearchList);
501    }
502
503    boolean inButtonScope(String targetName) {
504        return inScope(targetName, TagSearchButton);
505    }
506
507    boolean inTableScope(String targetName) {
508        return inSpecificScope(targetName, TagSearchTableScope, null);
509    }
510
511    boolean inSelectScope(String targetName) {
512        for (int pos = stack.size() -1; pos >= 0; pos--) {
513            Element el = stack.get(pos);
514            String elName = el.nodeName();
515            if (elName.equals(targetName))
516                return true;
517            if (!inSorted(elName, TagSearchSelectScope)) // all elements except
518                return false;
519        }
520        Validate.fail("Should not be reachable");
521        return false;
522    }
523
524    void setHeadElement(Element headElement) {
525        this.headElement = headElement;
526    }
527
528    Element getHeadElement() {
529        return headElement;
530    }
531
532    boolean isFosterInserts() {
533        return fosterInserts;
534    }
535
536    void setFosterInserts(boolean fosterInserts) {
537        this.fosterInserts = fosterInserts;
538    }
539
540    FormElement getFormElement() {
541        return formElement;
542    }
543
544    void setFormElement(FormElement formElement) {
545        this.formElement = formElement;
546    }
547
548    void newPendingTableCharacters() {
549        pendingTableCharacters = new ArrayList<>();
550    }
551
552    List<String> getPendingTableCharacters() {
553        return pendingTableCharacters;
554    }
555
556    void setPendingTableCharacters(List<String> pendingTableCharacters) {
557        this.pendingTableCharacters = pendingTableCharacters;
558    }
559
560    /**
561     11.2.5.2 Closing elements that have implied end tags<p/>
562     When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a
563     dt element, an li element, an option element, an optgroup element, a p element, an rp element, or an rt element,
564     the UA must pop the current node off the stack of open elements.
565
566     @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the
567     process, then the UA must perform the above steps as if that element was not in the above list.
568     */
569    void generateImpliedEndTags(String excludeTag) {
570        while ((excludeTag != null && !currentElement().nodeName().equals(excludeTag)) &&
571                inSorted(currentElement().nodeName(), TagSearchEndTags))
572            pop();
573    }
574
575    void generateImpliedEndTags() {
576        generateImpliedEndTags(null);
577    }
578
579    boolean isSpecial(Element el) {
580        // todo: mathml's mi, mo, mn
581        // todo: svg's foreigObject, desc, title
582        String name = el.nodeName();
583        return inSorted(name, TagSearchSpecial);
584    }
585
586    Element lastFormattingElement() {
587        return formattingElements.size() > 0 ? formattingElements.get(formattingElements.size()-1) : null;
588    }
589
590    Element removeLastFormattingElement() {
591        int size = formattingElements.size();
592        if (size > 0)
593            return formattingElements.remove(size-1);
594        else
595            return null;
596    }
597
598    // active formatting elements
599    void pushActiveFormattingElements(Element in) {
600        int numSeen = 0;
601        for (int pos = formattingElements.size() -1; pos >= 0; pos--) {
602            Element el = formattingElements.get(pos);
603            if (el == null) // marker
604                break;
605
606            if (isSameFormattingElement(in, el))
607                numSeen++;
608
609            if (numSeen == 3) {
610                formattingElements.remove(pos);
611                break;
612            }
613        }
614        formattingElements.add(in);
615    }
616
617    private boolean isSameFormattingElement(Element a, Element b) {
618        // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children
619        return a.nodeName().equals(b.nodeName()) &&
620                // a.namespace().equals(b.namespace()) &&
621                a.attributes().equals(b.attributes());
622        // todo: namespaces
623    }
624
625    void reconstructFormattingElements() {
626        Element last = lastFormattingElement();
627        if (last == null || onStack(last))
628            return;
629
630        Element entry = last;
631        int size = formattingElements.size();
632        int pos = size - 1;
633        boolean skip = false;
634        while (true) {
635            if (pos == 0) { // step 4. if none before, skip to 8
636                skip = true;
637                break;
638            }
639            entry = formattingElements.get(--pos); // step 5. one earlier than entry
640            if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack
641                break; // jump to 8, else continue back to 4
642        }
643        while(true) {
644            if (!skip) // step 7: on later than entry
645                entry = formattingElements.get(++pos);
646            Validate.notNull(entry); // should not occur, as we break at last element
647
648            // 8. create new element from element, 9 insert into current node, onto stack
649            skip = false; // can only skip increment from 4.
650            Element newEl = insertStartTag(entry.nodeName()); // todo: avoid fostering here?
651            // newEl.namespace(entry.namespace()); // todo: namespaces
652            newEl.attributes().addAll(entry.attributes());
653
654            // 10. replace entry with new entry
655            formattingElements.set(pos, newEl);
656
657            // 11
658            if (pos == size-1) // if not last entry in list, jump to 7
659                break;
660        }
661    }
662
663    void clearFormattingElementsToLastMarker() {
664        while (!formattingElements.isEmpty()) {
665            Element el = removeLastFormattingElement();
666            if (el == null)
667                break;
668        }
669    }
670
671    void removeFromActiveFormattingElements(Element el) {
672        for (int pos = formattingElements.size() -1; pos >= 0; pos--) {
673            Element next = formattingElements.get(pos);
674            if (next == el) {
675                formattingElements.remove(pos);
676                break;
677            }
678        }
679    }
680
681    boolean isInActiveFormattingElements(Element el) {
682        return isElementInQueue(formattingElements, el);
683    }
684
685    Element getActiveFormattingElement(String nodeName) {
686        for (int pos = formattingElements.size() -1; pos >= 0; pos--) {
687            Element next = formattingElements.get(pos);
688            if (next == null) // scope marker
689                break;
690            else if (next.nodeName().equals(nodeName))
691                return next;
692        }
693        return null;
694    }
695
696    void replaceActiveFormattingElement(Element out, Element in) {
697        replaceInQueue(formattingElements, out, in);
698    }
699
700    void insertMarkerToFormattingElements() {
701        formattingElements.add(null);
702    }
703
704    void insertInFosterParent(Node in) {
705        Element fosterParent;
706        Element lastTable = getFromStack("table");
707        boolean isLastTableParent = false;
708        if (lastTable != null) {
709            if (lastTable.parent() != null) {
710                fosterParent = lastTable.parent();
711                isLastTableParent = true;
712            } else
713                fosterParent = aboveOnStack(lastTable);
714        } else { // no table == frag
715            fosterParent = stack.get(0);
716        }
717
718        if (isLastTableParent) {
719            Validate.notNull(lastTable); // last table cannot be null by this point.
720            lastTable.before(in);
721        }
722        else
723            fosterParent.appendChild(in);
724    }
725
726    @Override
727    public String toString() {
728        return "TreeBuilder{" +
729                "currentToken=" + currentToken +
730                ", state=" + state +
731                ", currentElement=" + currentElement() +
732                '}';
733    }
734}