001package org.jsoup.safety;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.nodes.Attribute;
005import org.jsoup.nodes.Attributes;
006import org.jsoup.nodes.DataNode;
007import org.jsoup.nodes.Document;
008import org.jsoup.nodes.Element;
009import org.jsoup.nodes.Node;
010import org.jsoup.nodes.TextNode;
011import org.jsoup.parser.ParseErrorList;
012import org.jsoup.parser.Parser;
013import org.jsoup.parser.Tag;
014import org.jsoup.select.NodeTraversor;
015import org.jsoup.select.NodeVisitor;
016
017import java.util.List;
018
019
020/**
021 The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
022 that you are expecting; no junk, and no cross-site scripting attacks!
023 <p>
024 The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain
025 HTML that is allowed by the whitelist.
026 </p>
027 <p>
028 It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
029 canned white-lists only allow body contained tags.
030 </p>
031 <p>
032 Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
033 </p>
034 */
035public class Cleaner {
036    private Whitelist whitelist;
037
038    /**
039     Create a new cleaner, that sanitizes documents using the supplied whitelist.
040     @param whitelist white-list to clean with
041     */
042    public Cleaner(Whitelist whitelist) {
043        Validate.notNull(whitelist);
044        this.whitelist = whitelist;
045    }
046
047    /**
048     Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
049     The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
050     @param dirtyDocument Untrusted base document to clean.
051     @return cleaned document.
052     */
053    public Document clean(Document dirtyDocument) {
054        Validate.notNull(dirtyDocument);
055
056        Document clean = Document.createShell(dirtyDocument.baseUri());
057        if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
058            copySafeNodes(dirtyDocument.body(), clean.body());
059
060        return clean;
061    }
062
063    /**
064     Determines if the input document <b>body</b>is valid, against the whitelist. It is considered valid if all the tags and attributes
065     in the input HTML are allowed by the whitelist, and that there is no content in the <code>head</code>.
066     <p>
067     This method can be used as a validator for user input. An invalid document will still be cleaned successfully
068     using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document
069     to ensure enforced attributes are set correctly, and that the output is tidied.
070     </p>
071     @param dirtyDocument document to test
072     @return true if no tags or attributes need to be removed; false if they do
073     */
074    public boolean isValid(Document dirtyDocument) {
075        Validate.notNull(dirtyDocument);
076
077        Document clean = Document.createShell(dirtyDocument.baseUri());
078        int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
079        return numDiscarded == 0
080            && dirtyDocument.head().childNodes().size() == 0; // because we only look at the body, but we start from a shell, make sure there's nothing in the head
081    }
082
083    public boolean isValidBodyHtml(String bodyHtml) {
084        Document clean = Document.createShell("");
085        Document dirty = Document.createShell("");
086        ParseErrorList errorList = ParseErrorList.tracking(1);
087        List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList);
088        dirty.body().insertChildren(0, nodes);
089        int numDiscarded = copySafeNodes(dirty.body(), clean.body());
090        return numDiscarded == 0 && errorList.size() == 0;
091    }
092
093    /**
094     Iterates the input and copies trusted nodes (tags, attributes, text) into the destination.
095     */
096    private final class CleaningVisitor implements NodeVisitor {
097        private int numDiscarded = 0;
098        private final Element root;
099        private Element destination; // current element to append nodes to
100
101        private CleaningVisitor(Element root, Element destination) {
102            this.root = root;
103            this.destination = destination;
104        }
105
106        public void head(Node source, int depth) {
107            if (source instanceof Element) {
108                Element sourceEl = (Element) source;
109
110                if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
111                    ElementMeta meta = createSafeElement(sourceEl);
112                    Element destChild = meta.el;
113                    destination.appendChild(destChild);
114
115                    numDiscarded += meta.numAttribsDiscarded;
116                    destination = destChild;
117                } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
118                    numDiscarded++;
119                }
120            } else if (source instanceof TextNode) {
121                TextNode sourceText = (TextNode) source;
122                TextNode destText = new TextNode(sourceText.getWholeText());
123                destination.appendChild(destText);
124            } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) {
125              DataNode sourceData = (DataNode) source;
126              DataNode destData = new DataNode(sourceData.getWholeData());
127              destination.appendChild(destData);
128            } else { // else, we don't care about comments, xml proc instructions, etc
129                numDiscarded++;
130            }
131        }
132
133        public void tail(Node source, int depth) {
134            if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {
135                destination = destination.parent(); // would have descended, so pop destination stack
136            }
137        }
138    }
139
140    private int copySafeNodes(Element source, Element dest) {
141        CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest);
142        NodeTraversor.traverse(cleaningVisitor, source);
143        return cleaningVisitor.numDiscarded;
144    }
145
146    private ElementMeta createSafeElement(Element sourceEl) {
147        String sourceTag = sourceEl.tagName();
148        Attributes destAttrs = new Attributes();
149        Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
150        int numDiscarded = 0;
151
152        Attributes sourceAttrs = sourceEl.attributes();
153        for (Attribute sourceAttr : sourceAttrs) {
154            if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
155                destAttrs.put(sourceAttr);
156            else
157                numDiscarded++;
158        }
159        Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
160        destAttrs.addAll(enforcedAttrs);
161
162        return new ElementMeta(dest, numDiscarded);
163    }
164
165    private static class ElementMeta {
166        Element el;
167        int numAttribsDiscarded;
168
169        ElementMeta(Element el, int numAttribsDiscarded) {
170            this.el = el;
171            this.numAttribsDiscarded = numAttribsDiscarded;
172        }
173    }
174
175}