001package org.jsoup.safety;
002
003/*
004    Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/rgrove/sanitize/, which inspired
005    this whitelist configuration, and the initial defaults.
006 */
007
008import org.jsoup.helper.Validate;
009import org.jsoup.nodes.Attribute;
010import org.jsoup.nodes.Attributes;
011import org.jsoup.nodes.Element;
012
013import java.util.HashMap;
014import java.util.HashSet;
015import java.util.Map;
016import java.util.Set;
017
018import static org.jsoup.internal.Normalizer.lowerCase;
019
020
021/**
022 Whitelists define what HTML (elements and attributes) to allow through the cleaner. Everything else is removed.
023 <p>
024 Start with one of the defaults:
025 </p>
026 <ul>
027 <li>{@link #none}
028 <li>{@link #simpleText}
029 <li>{@link #basic}
030 <li>{@link #basicWithImages}
031 <li>{@link #relaxed}
032 </ul>
033 <p>
034 If you need to allow more through (please be careful!), tweak a base whitelist with:
035 </p>
036 <ul>
037 <li>{@link #addTags}
038 <li>{@link #addAttributes}
039 <li>{@link #addEnforcedAttribute}
040 <li>{@link #addProtocols}
041 </ul>
042 <p>
043 You can remove any setting from an existing whitelist with:
044 </p>
045 <ul>
046 <li>{@link #removeTags}
047 <li>{@link #removeAttributes}
048 <li>{@link #removeEnforcedAttribute}
049 <li>{@link #removeProtocols}
050 </ul>
051 
052 <p>
053 The cleaner and these whitelists assume that you want to clean a <code>body</code> fragment of HTML (to add user
054 supplied HTML into a templated page), and not to clean a full HTML document. If the latter is the case, either wrap the
055 document HTML around the cleaned body HTML, or create a whitelist that allows <code>html</code> and <code>head</code>
056 elements as appropriate.
057 </p>
058 <p>
059 If you are going to extend a whitelist, please be very careful. Make sure you understand what attributes may lead to
060 XSS attack vectors. URL attributes are particularly vulnerable and require careful validation. See 
061 http://ha.ckers.org/xss.html for some XSS attack examples.
062 </p>
063
064 @author Jonathan Hedley
065 */
066public class Whitelist {
067    private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]
068    private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.
069    private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values
070    private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes
071    private boolean preserveRelativeLinks; // option to preserve relative links
072
073    /**
074     This whitelist allows only text nodes: all HTML will be stripped.
075
076     @return whitelist
077     */
078    public static Whitelist none() {
079        return new Whitelist();
080    }
081
082    /**
083     This whitelist allows only simple text formatting: <code>b, em, i, strong, u</code>. All other HTML (tags and
084     attributes) will be removed.
085
086     @return whitelist
087     */
088    public static Whitelist simpleText() {
089        return new Whitelist()
090                .addTags("b", "em", "i", "strong", "u")
091                ;
092    }
093
094    /**
095     <p>
096     This whitelist allows a fuller range of text nodes: <code>a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li,
097     ol, p, pre, q, small, span, strike, strong, sub, sup, u, ul</code>, and appropriate attributes.
098     </p>
099     <p>
100     Links (<code>a</code> elements) can point to <code>http, https, ftp, mailto</code>, and have an enforced
101     <code>rel=nofollow</code> attribute.
102     </p>
103     <p>
104     Does not allow images.
105     </p>
106
107     @return whitelist
108     */
109    public static Whitelist basic() {
110        return new Whitelist()
111                .addTags(
112                        "a", "b", "blockquote", "br", "cite", "code", "dd", "dl", "dt", "em",
113                        "i", "li", "ol", "p", "pre", "q", "small", "span", "strike", "strong", "sub",
114                        "sup", "u", "ul")
115
116                .addAttributes("a", "href")
117                .addAttributes("blockquote", "cite")
118                .addAttributes("q", "cite")
119
120                .addProtocols("a", "href", "ftp", "http", "https", "mailto")
121                .addProtocols("blockquote", "cite", "http", "https")
122                .addProtocols("cite", "cite", "http", "https")
123
124                .addEnforcedAttribute("a", "rel", "nofollow")
125                ;
126
127    }
128
129    /**
130     This whitelist allows the same text tags as {@link #basic}, and also allows <code>img</code> tags, with appropriate
131     attributes, with <code>src</code> pointing to <code>http</code> or <code>https</code>.
132
133     @return whitelist
134     */
135    public static Whitelist basicWithImages() {
136        return basic()
137                .addTags("img")
138                .addAttributes("img", "align", "alt", "height", "src", "title", "width")
139                .addProtocols("img", "src", "http", "https")
140                ;
141    }
142
143    /**
144     This whitelist allows a full range of text and structural body HTML: <code>a, b, blockquote, br, caption, cite,
145     code, col, colgroup, dd, div, dl, dt, em, h1, h2, h3, h4, h5, h6, i, img, li, ol, p, pre, q, small, span, strike, strong, sub,
146     sup, table, tbody, td, tfoot, th, thead, tr, u, ul</code>
147     <p>
148     Links do not have an enforced <code>rel=nofollow</code> attribute, but you can add that if desired.
149     </p>
150
151     @return whitelist
152     */
153    public static Whitelist relaxed() {
154        return new Whitelist()
155                .addTags(
156                        "a", "b", "blockquote", "br", "caption", "cite", "code", "col",
157                        "colgroup", "dd", "div", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5", "h6",
158                        "i", "img", "li", "ol", "p", "pre", "q", "small", "span", "strike", "strong",
159                        "sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u",
160                        "ul")
161
162                .addAttributes("a", "href", "title")
163                .addAttributes("blockquote", "cite")
164                .addAttributes("col", "span", "width")
165                .addAttributes("colgroup", "span", "width")
166                .addAttributes("img", "align", "alt", "height", "src", "title", "width")
167                .addAttributes("ol", "start", "type")
168                .addAttributes("q", "cite")
169                .addAttributes("table", "summary", "width")
170                .addAttributes("td", "abbr", "axis", "colspan", "rowspan", "width")
171                .addAttributes(
172                        "th", "abbr", "axis", "colspan", "rowspan", "scope",
173                        "width")
174                .addAttributes("ul", "type")
175
176                .addProtocols("a", "href", "ftp", "http", "https", "mailto")
177                .addProtocols("blockquote", "cite", "http", "https")
178                .addProtocols("cite", "cite", "http", "https")
179                .addProtocols("img", "src", "http", "https")
180                .addProtocols("q", "cite", "http", "https")
181                ;
182    }
183
184    /**
185     Create a new, empty whitelist. Generally it will be better to start with a default prepared whitelist instead.
186
187     @see #basic()
188     @see #basicWithImages()
189     @see #simpleText()
190     @see #relaxed()
191     */
192    public Whitelist() {
193        tagNames = new HashSet<>();
194        attributes = new HashMap<>();
195        enforcedAttributes = new HashMap<>();
196        protocols = new HashMap<>();
197        preserveRelativeLinks = false;
198    }
199
200    /**
201     Add a list of allowed elements to a whitelist. (If a tag is not allowed, it will be removed from the HTML.)
202
203     @param tags tag names to allow
204     @return this (for chaining)
205     */
206    public Whitelist addTags(String... tags) {
207        Validate.notNull(tags);
208
209        for (String tagName : tags) {
210            Validate.notEmpty(tagName);
211            tagNames.add(TagName.valueOf(tagName));
212        }
213        return this;
214    }
215
216    /**
217     Remove a list of allowed elements from a whitelist. (If a tag is not allowed, it will be removed from the HTML.)
218
219     @param tags tag names to disallow
220     @return this (for chaining)
221     */
222    public Whitelist removeTags(String... tags) {
223        Validate.notNull(tags);
224
225        for(String tag: tags) {
226            Validate.notEmpty(tag);
227            TagName tagName = TagName.valueOf(tag);
228
229            if(tagNames.remove(tagName)) { // Only look in sub-maps if tag was allowed
230                attributes.remove(tagName);
231                enforcedAttributes.remove(tagName);
232                protocols.remove(tagName);
233            }
234        }
235        return this;
236    }
237
238    /**
239     Add a list of allowed attributes to a tag. (If an attribute is not allowed on an element, it will be removed.)
240     <p>
241     E.g.: <code>addAttributes("a", "href", "class")</code> allows <code>href</code> and <code>class</code> attributes
242     on <code>a</code> tags.
243     </p>
244     <p>
245     To make an attribute valid for <b>all tags</b>, use the pseudo tag <code>:all</code>, e.g.
246     <code>addAttributes(":all", "class")</code>.
247     </p>
248
249     @param tag  The tag the attributes are for. The tag will be added to the allowed tag list if necessary.
250     @param attributes List of valid attributes for the tag
251     @return this (for chaining)
252     */
253    public Whitelist addAttributes(String tag, String... attributes) {
254        Validate.notEmpty(tag);
255        Validate.notNull(attributes);
256        Validate.isTrue(attributes.length > 0, "No attribute names supplied.");
257
258        TagName tagName = TagName.valueOf(tag);
259        if (!tagNames.contains(tagName))
260            tagNames.add(tagName);
261        Set<AttributeKey> attributeSet = new HashSet<>();
262        for (String key : attributes) {
263            Validate.notEmpty(key);
264            attributeSet.add(AttributeKey.valueOf(key));
265        }
266        if (this.attributes.containsKey(tagName)) {
267            Set<AttributeKey> currentSet = this.attributes.get(tagName);
268            currentSet.addAll(attributeSet);
269        } else {
270            this.attributes.put(tagName, attributeSet);
271        }
272        return this;
273    }
274
275    /**
276     Remove a list of allowed attributes from a tag. (If an attribute is not allowed on an element, it will be removed.)
277     <p>
278     E.g.: <code>removeAttributes("a", "href", "class")</code> disallows <code>href</code> and <code>class</code>
279     attributes on <code>a</code> tags.
280     </p>
281     <p>
282     To make an attribute invalid for <b>all tags</b>, use the pseudo tag <code>:all</code>, e.g.
283     <code>removeAttributes(":all", "class")</code>.
284     </p>
285
286     @param tag  The tag the attributes are for.
287     @param attributes List of invalid attributes for the tag
288     @return this (for chaining)
289     */
290    public Whitelist removeAttributes(String tag, String... attributes) {
291        Validate.notEmpty(tag);
292        Validate.notNull(attributes);
293        Validate.isTrue(attributes.length > 0, "No attribute names supplied.");
294
295        TagName tagName = TagName.valueOf(tag);
296        Set<AttributeKey> attributeSet = new HashSet<>();
297        for (String key : attributes) {
298            Validate.notEmpty(key);
299            attributeSet.add(AttributeKey.valueOf(key));
300        }
301        if(tagNames.contains(tagName) && this.attributes.containsKey(tagName)) { // Only look in sub-maps if tag was allowed
302            Set<AttributeKey> currentSet = this.attributes.get(tagName);
303            currentSet.removeAll(attributeSet);
304
305            if(currentSet.isEmpty()) // Remove tag from attribute map if no attributes are allowed for tag
306                this.attributes.remove(tagName);
307        }
308        if(tag.equals(":all")) // Attribute needs to be removed from all individually set tags
309            for(TagName name: this.attributes.keySet()) {
310                Set<AttributeKey> currentSet = this.attributes.get(name);
311                currentSet.removeAll(attributeSet);
312
313                if(currentSet.isEmpty()) // Remove tag from attribute map if no attributes are allowed for tag
314                    this.attributes.remove(name);
315            }
316        return this;
317    }
318
319    /**
320     Add an enforced attribute to a tag. An enforced attribute will always be added to the element. If the element
321     already has the attribute set, it will be overridden with this value.
322     <p>
323     E.g.: <code>addEnforcedAttribute("a", "rel", "nofollow")</code> will make all <code>a</code> tags output as
324     <code>&lt;a href="..." rel="nofollow"&gt;</code>
325     </p>
326
327     @param tag   The tag the enforced attribute is for. The tag will be added to the allowed tag list if necessary.
328     @param attribute   The attribute name
329     @param value The enforced attribute value
330     @return this (for chaining)
331     */
332    public Whitelist addEnforcedAttribute(String tag, String attribute, String value) {
333        Validate.notEmpty(tag);
334        Validate.notEmpty(attribute);
335        Validate.notEmpty(value);
336
337        TagName tagName = TagName.valueOf(tag);
338        if (!tagNames.contains(tagName))
339            tagNames.add(tagName);
340        AttributeKey attrKey = AttributeKey.valueOf(attribute);
341        AttributeValue attrVal = AttributeValue.valueOf(value);
342
343        if (enforcedAttributes.containsKey(tagName)) {
344            enforcedAttributes.get(tagName).put(attrKey, attrVal);
345        } else {
346            Map<AttributeKey, AttributeValue> attrMap = new HashMap<>();
347            attrMap.put(attrKey, attrVal);
348            enforcedAttributes.put(tagName, attrMap);
349        }
350        return this;
351    }
352
353    /**
354     Remove a previously configured enforced attribute from a tag.
355
356     @param tag   The tag the enforced attribute is for.
357     @param attribute   The attribute name
358     @return this (for chaining)
359     */
360    public Whitelist removeEnforcedAttribute(String tag, String attribute) {
361        Validate.notEmpty(tag);
362        Validate.notEmpty(attribute);
363
364        TagName tagName = TagName.valueOf(tag);
365        if(tagNames.contains(tagName) && enforcedAttributes.containsKey(tagName)) {
366            AttributeKey attrKey = AttributeKey.valueOf(attribute);
367            Map<AttributeKey, AttributeValue> attrMap = enforcedAttributes.get(tagName);
368            attrMap.remove(attrKey);
369
370            if(attrMap.isEmpty()) // Remove tag from enforced attribute map if no enforced attributes are present
371                enforcedAttributes.remove(tagName);
372        }
373        return this;
374    }
375
376    /**
377     * Configure this Whitelist to preserve relative links in an element's URL attribute, or convert them to absolute
378     * links. By default, this is <b>false</b>: URLs will be  made absolute (e.g. start with an allowed protocol, like
379     * e.g. {@code http://}.
380     * <p>
381     * Note that when handling relative links, the input document must have an appropriate {@code base URI} set when
382     * parsing, so that the link's protocol can be confirmed. Regardless of the setting of the {@code preserve relative
383     * links} option, the link must be resolvable against the base URI to an allowed protocol; otherwise the attribute
384     * will be removed.
385     * </p>
386     *
387     * @param preserve {@code true} to allow relative links, {@code false} (default) to deny
388     * @return this Whitelist, for chaining.
389     * @see #addProtocols
390     */
391    public Whitelist preserveRelativeLinks(boolean preserve) {
392        preserveRelativeLinks = preserve;
393        return this;
394    }
395
396    /**
397     Add allowed URL protocols for an element's URL attribute. This restricts the possible values of the attribute to
398     URLs with the defined protocol.
399     <p>
400     E.g.: <code>addProtocols("a", "href", "ftp", "http", "https")</code>
401     </p>
402     <p>
403     To allow a link to an in-page URL anchor (i.e. <code>&lt;a href="#anchor"&gt;</code>, add a <code>#</code>:<br>
404     E.g.: <code>addProtocols("a", "href", "#")</code>
405     </p>
406
407     @param tag       Tag the URL protocol is for
408     @param attribute       Attribute name
409     @param protocols List of valid protocols
410     @return this, for chaining
411     */
412    public Whitelist addProtocols(String tag, String attribute, String... protocols) {
413        Validate.notEmpty(tag);
414        Validate.notEmpty(attribute);
415        Validate.notNull(protocols);
416
417        TagName tagName = TagName.valueOf(tag);
418        AttributeKey attrKey = AttributeKey.valueOf(attribute);
419        Map<AttributeKey, Set<Protocol>> attrMap;
420        Set<Protocol> protSet;
421
422        if (this.protocols.containsKey(tagName)) {
423            attrMap = this.protocols.get(tagName);
424        } else {
425            attrMap = new HashMap<>();
426            this.protocols.put(tagName, attrMap);
427        }
428        if (attrMap.containsKey(attrKey)) {
429            protSet = attrMap.get(attrKey);
430        } else {
431            protSet = new HashSet<>();
432            attrMap.put(attrKey, protSet);
433        }
434        for (String protocol : protocols) {
435            Validate.notEmpty(protocol);
436            Protocol prot = Protocol.valueOf(protocol);
437            protSet.add(prot);
438        }
439        return this;
440    }
441
442    /**
443     Remove allowed URL protocols for an element's URL attribute. If you remove all protocols for an attribute, that
444     attribute will allow any protocol.
445     <p>
446     E.g.: <code>removeProtocols("a", "href", "ftp")</code>
447     </p>
448
449     @param tag Tag the URL protocol is for
450     @param attribute Attribute name
451     @param removeProtocols List of invalid protocols
452     @return this, for chaining
453     */
454    public Whitelist removeProtocols(String tag, String attribute, String... removeProtocols) {
455        Validate.notEmpty(tag);
456        Validate.notEmpty(attribute);
457        Validate.notNull(removeProtocols);
458
459        TagName tagName = TagName.valueOf(tag);
460        AttributeKey attr = AttributeKey.valueOf(attribute);
461
462        // make sure that what we're removing actually exists; otherwise can open the tag to any data and that can
463        // be surprising
464        Validate.isTrue(protocols.containsKey(tagName), "Cannot remove a protocol that is not set.");
465        Map<AttributeKey, Set<Protocol>> tagProtocols = protocols.get(tagName);
466        Validate.isTrue(tagProtocols.containsKey(attr), "Cannot remove a protocol that is not set.");
467
468        Set<Protocol> attrProtocols = tagProtocols.get(attr);
469        for (String protocol : removeProtocols) {
470            Validate.notEmpty(protocol);
471            attrProtocols.remove(Protocol.valueOf(protocol));
472        }
473
474        if (attrProtocols.isEmpty()) { // Remove protocol set if empty
475            tagProtocols.remove(attr);
476            if (tagProtocols.isEmpty()) // Remove entry for tag if empty
477                protocols.remove(tagName);
478        }
479        return this;
480    }
481
482    /**
483     * Test if the supplied tag is allowed by this whitelist
484     * @param tag test tag
485     * @return true if allowed
486     */
487    protected boolean isSafeTag(String tag) {
488        return tagNames.contains(TagName.valueOf(tag));
489    }
490
491    /**
492     * Test if the supplied attribute is allowed by this whitelist for this tag
493     * @param tagName tag to consider allowing the attribute in
494     * @param el element under test, to confirm protocol
495     * @param attr attribute under test
496     * @return true if allowed
497     */
498    protected boolean isSafeAttribute(String tagName, Element el, Attribute attr) {
499        TagName tag = TagName.valueOf(tagName);
500        AttributeKey key = AttributeKey.valueOf(attr.getKey());
501
502        Set<AttributeKey> okSet = attributes.get(tag);
503        if (okSet != null && okSet.contains(key)) {
504            if (protocols.containsKey(tag)) {
505                Map<AttributeKey, Set<Protocol>> attrProts = protocols.get(tag);
506                // ok if not defined protocol; otherwise test
507                return !attrProts.containsKey(key) || testValidProtocol(el, attr, attrProts.get(key));
508            } else { // attribute found, no protocols defined, so OK
509                return true;
510            }
511        }
512        // might be an enforced attribute?
513        Map<AttributeKey, AttributeValue> enforcedSet = enforcedAttributes.get(tag);
514        if (enforcedSet != null) {
515            Attributes expect = getEnforcedAttributes(tagName);
516            String attrKey = attr.getKey();
517            if (expect.hasKeyIgnoreCase(attrKey)) {
518                return expect.getIgnoreCase(attrKey).equals(attr.getValue());
519            }
520        }
521        // no attributes defined for tag, try :all tag
522        return !tagName.equals(":all") && isSafeAttribute(":all", el, attr);
523    }
524
525    private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> protocols) {
526        // try to resolve relative urls to abs, and optionally update the attribute so output html has abs.
527        // rels without a baseuri get removed
528        String value = el.absUrl(attr.getKey());
529        if (value.length() == 0)
530            value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown protocols
531        if (!preserveRelativeLinks)
532            attr.setValue(value);
533        
534        for (Protocol protocol : protocols) {
535            String prot = protocol.toString();
536
537            if (prot.equals("#")) { // allows anchor links
538                if (isValidAnchor(value)) {
539                    return true;
540                } else {
541                    continue;
542                }
543            }
544
545            prot += ":";
546
547            if (lowerCase(value).startsWith(prot)) {
548                return true;
549            }
550        }
551        return false;
552    }
553
554    private boolean isValidAnchor(String value) {
555        return value.startsWith("#") && !value.matches(".*\\s.*");
556    }
557
558    Attributes getEnforcedAttributes(String tagName) {
559        Attributes attrs = new Attributes();
560        TagName tag = TagName.valueOf(tagName);
561        if (enforcedAttributes.containsKey(tag)) {
562            Map<AttributeKey, AttributeValue> keyVals = enforcedAttributes.get(tag);
563            for (Map.Entry<AttributeKey, AttributeValue> entry : keyVals.entrySet()) {
564                attrs.put(entry.getKey().toString(), entry.getValue().toString());
565            }
566        }
567        return attrs;
568    }
569    
570    // named types for config. All just hold strings, but here for my sanity.
571
572    static class TagName extends TypedValue {
573        TagName(String value) {
574            super(value);
575        }
576
577        static TagName valueOf(String value) {
578            return new TagName(value);
579        }
580    }
581
582    static class AttributeKey extends TypedValue {
583        AttributeKey(String value) {
584            super(value);
585        }
586
587        static AttributeKey valueOf(String value) {
588            return new AttributeKey(value);
589        }
590    }
591
592    static class AttributeValue extends TypedValue {
593        AttributeValue(String value) {
594            super(value);
595        }
596
597        static AttributeValue valueOf(String value) {
598            return new AttributeValue(value);
599        }
600    }
601
602    static class Protocol extends TypedValue {
603        Protocol(String value) {
604            super(value);
605        }
606
607        static Protocol valueOf(String value) {
608            return new Protocol(value);
609        }
610    }
611
612    abstract static class TypedValue {
613        private String value;
614
615        TypedValue(String value) {
616            Validate.notNull(value);
617            this.value = value;
618        }
619
620        @Override
621        public int hashCode() {
622            final int prime = 31;
623            int result = 1;
624            result = prime * result + ((value == null) ? 0 : value.hashCode());
625            return result;
626        }
627
628        @Override
629        public boolean equals(Object obj) {
630            if (this == obj) return true;
631            if (obj == null) return false;
632            if (getClass() != obj.getClass()) return false;
633            TypedValue other = (TypedValue) obj;
634            if (value == null) {
635                if (other.value != null) return false;
636            } else if (!value.equals(other.value)) return false;
637            return true;
638        }
639
640        @Override
641        public String toString() {
642            return value;
643        }
644    }
645}
646