001package org.jsoup.nodes;
002
003import org.jsoup.SerializationException;
004import org.jsoup.helper.StringUtil;
005import org.jsoup.helper.Validate;
006import org.jsoup.parser.CharacterReader;
007import org.jsoup.parser.Parser;
008
009import java.io.IOException;
010import java.nio.charset.CharsetEncoder;
011import java.util.Arrays;
012import java.util.HashMap;
013
014import static org.jsoup.nodes.Entities.EscapeMode.base;
015import static org.jsoup.nodes.Entities.EscapeMode.extended;
016
017/**
018 * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C
019 * HTML named character references</a>.
020 */
021public class Entities {
022    private static final int empty = -1;
023    private static final String emptyName = "";
024    static final int codepointRadix = 36;
025    private static final char[] codeDelims = {',', ';'};
026    private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
027    private static final Document.OutputSettings DefaultOutput = new Document.OutputSettings();
028
029    public enum EscapeMode {
030        /**
031         * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
032         */
033        xhtml(EntitiesData.xmlPoints, 4),
034        /**
035         * Default HTML output entities.
036         */
037        base(EntitiesData.basePoints, 106),
038        /**
039         * Complete HTML entities.
040         */
041        extended(EntitiesData.fullPoints, 2125);
042
043        // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
044        private String[] nameKeys;
045        private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
046
047        // table of codepoints to named entities.
048        private int[] codeKeys; // we don' support multicodepoints to single named value currently
049        private String[] nameVals;
050
051        EscapeMode(String file, int size) {
052            load(this, file, size);
053        }
054
055        int codepointForName(final String name) {
056            int index = Arrays.binarySearch(nameKeys, name);
057            return index >= 0 ? codeVals[index] : empty;
058        }
059
060        String nameForCodepoint(final int codepoint) {
061            final int index = Arrays.binarySearch(codeKeys, codepoint);
062            if (index >= 0) {
063                // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
064                // (and binary search for same item with multi results is undefined
065                return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ?
066                    nameVals[index + 1] : nameVals[index];
067            }
068            return emptyName;
069        }
070
071        private int size() {
072            return nameKeys.length;
073        }
074    }
075
076    private Entities() {
077    }
078
079    /**
080     * Check if the input is a known named entity
081     *
082     * @param name the possible entity name (e.g. "lt" or "amp")
083     * @return true if a known named entity
084     */
085    public static boolean isNamedEntity(final String name) {
086        return extended.codepointForName(name) != empty;
087    }
088
089    /**
090     * Check if the input is a known named entity in the base entity set.
091     *
092     * @param name the possible entity name (e.g. "lt" or "amp")
093     * @return true if a known named entity in the base set
094     * @see #isNamedEntity(String)
095     */
096    public static boolean isBaseNamedEntity(final String name) {
097        return base.codepointForName(name) != empty;
098    }
099
100    /**
101     * Get the Character value of the named entity
102     *
103     * @param name named entity (e.g. "lt" or "amp")
104     * @return the Character value of the named entity (e.g. '{@literal <}' or '{@literal &}')
105     * @deprecated does not support characters outside the BMP or multiple character names
106     */
107    public static Character getCharacterByName(String name) {
108        return (char) extended.codepointForName(name);
109    }
110
111    /**
112     * Get the character(s) represented by the named entity
113     *
114     * @param name entity (e.g. "lt" or "amp")
115     * @return the string value of the character(s) represented by this entity, or "" if not defined
116     */
117    public static String getByName(String name) {
118        String val = multipoints.get(name);
119        if (val != null)
120            return val;
121        int codepoint = extended.codepointForName(name);
122        if (codepoint != empty)
123            return new String(new int[]{codepoint}, 0, 1);
124        return emptyName;
125    }
126
127    public static int codepointsForName(final String name, final int[] codepoints) {
128        String val = multipoints.get(name);
129        if (val != null) {
130            codepoints[0] = val.codePointAt(0);
131            codepoints[1] = val.codePointAt(1);
132            return 2;
133        }
134        int codepoint = extended.codepointForName(name);
135        if (codepoint != empty) {
136            codepoints[0] = codepoint;
137            return 1;
138        }
139        return 0;
140    }
141
142    /**
143     * HTML escape an input string. That is, {@code <} is returned as {@code &lt;}
144     *
145     * @param string the un-escaped string to escape
146     * @param out the output settings to use
147     * @return the escaped string
148     */
149    public static String escape(String string, Document.OutputSettings out) {
150        if (string == null)
151            return "";
152        StringBuilder accum = new StringBuilder(string.length() * 2);
153        try {
154            escape(accum, string, out, false, false, false);
155        } catch (IOException e) {
156            throw new SerializationException(e); // doesn't happen
157        }
158        return accum.toString();
159    }
160
161    /**
162     * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as
163     * {@code &lt;}
164     *
165     * @param string the un-escaped string to escape
166     * @return the escaped string
167     */
168    public static String escape(String string) {
169        return escape(string, DefaultOutput);
170    }
171
172    // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
173    static void escape(Appendable accum, String string, Document.OutputSettings out,
174                       boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) throws IOException {
175
176        boolean lastWasWhite = false;
177        boolean reachedNonWhite = false;
178        final EscapeMode escapeMode = out.escapeMode();
179        final CharsetEncoder encoder = out.encoder != null ? out.encoder : out.prepareEncoder();
180        final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder()
181        final int length = string.length();
182
183        int codePoint;
184        for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
185            codePoint = string.codePointAt(offset);
186
187            if (normaliseWhite) {
188                if (StringUtil.isWhitespace(codePoint)) {
189                    if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite)
190                        continue;
191                    accum.append(' ');
192                    lastWasWhite = true;
193                    continue;
194                } else {
195                    lastWasWhite = false;
196                    reachedNonWhite = true;
197                }
198            }
199            // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
200            if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
201                final char c = (char) codePoint;
202                // html specific and required escapes:
203                switch (c) {
204                    case '&':
205                        accum.append("&amp;");
206                        break;
207                    case 0xA0:
208                        if (escapeMode != EscapeMode.xhtml)
209                            accum.append("&nbsp;");
210                        else
211                            accum.append("&#xa0;");
212                        break;
213                    case '<':
214                        // escape when in character data or when in a xml attribue val; not needed in html attr val
215                        if (!inAttribute || escapeMode == EscapeMode.xhtml)
216                            accum.append("&lt;");
217                        else
218                            accum.append(c);
219                        break;
220                    case '>':
221                        if (!inAttribute)
222                            accum.append("&gt;");
223                        else
224                            accum.append(c);
225                        break;
226                    case '"':
227                        if (inAttribute)
228                            accum.append("&quot;");
229                        else
230                            accum.append(c);
231                        break;
232                    default:
233                        if (canEncode(coreCharset, c, encoder))
234                            accum.append(c);
235                        else
236                            appendEncoded(accum, escapeMode, codePoint);
237                }
238            } else {
239                final String c = new String(Character.toChars(codePoint));
240                if (encoder.canEncode(c)) // uses fallback encoder for simplicity
241                    accum.append(c);
242                else
243                    appendEncoded(accum, escapeMode, codePoint);
244            }
245        }
246    }
247
248    private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
249        final String name = escapeMode.nameForCodepoint(codePoint);
250        if (name != emptyName) // ok for identity check
251            accum.append('&').append(name).append(';');
252        else
253            accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
254    }
255
256    /**
257     * Un-escape an HTML escaped string. That is, {@code &lt;} is returned as {@code <}.
258     *
259     * @param string the HTML string to un-escape
260     * @return the unescaped string
261     */
262    public static String unescape(String string) {
263        return unescape(string, false);
264    }
265
266    /**
267     * Unescape the input string.
268     *
269     * @param string to un-HTML-escape
270     * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
271     * @return unescaped string
272     */
273    static String unescape(String string, boolean strict) {
274        return Parser.unescapeEntities(string, strict);
275    }
276
277    /*
278     * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
279     * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
280     * performance may be bad. We can add more encoders for common character sets that are impacted by performance
281     * issues on Android if required.
282     *
283     * Benchmarks:     *
284     * OLD toHtml() impl v New (fastpath) in millis
285     * Wiki: 1895, 16
286     * CNN: 6378, 55
287     * Alterslash: 3013, 28
288     * Jsoup: 167, 2
289     */
290    private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
291        // todo add more charset tests if impacted by Android's bad perf in canEncode
292        switch (charset) {
293            case ascii:
294                return c < 0x80;
295            case utf:
296                return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
297            default:
298                return fallback.canEncode(c);
299        }
300    }
301
302    enum CoreCharset {
303        ascii, utf, fallback;
304
305        static CoreCharset byName(final String name) {
306            if (name.equals("US-ASCII"))
307                return ascii;
308            if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
309                return utf;
310            return fallback;
311        }
312    }
313
314    private static void load(EscapeMode e, String pointsData, int size) {
315        e.nameKeys = new String[size];
316        e.codeVals = new int[size];
317        e.codeKeys = new int[size];
318        e.nameVals = new String[size];
319
320        int i = 0;
321        CharacterReader reader = new CharacterReader(pointsData);
322
323        while (!reader.isEmpty()) {
324            // NotNestedLessLess=10913,824;1887&
325
326            final String name = reader.consumeTo('=');
327            reader.advance();
328            final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix);
329            final char codeDelim = reader.current();
330            reader.advance();
331            final int cp2;
332            if (codeDelim == ',') {
333                cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix);
334                reader.advance();
335            } else {
336                cp2 = empty;
337            }
338            final String indexS = reader.consumeTo('&');
339            final int index = Integer.parseInt(indexS, codepointRadix);
340            reader.advance();
341
342            e.nameKeys[i] = name;
343            e.codeVals[i] = cp1;
344            e.codeKeys[index] = cp1;
345            e.nameVals[index] = name;
346
347            if (cp2 != empty) {
348                multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2));
349            }
350            i++;
351        }
352
353        Validate.isTrue(i == size, "Unexpected count of entities loaded");
354    }
355}