001package org.jsoup.nodes; 002 003import org.jsoup.SerializationException; 004import org.jsoup.helper.StringUtil; 005import org.jsoup.helper.Validate; 006import org.jsoup.parser.CharacterReader; 007import org.jsoup.parser.Parser; 008 009import java.io.IOException; 010import java.nio.charset.CharsetEncoder; 011import java.util.Arrays; 012import java.util.HashMap; 013 014import static org.jsoup.nodes.Entities.EscapeMode.base; 015import static org.jsoup.nodes.Entities.EscapeMode.extended; 016 017/** 018 * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C 019 * HTML named character references</a>. 020 */ 021public class Entities { 022 private static final int empty = -1; 023 private static final String emptyName = ""; 024 static final int codepointRadix = 36; 025 private static final char[] codeDelims = {',', ';'}; 026 private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references 027 private static final Document.OutputSettings DefaultOutput = new Document.OutputSettings(); 028 029 public enum EscapeMode { 030 /** 031 * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. 032 */ 033 xhtml(EntitiesData.xmlPoints, 4), 034 /** 035 * Default HTML output entities. 036 */ 037 base(EntitiesData.basePoints, 106), 038 /** 039 * Complete HTML entities. 040 */ 041 extended(EntitiesData.fullPoints, 2125); 042 043 // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities. 044 private String[] nameKeys; 045 private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints. 046 047 // table of codepoints to named entities. 048 private int[] codeKeys; // we don' support multicodepoints to single named value currently 049 private String[] nameVals; 050 051 EscapeMode(String file, int size) { 052 load(this, file, size); 053 } 054 055 int codepointForName(final String name) { 056 int index = Arrays.binarySearch(nameKeys, name); 057 return index >= 0 ? codeVals[index] : empty; 058 } 059 060 String nameForCodepoint(final int codepoint) { 061 final int index = Arrays.binarySearch(codeKeys, codepoint); 062 if (index >= 0) { 063 // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower 064 // (and binary search for same item with multi results is undefined 065 return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ? 066 nameVals[index + 1] : nameVals[index]; 067 } 068 return emptyName; 069 } 070 071 private int size() { 072 return nameKeys.length; 073 } 074 } 075 076 private Entities() { 077 } 078 079 /** 080 * Check if the input is a known named entity 081 * 082 * @param name the possible entity name (e.g. "lt" or "amp") 083 * @return true if a known named entity 084 */ 085 public static boolean isNamedEntity(final String name) { 086 return extended.codepointForName(name) != empty; 087 } 088 089 /** 090 * Check if the input is a known named entity in the base entity set. 091 * 092 * @param name the possible entity name (e.g. "lt" or "amp") 093 * @return true if a known named entity in the base set 094 * @see #isNamedEntity(String) 095 */ 096 public static boolean isBaseNamedEntity(final String name) { 097 return base.codepointForName(name) != empty; 098 } 099 100 /** 101 * Get the Character value of the named entity 102 * 103 * @param name named entity (e.g. "lt" or "amp") 104 * @return the Character value of the named entity (e.g. '{@literal <}' or '{@literal &}') 105 * @deprecated does not support characters outside the BMP or multiple character names 106 */ 107 public static Character getCharacterByName(String name) { 108 return (char) extended.codepointForName(name); 109 } 110 111 /** 112 * Get the character(s) represented by the named entity 113 * 114 * @param name entity (e.g. "lt" or "amp") 115 * @return the string value of the character(s) represented by this entity, or "" if not defined 116 */ 117 public static String getByName(String name) { 118 String val = multipoints.get(name); 119 if (val != null) 120 return val; 121 int codepoint = extended.codepointForName(name); 122 if (codepoint != empty) 123 return new String(new int[]{codepoint}, 0, 1); 124 return emptyName; 125 } 126 127 public static int codepointsForName(final String name, final int[] codepoints) { 128 String val = multipoints.get(name); 129 if (val != null) { 130 codepoints[0] = val.codePointAt(0); 131 codepoints[1] = val.codePointAt(1); 132 return 2; 133 } 134 int codepoint = extended.codepointForName(name); 135 if (codepoint != empty) { 136 codepoints[0] = codepoint; 137 return 1; 138 } 139 return 0; 140 } 141 142 /** 143 * HTML escape an input string. That is, {@code <} is returned as {@code <} 144 * 145 * @param string the un-escaped string to escape 146 * @param out the output settings to use 147 * @return the escaped string 148 */ 149 public static String escape(String string, Document.OutputSettings out) { 150 if (string == null) 151 return ""; 152 StringBuilder accum = new StringBuilder(string.length() * 2); 153 try { 154 escape(accum, string, out, false, false, false); 155 } catch (IOException e) { 156 throw new SerializationException(e); // doesn't happen 157 } 158 return accum.toString(); 159 } 160 161 /** 162 * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as 163 * {@code <} 164 * 165 * @param string the un-escaped string to escape 166 * @return the escaped string 167 */ 168 public static String escape(String string) { 169 return escape(string, DefaultOutput); 170 } 171 172 // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations 173 static void escape(Appendable accum, String string, Document.OutputSettings out, 174 boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) throws IOException { 175 176 boolean lastWasWhite = false; 177 boolean reachedNonWhite = false; 178 final EscapeMode escapeMode = out.escapeMode(); 179 final CharsetEncoder encoder = out.encoder != null ? out.encoder : out.prepareEncoder(); 180 final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder() 181 final int length = string.length(); 182 183 int codePoint; 184 for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) { 185 codePoint = string.codePointAt(offset); 186 187 if (normaliseWhite) { 188 if (StringUtil.isWhitespace(codePoint)) { 189 if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite) 190 continue; 191 accum.append(' '); 192 lastWasWhite = true; 193 continue; 194 } else { 195 lastWasWhite = false; 196 reachedNonWhite = true; 197 } 198 } 199 // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): 200 if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 201 final char c = (char) codePoint; 202 // html specific and required escapes: 203 switch (c) { 204 case '&': 205 accum.append("&"); 206 break; 207 case 0xA0: 208 if (escapeMode != EscapeMode.xhtml) 209 accum.append(" "); 210 else 211 accum.append(" "); 212 break; 213 case '<': 214 // escape when in character data or when in a xml attribue val; not needed in html attr val 215 if (!inAttribute || escapeMode == EscapeMode.xhtml) 216 accum.append("<"); 217 else 218 accum.append(c); 219 break; 220 case '>': 221 if (!inAttribute) 222 accum.append(">"); 223 else 224 accum.append(c); 225 break; 226 case '"': 227 if (inAttribute) 228 accum.append("""); 229 else 230 accum.append(c); 231 break; 232 default: 233 if (canEncode(coreCharset, c, encoder)) 234 accum.append(c); 235 else 236 appendEncoded(accum, escapeMode, codePoint); 237 } 238 } else { 239 final String c = new String(Character.toChars(codePoint)); 240 if (encoder.canEncode(c)) // uses fallback encoder for simplicity 241 accum.append(c); 242 else 243 appendEncoded(accum, escapeMode, codePoint); 244 } 245 } 246 } 247 248 private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException { 249 final String name = escapeMode.nameForCodepoint(codePoint); 250 if (name != emptyName) // ok for identity check 251 accum.append('&').append(name).append(';'); 252 else 253 accum.append("&#x").append(Integer.toHexString(codePoint)).append(';'); 254 } 255 256 /** 257 * Un-escape an HTML escaped string. That is, {@code <} is returned as {@code <}. 258 * 259 * @param string the HTML string to un-escape 260 * @return the unescaped string 261 */ 262 public static String unescape(String string) { 263 return unescape(string, false); 264 } 265 266 /** 267 * Unescape the input string. 268 * 269 * @param string to un-HTML-escape 270 * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) 271 * @return unescaped string 272 */ 273 static String unescape(String string, boolean strict) { 274 return Parser.unescapeEntities(string, strict); 275 } 276 277 /* 278 * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean. 279 * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF, 280 * performance may be bad. We can add more encoders for common character sets that are impacted by performance 281 * issues on Android if required. 282 * 283 * Benchmarks: * 284 * OLD toHtml() impl v New (fastpath) in millis 285 * Wiki: 1895, 16 286 * CNN: 6378, 55 287 * Alterslash: 3013, 28 288 * Jsoup: 167, 2 289 */ 290 private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) { 291 // todo add more charset tests if impacted by Android's bad perf in canEncode 292 switch (charset) { 293 case ascii: 294 return c < 0x80; 295 case utf: 296 return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above 297 default: 298 return fallback.canEncode(c); 299 } 300 } 301 302 enum CoreCharset { 303 ascii, utf, fallback; 304 305 static CoreCharset byName(final String name) { 306 if (name.equals("US-ASCII")) 307 return ascii; 308 if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al 309 return utf; 310 return fallback; 311 } 312 } 313 314 private static void load(EscapeMode e, String pointsData, int size) { 315 e.nameKeys = new String[size]; 316 e.codeVals = new int[size]; 317 e.codeKeys = new int[size]; 318 e.nameVals = new String[size]; 319 320 int i = 0; 321 CharacterReader reader = new CharacterReader(pointsData); 322 323 while (!reader.isEmpty()) { 324 // NotNestedLessLess=10913,824;1887& 325 326 final String name = reader.consumeTo('='); 327 reader.advance(); 328 final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix); 329 final char codeDelim = reader.current(); 330 reader.advance(); 331 final int cp2; 332 if (codeDelim == ',') { 333 cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix); 334 reader.advance(); 335 } else { 336 cp2 = empty; 337 } 338 final String indexS = reader.consumeTo('&'); 339 final int index = Integer.parseInt(indexS, codepointRadix); 340 reader.advance(); 341 342 e.nameKeys[i] = name; 343 e.codeVals[i] = cp1; 344 e.codeKeys[index] = cp1; 345 e.nameVals[index] = name; 346 347 if (cp2 != empty) { 348 multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2)); 349 } 350 i++; 351 } 352 353 Validate.isTrue(i == size, "Unexpected count of entities loaded"); 354 } 355}