001package org.jsoup.parser; 002 003import org.jsoup.UncheckedIOException; 004import org.jsoup.helper.Validate; 005 006import java.io.IOException; 007import java.io.Reader; 008import java.io.StringReader; 009import java.util.Arrays; 010import java.util.Locale; 011 012/** 013 CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes. 014 */ 015public final class CharacterReader { 016 static final char EOF = (char) -1; 017 private static final int maxStringCacheLen = 12; 018 private static final int maxBufferLen = 1024 * 32; 019 private static final int readAheadLimit = (int) (maxBufferLen * 0.75); 020 021 private final char[] charBuf; 022 private final Reader reader; 023 private int bufLength; 024 private int bufSplitPoint; 025 private int bufPos; 026 private int readerPos; 027 private int bufMark; 028 private final String[] stringCache = new String[512]; // holds reused strings in this doc, to lessen garbage 029 030 public CharacterReader(Reader input, int sz) { 031 Validate.notNull(input); 032 Validate.isTrue(input.markSupported()); 033 reader = input; 034 charBuf = new char[sz > maxBufferLen ? maxBufferLen : sz]; 035 bufferUp(); 036 } 037 038 public CharacterReader(Reader input) { 039 this(input, maxBufferLen); 040 } 041 042 public CharacterReader(String input) { 043 this(new StringReader(input), input.length()); 044 } 045 046 private void bufferUp() { 047 if (bufPos < bufSplitPoint) 048 return; 049 050 try { 051 readerPos += bufPos; 052 reader.skip(bufPos); 053 reader.mark(maxBufferLen); 054 bufLength = reader.read(charBuf); 055 reader.reset(); 056 bufPos = 0; 057 bufMark = 0; 058 bufSplitPoint = bufLength > readAheadLimit ? readAheadLimit : bufLength; 059 } catch (IOException e) { 060 throw new UncheckedIOException(e); 061 } 062 } 063 064 /** 065 * Gets the current cursor position in the content. 066 * @return current position 067 */ 068 public int pos() { 069 return readerPos + bufPos; 070 } 071 072 /** 073 * Tests if all the content has been read. 074 * @return true if nothing left to read. 075 */ 076 public boolean isEmpty() { 077 return bufPos >= bufLength; 078 } 079 080 /** 081 * Get the char at the current position. 082 * @return char 083 */ 084 public char current() { 085 bufferUp(); 086 return isEmpty() ? EOF : charBuf[bufPos]; 087 } 088 089 char consume() { 090 bufferUp(); 091 char val = isEmpty() ? EOF : charBuf[bufPos]; 092 bufPos++; 093 return val; 094 } 095 096 void unconsume() { 097 bufPos--; 098 } 099 100 /** 101 * Moves the current position by one. 102 */ 103 public void advance() { 104 bufPos++; 105 } 106 107 void mark() { 108 bufMark = bufPos; 109 } 110 111 void rewindToMark() { 112 bufPos = bufMark; 113 } 114 115 /** 116 * Returns the number of characters between the current position and the next instance of the input char 117 * @param c scan target 118 * @return offset between current position and next instance of target. -1 if not found. 119 */ 120 int nextIndexOf(char c) { 121 // doesn't handle scanning for surrogates 122 bufferUp(); 123 for (int i = bufPos; i < bufLength; i++) { 124 if (c == charBuf[i]) 125 return i - bufPos; 126 } 127 return -1; 128 } 129 130 /** 131 * Returns the number of characters between the current position and the next instance of the input sequence 132 * 133 * @param seq scan target 134 * @return offset between current position and next instance of target. -1 if not found. 135 */ 136 int nextIndexOf(CharSequence seq) { 137 bufferUp(); 138 // doesn't handle scanning for surrogates 139 char startChar = seq.charAt(0); 140 for (int offset = bufPos; offset < bufLength; offset++) { 141 // scan to first instance of startchar: 142 if (startChar != charBuf[offset]) 143 while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ } 144 int i = offset + 1; 145 int last = i + seq.length()-1; 146 if (offset < bufLength && last <= bufLength) { 147 for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ } 148 if (i == last) // found full sequence 149 return offset - bufPos; 150 } 151 } 152 return -1; 153 } 154 155 /** 156 * Reads characters up to the specific char. 157 * @param c the delimiter 158 * @return the chars read 159 */ 160 public String consumeTo(char c) { 161 int offset = nextIndexOf(c); 162 if (offset != -1) { 163 String consumed = cacheString(charBuf, stringCache, bufPos, offset); 164 bufPos += offset; 165 return consumed; 166 } else { 167 return consumeToEnd(); 168 } 169 } 170 171 String consumeTo(String seq) { 172 int offset = nextIndexOf(seq); 173 if (offset != -1) { 174 String consumed = cacheString(charBuf, stringCache, bufPos, offset); 175 bufPos += offset; 176 return consumed; 177 } else { 178 return consumeToEnd(); 179 } 180 } 181 182 /** 183 * Read characters until the first of any delimiters is found. 184 * @param chars delimiters to scan for 185 * @return characters read up to the matched delimiter. 186 */ 187 public String consumeToAny(final char... chars) { 188 bufferUp(); 189 final int start = bufPos; 190 final int remaining = bufLength; 191 final char[] val = charBuf; 192 193 OUTER: while (bufPos < remaining) { 194 for (char c : chars) { 195 if (val[bufPos] == c) 196 break OUTER; 197 } 198 bufPos++; 199 } 200 201 return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : ""; 202 } 203 204 String consumeToAnySorted(final char... chars) { 205 bufferUp(); 206 final int start = bufPos; 207 final int remaining = bufLength; 208 final char[] val = charBuf; 209 210 while (bufPos < remaining) { 211 if (Arrays.binarySearch(chars, val[bufPos]) >= 0) 212 break; 213 bufPos++; 214 } 215 216 return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : ""; 217 } 218 219 String consumeData() { 220 // &, <, null 221 bufferUp(); 222 final int start = bufPos; 223 final int remaining = bufLength; 224 final char[] val = charBuf; 225 226 while (bufPos < remaining) { 227 final char c = val[bufPos]; 228 if (c == '&'|| c == '<' || c == TokeniserState.nullChar) 229 break; 230 bufPos++; 231 } 232 233 return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : ""; 234 } 235 236 String consumeTagName() { 237 // '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar 238 bufferUp(); 239 final int start = bufPos; 240 final int remaining = bufLength; 241 final char[] val = charBuf; 242 243 while (bufPos < remaining) { 244 final char c = val[bufPos]; 245 if (c == '\t'|| c == '\n'|| c == '\r'|| c == '\f'|| c == ' '|| c == '/'|| c == '>'|| c == TokeniserState.nullChar) 246 break; 247 bufPos++; 248 } 249 250 return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : ""; 251 } 252 253 String consumeToEnd() { 254 bufferUp(); 255 String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos); 256 bufPos = bufLength; 257 return data; 258 } 259 260 String consumeLetterSequence() { 261 bufferUp(); 262 int start = bufPos; 263 while (bufPos < bufLength) { 264 char c = charBuf[bufPos]; 265 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c)) 266 bufPos++; 267 else 268 break; 269 } 270 271 return cacheString(charBuf, stringCache, start, bufPos - start); 272 } 273 274 String consumeLetterThenDigitSequence() { 275 bufferUp(); 276 int start = bufPos; 277 while (bufPos < bufLength) { 278 char c = charBuf[bufPos]; 279 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c)) 280 bufPos++; 281 else 282 break; 283 } 284 while (!isEmpty()) { 285 char c = charBuf[bufPos]; 286 if (c >= '0' && c <= '9') 287 bufPos++; 288 else 289 break; 290 } 291 292 return cacheString(charBuf, stringCache, start, bufPos - start); 293 } 294 295 String consumeHexSequence() { 296 bufferUp(); 297 int start = bufPos; 298 while (bufPos < bufLength) { 299 char c = charBuf[bufPos]; 300 if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) 301 bufPos++; 302 else 303 break; 304 } 305 return cacheString(charBuf, stringCache, start, bufPos - start); 306 } 307 308 String consumeDigitSequence() { 309 bufferUp(); 310 int start = bufPos; 311 while (bufPos < bufLength) { 312 char c = charBuf[bufPos]; 313 if (c >= '0' && c <= '9') 314 bufPos++; 315 else 316 break; 317 } 318 return cacheString(charBuf, stringCache, start, bufPos - start); 319 } 320 321 boolean matches(char c) { 322 return !isEmpty() && charBuf[bufPos] == c; 323 324 } 325 326 boolean matches(String seq) { 327 bufferUp(); 328 int scanLength = seq.length(); 329 if (scanLength > bufLength - bufPos) 330 return false; 331 332 for (int offset = 0; offset < scanLength; offset++) 333 if (seq.charAt(offset) != charBuf[bufPos +offset]) 334 return false; 335 return true; 336 } 337 338 boolean matchesIgnoreCase(String seq) { 339 bufferUp(); 340 int scanLength = seq.length(); 341 if (scanLength > bufLength - bufPos) 342 return false; 343 344 for (int offset = 0; offset < scanLength; offset++) { 345 char upScan = Character.toUpperCase(seq.charAt(offset)); 346 char upTarget = Character.toUpperCase(charBuf[bufPos + offset]); 347 if (upScan != upTarget) 348 return false; 349 } 350 return true; 351 } 352 353 boolean matchesAny(char... seq) { 354 if (isEmpty()) 355 return false; 356 357 bufferUp(); 358 char c = charBuf[bufPos]; 359 for (char seek : seq) { 360 if (seek == c) 361 return true; 362 } 363 return false; 364 } 365 366 boolean matchesAnySorted(char[] seq) { 367 bufferUp(); 368 return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0; 369 } 370 371 boolean matchesLetter() { 372 if (isEmpty()) 373 return false; 374 char c = charBuf[bufPos]; 375 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c); 376 } 377 378 boolean matchesDigit() { 379 if (isEmpty()) 380 return false; 381 char c = charBuf[bufPos]; 382 return (c >= '0' && c <= '9'); 383 } 384 385 boolean matchConsume(String seq) { 386 bufferUp(); 387 if (matches(seq)) { 388 bufPos += seq.length(); 389 return true; 390 } else { 391 return false; 392 } 393 } 394 395 boolean matchConsumeIgnoreCase(String seq) { 396 if (matchesIgnoreCase(seq)) { 397 bufPos += seq.length(); 398 return true; 399 } else { 400 return false; 401 } 402 } 403 404 boolean containsIgnoreCase(String seq) { 405 // used to check presence of </title>, </style>. only finds consistent case. 406 String loScan = seq.toLowerCase(Locale.ENGLISH); 407 String hiScan = seq.toUpperCase(Locale.ENGLISH); 408 return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1); 409 } 410 411 @Override 412 public String toString() { 413 return new String(charBuf, bufPos, bufLength - bufPos); 414 } 415 416 /** 417 * Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks. 418 * <p /> 419 * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list. 420 * That saves both having to create objects as hash keys, and running through the entry list, at the expense of 421 * some more duplicates. 422 */ 423 private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) { 424 // limit (no cache): 425 if (count > maxStringCacheLen) 426 return new String(charBuf, start, count); 427 428 // calculate hash: 429 int hash = 0; 430 int offset = start; 431 for (int i = 0; i < count; i++) { 432 hash = 31 * hash + charBuf[offset++]; 433 } 434 435 // get from cache 436 final int index = hash & stringCache.length - 1; 437 String cached = stringCache[index]; 438 439 if (cached == null) { // miss, add 440 cached = new String(charBuf, start, count); 441 stringCache[index] = cached; 442 } else { // hashcode hit, check equality 443 if (rangeEquals(charBuf, start, count, cached)) { // hit 444 return cached; 445 } else { // hashcode conflict 446 cached = new String(charBuf, start, count); 447 stringCache[index] = cached; // update the cache, as recently used strings are more likely to show up again 448 } 449 } 450 return cached; 451 } 452 453 /** 454 * Check if the value of the provided range equals the string. 455 */ 456 static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) { 457 if (count == cached.length()) { 458 int i = start; 459 int j = 0; 460 while (count-- != 0) { 461 if (charBuf[i++] != cached.charAt(j++)) 462 return false; 463 } 464 return true; 465 } 466 return false; 467 } 468 469 // just used for testing 470 boolean rangeEquals(final int start, final int count, final String cached) { 471 return rangeEquals(charBuf, start, count, cached); 472 } 473}