001package org.jsoup.parser;
002
003import org.jsoup.UncheckedIOException;
004import org.jsoup.helper.Validate;
005
006import java.io.IOException;
007import java.io.Reader;
008import java.io.StringReader;
009import java.util.Arrays;
010import java.util.Locale;
011
012/**
013 CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
014 */
015public final class CharacterReader {
016    static final char EOF = (char) -1;
017    private static final int maxStringCacheLen = 12;
018    private static final int maxBufferLen = 1024 * 32;
019    private static final int readAheadLimit = (int) (maxBufferLen * 0.75);
020
021    private final char[] charBuf;
022    private final Reader reader;
023    private int bufLength;
024    private int bufSplitPoint;
025    private int bufPos;
026    private int readerPos;
027    private int bufMark;
028    private final String[] stringCache = new String[512]; // holds reused strings in this doc, to lessen garbage
029
030    public CharacterReader(Reader input, int sz) {
031        Validate.notNull(input);
032        Validate.isTrue(input.markSupported());
033        reader = input;
034        charBuf = new char[sz > maxBufferLen ? maxBufferLen : sz];
035        bufferUp();
036    }
037
038    public CharacterReader(Reader input) {
039        this(input, maxBufferLen);
040    }
041
042    public CharacterReader(String input) {
043        this(new StringReader(input), input.length());
044    }
045
046    private void bufferUp() {
047        if (bufPos < bufSplitPoint)
048            return;
049
050        try {
051            readerPos += bufPos;
052            reader.skip(bufPos);
053            reader.mark(maxBufferLen);
054            bufLength = reader.read(charBuf);
055            reader.reset();
056            bufPos = 0;
057            bufMark = 0;
058            bufSplitPoint = bufLength > readAheadLimit ? readAheadLimit : bufLength;
059        } catch (IOException e) {
060            throw new UncheckedIOException(e);
061        }
062    }
063
064    /**
065     * Gets the current cursor position in the content.
066     * @return current position
067     */
068    public int pos() {
069        return readerPos + bufPos;
070    }
071
072    /**
073     * Tests if all the content has been read.
074     * @return true if nothing left to read.
075     */
076    public boolean isEmpty() {
077        return bufPos >= bufLength;
078    }
079
080    /**
081     * Get the char at the current position.
082     * @return char
083     */
084    public char current() {
085        bufferUp();
086        return isEmpty() ? EOF : charBuf[bufPos];
087    }
088
089    char consume() {
090        bufferUp();
091        char val = isEmpty() ? EOF : charBuf[bufPos];
092        bufPos++;
093        return val;
094    }
095
096    void unconsume() {
097        bufPos--;
098    }
099
100    /**
101     * Moves the current position by one.
102     */
103    public void advance() {
104        bufPos++;
105    }
106
107    void mark() {
108        bufMark = bufPos;
109    }
110
111    void rewindToMark() {
112        bufPos = bufMark;
113    }
114
115    /**
116     * Returns the number of characters between the current position and the next instance of the input char
117     * @param c scan target
118     * @return offset between current position and next instance of target. -1 if not found.
119     */
120    int nextIndexOf(char c) {
121        // doesn't handle scanning for surrogates
122        bufferUp();
123        for (int i = bufPos; i < bufLength; i++) {
124            if (c == charBuf[i])
125                return i - bufPos;
126        }
127        return -1;
128    }
129
130    /**
131     * Returns the number of characters between the current position and the next instance of the input sequence
132     *
133     * @param seq scan target
134     * @return offset between current position and next instance of target. -1 if not found.
135     */
136    int nextIndexOf(CharSequence seq) {
137        bufferUp();
138        // doesn't handle scanning for surrogates
139        char startChar = seq.charAt(0);
140        for (int offset = bufPos; offset < bufLength; offset++) {
141            // scan to first instance of startchar:
142            if (startChar != charBuf[offset])
143                while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
144            int i = offset + 1;
145            int last = i + seq.length()-1;
146            if (offset < bufLength && last <= bufLength) {
147                for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ }
148                if (i == last) // found full sequence
149                    return offset - bufPos;
150            }
151        }
152        return -1;
153    }
154
155    /**
156     * Reads characters up to the specific char.
157     * @param c the delimiter
158     * @return the chars read
159     */
160    public String consumeTo(char c) {
161        int offset = nextIndexOf(c);
162        if (offset != -1) {
163            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
164            bufPos += offset;
165            return consumed;
166        } else {
167            return consumeToEnd();
168        }
169    }
170
171    String consumeTo(String seq) {
172        int offset = nextIndexOf(seq);
173        if (offset != -1) {
174            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
175            bufPos += offset;
176            return consumed;
177        } else {
178            return consumeToEnd();
179        }
180    }
181
182    /**
183     * Read characters until the first of any delimiters is found.
184     * @param chars delimiters to scan for
185     * @return characters read up to the matched delimiter.
186     */
187    public String consumeToAny(final char... chars) {
188        bufferUp();
189        final int start = bufPos;
190        final int remaining = bufLength;
191        final char[] val = charBuf;
192
193        OUTER: while (bufPos < remaining) {
194            for (char c : chars) {
195                if (val[bufPos] == c)
196                    break OUTER;
197            }
198            bufPos++;
199        }
200
201        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
202    }
203
204    String consumeToAnySorted(final char... chars) {
205        bufferUp();
206        final int start = bufPos;
207        final int remaining = bufLength;
208        final char[] val = charBuf;
209
210        while (bufPos < remaining) {
211            if (Arrays.binarySearch(chars, val[bufPos]) >= 0)
212                break;
213            bufPos++;
214        }
215
216        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
217    }
218
219    String consumeData() {
220        // &, <, null
221        bufferUp();
222        final int start = bufPos;
223        final int remaining = bufLength;
224        final char[] val = charBuf;
225
226        while (bufPos < remaining) {
227            final char c = val[bufPos];
228            if (c == '&'|| c ==  '<' || c ==  TokeniserState.nullChar)
229                break;
230            bufPos++;
231        }
232
233        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
234    }
235
236    String consumeTagName() {
237        // '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar
238        bufferUp();
239        final int start = bufPos;
240        final int remaining = bufLength;
241        final char[] val = charBuf;
242
243        while (bufPos < remaining) {
244            final char c = val[bufPos];
245            if (c == '\t'|| c ==  '\n'|| c ==  '\r'|| c ==  '\f'|| c ==  ' '|| c ==  '/'|| c ==  '>'|| c ==  TokeniserState.nullChar)
246                break;
247            bufPos++;
248        }
249
250        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
251    }
252
253    String consumeToEnd() {
254        bufferUp();
255        String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos);
256        bufPos = bufLength;
257        return data;
258    }
259
260    String consumeLetterSequence() {
261        bufferUp();
262        int start = bufPos;
263        while (bufPos < bufLength) {
264            char c = charBuf[bufPos];
265            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
266                bufPos++;
267            else
268                break;
269        }
270
271        return cacheString(charBuf, stringCache, start, bufPos - start);
272    }
273
274    String consumeLetterThenDigitSequence() {
275        bufferUp();
276        int start = bufPos;
277        while (bufPos < bufLength) {
278            char c = charBuf[bufPos];
279            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
280                bufPos++;
281            else
282                break;
283        }
284        while (!isEmpty()) {
285            char c = charBuf[bufPos];
286            if (c >= '0' && c <= '9')
287                bufPos++;
288            else
289                break;
290        }
291
292        return cacheString(charBuf, stringCache, start, bufPos - start);
293    }
294
295    String consumeHexSequence() {
296        bufferUp();
297        int start = bufPos;
298        while (bufPos < bufLength) {
299            char c = charBuf[bufPos];
300            if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'))
301                bufPos++;
302            else
303                break;
304        }
305        return cacheString(charBuf, stringCache, start, bufPos - start);
306    }
307
308    String consumeDigitSequence() {
309        bufferUp();
310        int start = bufPos;
311        while (bufPos < bufLength) {
312            char c = charBuf[bufPos];
313            if (c >= '0' && c <= '9')
314                bufPos++;
315            else
316                break;
317        }
318        return cacheString(charBuf, stringCache, start, bufPos - start);
319    }
320
321    boolean matches(char c) {
322        return !isEmpty() && charBuf[bufPos] == c;
323
324    }
325
326    boolean matches(String seq) {
327        bufferUp();
328        int scanLength = seq.length();
329        if (scanLength > bufLength - bufPos)
330            return false;
331
332        for (int offset = 0; offset < scanLength; offset++)
333            if (seq.charAt(offset) != charBuf[bufPos +offset])
334                return false;
335        return true;
336    }
337
338    boolean matchesIgnoreCase(String seq) {
339        bufferUp();
340        int scanLength = seq.length();
341        if (scanLength > bufLength - bufPos)
342            return false;
343
344        for (int offset = 0; offset < scanLength; offset++) {
345            char upScan = Character.toUpperCase(seq.charAt(offset));
346            char upTarget = Character.toUpperCase(charBuf[bufPos + offset]);
347            if (upScan != upTarget)
348                return false;
349        }
350        return true;
351    }
352
353    boolean matchesAny(char... seq) {
354        if (isEmpty())
355            return false;
356
357        bufferUp();
358        char c = charBuf[bufPos];
359        for (char seek : seq) {
360            if (seek == c)
361                return true;
362        }
363        return false;
364    }
365
366    boolean matchesAnySorted(char[] seq) {
367        bufferUp();
368        return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0;
369    }
370
371    boolean matchesLetter() {
372        if (isEmpty())
373            return false;
374        char c = charBuf[bufPos];
375        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c);
376    }
377
378    boolean matchesDigit() {
379        if (isEmpty())
380            return false;
381        char c = charBuf[bufPos];
382        return (c >= '0' && c <= '9');
383    }
384
385    boolean matchConsume(String seq) {
386        bufferUp();
387        if (matches(seq)) {
388            bufPos += seq.length();
389            return true;
390        } else {
391            return false;
392        }
393    }
394
395    boolean matchConsumeIgnoreCase(String seq) {
396        if (matchesIgnoreCase(seq)) {
397            bufPos += seq.length();
398            return true;
399        } else {
400            return false;
401        }
402    }
403
404    boolean containsIgnoreCase(String seq) {
405        // used to check presence of </title>, </style>. only finds consistent case.
406        String loScan = seq.toLowerCase(Locale.ENGLISH);
407        String hiScan = seq.toUpperCase(Locale.ENGLISH);
408        return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1);
409    }
410
411    @Override
412    public String toString() {
413        return new String(charBuf, bufPos, bufLength - bufPos);
414    }
415
416    /**
417     * Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks.
418     * <p />
419     * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
420     * That saves both having to create objects as hash keys, and running through the entry list, at the expense of
421     * some more duplicates.
422     */
423    private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) {
424        // limit (no cache):
425        if (count > maxStringCacheLen)
426            return new String(charBuf, start, count);
427
428        // calculate hash:
429        int hash = 0;
430        int offset = start;
431        for (int i = 0; i < count; i++) {
432            hash = 31 * hash + charBuf[offset++];
433        }
434
435        // get from cache
436        final int index = hash & stringCache.length - 1;
437        String cached = stringCache[index];
438
439        if (cached == null) { // miss, add
440            cached = new String(charBuf, start, count);
441            stringCache[index] = cached;
442        } else { // hashcode hit, check equality
443            if (rangeEquals(charBuf, start, count, cached)) { // hit
444                return cached;
445            } else { // hashcode conflict
446                cached = new String(charBuf, start, count);
447                stringCache[index] = cached; // update the cache, as recently used strings are more likely to show up again
448            }
449        }
450        return cached;
451    }
452
453    /**
454     * Check if the value of the provided range equals the string.
455     */
456    static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) {
457        if (count == cached.length()) {
458            int i = start;
459            int j = 0;
460            while (count-- != 0) {
461                if (charBuf[i++] != cached.charAt(j++))
462                    return false;
463            }
464            return true;
465        }
466        return false;
467    }
468
469    // just used for testing
470    boolean rangeEquals(final int start, final int count, final String cached) {
471        return rangeEquals(charBuf, start, count, cached);
472    }
473}