001package org.jsoup.parser;
002
003import org.jsoup.helper.StringUtil;
004import org.jsoup.helper.Validate;
005
006/**
007 * A character queue with parsing helpers.
008 *
009 * @author Jonathan Hedley
010 */
011public class TokenQueue {
012    private String queue;
013    private int pos = 0;
014    
015    private static final char ESC = '\\'; // escape char for chomp balanced.
016
017    /**
018     Create a new TokenQueue.
019     @param data string of data to back queue.
020     */
021    public TokenQueue(String data) {
022        Validate.notNull(data);
023        queue = data;
024    }
025
026    /**
027     * Is the queue empty?
028     * @return true if no data left in queue.
029     */
030    public boolean isEmpty() {
031        return remainingLength() == 0;
032    }
033    
034    private int remainingLength() {
035        return queue.length() - pos;
036    }
037
038    /**
039     * Retrieves but does not remove the first character from the queue.
040     * @return First character, or 0 if empty.
041     */
042    public char peek() {
043        return isEmpty() ? 0 : queue.charAt(pos);
044    }
045
046    /**
047     Add a character to the start of the queue (will be the next character retrieved).
048     @param c character to add
049     */
050    public void addFirst(Character c) {
051        addFirst(c.toString());
052    }
053
054    /**
055     Add a string to the start of the queue.
056     @param seq string to add.
057     */
058    public void addFirst(String seq) {
059        // not very performant, but an edge case
060        queue = seq + queue.substring(pos);
061        pos = 0;
062    }
063
064    /**
065     * Tests if the next characters on the queue match the sequence. Case insensitive.
066     * @param seq String to check queue for.
067     * @return true if the next characters match.
068     */
069    public boolean matches(String seq) {
070        return queue.regionMatches(true, pos, seq, 0, seq.length());
071    }
072
073    /**
074     * Case sensitive match test.
075     * @param seq string to case sensitively check for
076     * @return true if matched, false if not
077     */
078    public boolean matchesCS(String seq) {
079        return queue.startsWith(seq, pos);
080    }
081    
082
083    /**
084     Tests if the next characters match any of the sequences. Case insensitive.
085     @param seq list of strings to case insensitively check for
086     @return true of any matched, false if none did
087     */
088    public boolean matchesAny(String... seq) {
089        for (String s : seq) {
090            if (matches(s))
091                return true;
092        }
093        return false;
094    }
095
096    public boolean matchesAny(char... seq) {
097        if (isEmpty())
098            return false;
099
100        for (char c: seq) {
101            if (queue.charAt(pos) == c)
102                return true;
103        }
104        return false;
105    }
106
107    public boolean matchesStartTag() {
108        // micro opt for matching "<x"
109        return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character.isLetter(queue.charAt(pos+1)));
110    }
111
112    /**
113     * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the
114     * queue.
115     * @param seq String to search for, and if found, remove from queue.
116     * @return true if found and removed, false if not found.
117     */
118    public boolean matchChomp(String seq) {
119        if (matches(seq)) {
120            pos += seq.length();
121            return true;
122        } else {
123            return false;
124        }
125    }
126
127    /**
128     Tests if queue starts with a whitespace character.
129     @return if starts with whitespace
130     */
131    public boolean matchesWhitespace() {
132        return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos));
133    }
134
135    /**
136     Test if the queue matches a word character (letter or digit).
137     @return if matches a word character
138     */
139    public boolean matchesWord() {
140        return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos));
141    }
142
143    /**
144     * Drops the next character off the queue.
145     */
146    public void advance() {
147        if (!isEmpty()) pos++;
148    }
149
150    /**
151     * Consume one character off queue.
152     * @return first character on queue.
153     */
154    public char consume() {
155        return queue.charAt(pos++);
156    }
157
158    /**
159     * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will
160     * throw an illegal state exception -- but you should be running match() against that condition.
161     <p>
162     Case insensitive.
163     * @param seq sequence to remove from head of queue.
164     */
165    public void consume(String seq) {
166        if (!matches(seq))
167            throw new IllegalStateException("Queue did not match expected sequence");
168        int len = seq.length();
169        if (len > remainingLength())
170            throw new IllegalStateException("Queue not long enough to consume sequence");
171        
172        pos += len;
173    }
174
175    /**
176     * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out.
177     * @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b>
178     * @return The matched data consumed from queue.
179     */
180    public String consumeTo(String seq) {
181        int offset = queue.indexOf(seq, pos);
182        if (offset != -1) {
183            String consumed = queue.substring(pos, offset);
184            pos += consumed.length();
185            return consumed;
186        } else {
187            return remainder();
188        }
189    }
190    
191    public String consumeToIgnoreCase(String seq) {
192        int start = pos;
193        String first = seq.substring(0, 1);
194        boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of
195        while (!isEmpty()) {
196            if (matches(seq))
197                break;
198            
199            if (canScan) {
200                int skip = queue.indexOf(first, pos) - pos;
201                if (skip == 0) // this char is the skip char, but not match, so force advance of pos
202                    pos++;
203                else if (skip < 0) // no chance of finding, grab to end
204                    pos = queue.length();
205                else
206                    pos += skip;
207            }
208            else
209                pos++;
210        }
211
212        return queue.substring(start, pos);
213    }
214
215    /**
216     Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue.
217     @param seq any number of terminators to consume to. <b>Case insensitive.</b>
218     @return consumed string   
219     */
220    // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this
221    // is is a case sensitive time...
222    public String consumeToAny(String... seq) {
223        int start = pos;
224        while (!isEmpty() && !matchesAny(seq)) {
225            pos++;
226        }
227
228        return queue.substring(start, pos);
229    }
230
231    /**
232     * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it).
233     * <p>
234     * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go
235     * isEmpty() == true).
236     * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case sensitive.</b>
237     * @return Data matched from queue.
238     */
239    public String chompTo(String seq) {
240        String data = consumeTo(seq);
241        matchChomp(seq);
242        return data;
243    }
244    
245    public String chompToIgnoreCase(String seq) {
246        String data = consumeToIgnoreCase(seq); // case insensitive scan
247        matchChomp(seq);
248        return data;
249    }
250
251    /**
252     * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three",
253     * and leave " four" on the queue. Unbalanced openers and closers can be quoted (with ' or ") or escaped (with \). Those escapes will be left
254     * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for
255     * contains text strings; use unescape for that.
256     * @param open opener
257     * @param close closer
258     * @return data matched from the queue
259     */
260    public String chompBalanced(char open, char close) {
261        int start = -1;
262        int end = -1;
263        int depth = 0;
264        char last = 0;
265        boolean inQuote = false;
266
267        do {
268            if (isEmpty()) break;
269            Character c = consume();
270            if (last == 0 || last != ESC) {
271                if ((c.equals('\'') || c.equals('"')) && c != open)
272                    inQuote = !inQuote;
273                if (inQuote)
274                    continue;
275                if (c.equals(open)) {
276                    depth++;
277                    if (start == -1)
278                        start = pos;
279                }
280                else if (c.equals(close))
281                    depth--;
282            }
283
284            if (depth > 0 && last != 0)
285                end = pos; // don't include the outer match pair in the return
286            last = c;
287        } while (depth > 0);
288        final String out = (end >= 0) ? queue.substring(start, end) : "";
289        if (depth > 0) {// ran out of queue before seeing enough )
290            Validate.fail("Did not find balanced marker at '" + out + "'");
291        }
292        return out;
293    }
294    
295    /**
296     * Unescape a \ escaped string.
297     * @param in backslash escaped string
298     * @return unescaped string
299     */
300    public static String unescape(String in) {
301        StringBuilder out = StringUtil.stringBuilder();
302        char last = 0;
303        for (char c : in.toCharArray()) {
304            if (c == ESC) {
305                if (last != 0 && last == ESC)
306                    out.append(c);
307            }
308            else 
309                out.append(c);
310            last = c;
311        }
312        return out.toString();
313    }
314
315    /**
316     * Pulls the next run of whitespace characters of the queue.
317     * @return Whether consuming whitespace or not
318     */
319    public boolean consumeWhitespace() {
320        boolean seen = false;
321        while (matchesWhitespace()) {
322            pos++;
323            seen = true;
324        }
325        return seen;
326    }
327
328    /**
329     * Retrieves the next run of word type (letter or digit) off the queue.
330     * @return String of word characters from queue, or empty string if none.
331     */
332    public String consumeWord() {
333        int start = pos;
334        while (matchesWord())
335            pos++;
336        return queue.substring(start, pos);
337    }
338    
339    /**
340     * Consume an tag name off the queue (word or :, _, -)
341     * 
342     * @return tag name
343     */
344    public String consumeTagName() {
345        int start = pos;
346        while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-')))
347            pos++;
348        
349        return queue.substring(start, pos);
350    }
351    
352    /**
353     * Consume a CSS element selector (tag name, but | instead of : for namespaces (or *| for wildcard namespace), to not conflict with :pseudo selects).
354     * 
355     * @return tag name
356     */
357    public String consumeElementSelector() {
358        int start = pos;
359        while (!isEmpty() && (matchesWord() || matchesAny("*|","|", "_", "-")))
360            pos++;
361        
362        return queue.substring(start, pos);
363    }
364
365    /**
366     Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _)
367     http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier
368     @return identifier
369     */
370    public String consumeCssIdentifier() {
371        int start = pos;
372        while (!isEmpty() && (matchesWord() || matchesAny('-', '_')))
373            pos++;
374
375        return queue.substring(start, pos);
376    }
377
378    /**
379     Consume an attribute key off the queue (letter, digit, -, _, :")
380     @return attribute key
381     */
382    public String consumeAttributeKey() {
383        int start = pos;
384        while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':')))
385            pos++;
386        
387        return queue.substring(start, pos);
388    }
389
390    /**
391     Consume and return whatever is left on the queue.
392     @return remained of queue.
393     */
394    public String remainder() {
395        final String remainder = queue.substring(pos, queue.length());
396        pos = queue.length();
397        return remainder;
398    }
399    
400    @Override
401    public String toString() {
402        return queue.substring(pos);
403    }
404}