001package org.jsoup.select;
002
003import java.util.ArrayList;
004import java.util.List;
005import java.util.regex.Matcher;
006import java.util.regex.Pattern;
007
008import org.jsoup.helper.StringUtil;
009import org.jsoup.helper.Validate;
010import org.jsoup.parser.TokenQueue;
011
012import static org.jsoup.internal.Normalizer.normalize;
013
014/**
015 * Parses a CSS selector into an Evaluator tree.
016 */
017public class QueryParser {
018    private final static String[] combinators = {",", ">", "+", "~", " "};
019    private static final String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="};
020
021    private TokenQueue tq;
022    private String query;
023    private List<Evaluator> evals = new ArrayList<>();
024
025    /**
026     * Create a new QueryParser.
027     * @param query CSS query
028     */
029    private QueryParser(String query) {
030        this.query = query;
031        this.tq = new TokenQueue(query);
032    }
033
034    /**
035     * Parse a CSS query into an Evaluator.
036     * @param query CSS query
037     * @return Evaluator
038     */
039    public static Evaluator parse(String query) {
040        try {
041            QueryParser p = new QueryParser(query);
042            return p.parse();
043        } catch (IllegalArgumentException e) {
044            throw new Selector.SelectorParseException(e.getMessage());
045        }
046    }
047
048    /**
049     * Parse the query
050     * @return Evaluator
051     */
052    Evaluator parse() {
053        tq.consumeWhitespace();
054
055        if (tq.matchesAny(combinators)) { // if starts with a combinator, use root as elements
056            evals.add(new StructuralEvaluator.Root());
057            combinator(tq.consume());
058        } else {
059            findElements();
060        }
061
062        while (!tq.isEmpty()) {
063            // hierarchy and extras
064            boolean seenWhite = tq.consumeWhitespace();
065
066            if (tq.matchesAny(combinators)) {
067                combinator(tq.consume());
068            } else if (seenWhite) {
069                combinator(' ');
070            } else { // E.class, E#id, E[attr] etc. AND
071                findElements(); // take next el, #. etc off queue
072            }
073        }
074
075        if (evals.size() == 1)
076            return evals.get(0);
077
078        return new CombiningEvaluator.And(evals);
079    }
080
081    private void combinator(char combinator) {
082        tq.consumeWhitespace();
083        String subQuery = consumeSubQuery(); // support multi > childs
084
085        Evaluator rootEval; // the new topmost evaluator
086        Evaluator currentEval; // the evaluator the new eval will be combined to. could be root, or rightmost or.
087        Evaluator newEval = parse(subQuery); // the evaluator to add into target evaluator
088        boolean replaceRightMost = false;
089
090        if (evals.size() == 1) {
091            rootEval = currentEval = evals.get(0);
092            // make sure OR (,) has precedence:
093            if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') {
094                currentEval = ((CombiningEvaluator.Or) currentEval).rightMostEvaluator();
095                replaceRightMost = true;
096            }
097        }
098        else {
099            rootEval = currentEval = new CombiningEvaluator.And(evals);
100        }
101        evals.clear();
102
103        // for most combinators: change the current eval into an AND of the current eval and the new eval
104        if (combinator == '>')
105            currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.ImmediateParent(currentEval));
106        else if (combinator == ' ')
107            currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.Parent(currentEval));
108        else if (combinator == '+')
109            currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.ImmediatePreviousSibling(currentEval));
110        else if (combinator == '~')
111            currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.PreviousSibling(currentEval));
112        else if (combinator == ',') { // group or.
113            CombiningEvaluator.Or or;
114            if (currentEval instanceof CombiningEvaluator.Or) {
115                or = (CombiningEvaluator.Or) currentEval;
116                or.add(newEval);
117            } else {
118                or = new CombiningEvaluator.Or();
119                or.add(currentEval);
120                or.add(newEval);
121            }
122            currentEval = or;
123        }
124        else
125            throw new Selector.SelectorParseException("Unknown combinator: " + combinator);
126
127        if (replaceRightMost)
128            ((CombiningEvaluator.Or) rootEval).replaceRightMostEvaluator(currentEval);
129        else rootEval = currentEval;
130        evals.add(rootEval);
131    }
132
133    private String consumeSubQuery() {
134        StringBuilder sq = new StringBuilder();
135        while (!tq.isEmpty()) {
136            if (tq.matches("("))
137                sq.append("(").append(tq.chompBalanced('(', ')')).append(")");
138            else if (tq.matches("["))
139                sq.append("[").append(tq.chompBalanced('[', ']')).append("]");
140            else if (tq.matchesAny(combinators))
141                break;
142            else
143                sq.append(tq.consume());
144        }
145        return sq.toString();
146    }
147
148    private void findElements() {
149        if (tq.matchChomp("#"))
150            byId();
151        else if (tq.matchChomp("."))
152            byClass();
153        else if (tq.matchesWord() || tq.matches("*|"))
154            byTag();
155        else if (tq.matches("["))
156            byAttribute();
157        else if (tq.matchChomp("*"))
158            allElements();
159        else if (tq.matchChomp(":lt("))
160            indexLessThan();
161        else if (tq.matchChomp(":gt("))
162            indexGreaterThan();
163        else if (tq.matchChomp(":eq("))
164            indexEquals();
165        else if (tq.matches(":has("))
166            has();
167        else if (tq.matches(":contains("))
168            contains(false);
169        else if (tq.matches(":containsOwn("))
170            contains(true);
171        else if (tq.matches(":containsData("))
172            containsData();
173        else if (tq.matches(":matches("))
174            matches(false);
175        else if (tq.matches(":matchesOwn("))
176            matches(true);
177        else if (tq.matches(":not("))
178            not();
179                else if (tq.matchChomp(":nth-child("))
180                cssNthChild(false, false);
181        else if (tq.matchChomp(":nth-last-child("))
182                cssNthChild(true, false);
183        else if (tq.matchChomp(":nth-of-type("))
184                cssNthChild(false, true);
185        else if (tq.matchChomp(":nth-last-of-type("))
186                cssNthChild(true, true);
187        else if (tq.matchChomp(":first-child"))
188                evals.add(new Evaluator.IsFirstChild());
189        else if (tq.matchChomp(":last-child"))
190                evals.add(new Evaluator.IsLastChild());
191        else if (tq.matchChomp(":first-of-type"))
192                evals.add(new Evaluator.IsFirstOfType());
193        else if (tq.matchChomp(":last-of-type"))
194                evals.add(new Evaluator.IsLastOfType());
195        else if (tq.matchChomp(":only-child"))
196                evals.add(new Evaluator.IsOnlyChild());
197        else if (tq.matchChomp(":only-of-type"))
198                evals.add(new Evaluator.IsOnlyOfType());
199        else if (tq.matchChomp(":empty"))
200                evals.add(new Evaluator.IsEmpty());
201        else if (tq.matchChomp(":root"))
202                evals.add(new Evaluator.IsRoot());
203                else // unhandled
204            throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
205
206    }
207
208    private void byId() {
209        String id = tq.consumeCssIdentifier();
210        Validate.notEmpty(id);
211        evals.add(new Evaluator.Id(id));
212    }
213
214    private void byClass() {
215        String className = tq.consumeCssIdentifier();
216        Validate.notEmpty(className);
217        evals.add(new Evaluator.Class(className.trim()));
218    }
219
220    private void byTag() {
221        String tagName = tq.consumeElementSelector();
222
223        Validate.notEmpty(tagName);
224
225        // namespaces: wildcard match equals(tagName) or ending in ":"+tagName
226        if (tagName.startsWith("*|")) {
227            evals.add(new CombiningEvaluator.Or(new Evaluator.Tag(normalize(tagName)), new Evaluator.TagEndsWith(normalize(tagName.replace("*|", ":")))));
228        } else {
229            // namespaces: if element name is "abc:def", selector must be "abc|def", so flip:
230            if (tagName.contains("|"))
231                tagName = tagName.replace("|", ":");
232
233            evals.add(new Evaluator.Tag(tagName.trim()));
234        }
235    }
236
237    private void byAttribute() {
238        TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue
239        String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val)
240        Validate.notEmpty(key);
241        cq.consumeWhitespace();
242
243        if (cq.isEmpty()) {
244            if (key.startsWith("^"))
245                evals.add(new Evaluator.AttributeStarting(key.substring(1)));
246            else
247                evals.add(new Evaluator.Attribute(key));
248        } else {
249            if (cq.matchChomp("="))
250                evals.add(new Evaluator.AttributeWithValue(key, cq.remainder()));
251
252            else if (cq.matchChomp("!="))
253                evals.add(new Evaluator.AttributeWithValueNot(key, cq.remainder()));
254
255            else if (cq.matchChomp("^="))
256                evals.add(new Evaluator.AttributeWithValueStarting(key, cq.remainder()));
257
258            else if (cq.matchChomp("$="))
259                evals.add(new Evaluator.AttributeWithValueEnding(key, cq.remainder()));
260
261            else if (cq.matchChomp("*="))
262                evals.add(new Evaluator.AttributeWithValueContaining(key, cq.remainder()));
263
264            else if (cq.matchChomp("~="))
265                evals.add(new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder())));
266            else
267                throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder());
268        }
269    }
270
271    private void allElements() {
272        evals.add(new Evaluator.AllElements());
273    }
274
275    // pseudo selectors :lt, :gt, :eq
276    private void indexLessThan() {
277        evals.add(new Evaluator.IndexLessThan(consumeIndex()));
278    }
279
280    private void indexGreaterThan() {
281        evals.add(new Evaluator.IndexGreaterThan(consumeIndex()));
282    }
283
284    private void indexEquals() {
285        evals.add(new Evaluator.IndexEquals(consumeIndex()));
286    }
287    
288    //pseudo selectors :first-child, :last-child, :nth-child, ...
289    private static final Pattern NTH_AB = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE);
290    private static final Pattern NTH_B  = Pattern.compile("([+-])?(\\d+)");
291
292        private void cssNthChild(boolean backwards, boolean ofType) {
293                String argS = normalize(tq.chompTo(")"));
294                Matcher mAB = NTH_AB.matcher(argS);
295                Matcher mB = NTH_B.matcher(argS);
296                final int a, b;
297                if ("odd".equals(argS)) {
298                        a = 2;
299                        b = 1;
300                } else if ("even".equals(argS)) {
301                        a = 2;
302                        b = 0;
303                } else if (mAB.matches()) {
304                        a = mAB.group(3) != null ? Integer.parseInt(mAB.group(1).replaceFirst("^\\+", "")) : 1;
305                        b = mAB.group(4) != null ? Integer.parseInt(mAB.group(4).replaceFirst("^\\+", "")) : 0;
306                } else if (mB.matches()) {
307                        a = 0;
308                        b = Integer.parseInt(mB.group().replaceFirst("^\\+", ""));
309                } else {
310                        throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", argS);
311                }
312                if (ofType)
313                        if (backwards)
314                                evals.add(new Evaluator.IsNthLastOfType(a, b));
315                        else
316                                evals.add(new Evaluator.IsNthOfType(a, b));
317                else {
318                        if (backwards)
319                                evals.add(new Evaluator.IsNthLastChild(a, b));
320                        else
321                                evals.add(new Evaluator.IsNthChild(a, b));
322                }
323        }
324
325    private int consumeIndex() {
326        String indexS = tq.chompTo(")").trim();
327        Validate.isTrue(StringUtil.isNumeric(indexS), "Index must be numeric");
328        return Integer.parseInt(indexS);
329    }
330
331    // pseudo selector :has(el)
332    private void has() {
333        tq.consume(":has");
334        String subQuery = tq.chompBalanced('(', ')');
335        Validate.notEmpty(subQuery, ":has(el) subselect must not be empty");
336        evals.add(new StructuralEvaluator.Has(parse(subQuery)));
337    }
338
339    // pseudo selector :contains(text), containsOwn(text)
340    private void contains(boolean own) {
341        tq.consume(own ? ":containsOwn" : ":contains");
342        String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')'));
343        Validate.notEmpty(searchText, ":contains(text) query must not be empty");
344        if (own)
345            evals.add(new Evaluator.ContainsOwnText(searchText));
346        else
347            evals.add(new Evaluator.ContainsText(searchText));
348    }
349
350    // pseudo selector :containsData(data)
351    private void containsData() {
352        tq.consume(":containsData");
353        String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')'));
354        Validate.notEmpty(searchText, ":containsData(text) query must not be empty");
355        evals.add(new Evaluator.ContainsData(searchText));
356    }
357
358    // :matches(regex), matchesOwn(regex)
359    private void matches(boolean own) {
360        tq.consume(own ? ":matchesOwn" : ":matches");
361        String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex bits will be escaped
362        Validate.notEmpty(regex, ":matches(regex) query must not be empty");
363
364        if (own)
365            evals.add(new Evaluator.MatchesOwn(Pattern.compile(regex)));
366        else
367            evals.add(new Evaluator.Matches(Pattern.compile(regex)));
368    }
369
370    // :not(selector)
371    private void not() {
372        tq.consume(":not");
373        String subQuery = tq.chompBalanced('(', ')');
374        Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty");
375
376        evals.add(new StructuralEvaluator.Not(parse(subQuery)));
377    }
378}