001package org.jsoup.select; 002 003import java.util.ArrayList; 004import java.util.List; 005import java.util.regex.Matcher; 006import java.util.regex.Pattern; 007 008import org.jsoup.helper.StringUtil; 009import org.jsoup.helper.Validate; 010import org.jsoup.parser.TokenQueue; 011 012import static org.jsoup.internal.Normalizer.normalize; 013 014/** 015 * Parses a CSS selector into an Evaluator tree. 016 */ 017public class QueryParser { 018 private final static String[] combinators = {",", ">", "+", "~", " "}; 019 private static final String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="}; 020 021 private TokenQueue tq; 022 private String query; 023 private List<Evaluator> evals = new ArrayList<>(); 024 025 /** 026 * Create a new QueryParser. 027 * @param query CSS query 028 */ 029 private QueryParser(String query) { 030 this.query = query; 031 this.tq = new TokenQueue(query); 032 } 033 034 /** 035 * Parse a CSS query into an Evaluator. 036 * @param query CSS query 037 * @return Evaluator 038 */ 039 public static Evaluator parse(String query) { 040 try { 041 QueryParser p = new QueryParser(query); 042 return p.parse(); 043 } catch (IllegalArgumentException e) { 044 throw new Selector.SelectorParseException(e.getMessage()); 045 } 046 } 047 048 /** 049 * Parse the query 050 * @return Evaluator 051 */ 052 Evaluator parse() { 053 tq.consumeWhitespace(); 054 055 if (tq.matchesAny(combinators)) { // if starts with a combinator, use root as elements 056 evals.add(new StructuralEvaluator.Root()); 057 combinator(tq.consume()); 058 } else { 059 findElements(); 060 } 061 062 while (!tq.isEmpty()) { 063 // hierarchy and extras 064 boolean seenWhite = tq.consumeWhitespace(); 065 066 if (tq.matchesAny(combinators)) { 067 combinator(tq.consume()); 068 } else if (seenWhite) { 069 combinator(' '); 070 } else { // E.class, E#id, E[attr] etc. AND 071 findElements(); // take next el, #. etc off queue 072 } 073 } 074 075 if (evals.size() == 1) 076 return evals.get(0); 077 078 return new CombiningEvaluator.And(evals); 079 } 080 081 private void combinator(char combinator) { 082 tq.consumeWhitespace(); 083 String subQuery = consumeSubQuery(); // support multi > childs 084 085 Evaluator rootEval; // the new topmost evaluator 086 Evaluator currentEval; // the evaluator the new eval will be combined to. could be root, or rightmost or. 087 Evaluator newEval = parse(subQuery); // the evaluator to add into target evaluator 088 boolean replaceRightMost = false; 089 090 if (evals.size() == 1) { 091 rootEval = currentEval = evals.get(0); 092 // make sure OR (,) has precedence: 093 if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') { 094 currentEval = ((CombiningEvaluator.Or) currentEval).rightMostEvaluator(); 095 replaceRightMost = true; 096 } 097 } 098 else { 099 rootEval = currentEval = new CombiningEvaluator.And(evals); 100 } 101 evals.clear(); 102 103 // for most combinators: change the current eval into an AND of the current eval and the new eval 104 if (combinator == '>') 105 currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.ImmediateParent(currentEval)); 106 else if (combinator == ' ') 107 currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.Parent(currentEval)); 108 else if (combinator == '+') 109 currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.ImmediatePreviousSibling(currentEval)); 110 else if (combinator == '~') 111 currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.PreviousSibling(currentEval)); 112 else if (combinator == ',') { // group or. 113 CombiningEvaluator.Or or; 114 if (currentEval instanceof CombiningEvaluator.Or) { 115 or = (CombiningEvaluator.Or) currentEval; 116 or.add(newEval); 117 } else { 118 or = new CombiningEvaluator.Or(); 119 or.add(currentEval); 120 or.add(newEval); 121 } 122 currentEval = or; 123 } 124 else 125 throw new Selector.SelectorParseException("Unknown combinator: " + combinator); 126 127 if (replaceRightMost) 128 ((CombiningEvaluator.Or) rootEval).replaceRightMostEvaluator(currentEval); 129 else rootEval = currentEval; 130 evals.add(rootEval); 131 } 132 133 private String consumeSubQuery() { 134 StringBuilder sq = new StringBuilder(); 135 while (!tq.isEmpty()) { 136 if (tq.matches("(")) 137 sq.append("(").append(tq.chompBalanced('(', ')')).append(")"); 138 else if (tq.matches("[")) 139 sq.append("[").append(tq.chompBalanced('[', ']')).append("]"); 140 else if (tq.matchesAny(combinators)) 141 break; 142 else 143 sq.append(tq.consume()); 144 } 145 return sq.toString(); 146 } 147 148 private void findElements() { 149 if (tq.matchChomp("#")) 150 byId(); 151 else if (tq.matchChomp(".")) 152 byClass(); 153 else if (tq.matchesWord() || tq.matches("*|")) 154 byTag(); 155 else if (tq.matches("[")) 156 byAttribute(); 157 else if (tq.matchChomp("*")) 158 allElements(); 159 else if (tq.matchChomp(":lt(")) 160 indexLessThan(); 161 else if (tq.matchChomp(":gt(")) 162 indexGreaterThan(); 163 else if (tq.matchChomp(":eq(")) 164 indexEquals(); 165 else if (tq.matches(":has(")) 166 has(); 167 else if (tq.matches(":contains(")) 168 contains(false); 169 else if (tq.matches(":containsOwn(")) 170 contains(true); 171 else if (tq.matches(":containsData(")) 172 containsData(); 173 else if (tq.matches(":matches(")) 174 matches(false); 175 else if (tq.matches(":matchesOwn(")) 176 matches(true); 177 else if (tq.matches(":not(")) 178 not(); 179 else if (tq.matchChomp(":nth-child(")) 180 cssNthChild(false, false); 181 else if (tq.matchChomp(":nth-last-child(")) 182 cssNthChild(true, false); 183 else if (tq.matchChomp(":nth-of-type(")) 184 cssNthChild(false, true); 185 else if (tq.matchChomp(":nth-last-of-type(")) 186 cssNthChild(true, true); 187 else if (tq.matchChomp(":first-child")) 188 evals.add(new Evaluator.IsFirstChild()); 189 else if (tq.matchChomp(":last-child")) 190 evals.add(new Evaluator.IsLastChild()); 191 else if (tq.matchChomp(":first-of-type")) 192 evals.add(new Evaluator.IsFirstOfType()); 193 else if (tq.matchChomp(":last-of-type")) 194 evals.add(new Evaluator.IsLastOfType()); 195 else if (tq.matchChomp(":only-child")) 196 evals.add(new Evaluator.IsOnlyChild()); 197 else if (tq.matchChomp(":only-of-type")) 198 evals.add(new Evaluator.IsOnlyOfType()); 199 else if (tq.matchChomp(":empty")) 200 evals.add(new Evaluator.IsEmpty()); 201 else if (tq.matchChomp(":root")) 202 evals.add(new Evaluator.IsRoot()); 203 else // unhandled 204 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 205 206 } 207 208 private void byId() { 209 String id = tq.consumeCssIdentifier(); 210 Validate.notEmpty(id); 211 evals.add(new Evaluator.Id(id)); 212 } 213 214 private void byClass() { 215 String className = tq.consumeCssIdentifier(); 216 Validate.notEmpty(className); 217 evals.add(new Evaluator.Class(className.trim())); 218 } 219 220 private void byTag() { 221 String tagName = tq.consumeElementSelector(); 222 223 Validate.notEmpty(tagName); 224 225 // namespaces: wildcard match equals(tagName) or ending in ":"+tagName 226 if (tagName.startsWith("*|")) { 227 evals.add(new CombiningEvaluator.Or(new Evaluator.Tag(normalize(tagName)), new Evaluator.TagEndsWith(normalize(tagName.replace("*|", ":"))))); 228 } else { 229 // namespaces: if element name is "abc:def", selector must be "abc|def", so flip: 230 if (tagName.contains("|")) 231 tagName = tagName.replace("|", ":"); 232 233 evals.add(new Evaluator.Tag(tagName.trim())); 234 } 235 } 236 237 private void byAttribute() { 238 TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue 239 String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val) 240 Validate.notEmpty(key); 241 cq.consumeWhitespace(); 242 243 if (cq.isEmpty()) { 244 if (key.startsWith("^")) 245 evals.add(new Evaluator.AttributeStarting(key.substring(1))); 246 else 247 evals.add(new Evaluator.Attribute(key)); 248 } else { 249 if (cq.matchChomp("=")) 250 evals.add(new Evaluator.AttributeWithValue(key, cq.remainder())); 251 252 else if (cq.matchChomp("!=")) 253 evals.add(new Evaluator.AttributeWithValueNot(key, cq.remainder())); 254 255 else if (cq.matchChomp("^=")) 256 evals.add(new Evaluator.AttributeWithValueStarting(key, cq.remainder())); 257 258 else if (cq.matchChomp("$=")) 259 evals.add(new Evaluator.AttributeWithValueEnding(key, cq.remainder())); 260 261 else if (cq.matchChomp("*=")) 262 evals.add(new Evaluator.AttributeWithValueContaining(key, cq.remainder())); 263 264 else if (cq.matchChomp("~=")) 265 evals.add(new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder()))); 266 else 267 throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder()); 268 } 269 } 270 271 private void allElements() { 272 evals.add(new Evaluator.AllElements()); 273 } 274 275 // pseudo selectors :lt, :gt, :eq 276 private void indexLessThan() { 277 evals.add(new Evaluator.IndexLessThan(consumeIndex())); 278 } 279 280 private void indexGreaterThan() { 281 evals.add(new Evaluator.IndexGreaterThan(consumeIndex())); 282 } 283 284 private void indexEquals() { 285 evals.add(new Evaluator.IndexEquals(consumeIndex())); 286 } 287 288 //pseudo selectors :first-child, :last-child, :nth-child, ... 289 private static final Pattern NTH_AB = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE); 290 private static final Pattern NTH_B = Pattern.compile("([+-])?(\\d+)"); 291 292 private void cssNthChild(boolean backwards, boolean ofType) { 293 String argS = normalize(tq.chompTo(")")); 294 Matcher mAB = NTH_AB.matcher(argS); 295 Matcher mB = NTH_B.matcher(argS); 296 final int a, b; 297 if ("odd".equals(argS)) { 298 a = 2; 299 b = 1; 300 } else if ("even".equals(argS)) { 301 a = 2; 302 b = 0; 303 } else if (mAB.matches()) { 304 a = mAB.group(3) != null ? Integer.parseInt(mAB.group(1).replaceFirst("^\\+", "")) : 1; 305 b = mAB.group(4) != null ? Integer.parseInt(mAB.group(4).replaceFirst("^\\+", "")) : 0; 306 } else if (mB.matches()) { 307 a = 0; 308 b = Integer.parseInt(mB.group().replaceFirst("^\\+", "")); 309 } else { 310 throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", argS); 311 } 312 if (ofType) 313 if (backwards) 314 evals.add(new Evaluator.IsNthLastOfType(a, b)); 315 else 316 evals.add(new Evaluator.IsNthOfType(a, b)); 317 else { 318 if (backwards) 319 evals.add(new Evaluator.IsNthLastChild(a, b)); 320 else 321 evals.add(new Evaluator.IsNthChild(a, b)); 322 } 323 } 324 325 private int consumeIndex() { 326 String indexS = tq.chompTo(")").trim(); 327 Validate.isTrue(StringUtil.isNumeric(indexS), "Index must be numeric"); 328 return Integer.parseInt(indexS); 329 } 330 331 // pseudo selector :has(el) 332 private void has() { 333 tq.consume(":has"); 334 String subQuery = tq.chompBalanced('(', ')'); 335 Validate.notEmpty(subQuery, ":has(el) subselect must not be empty"); 336 evals.add(new StructuralEvaluator.Has(parse(subQuery))); 337 } 338 339 // pseudo selector :contains(text), containsOwn(text) 340 private void contains(boolean own) { 341 tq.consume(own ? ":containsOwn" : ":contains"); 342 String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')')); 343 Validate.notEmpty(searchText, ":contains(text) query must not be empty"); 344 if (own) 345 evals.add(new Evaluator.ContainsOwnText(searchText)); 346 else 347 evals.add(new Evaluator.ContainsText(searchText)); 348 } 349 350 // pseudo selector :containsData(data) 351 private void containsData() { 352 tq.consume(":containsData"); 353 String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')')); 354 Validate.notEmpty(searchText, ":containsData(text) query must not be empty"); 355 evals.add(new Evaluator.ContainsData(searchText)); 356 } 357 358 // :matches(regex), matchesOwn(regex) 359 private void matches(boolean own) { 360 tq.consume(own ? ":matchesOwn" : ":matches"); 361 String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex bits will be escaped 362 Validate.notEmpty(regex, ":matches(regex) query must not be empty"); 363 364 if (own) 365 evals.add(new Evaluator.MatchesOwn(Pattern.compile(regex))); 366 else 367 evals.add(new Evaluator.Matches(Pattern.compile(regex))); 368 } 369 370 // :not(selector) 371 private void not() { 372 tq.consume(":not"); 373 String subQuery = tq.chompBalanced('(', ')'); 374 Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty"); 375 376 evals.add(new StructuralEvaluator.Not(parse(subQuery))); 377 } 378}