001/* 002 * gnu/regexp/RE.java 003 * Copyright (C) 1998-2001 Wes Biggs 004 * 005 * This library is free software; you can redistribute it and/or modify 006 * it under the terms of the GNU Lesser General Public License as published 007 * by the Free Software Foundation; either version 2.1 of the License, or 008 * (at your option) any later version. 009 * 010 * This library is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 * GNU Lesser General Public License for more details. 014 * 015 * You should have received a copy of the GNU Lesser General Public License 016 * along with this program; if not, write to the Free Software 017 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 018 */ 019 020package gnu.regexp; 021import java.io.InputStream; 022import java.io.Reader; 023import java.io.Serializable; 024import java.util.Locale; 025import java.util.PropertyResourceBundle; 026import java.util.ResourceBundle; 027import java.util.Vector; 028 029class IntPair implements Serializable { 030 public int first, second; 031} 032 033class CharUnit implements Serializable { 034 public char ch; 035 public boolean bk; 036} 037 038/** 039 * RE provides the user interface for compiling and matching regular 040 * expressions. 041 * <P> 042 * A regular expression object (class RE) is compiled by constructing it 043 * from a String, StringBuffer or character array, with optional 044 * compilation flags (below) 045 * and an optional syntax specification (see RESyntax; if not specified, 046 * <code>RESyntax.RE_SYNTAX_PERL5</code> is used). 047 * <P> 048 * Various methods attempt to match input text against a compiled 049 * regular expression. These methods are: 050 * <LI><code>isMatch</code>: returns true if the input text in its entirety 051 * matches the regular expression pattern. 052 * <LI><code>getMatch</code>: returns the first match found in the input text, 053 * or null if no match is found. 054 * <LI><code>getAllMatches</code>: returns an array of all non-overlapping 055 * matches found in the input text. If no matches are found, the array is 056 * zero-length. 057 * <LI><code>substitute</code>: substitute the first occurence of the pattern 058 * in the input text with a replacement string (which may include 059 * metacharacters $0-$9, see REMatch.substituteInto). 060 * <LI><code>substituteAll</code>: same as above, but repeat for each match 061 * before returning. 062 * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration object 063 * that allows iteration over the matches (see REMatchEnumeration for some 064 * reasons why you may want to do this instead of using <code>getAllMatches</code>. 065 * <P> 066 * 067 * These methods all have similar argument lists. The input can be a 068 * String, a character array, a StringBuffer, a Reader or an 069 * InputStream of some sort. Note that when using a Reader or 070 * InputStream, the stream read position cannot be guaranteed after 071 * attempting a match (this is not a bug, but a consequence of the way 072 * regular expressions work). Using an REMatchEnumeration can 073 * eliminate most positioning problems. 074 * 075 * <P> 076 * 077 * The optional index argument specifies the offset from the beginning 078 * of the text at which the search should start (see the descriptions 079 * of some of the execution flags for how this can affect positional 080 * pattern operators). For a Reader or InputStream, this means an 081 * offset from the current read position, so subsequent calls with the 082 * same index argument on a Reader or an InputStream will not 083 * necessarily access the same position on the stream, whereas 084 * repeated searches at a given index in a fixed string will return 085 * consistent results. 086 * 087 * <P> 088 * You can optionally affect the execution environment by using a 089 * combination of execution flags (constants listed below). 090 * 091 * <P> 092 * All operations on a regular expression are performed in a 093 * thread-safe manner. 094 * 095 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A> 096 * @version 1.1.4-dev, to be released 097 */ 098 099public class RE extends REToken { 100 // This String will be returned by getVersion() 101 private static final String VERSION = "1.1.4-dev"; 102 103 // The localized strings are kept in a separate file 104 private static ResourceBundle messages = PropertyResourceBundle.getBundle("gnu/regexp/MessagesBundle", Locale.getDefault()); 105 106 // These are, respectively, the first and last tokens in our linked list 107 // If there is only one token, firstToken == lastToken 108 private REToken firstToken, lastToken; 109 110 // This is the number of subexpressions in this regular expression, 111 // with a minimum value of zero. Returned by getNumSubs() 112 private int numSubs; 113 114 /** Minimum length, in characters, of any possible match. */ 115 private int minimumLength; 116 117 /** 118 * Compilation flag. Do not differentiate case. Subsequent 119 * searches using this RE will be case insensitive. 120 */ 121 public static final int REG_ICASE = 2; 122 123 /** 124 * Compilation flag. The match-any-character operator (dot) 125 * will match a newline character. When set this overrides the syntax 126 * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to 127 * the "/s" operator in Perl. 128 */ 129 public static final int REG_DOT_NEWLINE = 4; 130 131 /** 132 * Compilation flag. Use multiline mode. In this mode, the ^ and $ 133 * anchors will match based on newlines within the input. This is 134 * equivalent to the "/m" operator in Perl. 135 */ 136 public static final int REG_MULTILINE = 8; 137 138 /** 139 * Execution flag. 140 * The match-beginning operator (^) will not match at the beginning 141 * of the input string. Useful for matching on a substring when you 142 * know the context of the input is such that position zero of the 143 * input to the match test is not actually position zero of the text. 144 * <P> 145 * This example demonstrates the results of various ways of matching on 146 * a substring. 147 * <P> 148 * <CODE> 149 * String s = "food bar fool";<BR> 150 * RE exp = new RE("^foo.");<BR> 151 * REMatch m0 = exp.getMatch(s);<BR> 152 * REMatch m1 = exp.getMatch(s.substring(8));<BR> 153 * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR> 154 * REMatch m3 = exp.getMatch(s,8); <BR> 155 * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX); <BR> 156 * <P> 157 * // Results:<BR> 158 * // m0 = "food"<BR> 159 * // m1 = "fool"<BR> 160 * // m2 = null<BR> 161 * // m3 = null<BR> 162 * // m4 = "fool"<BR> 163 * </CODE> 164 */ 165 public static final int REG_NOTBOL = 16; 166 167 /** 168 * Execution flag. 169 * The match-end operator ($) does not match at the end 170 * of the input string. Useful for matching on substrings. 171 */ 172 public static final int REG_NOTEOL = 32; 173 174 /** 175 * Execution flag. 176 * When a match method is invoked that starts matching at a non-zero 177 * index into the input, treat the input as if it begins at the index 178 * given. The effect of this flag is that the engine does not "see" 179 * any text in the input before the given index. This is useful so 180 * that the match-beginning operator (^) matches not at position 0 181 * in the input string, but at the position the search started at 182 * (based on the index input given to the getMatch function). See 183 * the example under REG_NOTBOL. It also affects the use of the \< 184 * and \b operators. 185 */ 186 public static final int REG_ANCHORINDEX = 64; 187 188 /** 189 * Execution flag. 190 * The substitute and substituteAll methods will not attempt to 191 * interpolate occurrences of $1-$9 in the replacement text with 192 * the corresponding subexpressions. For example, you may want to 193 * replace all matches of "one dollar" with "$1". 194 */ 195 public static final int REG_NO_INTERPOLATE = 128; 196 197 /** Returns a string representing the version of the gnu.regexp package. */ 198 public static final String version() { 199 return VERSION; 200 } 201 202 // Retrieves a message from the ResourceBundle 203 static final String getLocalizedMessage(String key) { 204 return messages.getString(key); 205 } 206 207 /** 208 * Constructs a regular expression pattern buffer without any compilation 209 * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5). 210 * 211 * @param pattern A regular expression pattern, in the form of a String, 212 * StringBuffer or char[]. Other input types will be converted to 213 * strings using the toString() method. 214 * @exception REException The input pattern could not be parsed. 215 * @exception NullPointerException The pattern was null. 216 */ 217 public RE(Object pattern) throws REException { 218 this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0); 219 } 220 221 /** 222 * Constructs a regular expression pattern buffer using the specified 223 * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5). 224 * 225 * @param pattern A regular expression pattern, in the form of a String, 226 * StringBuffer, or char[]. Other input types will be converted to 227 * strings using the toString() method. 228 * @param cflags The logical OR of any combination of the compilation flags listed above. 229 * @exception REException The input pattern could not be parsed. 230 * @exception NullPointerException The pattern was null. 231 */ 232 public RE(Object pattern, int cflags) throws REException { 233 this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0); 234 } 235 236 /** 237 * Constructs a regular expression pattern buffer using the specified 238 * compilation flags and regular expression syntax. 239 * 240 * @param pattern A regular expression pattern, in the form of a String, 241 * StringBuffer, or char[]. Other input types will be converted to 242 * strings using the toString() method. 243 * @param cflags The logical OR of any combination of the compilation flags listed above. 244 * @param syntax The type of regular expression syntax to use. 245 * @exception REException The input pattern could not be parsed. 246 * @exception NullPointerException The pattern was null. 247 */ 248 public RE(Object pattern, int cflags, RESyntax syntax) throws REException { 249 this(pattern,cflags,syntax,0,0); 250 } 251 252 // internal constructor used for alternation 253 private RE(REToken first, REToken last,int subs, int subIndex, int minLength) { 254 super(subIndex); 255 firstToken = first; 256 lastToken = last; 257 numSubs = subs; 258 minimumLength = minLength; 259 addToken(new RETokenEndSub(subIndex)); 260 } 261 262 private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException { 263 super(myIndex); // Subexpression index of this token. 264 initialize(patternObj, cflags, syntax, myIndex, nextSub); 265 } 266 267 // For use by subclasses 268 protected RE() { super(0); } 269 270 // The meat of construction 271 protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException { 272 char[] pattern; 273 if (patternObj instanceof String) { 274 pattern = ((String) patternObj).toCharArray(); 275 } else if (patternObj instanceof char[]) { 276 pattern = (char[]) patternObj; 277 } else if (patternObj instanceof StringBuffer) { 278 pattern = new char [((StringBuffer) patternObj).length()]; 279 ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0); 280 } else { 281 pattern = patternObj.toString().toCharArray(); 282 } 283 284 int pLength = pattern.length; 285 286 numSubs = 0; // Number of subexpressions in this token. 287 Vector branches = null; 288 289 // linked list of tokens (sort of -- some closed loops can exist) 290 firstToken = lastToken = null; 291 292 // Precalculate these so we don't pay for the math every time we 293 // need to access them. 294 boolean insens = ((cflags & REG_ICASE) > 0); 295 296 // Parse pattern into tokens. Does anyone know if it's more efficient 297 // to use char[] than a String.charAt()? I'm assuming so. 298 299 // index tracks the position in the char array 300 int index = 0; 301 302 // this will be the current parse character (pattern[index]) 303 CharUnit unit = new CharUnit(); 304 305 // This is used for {x,y} calculations 306 IntPair minMax = new IntPair(); 307 308 // Buffer a token so we can create a TokenRepeated, etc. 309 REToken currentToken = null; 310 char ch; 311 312 while (index < pLength) { 313 // read the next character unit (including backslash escapes) 314 index = getCharUnit(pattern,index,unit); 315 316 // ALTERNATION OPERATOR 317 // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT) 318 // not available if RE_LIMITED_OPS is set 319 320 // TODO: the '\n' literal here should be a test against REToken.newline, 321 // which unfortunately may be more than a single character. 322 if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ unit.bk)) 323 || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !unit.bk) ) 324 && !syntax.get(RESyntax.RE_LIMITED_OPS)) { 325 // make everything up to here be a branch. create vector if nec. 326 addToken(currentToken); 327 RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength); 328 minimumLength = 0; 329 if (branches == null) { 330 branches = new Vector(); 331 } 332 branches.addElement(theBranch); 333 firstToken = lastToken = currentToken = null; 334 } 335 336 // INTERVAL OPERATOR: 337 // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES) 338 // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES) 339 // 340 // OPEN QUESTION: 341 // what is proper interpretation of '{' at start of string? 342 343 else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) { 344 int newIndex = getMinMax(pattern,index,minMax,syntax); 345 if (newIndex > index) { 346 if (minMax.first > minMax.second) 347 throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex); 348 if (currentToken == null) 349 throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex); 350 if (currentToken instanceof RETokenRepeated) 351 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex); 352 if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary) 353 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex); 354 if ((currentToken.getMinimumLength() == 0) && (minMax.second == Integer.MAX_VALUE)) 355 throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,newIndex); 356 index = newIndex; 357 currentToken = setRepeated(currentToken,minMax.first,minMax.second,index); 358 } 359 else { 360 addToken(currentToken); 361 currentToken = new RETokenChar(subIndex,unit.ch,insens); 362 } 363 } 364 365 // LIST OPERATOR: 366 // [...] | [^...] 367 368 else if ((unit.ch == '[') && !unit.bk) { 369 Vector options = new Vector(); 370 boolean negative = false; 371 char lastChar = 0; 372 if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index); 373 374 // Check for initial caret, negation 375 if ((ch = pattern[index]) == '^') { 376 negative = true; 377 if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); 378 ch = pattern[index]; 379 } 380 381 // Check for leading right bracket literal 382 if (ch == ']') { 383 lastChar = ch; 384 if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); 385 } 386 387 while ((ch = pattern[index++]) != ']') { 388 if ((ch == '-') && (lastChar != 0)) { 389 if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); 390 if ((ch = pattern[index]) == ']') { 391 options.addElement(new RETokenChar(subIndex,lastChar,insens)); 392 lastChar = '-'; 393 } else { 394 options.addElement(new RETokenRange(subIndex,lastChar,ch,insens)); 395 lastChar = 0; 396 index++; 397 } 398 } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) { 399 if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); 400 int posixID = -1; 401 boolean negate = false; 402 char asciiEsc = 0; 403 if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) { 404 switch (pattern[index]) { 405 case 'D': 406 negate = true; 407 case 'd': 408 posixID = RETokenPOSIX.DIGIT; 409 break; 410 case 'S': 411 negate = true; 412 case 's': 413 posixID = RETokenPOSIX.SPACE; 414 break; 415 case 'W': 416 negate = true; 417 case 'w': 418 posixID = RETokenPOSIX.ALNUM; 419 break; 420 } 421 } 422 else if ("nrt".indexOf(pattern[index]) != -1) { 423 switch (pattern[index]) { 424 case 'n': 425 asciiEsc = '\n'; 426 break; 427 case 't': 428 asciiEsc = '\t'; 429 break; 430 case 'r': 431 asciiEsc = '\r'; 432 break; 433 } 434 } 435 if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens)); 436 437 if (posixID != -1) { 438 options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate)); 439 } else if (asciiEsc != 0) { 440 lastChar = asciiEsc; 441 } else { 442 lastChar = pattern[index]; 443 } 444 ++index; 445 } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) { 446 StringBuffer posixSet = new StringBuffer(); 447 index = getPosixSet(pattern,index+1,posixSet); 448 int posixId = RETokenPOSIX.intValue(posixSet.toString()); 449 if (posixId != -1) 450 options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false)); 451 } else { 452 if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens)); 453 lastChar = ch; 454 } 455 if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); 456 } // while in list 457 // Out of list, index is one past ']' 458 459 if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens)); 460 461 // Create a new RETokenOneOf 462 addToken(currentToken); 463 options.trimToSize(); 464 currentToken = new RETokenOneOf(subIndex,options,negative); 465 } 466 467 // SUBEXPRESSIONS 468 // (...) | \(...\) depending on RE_NO_BK_PARENS 469 470 else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) { 471 boolean pure = false; 472 boolean comment = false; 473 boolean lookAhead = false; 474 boolean negativelh = false; 475 if ((index+1 < pLength) && (pattern[index] == '?')) { 476 switch (pattern[index+1]) { 477 case '!': 478 if (syntax.get(RESyntax.RE_LOOKAHEAD)) { 479 pure = true; 480 negativelh = true; 481 lookAhead = true; 482 index += 2; 483 } 484 break; 485 case '=': 486 if (syntax.get(RESyntax.RE_LOOKAHEAD)) { 487 pure = true; 488 lookAhead = true; 489 index += 2; 490 } 491 break; 492 case ':': 493 if (syntax.get(RESyntax.RE_PURE_GROUPING)) { 494 pure = true; 495 index += 2; 496 } 497 break; 498 case '#': 499 if (syntax.get(RESyntax.RE_COMMENTS)) { 500 comment = true; 501 } 502 break; 503 default: 504 throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index); 505 } 506 } 507 508 if (index >= pLength) { 509 throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index); 510 } 511 512 // find end of subexpression 513 int endIndex = index; 514 int nextIndex = index; 515 int nested = 0; 516 517 while ( ((nextIndex = getCharUnit(pattern,endIndex,unit)) > 0) 518 && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) ) 519 if ((endIndex = nextIndex) >= pLength) 520 throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex); 521 else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) 522 nested++; 523 else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) 524 nested--; 525 526 // endIndex is now position at a ')','\)' 527 // nextIndex is end of string or position after ')' or '\)' 528 529 if (comment) index = nextIndex; 530 else { // not a comment 531 // create RE subexpression as token. 532 addToken(currentToken); 533 if (!pure) { 534 numSubs++; 535 } 536 537 int useIndex = (pure || lookAhead) ? 0 : nextSub + numSubs; 538 currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs); 539 numSubs += ((RE) currentToken).getNumSubs(); 540 541 if (lookAhead) { 542 currentToken = new RETokenLookAhead(currentToken,negativelh); 543 } 544 545 index = nextIndex; 546 } // not a comment 547 } // subexpression 548 549 // UNMATCHED RIGHT PAREN 550 // ) or \) throw exception if 551 // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) 552 else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))) { 553 throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index); 554 } 555 556 // START OF LINE OPERATOR 557 // ^ 558 559 else if ((unit.ch == '^') && !unit.bk) { 560 addToken(currentToken); 561 currentToken = null; 562 addToken(new RETokenStart(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null)); 563 } 564 565 // END OF LINE OPERATOR 566 // $ 567 568 else if ((unit.ch == '$') && !unit.bk) { 569 addToken(currentToken); 570 currentToken = null; 571 addToken(new RETokenEnd(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null)); 572 } 573 574 // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null) 575 // . 576 577 else if ((unit.ch == '.') && !unit.bk) { 578 addToken(currentToken); 579 currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL)); 580 } 581 582 // ZERO-OR-MORE REPEAT OPERATOR 583 // * 584 585 else if ((unit.ch == '*') && !unit.bk) { 586 if (currentToken == null) 587 throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index); 588 if (currentToken instanceof RETokenRepeated) 589 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index); 590 if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary) 591 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index); 592 if (currentToken.getMinimumLength() == 0) 593 throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index); 594 currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index); 595 } 596 597 // ONE-OR-MORE REPEAT OPERATOR 598 // + | \+ depending on RE_BK_PLUS_QM 599 // not available if RE_LIMITED_OPS is set 600 601 else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) { 602 if (currentToken == null) 603 throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index); 604 if (currentToken instanceof RETokenRepeated) 605 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index); 606 if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary) 607 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index); 608 if (currentToken.getMinimumLength() == 0) 609 throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index); 610 currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index); 611 } 612 613 // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR 614 // ? | \? depending on RE_BK_PLUS_QM 615 // not available if RE_LIMITED_OPS is set 616 // stingy matching if RE_STINGY_OPS is set and it follows a quantifier 617 618 else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) { 619 if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index); 620 621 // Check for stingy matching on RETokenRepeated 622 if (currentToken instanceof RETokenRepeated) { 623 if (syntax.get(RESyntax.RE_STINGY_OPS) && !((RETokenRepeated)currentToken).isStingy()) 624 ((RETokenRepeated)currentToken).makeStingy(); 625 else 626 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index); 627 } 628 else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary) 629 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index); 630 else 631 currentToken = setRepeated(currentToken,0,1,index); 632 } 633 634 // BACKREFERENCE OPERATOR 635 // \1 \2 ... \9 636 // not available if RE_NO_BK_REFS is set 637 638 else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) { 639 addToken(currentToken); 640 currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens); 641 } 642 643 // START OF STRING OPERATOR 644 // \A if RE_STRING_ANCHORS is set 645 646 else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) { 647 addToken(currentToken); 648 currentToken = new RETokenStart(subIndex,null); 649 } 650 651 // WORD BREAK OPERATOR 652 // \b if ???? 653 654 else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) { 655 addToken(currentToken); 656 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false); 657 } 658 659 // WORD BEGIN OPERATOR 660 // \< if ???? 661 else if (unit.bk && (unit.ch == '<')) { 662 addToken(currentToken); 663 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false); 664 } 665 666 // WORD END OPERATOR 667 // \> if ???? 668 else if (unit.bk && (unit.ch == '>')) { 669 addToken(currentToken); 670 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false); 671 } 672 673 // NON-WORD BREAK OPERATOR 674 // \B if ???? 675 676 else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) { 677 addToken(currentToken); 678 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true); 679 } 680 681 682 // DIGIT OPERATOR 683 // \d if RE_CHAR_CLASS_ESCAPES is set 684 685 else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { 686 addToken(currentToken); 687 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false); 688 } 689 690 // NON-DIGIT OPERATOR 691 // \D 692 693 else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { 694 addToken(currentToken); 695 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true); 696 } 697 698 // NEWLINE ESCAPE 699 // \n 700 701 else if (unit.bk && (unit.ch == 'n')) { 702 addToken(currentToken); 703 currentToken = new RETokenChar(subIndex,'\n',false); 704 } 705 706 // RETURN ESCAPE 707 // \r 708 709 else if (unit.bk && (unit.ch == 'r')) { 710 addToken(currentToken); 711 currentToken = new RETokenChar(subIndex,'\r',false); 712 } 713 714 // WHITESPACE OPERATOR 715 // \s if RE_CHAR_CLASS_ESCAPES is set 716 717 else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { 718 addToken(currentToken); 719 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false); 720 } 721 722 // NON-WHITESPACE OPERATOR 723 // \S 724 725 else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { 726 addToken(currentToken); 727 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true); 728 } 729 730 // TAB ESCAPE 731 // \t 732 733 else if (unit.bk && (unit.ch == 't')) { 734 addToken(currentToken); 735 currentToken = new RETokenChar(subIndex,'\t',false); 736 } 737 738 // ALPHANUMERIC OPERATOR 739 // \w 740 741 else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { 742 addToken(currentToken); 743 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false); 744 } 745 746 // NON-ALPHANUMERIC OPERATOR 747 // \W 748 749 else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { 750 addToken(currentToken); 751 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true); 752 } 753 754 // END OF STRING OPERATOR 755 // \Z 756 757 else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) { 758 addToken(currentToken); 759 currentToken = new RETokenEnd(subIndex,null); 760 } 761 762 // NON-SPECIAL CHARACTER (or escape to make literal) 763 // c | \* for example 764 765 else { // not a special character 766 addToken(currentToken); 767 currentToken = new RETokenChar(subIndex,unit.ch,insens); 768 } 769 } // end while 770 771 // Add final buffered token and an EndSub marker 772 addToken(currentToken); 773 774 if (branches != null) { 775 branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength)); 776 branches.trimToSize(); // compact the Vector 777 minimumLength = 0; 778 firstToken = lastToken = null; 779 addToken(new RETokenOneOf(subIndex,branches,false)); 780 } 781 else addToken(new RETokenEndSub(subIndex)); 782 783 } 784 785 private static int getCharUnit(char[] input, int index, CharUnit unit) throws REException { 786 unit.ch = input[index++]; 787 if (unit.bk = (unit.ch == '\\')) 788 if (index < input.length) 789 unit.ch = input[index++]; 790 else throw new REException(getLocalizedMessage("ends.with.backslash"),REException.REG_ESCAPE,index); 791 return index; 792 } 793 794 /** 795 * Checks if the regular expression matches the input in its entirety. 796 * 797 * @param input The input text. 798 */ 799 public boolean isMatch(Object input) { 800 return isMatch(input,0,0); 801 } 802 803 /** 804 * Checks if the input string, starting from index, is an exact match of 805 * this regular expression. 806 * 807 * @param input The input text. 808 * @param index The offset index at which the search should be begin. 809 */ 810 public boolean isMatch(Object input,int index) { 811 return isMatch(input,index,0); 812 } 813 814 815 /** 816 * Checks if the input, starting from index and using the specified 817 * execution flags, is an exact match of this regular expression. 818 * 819 * @param input The input text. 820 * @param index The offset index at which the search should be begin. 821 * @param eflags The logical OR of any execution flags above. 822 */ 823 public boolean isMatch(Object input,int index,int eflags) { 824 return isMatchImpl(makeCharIndexed(input,index),index,eflags); 825 } 826 827 private boolean isMatchImpl(CharIndexed input, int index, int eflags) { 828 if (firstToken == null) // Trivial case 829 return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS); 830 REMatch m = new REMatch(numSubs, index, eflags); 831 if (firstToken.match(input, m)) { 832 while (m != null) { 833 if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) { 834 return true; 835 } 836 m = m.next; 837 } 838 } 839 return false; 840 } 841 842 /** 843 * Returns the maximum number of subexpressions in this regular expression. 844 * If the expression contains branches, the value returned will be the 845 * maximum subexpressions in any of the branches. 846 */ 847 public int getNumSubs() { 848 return numSubs; 849 } 850 851 // Overrides REToken.setUncle 852 void setUncle(REToken uncle) { 853 if (lastToken != null) { 854 lastToken.setUncle(uncle); 855 } else super.setUncle(uncle); // to deal with empty subexpressions 856 } 857 858 // Overrides REToken.chain 859 860 boolean chain(REToken next) { 861 super.chain(next); 862 setUncle(next); 863 return true; 864 } 865 866 /** 867 * Returns the minimum number of characters that could possibly 868 * constitute a match of this regular expression. 869 */ 870 public int getMinimumLength() { 871 return minimumLength; 872 } 873 874 /** 875 * Returns an array of all matches found in the input. 876 * 877 * If the regular expression allows the empty string to match, it will 878 * substitute matches at all positions except the end of the input. 879 * 880 * @param input The input text. 881 * @return a non-null (but possibly zero-length) array of matches 882 */ 883 public REMatch[] getAllMatches(Object input) { 884 return getAllMatches(input,0,0); 885 } 886 887 /** 888 * Returns an array of all matches found in the input, 889 * beginning at the specified index position. 890 * 891 * If the regular expression allows the empty string to match, it will 892 * substitute matches at all positions except the end of the input. 893 * 894 * @param input The input text. 895 * @param index The offset index at which the search should be begin. 896 * @return a non-null (but possibly zero-length) array of matches 897 */ 898 public REMatch[] getAllMatches(Object input, int index) { 899 return getAllMatches(input,index,0); 900 } 901 902 /** 903 * Returns an array of all matches found in the input string, 904 * beginning at the specified index position and using the specified 905 * execution flags. 906 * 907 * If the regular expression allows the empty string to match, it will 908 * substitute matches at all positions except the end of the input. 909 * 910 * @param input The input text. 911 * @param index The offset index at which the search should be begin. 912 * @param eflags The logical OR of any execution flags above. 913 * @return a non-null (but possibly zero-length) array of matches 914 */ 915 public REMatch[] getAllMatches(Object input, int index, int eflags) { 916 return getAllMatchesImpl(makeCharIndexed(input,index),index,eflags); 917 } 918 919 // this has been changed since 1.03 to be non-overlapping matches 920 private REMatch[] getAllMatchesImpl(CharIndexed input, int index, int eflags) { 921 Vector all = new Vector(); 922 REMatch m = null; 923 while ((m = getMatchImpl(input,index,eflags,null)) != null) { 924 all.addElement(m); 925 index = m.getEndIndex(); 926 if (m.end[0] == 0) { // handle pathological case of zero-length match 927 index++; 928 input.move(1); 929 } else { 930 input.move(m.end[0]); 931 } 932 if (!input.isValid()) break; 933 } 934 REMatch[] mset = new REMatch[all.size()]; 935 all.copyInto(mset); 936 return mset; 937 } 938 939 /* Implements abstract method REToken.match() */ 940 boolean match(CharIndexed input, REMatch mymatch) { 941 if (firstToken == null) return next(input, mymatch); 942 943 // Note the start of this subexpression 944 mymatch.start[subIndex] = mymatch.index; 945 946 return firstToken.match(input, mymatch); 947 } 948 949 /** 950 * Returns the first match found in the input. If no match is found, 951 * null is returned. 952 * 953 * @param input The input text. 954 * @return An REMatch instance referencing the match, or null if none. 955 */ 956 public REMatch getMatch(Object input) { 957 return getMatch(input,0,0); 958 } 959 960 /** 961 * Returns the first match found in the input, beginning 962 * the search at the specified index. If no match is found, 963 * returns null. 964 * 965 * @param input The input text. 966 * @param index The offset within the text to begin looking for a match. 967 * @return An REMatch instance referencing the match, or null if none. 968 */ 969 public REMatch getMatch(Object input, int index) { 970 return getMatch(input,index,0); 971 } 972 973 /** 974 * Returns the first match found in the input, beginning 975 * the search at the specified index, and using the specified 976 * execution flags. If no match is found, returns null. 977 * 978 * @param input The input text. 979 * @param index The offset index at which the search should be begin. 980 * @param eflags The logical OR of any execution flags above. 981 * @return An REMatch instance referencing the match, or null if none. 982 */ 983 public REMatch getMatch(Object input, int index, int eflags) { 984 return getMatch(input,index,eflags,null); 985 } 986 987 /** 988 * Returns the first match found in the input, beginning the search 989 * at the specified index, and using the specified execution flags. 990 * If no match is found, returns null. If a StringBuffer is 991 * provided and is non-null, the contents of the input text from the 992 * index to the beginning of the match (or to the end of the input, 993 * if there is no match) are appended to the StringBuffer. 994 * 995 * @param input The input text. 996 * @param index The offset index at which the search should be begin. 997 * @param eflags The logical OR of any execution flags above. 998 * @param buffer The StringBuffer to save pre-match text in. 999 * @return An REMatch instance referencing the match, or null if none. */ 1000 public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) { 1001 return getMatchImpl(makeCharIndexed(input,index),index,eflags,buffer); 1002 } 1003 1004 REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) { 1005 // Create a new REMatch to hold results 1006 REMatch mymatch = new REMatch(numSubs, anchor, eflags); 1007 do { 1008 // Optimization: check if anchor + minimumLength > length 1009 if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) { 1010 if (match(input, mymatch)) { 1011 // Find longest match of them all to observe leftmost longest 1012 REMatch longest = mymatch; 1013 while ((mymatch = mymatch.next) != null) { 1014 if (mymatch.index > longest.index) { 1015 longest = mymatch; 1016 } 1017 } 1018 1019 longest.end[0] = longest.index; 1020 longest.finish(input); 1021 return longest; 1022 } 1023 } 1024 mymatch.clear(++anchor); 1025 // Append character to buffer if needed 1026 if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) { 1027 buffer.append(input.charAt(0)); 1028 } 1029 } while (input.move(1)); 1030 1031 return null; 1032 } 1033 1034 /** 1035 * Returns an REMatchEnumeration that can be used to iterate over the 1036 * matches found in the input text. 1037 * 1038 * @param input The input text. 1039 * @return A non-null REMatchEnumeration instance. 1040 */ 1041 public REMatchEnumeration getMatchEnumeration(Object input) { 1042 return getMatchEnumeration(input,0,0); 1043 } 1044 1045 1046 /** 1047 * Returns an REMatchEnumeration that can be used to iterate over the 1048 * matches found in the input text. 1049 * 1050 * @param input The input text. 1051 * @param index The offset index at which the search should be begin. 1052 * @return A non-null REMatchEnumeration instance, with its input cursor 1053 * set to the index position specified. 1054 */ 1055 public REMatchEnumeration getMatchEnumeration(Object input, int index) { 1056 return getMatchEnumeration(input,index,0); 1057 } 1058 1059 /** 1060 * Returns an REMatchEnumeration that can be used to iterate over the 1061 * matches found in the input text. 1062 * 1063 * @param input The input text. 1064 * @param index The offset index at which the search should be begin. 1065 * @param eflags The logical OR of any execution flags above. 1066 * @return A non-null REMatchEnumeration instance, with its input cursor 1067 * set to the index position specified. 1068 */ 1069 public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) { 1070 return new REMatchEnumeration(this,makeCharIndexed(input,index),index,eflags); 1071 } 1072 1073 1074 /** 1075 * Substitutes the replacement text for the first match found in the input. 1076 * 1077 * @param input The input text. 1078 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 1079 * @return A String interpolating the substituted text. 1080 * @see REMatch#substituteInto 1081 */ 1082 public String substitute(Object input,String replace) { 1083 return substitute(input,replace,0,0); 1084 } 1085 1086 /** 1087 * Substitutes the replacement text for the first match found in the input 1088 * beginning at the specified index position. Specifying an index 1089 * effectively causes the regular expression engine to throw away the 1090 * specified number of characters. 1091 * 1092 * @param input The input text. 1093 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 1094 * @param index The offset index at which the search should be begin. 1095 * @return A String containing the substring of the input, starting 1096 * at the index position, and interpolating the substituted text. 1097 * @see REMatch#substituteInto 1098 */ 1099 public String substitute(Object input,String replace,int index) { 1100 return substitute(input,replace,index,0); 1101 } 1102 1103 /** 1104 * Substitutes the replacement text for the first match found in the input 1105 * string, beginning at the specified index position and using the 1106 * specified execution flags. 1107 * 1108 * @param input The input text. 1109 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 1110 * @param index The offset index at which the search should be begin. 1111 * @param eflags The logical OR of any execution flags above. 1112 * @return A String containing the substring of the input, starting 1113 * at the index position, and interpolating the substituted text. 1114 * @see REMatch#substituteInto 1115 */ 1116 public String substitute(Object input,String replace,int index,int eflags) { 1117 return substituteImpl(makeCharIndexed(input,index),replace,index,eflags); 1118 } 1119 1120 private String substituteImpl(CharIndexed input,String replace,int index,int eflags) { 1121 StringBuffer buffer = new StringBuffer(); 1122 REMatch m = getMatchImpl(input,index,eflags,buffer); 1123 if (m==null) return buffer.toString(); 1124 buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ? 1125 replace : m.substituteInto(replace) ); 1126 if (input.move(m.end[0])) { 1127 do { 1128 buffer.append(input.charAt(0)); 1129 } while (input.move(1)); 1130 } 1131 return buffer.toString(); 1132 } 1133 1134 /** 1135 * Substitutes the replacement text for each non-overlapping match found 1136 * in the input text. 1137 * 1138 * @param input The input text. 1139 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 1140 * @return A String interpolating the substituted text. 1141 * @see REMatch#substituteInto 1142 */ 1143 public String substituteAll(Object input,String replace) { 1144 return substituteAll(input,replace,0,0); 1145 } 1146 1147 /** 1148 * Substitutes the replacement text for each non-overlapping match found 1149 * in the input text, starting at the specified index. 1150 * 1151 * If the regular expression allows the empty string to match, it will 1152 * substitute matches at all positions except the end of the input. 1153 * 1154 * @param input The input text. 1155 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 1156 * @param index The offset index at which the search should be begin. 1157 * @return A String containing the substring of the input, starting 1158 * at the index position, and interpolating the substituted text. 1159 * @see REMatch#substituteInto 1160 */ 1161 public String substituteAll(Object input,String replace,int index) { 1162 return substituteAll(input,replace,index,0); 1163 } 1164 1165 /** 1166 * Substitutes the replacement text for each non-overlapping match found 1167 * in the input text, starting at the specified index and using the 1168 * specified execution flags. 1169 * 1170 * @param input The input text. 1171 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 1172 * @param index The offset index at which the search should be begin. 1173 * @param eflags The logical OR of any execution flags above. 1174 * @return A String containing the substring of the input, starting 1175 * at the index position, and interpolating the substituted text. 1176 * @see REMatch#substituteInto 1177 */ 1178 public String substituteAll(Object input,String replace,int index,int eflags) { 1179 return substituteAllImpl(makeCharIndexed(input,index),replace,index,eflags); 1180 } 1181 1182 private String substituteAllImpl(CharIndexed input,String replace,int index,int eflags) { 1183 StringBuffer buffer = new StringBuffer(); 1184 REMatch m; 1185 while ((m = getMatchImpl(input,index,eflags,buffer)) != null) { 1186 buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ? 1187 replace : m.substituteInto(replace) ); 1188 index = m.getEndIndex(); 1189 if (m.end[0] == 0) { 1190 char ch = input.charAt(0); 1191 if (ch != CharIndexed.OUT_OF_BOUNDS) 1192 buffer.append(ch); 1193 input.move(1); 1194 } else { 1195 input.move(m.end[0]); 1196 } 1197 1198 if (!input.isValid()) break; 1199 } 1200 return buffer.toString(); 1201 } 1202 1203 /* Helper function for constructor */ 1204 private void addToken(REToken next) { 1205 if (next == null) return; 1206 minimumLength += next.getMinimumLength(); 1207 if (firstToken == null) { 1208 lastToken = firstToken = next; 1209 } else { 1210 // if chain returns false, it "rejected" the token due to 1211 // an optimization, and next was combined with lastToken 1212 if (lastToken.chain(next)) { 1213 lastToken = next; 1214 } 1215 } 1216 } 1217 1218 private static REToken setRepeated(REToken current, int min, int max, int index) throws REException { 1219 if (current == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index); 1220 return new RETokenRepeated(current.subIndex,current,min,max); 1221 } 1222 1223 private static int getPosixSet(char[] pattern,int index,StringBuffer buf) { 1224 // Precondition: pattern[index-1] == ':' 1225 // we will return pos of closing ']'. 1226 int i; 1227 for (i=index; i<(pattern.length-1); i++) { 1228 if ((pattern[i] == ':') && (pattern[i+1] == ']')) 1229 return i+2; 1230 buf.append(pattern[i]); 1231 } 1232 return index; // didn't match up 1233 } 1234 1235 private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException { 1236 // Precondition: input[index-1] == '{', minMax != null 1237 1238 boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES); 1239 int startIndex = index; 1240 if (index == input.length) { 1241 if (mustMatch) 1242 throw new REException(getLocalizedMessage("unmatched.brace"),REException.REG_EBRACE,index); 1243 else 1244 return startIndex; 1245 } 1246 1247 int min,max=0; 1248 CharUnit unit = new CharUnit(); 1249 StringBuffer buf = new StringBuffer(); 1250 1251 // Read string of digits 1252 do { 1253 index = getCharUnit(input,index,unit); 1254 if (Character.isDigit(unit.ch)) 1255 buf.append(unit.ch); 1256 } while ((index != input.length) && Character.isDigit(unit.ch)); 1257 1258 // Check for {} tomfoolery 1259 if (buf.length() == 0) { 1260 if (mustMatch) 1261 throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index); 1262 else 1263 return startIndex; 1264 } 1265 1266 min = Integer.parseInt(buf.toString()); 1267 1268 if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) 1269 max = min; 1270 else if (index == input.length) 1271 if (mustMatch) 1272 throw new REException(getLocalizedMessage("interval.no.end"),REException.REG_EBRACE,index); 1273 else 1274 return startIndex; 1275 else if ((unit.ch == ',') && !unit.bk) { 1276 buf = new StringBuffer(); 1277 // Read string of digits 1278 while (((index = getCharUnit(input,index,unit)) != input.length) && Character.isDigit(unit.ch)) 1279 buf.append(unit.ch); 1280 1281 if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))) 1282 if (mustMatch) 1283 throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index); 1284 else 1285 return startIndex; 1286 1287 // This is the case of {x,} 1288 if (buf.length() == 0) max = Integer.MAX_VALUE; 1289 else max = Integer.parseInt(buf.toString()); 1290 } else 1291 if (mustMatch) 1292 throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index); 1293 else 1294 return startIndex; 1295 1296 // We know min and max now, and they are valid. 1297 1298 minMax.first = min; 1299 minMax.second = max; 1300 1301 // return the index following the '}' 1302 return index; 1303 } 1304 1305 /** 1306 * Return a human readable form of the compiled regular expression, 1307 * useful for debugging. 1308 */ 1309 public String toString() { 1310 StringBuffer sb = new StringBuffer(); 1311 dump(sb); 1312 return sb.toString(); 1313 } 1314 1315 void dump(StringBuffer os) { 1316 os.append('('); 1317 if (subIndex == 0) 1318 os.append("?:"); 1319 if (firstToken != null) 1320 firstToken.dumpAll(os); 1321 os.append(')'); 1322 } 1323 1324 // Cast input appropriately or throw exception 1325 private static CharIndexed makeCharIndexed(Object input, int index) { 1326 // We could let a String fall through to final input, but since 1327 // it's the most likely input type, we check it first. 1328 if (input instanceof String) 1329 return new CharIndexedString((String) input,index); 1330 else if (input instanceof char[]) 1331 return new CharIndexedCharArray((char[]) input,index); 1332 else if (input instanceof StringBuffer) 1333 return new CharIndexedStringBuffer((StringBuffer) input,index); 1334 else if (input instanceof InputStream) 1335 return new CharIndexedInputStream((InputStream) input,index); 1336 else if (input instanceof Reader) 1337 return new CharIndexedReader((Reader) input, index); 1338 else if (input instanceof CharIndexed) 1339 return (CharIndexed) input; // do we lose index info? 1340 else 1341 return new CharIndexedString(input.toString(), index); 1342 } 1343}