001/* 002 * gnu/regexp/RESyntax.java 003 * Copyright (C) 1998-2001 Wes Biggs 004 * 005 * This library is free software; you can redistribute it and/or modify 006 * it under the terms of the GNU Lesser General Public License as published 007 * by the Free Software Foundation; either version 2.1 of the License, or 008 * (at your option) any later version. 009 * 010 * This library is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 * GNU Lesser General Public License for more details. 014 * 015 * You should have received a copy of the GNU Lesser General Public License 016 * along with this program; if not, write to the Free Software 017 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 018 */ 019 020package gnu.regexp; 021import java.io.Serializable; 022import java.util.BitSet; 023 024/** 025 * An RESyntax specifies the way a regular expression will be compiled. 026 * This class provides a number of predefined useful constants for 027 * emulating popular regular expression syntaxes. Additionally the 028 * user may construct his or her own syntax, using any combination of the 029 * syntax bit constants. The syntax is an optional argument to any of the 030 * matching methods on class RE. 031 * 032 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A> 033 */ 034 035public final class RESyntax implements Serializable { 036 static final String DEFAULT_LINE_SEPARATOR = System.getProperty("line.separator"); 037 038 private static final String SYNTAX_IS_FINAL = RE.getLocalizedMessage("syntax.final"); 039 040 private BitSet bits; 041 042 // true for the constant defined syntaxes 043 private boolean isFinal = false; 044 045 private String lineSeparator = DEFAULT_LINE_SEPARATOR; 046 047 // Values for constants are bit indexes 048 049 /** 050 * Syntax bit. Backslash is an escape character in lists. 051 */ 052 public static final int RE_BACKSLASH_ESCAPE_IN_LISTS = 0; 053 054 /** 055 * Syntax bit. Use \? instead of ? and \+ instead of +. 056 */ 057 public static final int RE_BK_PLUS_QM = 1; 058 059 /** 060 * Syntax bit. POSIX character classes ([:...:]) in lists are allowed. 061 */ 062 public static final int RE_CHAR_CLASSES = 2; 063 064 /** 065 * Syntax bit. ^ and $ are special everywhere. 066 * <B>Not implemented.</B> 067 */ 068 public static final int RE_CONTEXT_INDEP_ANCHORS = 3; 069 070 /** 071 * Syntax bit. Repetition operators are only special in valid positions. 072 * <B>Not implemented.</B> 073 */ 074 public static final int RE_CONTEXT_INDEP_OPS = 4; 075 076 /** 077 * Syntax bit. Repetition and alternation operators are invalid 078 * at start and end of pattern and other places. 079 * <B>Not implemented</B>. 080 */ 081 public static final int RE_CONTEXT_INVALID_OPS = 5; 082 083 /** 084 * Syntax bit. Match-any-character operator (.) matches a newline. 085 */ 086 public static final int RE_DOT_NEWLINE = 6; 087 088 /** 089 * Syntax bit. Match-any-character operator (.) does not match a null. 090 */ 091 public static final int RE_DOT_NOT_NULL = 7; 092 093 /** 094 * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed. 095 */ 096 public static final int RE_INTERVALS = 8; 097 098 /** 099 * Syntax bit. No alternation (|), match one-or-more (+), or 100 * match zero-or-one (?) operators. 101 */ 102 public static final int RE_LIMITED_OPS = 9; 103 104 /** 105 * Syntax bit. Newline is an alternation operator. 106 */ 107 public static final int RE_NEWLINE_ALT = 10; // impl. 108 109 /** 110 * Syntax bit. Intervals use { } instead of \{ \} 111 */ 112 public static final int RE_NO_BK_BRACES = 11; 113 114 /** 115 * Syntax bit. Grouping uses ( ) instead of \( \). 116 */ 117 public static final int RE_NO_BK_PARENS = 12; 118 119 /** 120 * Syntax bit. Backreferences not allowed. 121 */ 122 public static final int RE_NO_BK_REFS = 13; 123 124 /** 125 * Syntax bit. Alternation uses | instead of \| 126 */ 127 public static final int RE_NO_BK_VBAR = 14; 128 129 /** 130 * Syntax bit. <B>Not implemented</B>. 131 */ 132 public static final int RE_NO_EMPTY_RANGES = 15; 133 134 /** 135 * Syntax bit. An unmatched right parenthesis (')' or '\)', depending 136 * on RE_NO_BK_PARENS) will throw an exception when compiling. 137 */ 138 public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16; 139 140 /** 141 * Syntax bit. <B>Not implemented.</B> 142 */ 143 public static final int RE_HAT_LISTS_NOT_NEWLINE = 17; 144 145 /** 146 * Syntax bit. Stingy matching is allowed (+?, *?, ??, {x,y}?). 147 */ 148 public static final int RE_STINGY_OPS = 18; 149 150 /** 151 * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W). 152 */ 153 public static final int RE_CHAR_CLASS_ESCAPES = 19; 154 155 /** 156 * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved). 157 */ 158 public static final int RE_PURE_GROUPING = 20; 159 160 /** 161 * Syntax bit. Allow use of (?=xxx) and (?!xxx) apply the subexpression 162 * to the text following the current position without consuming that text. 163 */ 164 public static final int RE_LOOKAHEAD = 21; 165 166 /** 167 * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z). 168 */ 169 public static final int RE_STRING_ANCHORS = 22; 170 171 /** 172 * Syntax bit. Allow embedded comments, (?#comment), as in Perl5. 173 */ 174 public static final int RE_COMMENTS = 23; 175 176 /** 177 * Syntax bit. Allow character class escapes within lists, as in Perl5. 178 */ 179 public static final int RE_CHAR_CLASS_ESC_IN_LISTS = 24; 180 181 private static final int BIT_TOTAL = 25; 182 183 /** 184 * Predefined syntax. 185 * Emulates regular expression support in the awk utility. 186 */ 187 public static final RESyntax RE_SYNTAX_AWK; 188 189 /** 190 * Predefined syntax. 191 * Emulates regular expression support in the ed utility. 192 */ 193 public static final RESyntax RE_SYNTAX_ED; 194 195 /** 196 * Predefined syntax. 197 * Emulates regular expression support in the egrep utility. 198 */ 199 public static final RESyntax RE_SYNTAX_EGREP; 200 201 /** 202 * Predefined syntax. 203 * Emulates regular expression support in the GNU Emacs editor. 204 */ 205 public static final RESyntax RE_SYNTAX_EMACS; 206 207 /** 208 * Predefined syntax. 209 * Emulates regular expression support in the grep utility. 210 */ 211 public static final RESyntax RE_SYNTAX_GREP; 212 213 /** 214 * Predefined syntax. 215 * Emulates regular expression support in the POSIX awk specification. 216 */ 217 public static final RESyntax RE_SYNTAX_POSIX_AWK; 218 219 /** 220 * Predefined syntax. 221 * Emulates POSIX basic regular expression support. 222 */ 223 public static final RESyntax RE_SYNTAX_POSIX_BASIC; 224 225 /** 226 * Predefined syntax. 227 * Emulates regular expression support in the POSIX egrep specification. 228 */ 229 public static final RESyntax RE_SYNTAX_POSIX_EGREP; 230 231 /** 232 * Predefined syntax. 233 * Emulates POSIX extended regular expression support. 234 */ 235 public static final RESyntax RE_SYNTAX_POSIX_EXTENDED; 236 237 /** 238 * Predefined syntax. 239 * Emulates POSIX basic minimal regular expressions. 240 */ 241 public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC; 242 243 /** 244 * Predefined syntax. 245 * Emulates POSIX extended minimal regular expressions. 246 */ 247 public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED; 248 249 /** 250 * Predefined syntax. 251 * Emulates regular expression support in the sed utility. 252 */ 253 public static final RESyntax RE_SYNTAX_SED; 254 255 /** 256 * Predefined syntax. 257 * Emulates regular expression support in Larry Wall's perl, version 4, 258 */ 259 public static final RESyntax RE_SYNTAX_PERL4; 260 261 /** 262 * Predefined syntax. 263 * Emulates regular expression support in Larry Wall's perl, version 4, 264 * using single line mode (/s modifier). 265 */ 266 public static final RESyntax RE_SYNTAX_PERL4_S; // single line mode (/s) 267 268 /** 269 * Predefined syntax. 270 * Emulates regular expression support in Larry Wall's perl, version 5. 271 */ 272 public static final RESyntax RE_SYNTAX_PERL5; 273 274 /** 275 * Predefined syntax. 276 * Emulates regular expression support in Larry Wall's perl, version 5, 277 * using single line mode (/s modifier). 278 */ 279 public static final RESyntax RE_SYNTAX_PERL5_S; 280 281 static { 282 // Define syntaxes 283 284 RE_SYNTAX_EMACS = new RESyntax().makeFinal(); 285 286 RESyntax RE_SYNTAX_POSIX_COMMON = new RESyntax() 287 .set(RE_CHAR_CLASSES) 288 .set(RE_DOT_NEWLINE) 289 .set(RE_DOT_NOT_NULL) 290 .set(RE_INTERVALS) 291 .set(RE_NO_EMPTY_RANGES) 292 .makeFinal(); 293 294 RE_SYNTAX_POSIX_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON) 295 .set(RE_BK_PLUS_QM) 296 .makeFinal(); 297 298 RE_SYNTAX_POSIX_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON) 299 .set(RE_CONTEXT_INDEP_ANCHORS) 300 .set(RE_CONTEXT_INDEP_OPS) 301 .set(RE_NO_BK_BRACES) 302 .set(RE_NO_BK_PARENS) 303 .set(RE_NO_BK_VBAR) 304 .set(RE_UNMATCHED_RIGHT_PAREN_ORD) 305 .makeFinal(); 306 307 RE_SYNTAX_AWK = new RESyntax() 308 .set(RE_BACKSLASH_ESCAPE_IN_LISTS) 309 .set(RE_DOT_NOT_NULL) 310 .set(RE_NO_BK_PARENS) 311 .set(RE_NO_BK_REFS) 312 .set(RE_NO_BK_VBAR) 313 .set(RE_NO_EMPTY_RANGES) 314 .set(RE_UNMATCHED_RIGHT_PAREN_ORD) 315 .makeFinal(); 316 317 RE_SYNTAX_POSIX_AWK = new RESyntax(RE_SYNTAX_POSIX_EXTENDED) 318 .set(RE_BACKSLASH_ESCAPE_IN_LISTS) 319 .makeFinal(); 320 321 RE_SYNTAX_GREP = new RESyntax() 322 .set(RE_BK_PLUS_QM) 323 .set(RE_CHAR_CLASSES) 324 .set(RE_HAT_LISTS_NOT_NEWLINE) 325 .set(RE_INTERVALS) 326 .set(RE_NEWLINE_ALT) 327 .makeFinal(); 328 329 RE_SYNTAX_EGREP = new RESyntax() 330 .set(RE_CHAR_CLASSES) 331 .set(RE_CONTEXT_INDEP_ANCHORS) 332 .set(RE_CONTEXT_INDEP_OPS) 333 .set(RE_HAT_LISTS_NOT_NEWLINE) 334 .set(RE_NEWLINE_ALT) 335 .set(RE_NO_BK_PARENS) 336 .set(RE_NO_BK_VBAR) 337 .makeFinal(); 338 339 RE_SYNTAX_POSIX_EGREP = new RESyntax(RE_SYNTAX_EGREP) 340 .set(RE_INTERVALS) 341 .set(RE_NO_BK_BRACES) 342 .makeFinal(); 343 344 /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ 345 346 RE_SYNTAX_ED = new RESyntax(RE_SYNTAX_POSIX_BASIC) 347 .makeFinal(); 348 349 RE_SYNTAX_SED = new RESyntax(RE_SYNTAX_POSIX_BASIC) 350 .makeFinal(); 351 352 RE_SYNTAX_POSIX_MINIMAL_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON) 353 .set(RE_LIMITED_OPS) 354 .makeFinal(); 355 356 /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS 357 replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ 358 359 RE_SYNTAX_POSIX_MINIMAL_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON) 360 .set(RE_CONTEXT_INDEP_ANCHORS) 361 .set(RE_CONTEXT_INVALID_OPS) 362 .set(RE_NO_BK_BRACES) 363 .set(RE_NO_BK_PARENS) 364 .set(RE_NO_BK_REFS) 365 .set(RE_NO_BK_VBAR) 366 .set(RE_UNMATCHED_RIGHT_PAREN_ORD) 367 .makeFinal(); 368 369 /* There is no official Perl spec, but here's a "best guess" */ 370 371 RE_SYNTAX_PERL4 = new RESyntax() 372 .set(RE_BACKSLASH_ESCAPE_IN_LISTS) 373 .set(RE_CONTEXT_INDEP_ANCHORS) 374 .set(RE_CONTEXT_INDEP_OPS) // except for '{', apparently 375 .set(RE_INTERVALS) 376 .set(RE_NO_BK_BRACES) 377 .set(RE_NO_BK_PARENS) 378 .set(RE_NO_BK_VBAR) 379 .set(RE_NO_EMPTY_RANGES) 380 .set(RE_CHAR_CLASS_ESCAPES) // \d,\D,\w,\W,\s,\S 381 .makeFinal(); 382 383 RE_SYNTAX_PERL4_S = new RESyntax(RE_SYNTAX_PERL4) 384 .set(RE_DOT_NEWLINE) 385 .makeFinal(); 386 387 RE_SYNTAX_PERL5 = new RESyntax(RE_SYNTAX_PERL4) 388 .set(RE_PURE_GROUPING) // (?:) 389 .set(RE_STINGY_OPS) // *?,??,+?,{}? 390 .set(RE_LOOKAHEAD) // (?=)(?!) 391 .set(RE_STRING_ANCHORS) // \A,\Z 392 .set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within [] 393 .set(RE_COMMENTS) // (?#) 394 .makeFinal(); 395 396 RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5) 397 .set(RE_DOT_NEWLINE) 398 .makeFinal(); 399 } 400 401 /** 402 * Construct a new syntax object with all bits turned off. 403 * This is equivalent to RE_SYNTAX_EMACS. 404 */ 405 public RESyntax() { 406 bits = new BitSet(BIT_TOTAL); 407 } 408 409 /** 410 * Called internally when constructing predefined syntaxes 411 * so their interpretation cannot vary. Conceivably useful 412 * for your syntaxes as well. Causes IllegalAccessError to 413 * be thrown if any attempt to modify the syntax is made. 414 * 415 * @return this object for convenient chaining 416 */ 417 public RESyntax makeFinal() { 418 isFinal = true; 419 return this; 420 } 421 422 /** 423 * Construct a new syntax object with all bits set the same 424 * as the other syntax. 425 */ 426 public RESyntax(RESyntax other) { 427 bits = (BitSet) other.bits.clone(); 428 } 429 430 /** 431 * Check if a given bit is set in this syntax. 432 */ 433 public boolean get(int index) { 434 return bits.get(index); 435 } 436 437 /** 438 * Set a given bit in this syntax. 439 * 440 * @param index the constant (RESyntax.RE_xxx) bit to set. 441 * @return a reference to this object for easy chaining. 442 */ 443 public RESyntax set(int index) { 444 if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL); 445 bits.set(index); 446 return this; 447 } 448 449 /** 450 * Clear a given bit in this syntax. 451 * 452 * @param index the constant (RESyntax.RE_xxx) bit to clear. 453 * @return a reference to this object for easy chaining. 454 */ 455 public RESyntax clear(int index) { 456 if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL); 457 bits.clear(index); 458 return this; 459 } 460 461 /** 462 * Changes the line separator string for regular expressions 463 * created using this RESyntax. The default separator is the 464 * value returned by the system property "line.separator", which 465 * should be correct when reading platform-specific files from a 466 * filesystem. However, many programs may collect input from 467 * sources where the line separator is differently specified (for 468 * example, in the applet environment, the text box widget 469 * interprets line breaks as single-character newlines, 470 * regardless of the host platform. 471 * 472 * Note that setting the line separator to a character or 473 * characters that have specific meaning within the current syntax 474 * can cause unexpected chronosynclastic infundibula. 475 * 476 * @return this object for convenient chaining 477 */ 478 public RESyntax setLineSeparator(String aSeparator) { 479 if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL); 480 lineSeparator = aSeparator; 481 return this; 482 } 483 484 /** 485 * Returns the currently active line separator string. The default 486 * is the platform-dependent system property "line.separator". 487 */ 488 public String getLineSeparator() { 489 return lineSeparator; 490 } 491}