Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.
Comment: Migrated to Confluence 5.3

Matching Strings to Patterns

We can define string patterns, aka "Regular Expressions" or "Regexes", and see if a String matches it:

Code Block
assert 'abc' ==~ /abc/ //pattern on righthand side between single-slashes
assert ! ( 'abc' ==~ /ace/ )
assert ! ( 'abc' ==~ /ab/ )

assert 'abc' ==~ /a.c/
    //the . in the pattern matches any character, except \n (or \r\n on Windows)
assert 'abc'.matches( /a.c/ ) //alternative method name
assert java.util.regex.Pattern.matches( /a.c/, 'abc' ) //alternative syntax
assert java.util.regex.Pattern.compile( /a.c/ ).matcher( 'abc' ).matches()
                                                       //alternative syntax

assert '\t\n\f\r' ==~ /\t\n\f\r/
    //some control chars have same notation as in strings
assert '\t\n\f\r' ==~ /\x09\x0a\x0c\x0D/
    //alternatively use hex codes (leading zero required to make 2 digits)
assert '\t\n\f\r' ==~ /\011\012\014\015/
    //alternatively use octal codes (leading zero required)
assert '\b' ==~ /\x08/ && ! ( '\b' ==~ /\b/ )
    // \b has different meaning in regex than in string
assert '\07\013\033' ==~ /\a\v\e/
    //regex-only notation: bell \a, vertical tab \v, escape \e

Twelve characters that are special syntax for regexes need to be quoted:

Code Block
assert 'a.c' ==~ /a\.c/ //backslash before . to quote it
assert '.{[()\\^$|?*+' ==~ /\.\{\[\(\)\\\^\$\|\?\*\+/
    //the 12 chars that need quoting
assert '.{[()\\^$|?*+' ==~ /\Q.{[()\^$|?*+\E/
    //another way to quote text is to bracket with \Q and \E
import java.util.regex.Pattern
assert Pattern.quote( /.{[()\^$|?*+/ ) == /\Q.{[()\^$|?*+\E/
    //a special method to quote text in this way

The chars \c@, \cA, \cB, ..., \cZ, \c[, \c], \c^, and \c_ map to the special characters 0x0 to 0x1f, except 0x1c:

Code Block
assert "${0x0 as char}" =~ /\c@/
for(int c= 'A'; int d= 0x1; c <= 'Z'; c++; d++){
  assert "${d as char}" =~ /\c${c as char}/
}
assert "${0x1b as char}" =~ /\c[/
assert "${0x1d as char}" =~ /\c]/
assert "${0x1e as char}" =~ /\c^/
assert "${0x1f as char}" =~ /\c_/

We have special pattern syntax for whitespace \s, word characters \w, digits \d, and their complements:

Code Block
assert (0x0..0x7F).findAll{ (it as char) ==~ /\s/ } ==
    ['\t', '\n', '\013', '\f', '\r', ' '].collect{it as int}
assert (0x0..0x7F).findAll{ (it as char) ==~ /\w/ } ==
    [*'0'..'9', *'A'..'Z', '_', *'a'..'z'].collect{it as int}
assert (0x0..0x7F).findAll{ (it as char) ==~ /\d/ } ==
    ('0'..'9').collect{it as int}

[ [/\w/, /\W/], [/\d/, /\D/], [/\s/, /\S/] ].each{ pair->
  assert (0x0..0x7F).findAll{ (it as char) ==~ pair[0] &&
         (it as char) ==~ pair[1] }.size() == 0
} // \S means not \s; \W means not \w; \D means not \d

There's certain characters that the dot . doesn't match, except when (?s) is used:

Code Block
assert (0x0..0x7F).findAll{ !( (it as char) ==~ /./ ) } ==
  ['\n' as int, '\r' as int]
    //chars that . doesn't match //also: 0x85, 0x2028, 0x2029
assert 'abc\ndef' ==~ /a.c\ndef/
assert !( 'abc\ndef' ==~ /abc.def/ ) //the . doesn't match \n

assert (0x0..0x7F).findAll{ !( (it as char) ==~ /(?s)./ ) } == []
    //when (?s) used, . matches every character
assert 'abc\r\ndef' ==~ /(?s)abc..def/ && !( 'abc\r\ndef' ==~ /(?s)abc.def/ )
    //on Windows, \r\n needs .. for match

assert (0x0..0x7F).findAll{ !( (it as char) ==~ /(?d)./ ) } == ['\n' as int]
    //only char that . doesn't match for (?d) flag
assert (0x0..0x7F).findAll{ !( (it as char) ==~ /(?sd)./ ) } == []
    // (?sd) together same as (?s) alone

Some other flags:

Code Block
assert ( 'gOoDbYe' ==~ /(?i)goodbye/ )
    //when (?i) used, case-insensitive matching for ASCII characters

assert 'an ace' ==~ /(?x) an\ ace #comment here after hash/
    //quote the space, ignore unquoted whitespace and comments

Some other ways to use flags:

Code Block
assert 'abcDEFG' ==~ /abc(?i)defg/
    //turn on flag halfway thru pattern
assert 'abCDefg' ==~ /ab(?i)cd(?-i)efg/
    //turn flag on, then off again
assert 'abCDEfg' ==~ /ab(?i:cde)fg/
    //turn flag on for only a certain span of text
assert 'ABcdeFG' ==~ /(?i)ab(?-i:cde)fg/
    //turn flag on, but off for only a certain span

assert 'abcdefg' ==~ /abc(?ix) d e f g #comment here/
    //turn more than one flag on together
assert 'abcdefg' ==~ /(?ix) a b c (?-ix)defg/
    //turn more than one flag off together
assert 'abcdefg' ==~ /(?ix) a b c (?s-ix)defg/
    //turn some flag(s) on and other flag(s) off together

import java.util.regex.Pattern
assert Pattern.compile(/abc.def/, Pattern.DOTALL).matcher('abc\ndef').matches()
    //alternative to (?s)
assert ! Pattern.compile(/abc.def/, Pattern.UNIX_LINES).
    matcher('abc\ndef').matches() //alternative to (?d)
assert Pattern.compile(/goodbye/, Pattern.CASE_INSENSITIVE).
    matcher('gOoDbYe').matches() //alternative to (?i)
assert Pattern.compile(/ an\ ace #comment here/, Pattern.COMMENTS).
    matcher('an ace').matches() //alternative to (?x)

//we can enquire the flags set at the end-point of a pattern...
import java.util.regex.Pattern
assert Pattern.compile(/ab(?i)c.def/, Pattern.DOTALL).flags() ==
  Pattern.DOTALL + Pattern.CASE_INSENSITIVE
assert Pattern.compile(/ab(?i)c.d(?-i)ef/, Pattern.DOTALL).flags() ==
  Pattern.DOTALL
assert Pattern.compile(/ab(?i:c.d)ef/, Pattern.DOTALL).flags() ==
  Pattern.DOTALL

A character class is a set of characters, one of which may be matched. We've already seen the predefined character classes \s, \w, \d, \S, \W, \D. We can also define our own:

Code Block
['bat', 'bet', 'bit', 'bot', 'but'].each{ assert it ==~ /b[aeiou]t/ }
    //[aeiou] matches one of a,e,i,o,u
assert ! ('bnt' ==~ /b[aeiou]t/)

['bat', 'bet', 'bit', 'bot', 'but'].each{ assert ! (it ==~ /b[^aeiou]t/) }
    //[^aeiou] matches anything except a,e,i,o,u...
['bbt', 'bxt', 'b%t', 'b)t', 'b*t', 'b\nt'].each{ assert it ==~ /b[^aeiou]t/ }
    //...even newlines

assert 'b' ==~ /[abbbc]/ //duplicate chars in character class have no effect
assert '&' ==~ /[a&]/ &&
       !('&' ==~ /[a&&z]/) &&
       '&' ==~ /[a&&&]/ &&
       !('&' ==~ /[a&&]/) &&
       '&' ==~ /[a&\&]/ //all legal syntax

[ /[a-j]/: [*'a'..'j'],
     //we can specify a range of characters inside a class using hyphen -
  /[_a-zA-Z]/: [*'A'..'Z', '_', *'a'..'z'],
     //we can have many ranges mixed with single characters
  /[_a-z[A-Z]]/: [*'A'..'Z', '_', *'a'..'z'],
     //same effect as [_a-zA-Z]
  /[a-m&&g-z]/: [*'g'..'m'],
     //&& is intersection operator
  /[a-z&&[^bc]]/: ['a', *'d'..'z'],
     //^ means 'not' everything in the character class
  /[a-z&&[^m-p]]/: [*'a'..'l', *'q'..'z'],
     //&& with ^ works like subtraction
  /[^\d\s]/: [*0x0..0x7F].collect{ it as char } - [*'\t'..'\r', ' ', *'0'..'9'],
     //not digit AND not whitespace
  /[\D\S]/: [*0x0..0x7F].collect{ it as char },
     //not equal to above, but means: not digit OR not whitespace
].each{ regex, validVals->
  assert (0x0..0x7F).findAll{ (it as char) ==~ regex } ==
    validVals.collect{ it as int }
}

The only meta-characters inside a character class are \, [, ^ (in the first position), ] (not in the first position or after the ^), - (not in the first position, after the ^, or before the ]), and &&. Quote them with a / to get the literal character. The other usual meta-characters are normal characters inside a character class, and do not need to be quoted with a backslash, though can be. Character class precedences are, from highest: literal escapes (eg \s), grouping (eg [abc]), ranges (eg a-g), unions (eg [abc][xyz]), then intersections ([a-z&&[gjpqy]]).

We can use the alternation operator | to give some options:

Code Block
['abc', 'def', 'xyz'].each{ assert it ==~ /abc|def|xyz/ }
['abcz', 'aijz', 'axyz'].each{ assert it ==~ /a(bc|ij|xy)z/ }
    //we delimit the alternation with parentheses

//when using longhand syntax, we can see what option was matched, using groups,
//which we'll meet soon:
def m= java.util.regex.Pattern.compile( /a(bc|ij|xy)z/ ).matcher( 'abcz' )
m.matches()
assert m.group(1) == 'bc' //whatever was matched between the parens

We use ? to indicate optional character/s:

Code Block
['0 days', '1 day', '2 days'].each{ assert it ==~ /. days?/ }
['Mon', 'Monday'].each{ assert it ==~ /Mon(day)?/ }

Use {n} to match a character exactly n times:

Code Block
assert 'aaab' ==~ /a{3}b/
assert 'abcabc' ==~ /(abc){2}/ // {n} can apply to a multi-character sequence
['ab', 'ba', 'bb', 'aa'].each{ it ==~ /[ab]{2}/ }
    // {n} can apply to a character class
['abab', '%&@b'].each{ assert it ==~ /.{3}b/ }

We can match a character a variable number of times. Use the * operator to match any number of a character:

Code Block
['aaab', 'aab', 'ab', 'b'].each{ assert it ==~ /a*b/ }
    //even zero occurences of the character is matched
['abcabc', 'abc', ''].each{ assert it ==~ /(abc)*/ }
    // * can apply to a multi-character sequence
['abbacb', 'acaba', 'cbbbac', 'c', ''].each{ assert it ==~ /[abc]*/ }
    // * can apply to a character class
['aaab', 'b', 'abab'].each{ assert it ==~ /.*b/ }
    // * is greedy: in 'abab' .* matches 'aba'

//Use + to match at least one occurence of a character:
['aaab', 'aab', 'ab'].each{ assert it ==~ /a+b/ }
assert !( 'b' ==~ /a+b/ ) //at least one 'a' is required
assert 'abcabcxz' ==~ /(abc)+[xyz]+/
    // + can apply to character class or multi-character sequence

//Other variable-length repetition operators:
assert 'aaaab' ==~ /a{3,}b/ // {n,} matches at least n characters
assert 'aaaab' ==~ /a{3,5}b/ // {n1,n2} matches between n1 and n2 characters
assert 'abaxyzxyz' ==~ /[ab]{2,}(xyz){2,4}/
    //these also can apply to multi-character sequences or character classes

By using longhand syntax, we see that * operator is greedy, repeating the preceding token as often as possible, returning the leftmost longest match:

Code Block
def m= java.util.regex.Pattern.compile( /(.*),(.*)/ ).matcher( 'one,two,three' )
m.matches()
assert m.group(1) == 'one,two' //what was matched between the first parens
assert m.group(2) == 'three'   //what was matched between the second parens

assert m.hasGroup() //misc method to check whether the pattern has groups
assert m.groupCount() == 2 //misc method to count them

Anything between parentheses is a capturing group, whose matched values can be accessed later:

Code Block
//we can access matched values in groups outside the pattern using 
//longhand syntax...
def m= java.util.regex.Pattern.compile( /(a*)(b*)/ ).matcher( 'aaabb' )
m.matches()
assert m.group(1) == 'aaa' && m.start(1) == 0 && m.end(1) == 3
assert m.group(2) == 'bb' && m.start(2) == 3 && m.end(2) == 5
assert m.group(0) == 'aaabb' //group(0) is the entire string
assert m.group() == 'aaabb' && m.start() == 0 && m.end() == 5
    //parameters default to 0

//...or outside the pattern using indexing (don't forget the first [0] index)...
m= java.util.regex.Pattern.compile( /(a*)(b*)/ ).matcher( 'aaabb' )
m.matches()
assert m[0][0] == 'aaabb' //the entire string
assert m[0][1] == 'aaa' && m.start(1) == 0 && m.end(1) == 3
assert m[0][2] == 'bb' && m.start(2) == 3 && m.end(2) == 5

//...or within the pattern using \n notation:
assert 'aaabb,aaa,bb' ==~ /(a*)(b*),\1,\2/
    // \1 is the first group matched, \2 the second matched

assert 'abbcc,abb,bb,cc' ==~ /(a(b*))(c*),\1,\2,\3/
    //groups numbered by sequence of their opening parens from left to right
assert 'abcddd,ab,ddd' ==~ /(a(?:b))(?>c)(d*),\1,\2/
    //groups beginning with ?: or ?> aren't numbered
assert 'aba,a,b' ==~ /(a(b)?)+,\1,\2/
   //second match for \1 has no match for \2, so \2 keeps value from first match

assert 'abc,bc' ==~ /a(bc)?,\1/
assert !( 'a,' ==~ /a(bc)?,\1/ )
    //referencing \1 causes entire match to fail if it hasn't already matched
assert !( 'a' ==~ /([abc]\1)/ )
    //referencing a group within itself causes entire match to fail

\1 through \9 in patterns are always interpreted as group references, and a backslash-escaped number greater than 9 is treated as a group reference if at least that many groups exist at that point in the string pattern. Otherwise digits are dropped until either the number is smaller or equal to the existing number of groups or it is one digit. Grouping parentheses and group references cannot be used inside character classes.

Some miscellaneous methods:

Code Block
def m= ( ~/(a*)|bc/ ).matcher( 'bc' ) //another longhand syntax
m.matches()
assert m.group(1) == null && m.start(1) == -1 && m.end(1) == -1
                         //if match successful but group didn't match anything

def p= java.util.regex.Pattern.compile( /ab*c/ )
assert p.pattern() == /ab*c/ //retrieve the definition from a compiled pattern

Finding Patterns in Strings

As well as matching an entire string to a pattern, we can also find a pattern within a string using =~ syntax:

Code Block
assert 'abcdefg' =~ /cde/ //is 'cde' within 'abcdefg'?
assert ! ( 'abcdefg' =~ /ace/ )
assert java.util.regex.Pattern.compile( /cde/ ).matcher( 'abcdefg' ).find()
                                                          //alternative syntax

assert 'xxx z9g\t\nxxx' =~ /\s\w\d.\t\n/
    //special characters work the same as with ==~ matching
assert ( 'xxxgOoDbYexxx' =~ /(?i)goodbye/ )
    //flags also work the same as with ==~
assert 'xxxbatxxx' =~ /b[aeiou]t/
    //character classes also work the same as with ==~

There can be more than one occurence of the pattern:

Code Block
def s= 'horse house'
assert s =~ /ho.se/ //to check for the first occurence only
def m= (s =~ /ho.se/)
assert m.size() == 2 && m[0] == 'horse' && m[1] == 'house'
    //to retrieve all occurences

def l= []
s.eachMatch( /ho.se/ ){ l << it[0] } //alternative syntax, be sure to use it[0]
assert l == ['horse', 'house']
def l2= []
s.eachMatch( /abc/ ){ l2 << it[0] } //no matches
assert l2 == []
def l3= []
s.eachMatch( /hor./ ){ l3 << it[0] } //one match only
assert l3 == ['hors']

Some longhand syntax, with various methods:

Code Block
import java.util.regex.Pattern
def s= 'hoose horse house'
def m= Pattern.compile(/ho.se/).matcher(s)
assert m.find() && s[m.start()..<m.end()] == 'hoose'
assert m.find() && s[m.start()..<m.end()] == 'horse'
assert m.find() && s[m.start()..<m.end()] == 'house'
assert ! m.find()
assert m.reset() && s[m.start()..<m.end()] == 'hoose'
    //use reset() to find from beginning
assert m.find() && s[m.start()..<m.end()] == 'horse'
assert m.find(1) && s[m.start()..<m.end()] == 'horse'
    //giving a parameter to find() starts finding from that index
m.setIndex(1)
    //alternatively, calling setIndex() resets from that index, without finding
    //until find() called

assert m.find() && s[m.start()..<m.end()] == 'horse'

We can group when finding with =~ just as we do when matching with ==~:

Code Block
def m= ( 'mistlemuscle' =~ /m(.)s(.)le/ )
assert m.size() == 2
assert m.count == 2 //alternative to size()
assert m[0] == ['mistle', 'i', 't']
assert m[0].size() == 3 && m[0][0] == 'mistle' &&
       m[0][1] == 'i' && m[0][2] == 't'
assert m[1] == ['muscle', 'u', 'c']
assert m[1].size() == 3 && m[1][0] == 'muscle' &&
       m[1][1] == 'u' && m[1][2] == 'c'

//using the eachMatch() method...
def l= []
'mistlemuscle'.eachMatch( /m(.)s(.)le/ ){ l << it }
assert l*.toList() == [['mistle', 'i', 't'], ['muscle', 'u', 'c']]
def l2= []
'mistle'.eachMatch( /m(.)s(.)le/ ){ l2 << it }
assert l2*.toList() == [['mistle', 'i', 't']]
def l3= []
'practical'.eachMatch( /m(.)s(.)le/ ){ l3 << it }
assert l3*.toList() == []

//using longhand notation...
import java.util.regex.Pattern
m= Pattern.compile( /(a+)(b+)/ ).matcher( 'aaabbcccaabbb' )
m.find()
assert m.group(1) == 'aaa' && m.start(1) == 0 && m.end(1) == 3 &&
       m.group(2) == 'bb' && m.start(2) == 3 && m.end(2) == 5 &&
       m.group() == 'aaabb' && m.start() == 0 && m.end() == 5
m.find()
assert m.group(1) == 'aa' && m.start(1) == 8 && m.end(1) == 10 &&
       m.group(2) == 'bbb' && m.start(2) == 10 && m.end(2) == 13 &&
       m.group() == 'aabbb' && m.start() == 8 && m.end() == 13

Calling collect() and each() require some special tricks to work:

Code Block
def m= ('redeem coffee' =~ /ee/)
assert m.collect{it} == ['ee', 'ee']
    //when calling collect() on a pattern with no groups...
assert m.collect{it} == []
    //...we must call reset() if we want to access the found matches again
m.reset()
assert m.collect{it} == ['ee', 'ee']

def l= [] //ditto for each()
m.each{ l << it }
assert l == []
m.reset()
l= []
m.each{ l << it }
assert l == ['ee', 'ee']

l= [] //ditto for eachWithIndex
m.eachWithIndex{it, i-> l << it+i }
assert l == []
m.reset()
l= []
m.eachWithIndex{it, i-> l << it+i }
assert l == ['ee0', 'ee1']

m= ('play the game\nfollow the rules' =~ /(?m)^(.*?) the (.*?)$/)
    //for a pattern with groups...
l= []
m.each{g0, g1, g2-> l << [g0, g1, g2] }
    //...we must pass the groups separately to the closure of each()
assert l == [['play the game', 'play', 'game'],
             ['follow the rules', 'follow', 'rules']]

m= ( 'mistlemuscle' =~ /m(.)s(.)le/ )
assert m[1] == ['muscle', 'u', 'c']
assert m.group(0) == 'muscle' && m.group(1) == 'u' && m.group(2) == 'c'
    //only call group() after using subscripting first

Aggregate functions we can use are:

Code Block
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
    findAll{ it[1] == 'a' } == ['tame', 'tape', 'take']
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
    find{ it[1] == 'a' } == 'tame'
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
    findIndexOf{ it[1] == 'a' } == 2 //index of 'tame'
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
    any{ it[1] == 'a' }
assert ! ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
    every{ it[1] == 'a' }

The sequence of text joined by operators such as | ? * + {} has no effect on the success of the ==~ matcher, but does affect what's found with the =~ finder. The first choice of the | is found first, and backtracking to the second choice is only tried if necessary. The choice of the ? is tried first, and backtracking to ignore the choice only tried if necessary. As much as possible of the * + {} is found first, and backtracking to find less text only tried if necessary.

Code Block
assert ('abcdefg' =~ /bcd|bcdef/)[0] == 'bcd'
assert ('abcdefg' =~ /bcdef|bcd/)[0] == 'bcdef'
    //first choice always tried first

assert ('Friday 13th' =~ /Fri(day)?/)[0][0] == 'Friday'

assert ('Say "hello" and "goodbye" to the world!' =~ /".*"/)[0] ==
    '"hello" and "goodbye"'
l= []
'Say "hello" and "goodbye" to the world!'.eachMatch( /"[^"]*"/ ){ l << it }
    //use NOT DOUBLE-QUOTES instead of ANY CHARACTER
assert l*.toList() == [['"hello"'], ['"goodbye"']]

Because the ? and * operators can match nothing, they may not always be intuitive to understand:

Code Block
def m= ('grgggr'=~/g?/)
def l= []
for( int i in 0..<(m.size() as int) ) l << m[i]
assert l == ['g', '', 'g', 'g', 'g', '', '']
    // ? also matches the empty space before each 'r', and the end of string

m= ('grgggr'=~/g*/)
l= []
for( int i in 0..<(m.size() as int) ) l << m[i]
assert l == ['g', '', 'ggg', '', '']
    // * also matches the empty space before each 'r', and the end of string

m= ('grgggr'=~/g+/)
l= []
for( int i in 0..<(m.size() as int) ) l << m[i]
assert l == ['g', 'ggg'] // + repetition is the most intuitive to use

By putting a ? after the operators ? * + {}, we can make them "lazy" instead of "greedy", that is, as little as possible is found first, and backtracking to find MORE text is tried if necessary:

Code Block
assert ('Friday 13th' =~ /Fri(day)??/)[0][0] == 'Fri' //instead of 'Friday'

def l= []
'Say "hello" and "goodbye" to the world!'.eachMatch( /".*?"/ ){ l << it }
assert l*.toList() == [['"hello"'], ['"goodbye"']]

We've seen some longhand methods such as 'find', 'matches', 'start', and 'end'. There's many more such methods:

Code Block
def s= 'a quick quick dog'
def m= (s =~ /a.*k/)
    //starts at the beginning, but doesn't try to match the entire string
assert m.lookingAt() && s[m.start()..<m.end()] == 'a quick quick'

//replaceFirst...
assert (s =~ /quick/).replaceFirst('fast') == 'a fast quick dog'
assert (s =~ /qu(ick)/).replaceFirst('kw$1') == 'a kwick quick dog'
    //can reference groups in pattern using $
assert (s =~ /qu(ick)/).replaceFirst('kw\\$1') == 'a kw$1 quick dog'
    //include literal $ by writing \$, escaping \ as \\

//utility method to create a literal replacement String for the given String...
import java.util.regex.Matcher
assert Matcher.quoteReplacement( 'kw$1' ) == 'kw\\$1'
assert (s =~ /qu(ick)/).replaceFirst( Matcher.quoteReplacement( 'kw$1' ) ) ==
    'a kw$1 quick dog'

//we can mix GStrings and replacement group refs by mixing single-quoted and
//double-quoted strings...
def ice= 'ice cream'
assert ('some malting beer' =~ /a(lting ).*/).replaceFirst('e$1' + "$ice") ==
    'some melting ice cream'

//replaceAll...
assert (s =~ /quick/).replaceAll('fast') == 'a fast fast dog'
s= 'a quickly quacking duck'
assert (s =~ /qu(.)ck/).replaceAll('kw$1ck') == 'a kwickly kwacking duck'

//another shorthand...
assert 'a quick quick dog'.replaceFirst(/qu(ick)/, 'kw\\$1') ==
   'a kw$1 quick dog'
assert 'a quickly quacking duck'.replaceAll(/qu(.)ck/, 'kw$1ck') ==
   'a kwickly kwacking duck'

//'appendReplacement' and 'appendTail' should be used together for more
//complex replacements...
m= 'one banana two havana three matana four' =~ /(.a.)ana/
def i=0, sb= new StringBuffer()
while( m.find() ) m.appendReplacement(sb, '$1a' + 'na'*i++)
m.appendTail(sb)
assert sb.toString() == 'one bana two havana three matanana four'

Similarly to back-references in patterns, $1 through $9 in replacement strings are always interpreted as group references, and a dollar-escaped number greater than 9 is treated as a group reference if at least that many groups exist in the string pattern. Otherwise digits are dropped until either the number is smaller or equal to the existing number of groups or it is one digit.

We've already seen the greedy and lazy operators. There's also possessive operators, which act like greedy operators, except they never backtrack. Whereas choosing greedy or lazy operators affects the efficiency of a match, they don't affect the outcome. However, possessive operators can affect the outcome of a match:

Code Block
//the greedy * operator, with backwards backtracking...
def m= ( ~/(.*),(.*)/ ).matcher( 'one,two,three' )
assert m.matches() && m.group(1) == 'one,two' && m.group(2) == 'three'

//the lazy *? operator, with forwards backtracking...
m= ( ~/(.*?),(.*)/ ).matcher( 'one,two,three' )
assert m.matches() && m.group(1) == 'one' && m.group(2) == 'two,three'

//the possessive *+ operator, with no backtracking at all, even when doing so
//would cause a match...
assert ! ( ~/(.*+),(.*)/ ).matcher( 'one,two,three' ).matches()

//we can qualify other operators with possessiveness, such as ++, ?+, {m,n}+...
m= ( ~/([abc,]*+),(.*)/ ).matcher( 'abba,and,beegees' )
assert ! m.matches()
    //greedily matches 'abba,a', but doesn't backtrack to 'abba'

Atomic grouping, a more general form of possessiveness, enables everything in the atom group to be considered as one token. No backtracking occurs within the group, only outside of it:

Code Block
assert ! ( 'abbbc' ==~ /a(?>b*)bc/ )
    //after 'bbb' matched, no backtracking to 'bb' within atomic group

Atomic grouping and possessiveness are handy with nested repetition, allowing much faster match failures.

Finding Positions in Strings

We can use ^ and $ to match the beginning and end of each line using flag m:

Code Block
def s= 'an apple\nthe lime\na banana'
assert ! (s =~ /^a.{7}$/)
    //normally, ^ matches the beginning of the entire input,
    //and $ matches its end

def m= (s =~ /(?m)^a.{7}$/)
    //in multi-line mode, ^ matches the beginning of each line,
    //and $ matches each line's end

assert m.size() == 2 && m[0] == 'an apple' && m[1] == 'a banana'
assert m.toString() ==
  'java.util.regex.Matcher[pattern=(?m)^a.{7}$ region=0,26 lastmatch=a banana]'
    //some technical info

assert ((s+'\n') =~ /(?m)^a.{7}$/) // $ ignores any \n at the end of the string

import java.util.regex.Pattern
m= Pattern.compile(/^a.{7}$/, Pattern.MULTILINE).matcher(s)
    //alternative to (?m) in longhand syntax
assert m.find() && s[m.start()..<m.end()] == 'an apple'
assert m.find() && s[m.start()..<m.end()] == 'a banana'
assert ! m.find()

At the end of strings with \n at the end, $ matches twice:

Code Block
m= ( 'nine\nlives' =~ /$/ )
assert m.find() && m.start() == 10 && m.end() == 10
    // $ matches at end of string once only
assert ! m.find()

m= ( 'nine\nlives\n' =~ /$/ )
assert m.find() && m.start() == 10 && m.end() == 10
    // $ matches just before \n ...
assert m.find() && m.start() == 11 && m.end() == 11
    //...and again, $ matches after the \n
assert ! m.find()

m= ( 'nine\nlives\n' =~ /(?m)$/ )
assert m.find() && m.start() == 4 && m.end() == 4
    //in multiline mode, $ matches at end of each line
assert m.find() && m.start() == 10 && m.end() == 10
assert m.find() && m.start() == 11 && m.end() == 11
    // $ also always matches after the \n in multiline mode
assert ! m.find()

m= ( 'nine\nlives\n' =~ /^/ )
    // ^ matches at beginning of string once only,
    //even if there's an \n at the end
assert m.find() && m.start() == 0 && m.end() == 0
assert ! m.find()

m= ( 'nine\nlives\n' =~ /(?m)^/ )
assert m.find() && m.start() == 0 && m.end() == 0
assert m.find() && m.start() == 5 && m.end() == 5
    //in multiline mode, ^ matches at beginning of each line
assert ! m.find()
    // ^ also never matches after the \n in multiline mode

We can use \A \Z and \z to match the beginning and end of input, even in multiline mode:

Code Block
def s1= 'an apple\na banana'
assert (s1 =~ /\A.{8}\n.{8}\Z/)
    // \A always matches the beginning of the entire input, and \Z its end
assert (s1 =~ /\A.{8}\n.{8}\z/) // \z also matches its end

assert (s1 =~ /(?m)\A.{8}\n.{8}\Z/)
    // ?m flag has no effect on meaning of \A \Z and \z

def s2= s1 + '\n'
assert (s2 =~ /(?m)\A.{8}\n.{8}\Z/)
    // \Z ignores an extra \n when matching the end of input...
assert ! (s2 =~ /(?m)\A.{8}\n.{8}\z/) // ...but \z is fussy

We can match at word boundaries:

Code Block
// \b matches either the preceding or following character, but not both, is
//a word (matched by \w)
(0x20..0x7F).each{it1->
  (0x20..0x7F).each{it2->
    def s= "${it1 as char}${it2 as char}"
    if( s ==~ /.\b./ ) assert (s[0] ==~ /\w/) ^ (s[1] ==~ /\w/)
                                                  // ^ means xor (exclusive or)
} }

// \B matches where \b doesn't
assert (0x0..0x7F).findAll{ (it as char) ==~ /\b/ && (it as char) ==~ /\B/ }.
    size() == 0

We can can look behind or ahead of a position, ie, find a position based on text that precedes follows it, but without matching that text itself. We can only use fixed-length strings when looking behind, ie, literal text, character classes, finite repetition ( {length} and ? ), and alternation where each string in it is also of fixed length, because the length of the match must be able to be predetermined:

Code Block
//use (?=) to find the position just in front of all 'qu'...
assert 'the queen quietly quacked'.replaceAll( /(?=qu)/, 'we' ) ==
    'the wequeen wequietly wequacked'

//use (?!) to find all 'c' not followed by 'a'...
assert 'clever cats can count mice'.replaceAll( /c(?!a)/, 'k' ) ==
    'klever cats can kount mike'

//use (?<=) to find all words ending in '-gry'...
assert 'The angry, hungry boy gried out.'.
    replaceAll( /\b\w+?(?<=gry)\b/, 'naughty' ) ==
  'The naughty, naughty boy gried out.'

//use (?<!) to find 3-letter words not ending with 'e'...
assert 'The spy saw seven spuds.'.replaceAll( /\b\w{3}(?<!e)\b/, 'hid' ) ==
    'The hid hid seven spuds.'

//lookaheads and lookbehinds can contain capturing groups...
assert 'the landlord dared band led not'.
    replaceAll( /\b\w{4,}(?<=(\w{3})d)\b/, '$1' ) ==
  'the lor are ban led not'

Matching positions in a string is useful for splitting the string, and for inserting text:

Code Block
assert 'The leaky cauldron.'.split(/\b/).toList() ==
  ['', 'The', ' ', 'leaky', ' ', 'cauldron', '.']
    //note that an empty string is prepended
assert 'Hi, my, bye.'.split( /\b(?=\w)/ ).toList() ==
  ['', 'Hi, ', 'my, ', 'bye.']
assert 'The leaky cauldron.'.replaceAll(/\b/, '*') ==
  '*The* *leaky* *cauldron*.'
     //note that text inserted at beginning but not at end

We can split a string in many ways:

Code Block
def s= 'hi,my,spy,tie,bye,,'
assert s.split( /,/ ).toList() == ['hi', 'my', 'spy', 'tie', 'bye']
assert s.split( /,/, 1 ).toList() == ['hi,my,spy,tie,bye,,']
    //extra argument gives max number of splits
assert s.split( /,/, 2 ).toList() == ['hi', 'my,spy,tie,bye,,']
assert s.split( /,/, 3 ).toList() == ['hi', 'my', 'spy,tie,bye,,']
assert s.split( /,/, 0 ).toList() == ['hi', 'my', 'spy', 'tie', 'bye']
    //any number of splits; same as no arg
assert s.split( /,/, -1 ).toList() == ['hi', 'my', 'spy', 'tie', 'bye', '', '']
    //a negative arg doesn't remove trailing empty strings

assert ( ~/,/ ).split(s).toList() == ['hi', 'my', 'spy', 'tie', 'bye']
    //alternative syntax
assert ( ~/,/ ).split(s, 2).toList() == ['hi', 'my,spy,tie,bye,,']

Restricting a String to a Region for a Pattern

We can set the limit of the part of the input string that will be searched to find a match:

Code Block
import java.util.regex.Pattern
def m= Pattern.compile( /abc+/ ).matcher( 'aaabc' )
assert m.find()
m.region(1, 4) //restrict string 'aaabc' to a region within, ie, 'aab'
assert ! m.find()
assert m.regionStart() == 1 && m.regionEnd() == 4
assert ! m.region(1, 4).find() //alternative syntax

//we can make a region's boundaries transparent to lookaround and boundary
//matching constructs...
m= Pattern.compile( /abc\b/ ).matcher( 'aaabcdef' )
m.region(1, 5)
assert m.find() //doesn't consider whether there's a word boundary (\b) after
                //'aabc' in full string
assert ! m.hasTransparentBounds()
m.region(1, 5)
m.useTransparentBounds(true)
assert ! m.find() //doesn't find anything because the \b doesn't match
assert m.hasTransparentBounds()
assert ! m.region(1, 5).useTransparentBounds(true).find() //alternative syntax

//we can decide whether to match anchors such as ^ and $ at the boundaries of
//the region...
m= Pattern.compile( /^abc$/ ).matcher( 'aaabcdef' )
m.region(2, 5)
assert m.find()
assert m.hasAnchoringBounds() //match such anchors by default
m.region(2, 5)
m.useAnchoringBounds(false)
assert ! m.find() //the ^ and $ no longer match
assert ! m.region(2, 5).useAnchoringBounds(false).find() //alternative syntax