Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

Matching Strings to Patterns

We can define string patterns, aka "Regular Expressions" or "Regexes", and see if a String matches it:

Code Block
assert 'abc' ==~ /abc/ //pattern on righthand side between single-slashes
assert ! ( 'abc' ==~ /ace/ )
assert ! ( 'abc' ==~ /ab/ )

assert 'abc' ==~ /a.c/
    //the . in the pattern matches any character, except \n (or \r\n on Windows)
assert 'abc'.matches( /a.c/ ) //alternative method name
assert java.util.regex.Pattern.matches( /a.c/, 'abc' ) //alternative syntax
assert java.util.regex.Pattern.compile( /a.c/ ).matcher( 'abc' ).matches()
                                                       //alternative syntax

assert '\t\n\f\r' ==~ /\t\n\f\r/
    //some control chars have same notation as in strings
assert '\t\n\f\r' ==~ /\x09\x0a\x0c\x0D/
    //alternatively use hex codes (leading zero required to make 2 digits)
assert '\t\n\f\r' ==~ /\011\012\014\015/
    //alternatively use octal codes (leading zero required)
assert '\b' ==~ /\x08/ && ! ( '\b' ==~ /\b/ )
    // \b has different meaning in regex than in string
assert '\07\013\033' ==~ /\a\v\e/
    //regex-only notation: bell \a, vertical tab \v, escape \e

Twelve characters that are special syntax for regexes need to be quoted:

Code Block
assert 'a.cabc' ==~ /abc/ //pattern on righthand side between single-slashes
assert ! ( 'abc' ==~ /ace/ )
assert ! ( 'abc' ==~ /ab/ )

assert 'abc' ==~ /a.c/
    //the . in the pattern matches any character, except \n (or \r\n on Windows)
assert 'abc'.matches( /a.c/ ) //backslashalternative beforemethod name
assert java. to quote it
assert '.{[()\\^$|?*+util.regex.Pattern.matches( /a.c/, 'abc' ) //alternative syntax
assert java.util.regex.Pattern.compile( /a.c/ ).matcher( 'abc' ).matches()
                                                       //alternative syntax

assert '\t\n\f\r' ==~ /\.t\{n\[\(\)\\\^\$\|\?\*\+/ //the 12 chars that need quoting
assert '.{[()\\^$|?*+f\r/
    //some control chars have same notation as in strings
assert '\t\n\f\r' ==~ /\x09\x0a\x0c\x0D/
    //alternatively use hex codes (leading zero required to make 2 digits)
assert '\t\n\f\r' ==~ /\Q.{[()\^$|?*+\E/ //another way to quote text is to bracket with \Q and \E
import java.util.regex.Pattern
assert Pattern.quote( /.{[()\^$|?*+/ ) == /\Q.{[()\^$|?*+\E/ //a special method to quote text in this way011\012\014\015/
    //alternatively use octal codes (leading zero required)
assert '\b' ==~ /\x08/ && ! ( '\b' ==~ /\b/ )
    // \b has different meaning in regex than in string
assert '\07\013\033' ==~ /\a\v\e/
    //regex-only notation: bell \a, vertical tab \v, escape \e

The chars \c@, \cA, \cB, ..., \cZ, \c[, \c], \c^, and \c_ map to the special characters 0x0 to 0x1f, except 0x1c:

...

Code Block
assert (0x0..0x7F).findAll{ (it as char) ==~ /\s/ } ==
    ['\t', '\n', '\013', '\f', '\r', ' '].collect{it as int}
assert (0x0..0x7F).findAll{ (it as char) ==~ /\w/ } ==
    [*'0'..'9', *'A'..'Z', '_', *'a'..'z'].collect{it as int}
assert (0x0..0x7F).findAll{ (it as char) ==~ /\d/ } ==
    ('0'..'9').collect{it as int}

[ [/\w/, /\W/], [/\d/, /\D/], [/\s/, /\S/] ].each{ pair->
  assert (0x0..0x7F).findAll{ (it as char) ==~ pair[0] &&
         (it as char) ==~ pair[1] }.size() == 0
} // \S means not \s; \W means not \w; \D means not \d

...

Code Block
assert (0x0..0x7F).findAll{ !( (it as char) ==~ /./ ) } ==
  ['\n' as int, '\r' as int]
    //chars that . doesn't match //also: 0x85, 0x2028, 0x2029
assert 'abc\ndef' ==~ /a.c\ndef/
assert !( 'abc\ndef' ==~ /abc.def/ ) //the . doesn't match \n

assert (0x0..0x7F).findAll{ !( (it as char) ==~ /(?s)./ ) } == []
    //when (?s) used, . matches every character
assert 'abc\r\ndef' ==~ /(?s)abc..def/ && !( 'abc\r\ndef' ==~ /(?s)abc.def/ )
    //on Windows, \r\n needs .. for match

assert (0x0..0x7F).findAll{ !( (it as char) ==~ /(?d)./ ) } == ['\n' as int]
    //only char that . doesn't match for (?d) flag
assert (0x0..0x7F).findAll{ !( (it as char) ==~ /(?sd)./ ) } == []
    // (?sd) together same as (?s) alone

...

Code Block
assert ( 'gOoDbYe' ==~ /(?i)goodbye/ )
    //when (?i) used, case-insensitive matching for ASCII characters

assert 'an ace' ==~ /(?x) an\ ace #comment here after hash/
    //quote the space, ignore unquoted whitespace and comments

...

Code Block
assert 'abcDEFG' ==~ /abc(?i)defg/
    //turn on flag halfway thru pattern
assert 'abCDefg' ==~ /ab(?i)cd(?-i)efg/
    //turn flag on, then off again
assert 'abCDEfg' ==~ /ab(?i:cde)fg/
    //turn flag on for only a certain span of text
assert 'ABcdeFG' ==~ /(?i)ab(?-i:cde)fg/
    //turn flag on, but off for only a certain span

assert 'abcdefg' ==~ /abc(?ix) d e f g #comment here/
    //turn more than one flag on together
assert 'abcdefg' ==~ /(?ix) a b c (?-ix)defg/
    //turn more than one flag off together
assert 'abcdefg' ==~ /(?ix) a b c (?s-ix)defg/
    //turn some flag(s) on and other flag(s) off together

import java.util.regex.Pattern
assert Pattern.compile(/abc.def/, Pattern.DOTALL).matcher('abc\ndef').matches()
    //alternative to (?s)
assert ! Pattern.compile(/abc.def/, Pattern.UNIX_LINES).
    matcher('abc\ndef').matches() //alternative to (?d)
assert Pattern.compile(/goodbye/, Pattern.CASE_INSENSITIVE).
    matcher('gOoDbYe').matches() //alternative to (?i)
assert Pattern.compile(/ an\ ace #comment here/, Pattern.COMMENTS).
    matcher('an ace').matches() //alternative to (?x)

//we can enquire the flags set at the end-point of a pattern...
import java.util.regex.Pattern
assert Pattern.compile(/ab(?i)c.def/, Pattern.DOTALL).flags() ==
  Pattern.DOTALL + Pattern.CASE_INSENSITIVE
assert Pattern.compile(/ab(?i)c.d(?-i)ef/, Pattern.DOTALL).flags() ==
  Pattern.DOTALL
assert Pattern.compile(/ab(?i:c.d)ef/, Pattern.DOTALL).flags() ==
  Pattern.DOTALL

...

Code Block
['bat', 'bet', 'bit', 'bot', 'but'].each{ assert it ==~ /b[aeiou]t/ }
    //[aeiou] matches one of a,e,i,o,u
assert ! ('bnt' ==~ /b[aeiou]t/)

['bat', 'bet', 'bit', 'bot', 'but'].each{ assert ! (it ==~ /b[^aeiou]t/) }
    //[^aeiou] matches anything except a,e,i,o,u...
['bbt', 'bxt', 'b%t', 'b)t', 'b*t', 'b\nt'].each{ assert it ==~ /b[^aeiou]t/ }
    //...even newlines

assert 'b' ==~ /[abbbc]/ //duplicate chars in character class have no effect
assert '&' ==~ /[a&]/ &&
       !('&' ==~ /[a&&z]/) &&
       '&' ==~ /[a&&&]/ &&
       !('&' ==~ /[a&&]/) &&
       '&' ==~ /[a&\&]/   //all legal syntax

[ /[a-j]/: [*'a'..'j'],
     //we can specify a range of characters inside a class using hyphen -
  /[_a-zA-Z]/: [*'A'..'Z', '_', *'a'..'z'],
     //we can have many ranges mixed with single characters
  /[_a-z[A-Z]]/: [*'A'..'Z', '_', *'a'..'z'],
     //same effect as [_a-zA-Z]
  /[a-m&&g-z]/: [*'g'..'m'],
     //&& is intersection operator
  /[a-z&&[^bc]]/: ['a', *'d'..'z'],
     //^ means 'not' everything in the character class
  /[a-z&&[^m-p]]/: [*'a'..'l', *'q'..'z'],
     //&& with ^ works like subtraction
  /[^\d\s]/: [*0x0..0x7F].collect{ it as char } - [*'\t'..'\r', ' ', *'0'..'9' ],
     //not digit AND not whitespace
  /[\D\S]/: [*0x0..0x7F].collect{ it as char },
     //not equal to above, but means: not digit OR not whitespace
].each{ regex, validVals->
  assert (0x0..0x7F).findAll{ (it as char) ==~ regex } ==
    validVals.collect{ it as int }
}

...

Code Block
['abc', 'def', 'xyz'].each{ assert it ==~ /abc|def|xyz/ }
['abcz', 'aijz', 'axyz'].each{ assert it ==~ /a(bc|ij|xy)z/ }
    //we delimit the alternation with parentheses

//when using longhand syntax, we can see what option was matched, using groups,
//which we'll meet soon:
def m= java.util.regex.Pattern.compile( /a(bc|ij|xy)z/ ).matcher( 'abcz' )
m.matches()
assert m.group(1) == 'bc' //whatever was matched between the parens

...

Code Block
assert 'aaab' ==~ /a{3}b/
assert 'abcabc' ==~ /(abc){2}/ // {n} can apply to a multi-character sequence
['ab', 'ba', 'bb', 'aa'].each{ it ==~ /[ab]{2}/ }
    // {n} can apply to a character class
['abab', '%&@b'].each{ assert it ==~ /.{3}b/ }

...

Code Block
['aaab', 'aab', 'ab', 'b'].each{ assert it ==~ /a*b/ }
    //even zero occurences of the character is matched
['abcabc', 'abc', ''].each{ assert it ==~ /(abc)*/ }
    // * can apply to a multi-character sequence
['abbacb', 'acaba', 'cbbbac', 'c', ''].each{ assert it ==~ /[abc]*/ }
    // * can apply to a character class
['aaab', 'b', 'abab'].each{ assert it ==~ /.*b/ }
    // * is greedy: in 'abab' .* matches 'aba'

//Use + to match at least one occurence of a character:
['aaab', 'aab', 'ab'].each{ assert it ==~ /a+b/ }
assert !( 'b' ==~ /a+b/ ) //at least one 'a' is required
assert 'abcabcxz' ==~ /(abc)+[xyz]+/
    // + can apply to character class or multi-character sequence

//Other variable-length repetition operators:
assert 'aaaab' ==~ /a{3,}b/ // {n,} matches at least n characters
assert 'aaaab' ==~ /a{3,5}b/ // {n1,n2} matches between n1 and n2 characters
assert 'abaxyzxyz' ==~ /[ab]{2,}(xyz){2,4}/
    //these also can apply to multi-character sequences or character classes

...

Code Block
//we can access matched values in groups outside the pattern using 
//longhand syntax...
def m= java.util.regex.Pattern.compile( /(a*)(b*)/ ).matcher( 'aaabb' )
m.matches()
assert m.group(1) == 'aaa' && m.start(1) == 0 && m.end(1) == 3
assert m.group(2) == 'bb' && m.start(2) == 3 && m.end(2) == 5
assert m.group(0) == 'aaabb' //group(0) is the entire string
assert m.group() == 'aaabb' && m.start() == 0 && m.end() == 5
    //parameters default to 0

//...or outside the pattern using indexing syntax (don't forget the first [0] index)...
m= java.util.regex.Pattern.compile( /(a*)(b*)/ ).matcher( 'aaabb' )
m.matches()
assert m[0][0] == 'aaabb' //the entire string
assert m[0][1] == 'aaa' && m.start(1) == 0 && m.end(1) == 3
assert m[0][2] == 'bb' && m.start(2) == 3 && m.end(2) == 5

//...or within the pattern using \n notation:
assert 'aaabb,aaa,bb' ==~ /(a*)(b*),\1,\2/
    // \1 is the first group matched, \2 the second matched

assert 'abbcc,abb,bb,cc' ==~ /(a(b*))(c*),\1,\2,\3/
    //groups numbered by sequence of their opening parens from left to right
assert 'abcddd,ab,ddd' ==~ /(a(?:b))(?>c)(d*),\1,\2/
    //groups beginning with ?: or ?> aren't numbered
assert 'aba,a,b' ==~ /(a(b)?)+,\1,\2/
   //second match for \1 has no match for \2, so \2 keeps value from its first match

assert 'abc,bc' ==~ /a(bc)?,\1/
assert !( 'a,' ==~ /a(bc)?,\1/ )
    //referencing \1 causes entire match to fail if it hasn't already matched
a value
assert !( 'a' ==~ /([abc]\1)/ )
    //referencing a group within itself causes entire match to fail

...

Code Block
def m= ( ~/(a*)|bc/ ).matcher( 'bc' ) //another longhand syntax
m.matches()
assert m.group(1) == null && m.start(1) == -1 && m.end(1) == -1
                         //if match successful but group didn't match anything

def p= java.util.regex.Pattern.compile( /ab*c/ )
assert p.pattern() == /ab*c/ //retrieve the definition from a compiled pattern

Finding Patterns in Strings

As well as matching an entire string to a pattern, we can also find a pattern within a string using =~ syntax:

Code Block
assert 'abcdefg' =~ /cde/ //is 'cde' within 'abcdefg'?
assert ! ( 'abcdefg' =~ /ace/ )
assert java.util.regex.Pattern.compile( /cde/ ).matcher( 'abcdefg' ).find()
                                                          //alternative syntax

assert 'xxx z9g\t\nxxx' =~ /\s\w\d.\t\n/
    //special characters work the same as with ==~ matching
assert ( 'xxxgOoDbYexxx' =~ /(?i)goodbye/ )
    //flags also work the same as with ==~
assert 'xxxbatxxx' =~ /b[aeiou]t/
    //character classes also work the same as with ==~

...

Code Block
def s= 'horse house'
assert s =~ /ho.se/ //to check for the first occurence only
def m= (s =~ /ho.se/)
assert m.size() == 2 && m[0] == 'horse' && m[1] == 'house'
    //to retrieve all occurences

def l= []
s.eachMatch( /ho.se/ ){ l << it[0] } //alternative syntax, be sure to use it[0]
assert l == ['horse', 'house']
def l2= []
s.eachMatch( /abc/ ){ l2 << it[0] } //no matches
assert l2 == []
def l3= []
s.eachMatch( /hor./ ){ l3 << it[0] } //one match only
assert l3 == ['hors']

...

Code Block
import java.util.regex.Pattern
def s= 'hoose horse house'
def m= Pattern.compile(/ho.se/).matcher(s)
assert m.find() && s[m.start()..<m.end()] == 'hoose'
assert m.find() && s[m.start()..<m.end()] == 'horse'
assert m.find() && s[m.start()..<m.end()] == 'house'
assert ! m.find()
assert m.reset() && s[m.start()..<m.end()] == 'hoose'
    //use reset() to find from beginning
assert m.find() && s[m.start()..<m.end()] == 'horse'
assert m.find(1) && s[m.start()..<m.end()] == 'horse'
    //giving a parameter to find() starts finding from that index
m.setIndex(1)
    //alternatively, calling setIndex() resets from that index, without finding
    //until find() called

assert m.find() && s[m.start()..<m.end()] == 'horse'

We can group when finding with =~ just as we do when matching with ==~:

Code Block
def m= ( 'mistlemuscle' =~ /m(.)s(.)le/ )
assert m.size() == 2
assert m.count == 2 //alternative to size()
assert m[0] == ['mistle', 'i', 't']
assert m[0].size() == 3 && m[0][0] == 'mistle' &&
       m[0][1] == 'i' && m[0][2] == 't'
assert m[1] == ['muscle', 'u', 'c']
assert m[1].size() == 3 && m[1][0] == 'muscle' &&
       m[1][1] == 'u' && m[1][2] == 'c'

//using the eachMatch() method...
def l= []
'mistlemuscle'.eachMatch( /m(.)s(.)le/ ){ l << it }
assert l*.toList() == [['mistle', 'i', 't'], ['muscle', 'u', 'c']]
def l2= []
'mistle'.eachMatch( /m(.)s(.)le/ ){ l2 << it }
assert l2*.toList() == [['mistle', 'i', 't']]
def l3= []
'practical'.eachMatch( /m(.)s(.)le/ ){ l3 << it }
assert l3*.toList() == []

//using longhand notation...
import java.util.regex.Pattern
m= Pattern.compile( /(a+)(b+)/ ).matcher( 'aaabbcccaabbb' )
m.find()
assert m.group(1) == 'aaa' && m.start(1) == 0 && m.end(1) == 3 &&
       m.group(2) == 'bb' && m.start(2) == 3 && m.end(2) == 5 &&
       m.group() == 'aaabb' && m.start() == 0 && m.end() == 5
m.find()
assert m.group(1) == 'aa' && m.start(1) == 8 && m.end(1) == 10 &&
       m.group(2) == 'bbb' && m.start(2) == 10 && m.end(2) == 13 &&
       m.group() == 'aabbb' && m.start() == 8 && m.end() == 13

...

Code Block
def m= ('redeem coffee' =~ /ee/)
assert m.collect{it} == ['ee', 'ee']
    //when calling collect() on a pattern with no groups...
assert m.collect{it} == []
    //...we must call reset() if we want to access the found matches again
m.reset()
assert m.collect{it} == ['ee', 'ee']

def l= [] //ditto for each()
m.each{ l << it }
assert l == []
m.reset()
l= []
m.each{ l << it }
assert l == ['ee', 'ee']

l= [] //ditto for eachWithIndex
m.eachWithIndex{it, i-> l << it+i }
assert l == []
m.reset()
l= []
m.eachWithIndex{it, i-> l << it+i }
assert l == ['ee0', 'ee1']

m= ('play the game\nfollow the rules' =~ /(?m)^(.*?) the (.*?)$/)
    //for a pattern with groups...
l= []
m.each{g0, g1, g2-> l << [g0, g1, g2] }
    //...we must pass the groups separately to the closure of each()
assert l == [['play the game', 'play', 'game'],
             ['follow the rules', 'follow', 'rules']]

m= ( 'mistlemuscle' =~ /m(.)s(.)le/ )
assert m[1] == ['muscle', 'u', 'c']
assert m.group(0) == 'muscle' && m.group(1) == 'u' && m.group(2) == 'c'
    //only call group() after using subscripting first

...

Code Block
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
    findAll{ it[1] == 'a' } == ['tame', 'tape', 'take']
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
    find{ it[1] == 'a' } == 'tame'
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
    findIndexOf{ it[1] == 'a' } == 2 //index of 'tame'
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
    any{ it[1] == 'a' }
assert ! ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
    every{ it[1] == 'a' }

The sequence of text joined by operators such as | ? * + {} has no effect on the success of the ==~ matcher, but does affect what's found with the =~ finder. The first choice of the | is found first, and backtracking to the second choice is only tried if necessary. The choice of the ? is tried first, and backtracking to ignore the choice only tried if necessary. As much as possible of the * + {} is found first, and backtracking to find less text only tried if necessary.

Code Block
assert ('abcdefg' =~ /bcd|bcdef/)[0] == 'bcd'
assert ('abcdefg' =~ /bcdef|bcd/)[0] == 'bcdef'
    //first choice always tried first

assert ('Friday 13th' =~ /Fri(day)?/)[0][0] == 'Friday'

assert ('Say "hello" and "goodbye" to the world!' =~ /".*"/)[0] ==
    '"hello" and "goodbye"'
l= []
'Say "hello" and "goodbye" to the world!'.eachMatch( /"[^"]*"/ ){ l << it }
    //use NOT DOUBLE-QUOTES instead of ANY CHARACTER
assert l*.toList() == [['"hello"'], ['"goodbye"']]

...

Code Block
def m= ('grgggr'=~/g?/)
def l= []
for( int i in 0..<(m.size() as int) ) l << m[i]
assert l == ['g', '', 'g', 'g', 'g', '', '']
    // ? option also matches the empty space before each 'r', and the end of string

m= ('grgggr'=~/g*/)
l= []
for( int i in 0..<(m.size() as int) ) l << m[i]
assert l == ['g', '', 'ggg', '', '']
    // * repetition also matches the empty space before each 'r', and the end of string

m= ('grgggr'=~/g+/)
l= []
for( int i in 0..<(m.size() as int) ) l << m[i]
assert l == ['g', 'ggg'] // + repetition is the most intuitive to use

...

Code Block
def s= 'a quick quick dog'
def m= (s =~ /a.*k/)
    //starts at the beginning, but doesn't try to match the entire string
assert m.lookingAt() && s[m.start()..<m.end()] == 'a quick quick'

//replaceFirst...
assert (s =~ /quick/).replaceFirst('fast') == 'a fast quick dog'
assert (s =~ /qu(ick)/).replaceFirst('kw$1') == 'a kwick quick dog'
    //can reference groups in pattern using $
assert (s =~ /qu(ick)/).replaceFirst('kw\\$1') == 'a kw$1 quick dog'
    //include literal $ by writing \$, escaping \ as \\

//utility method to create a literal replacement String for the given String...
import java.util.regex.Matcher
assert Matcher.quoteReplacement( 'kw$1' ) == 'kw\\$1'
assert (s =~ /qu(ick)/).replaceFirst( Matcher.quoteReplacement( 'kw$1' ) ) ==
    'a kw$1 quick dog'

//we can mix GStrings and replacement group refs by mixing single-quoted and
//double-quoted strings...
def ice= 'ice cream'
assert ('some malting beer' =~ /a(lting ).*/).replaceFirst('e$1' + "$ice") ==
    'some melting ice cream'

//replaceAll...
assert (s =~ /quick/).replaceAll('fast') == 'a fast fast dog'
s= 'a quickly quacking duck'
assert (s =~ /qu(.)ck/).replaceAll('kw$1ck') == 'a kwickly kwacking duck'

//another shorthand...
assert 'a quick quick dog'.replaceFirst(/qu(ick)/, 'kw\\$1') ==
   'a kw$1 quick dog'
assert 'a quickly quacking duck'.replaceAll(/qu(.)ck/, 'kw$1ck') ==
   'a kwickly kwacking duck'

//'appendReplacement' and 'appendTail' should be used together for more
//complex replacements...
m= 'one banana two havana three matana four' =~ /(.a.)ana/
def i=0, sb= new StringBuffer()
while( m.find() ) m.appendReplacement(sb, '$1a' + 'na'*i++)
m.appendTail(sb)
assert sb.toString() == 'one bana two havana three matanana four'

...

Code Block
//the greedy * operator, with backwards backtracking...
def m= ( ~/(.*),(.*)/ ).matcher( 'one,two,three' )
assert m.matches() && m.group(1) == 'one,two' && m.group(2) == 'three'

//the lazy *? operator, with forwards backtracking...
m= ( ~/(.*?),(.*)/ ).matcher( 'one,two,three' )
assert m.matches() && m.group(1) == 'one' && m.group(2) == 'two,three'

//the possessive *+ operator, with no backtracking at all, even when doing so
//would cause a match...
assert ! ( ~/(.*+),(.*)/ ).matcher( 'one,two,three' ).matches()

//we can qualify other operators with possessiveness, such as ++, ?+, {m,n}+...
m= ( ~/([abc,]*+),(.*)/ ).matcher( 'abba,and,beegees' )
assert ! m.matches()
    //greedily matches 'abba,a', but doesn't backtrack to 'abba'

...

Code Block
assert ! ( 'abbbc' ==~ /a(?>b*)bc/ )
    //after 'bbb' matched, no backtracking to 'bb' within atomic group

Atomic grouping and possessiveness are handy with nested repetition, allowing much faster match failures.

Finding Positions in Strings

We can use ^ and $ to match the beginning and end of each line using flag m:

Code Block
def s= 'an apple\nthe lime\na banana'
assert ! (s =~ /^a.{7}$/)
    //normally, ^ matches the beginning of the entire input,
    //and $ matches its end

def m= (s =~ /(?m)^a.{7}$/)
    //in multi-line mode, ^ matches the beginning of each line,
    //and $ matches each line's end

assert m.size() == 2 && m[0] == 'an apple' && m[1] == 'a banana'
assert m.toString() ==
  'java.util.regex.Matcher[pattern=(?m)^a.{7}$ region=0,26 lastmatch=a banana]'
    //some technical info

assert ((s+'\n') =~ /(?m)^a.{7}$/) // $ ignores any \n at the end of the string

import java.util.regex.Pattern
m= Pattern.compile(/^a.{7}$/, Pattern.MULTILINE).matcher(s)
    //alternative to (?m) in longhand syntax
assert m.find() && s[m.start()..<m.end()] == 'an apple'
assert m.find() && s[m.start()..<m.end()] == 'a banana'
assert ! m.find()

...

Code Block
m= ( 'nine\nlives' =~ /$/ )
assert m.find() && m.start() == 10 && m.end() == 10
    // $ matches at end of string once only
assert ! m.find()

m= ( 'nine\nlives\n' =~ /$/ )
assert m.find() && m.start() == 10 && m.end() == 10
    // $ matches just before \n ...
assert m.find() && m.start() == 11 && m.end() == 11
    //...and again, $ matches after the \n
assert ! m.find()

m= ( 'nine\nlives\n' =~ /(?m)$/ )
assert m.find() && m.start() == 4 && m.end() == 4
    //in multiline mode, $ matches at end of each line
assert m.find() && m.start() == 10 && m.end() == 10
assert m.find() && m.start() == 11 && m.end() == 11
    // $ also always matches after the \n in multiline mode
assert ! m.find()

m= ( 'nine\nlives\n' =~ /^/ )
    // ^ matches at beginning of string once only,
    //even if there's an \n at the end
assert m.find() && m.start() == 0 && m.end() == 0
assert ! m.find()

m= ( 'nine\nlives\n' =~ /(?m)^/ )
assert m.find() && m.start() == 0 && m.end() == 0
assert m.find() && m.start() == 5 && m.end() == 5
    //in multiline mode, ^ matches at beginning of each line
assert ! m.find()
    // ^ also never matches after the \n in multiline mode

...

Code Block
def s1= 'an apple\na banana'
assert (s1 =~ /\A.{8}\n.{8}\Z/)
    // \A always matches the beginning of the entire input, and \Z its end
assert (s1 =~ /\A.{8}\n.{8}\z/) // \z also matches its end

assert (s1 =~ /(?m)\A.{8}\n.{8}\Z/)
    // ?m flag has no effect on meaning of \A \Z and \z

def s2= s1 + '\n'
assert (s2 =~ /(?m)\A.{8}\n.{8}\Z/)
    // \Z ignores an extra \n when matching the end of input...
assert ! (s2 =~ /(?m)\A.{8}\n.{8}\z/) // ...but \z is fussy

...

Code Block
// \b matches either the preceding or following character, but not both, is
//a word (matched by \w)
(0x20..0x7F).each{it1->
  (0x20..0x7F).each{it2->
    def s= "${it1 as char}${it2 as char}"
    if( s ==~ /.\b./ ) assert (s[0] ==~ /\w/) ^ (s[1] ==~ /\w/)
                                                  // ^ means xor (exclusive or)
  } }

// \B matches where \b doesn't
assert (0x0..0x7F).findAll{ (it as char) ==~ /\b/ && (it as char) ==~ /\B/ }.
    size() == 0

We can can look behind or ahead of a position, ie, find a position based on text that precedes follows it, but without matching that text itself. We can only use fixed-length strings when looking behind, ie, literal text, character classes, finite repetition ( {length} and ? ), and alternation where each string in it is also of fixed length, because the length of the match must be able to be predetermined:

Code Block
//use (?=) to find the position just in front of all 'qu'...
assert 'the queen quietly quacked'.replaceAll( /(?=qu)/, 'we' ) ==
    'the wequeen wequietly wequacked'

//use (?!) to find all 'c' not followed by 'a'...
assert 'clever cats can count mice'.replaceAll( /c(?!a)/, 'k' ) ==
    'klever cats can kount mike'

//use (?<=) to find all words ending in '-gry'...
assert 'The angry, hungry boy gried out.'.
    replaceAll( /\b\w+?(?<=gry)\b/, 'naughty' ) ==
  'The naughty, naughty boy gried out.'

//use (?<!) to find 3-letter words not ending with 'e'...
assert 'The spy saw seven spuds.'.replaceAll( /\b\w{3}(?<!e)\b/, 'hid' ) ==
    'The hid hid seven spuds.'

//lookaheads and lookbehinds can contain capturing groups...
assert 'the landlord dared band led not'.
    replaceAll( /\b\w{4,}(?<=(\w{3})d)\b/, '$1' ) ==
  'the lor are ban led not'

...

Code Block
assert 'The leaky cauldron.'.split(/\b/).toList() ==
  ['', 'The', ' ', 'leaky', ' ', 'cauldron', '.']
    //note that an empty string is prepended
assert 'Hi, my, bye.'.split( /\b(?=\w)/ ).toList() ==
  ['', 'Hi, ', 'my, ', 'bye.']
assert 'The leaky cauldron.'.replaceAll(/\b/, '*') ==
  '*The* *leaky* *cauldron*.'
     //note that text inserted at beginning but not at end

...

Code Block
def s= 'hi,my,spy,tie,bye,,'
assert s.split( /,/ ).toList() == ['hi', 'my', 'spy', 'tie', 'bye']
assert s.split( /,/, 1 ).toList() == ['hi,my,spy,tie,bye,,']
    //extra argument gives max number of splits
assert s.split( /,/, 2 ).toList() == ['hi', 'my,spy,tie,bye,,']
assert s.split( /,/, 3 ).toList() == ['hi', 'my', 'spy,tie,bye,,']
assert s.split( /,/, 0 ).toList() == ['hi', 'my', 'spy', 'tie', 'bye']
    //any number of splits; same as no arg
assert s.split( /,/, -1 ).toList() == ['hi', 'my', 'spy', 'tie', 'bye', '', '']
    //a negative arg doesn't remove trailing empty strings

assert ( ~/,/ ).split(s).toList() == ['hi', 'my', 'spy', 'tie', 'bye']
    //alternative syntax
assert ( ~/,/ ).split(s, 2).toList() == ['hi', 'my,spy,tie,bye,,']

Restricting a String to a Region for a Pattern

We can set the limit of the part of the input string that will be searched to find a match:

Code Block
import java.util.regex.Pattern
def m= Pattern.compile( /abc+/ ).matcher( 'aaabc' )
assert m.find()
m.region(1, 4) //restrict string 'aaabc' to a region within, ie, 'aab'
assert ! m.find()
assert m.regionStart() == 1 && m.regionEnd() == 4
assert ! m.region(1, 4).find() //alternative syntax

//we can make a region's boundaries transparent to lookaround and boundary
//matching constructs...
m= Pattern.compile( /abc\b/ ).matcher( 'aaabcdef' )
m.region(1, 5)
assert m.find() //doesn't consider whether there's a word boundary (\b) after
                //'aabc' in full string
assert ! m.hasTransparentBounds()
m.region(1, 5)
m.useTransparentBounds(true)
assert ! m.find() //doesn't find anything because the \b doesn't match
assert m.hasTransparentBounds()
assert ! m.region(1, 5).useTransparentBounds(true).find() //alternative syntax

//we can decide whether to match anchors such as ^ and $ at the boundaries of
//the region...
m= Pattern.compile( /^abc$/ ).matcher( 'aaabcdef' )
m.region(2, 5)
assert m.find()
assert m.hasAnchoringBounds() //match such anchors by default
m.region(2, 5)
m.useAnchoringBounds(false)
assert ! m.find() //the ^ and $ no longer match
assert ! m.region(2, 5).useAnchoringBounds(false).find() //alternative syntax