Matching Strings to Patterns
We can define string patterns, aka "Regular Expressions" or "Regexes", and see if a String matches it:
| Code Block |
|---|
assert 'abc' ==~ /abc/ //pattern on righthand side between single-slashes assert ! ( 'abc' ==~ /ace/ ) assert ! ( 'abc' ==~ /ab/ ) assert 'abc' ==~ /a.c/ //the . in the pattern matches any character, except \n (or \r\n on Windows) assert 'abc'.matches( /a.c/ ) //alternative method name assert java.util.regex.Pattern.matches( /a.c/, 'abc' ) //alternative syntax assert java.util.regex.Pattern.compile( /a.c/ ).matcher( 'abc' ).matches() //alternative syntax assert '\t\n\f\r' ==~ /\t\n\f\r/ //some control chars have same notation as in strings assert '\t\n\f\r' ==~ /\x09\x0a\x0c\x0D/ //alternatively use hex codes (leading zero required to make 2 digits) assert '\t\n\f\r' ==~ /\011\012\014\015/ //alternatively use octal codes (leading zero required) assert '\b' ==~ /\x08/ && ! ( '\b' ==~ /\b/ ) // \b has different meaning in regex than in string assert '\07\013\033' ==~ /\a\v\e/ //regex-only notation: bell \a, vertical tab \v, escape \e |
Twelve characters that are special syntax for regexes need to be quoted:
| Code Block |
|---|
assert 'a.cabc' ==~ /abc/ //pattern on righthand side between single-slashes assert ! ( 'abc' ==~ /ace/ ) assert ! ( 'abc' ==~ /ab/ ) assert 'abc' ==~ /a.c/ //the . in the pattern matches any character, except \n (or \r\n on Windows) assert 'abc'.matches( /a.c/ ) //backslashalternative beforemethod name assert java. to quote it assert '.{[()\\^$|?*+util.regex.Pattern.matches( /a.c/, 'abc' ) //alternative syntax assert java.util.regex.Pattern.compile( /a.c/ ).matcher( 'abc' ).matches() //alternative syntax assert '\t\n\f\r' ==~ /\.t\{n\[\(\)\\\^\$\|\?\*\+/ //the 12 chars that need quoting assert '.{[()\\^$|?*+f\r/ //some control chars have same notation as in strings assert '\t\n\f\r' ==~ /\x09\x0a\x0c\x0D/ //alternatively use hex codes (leading zero required to make 2 digits) assert '\t\n\f\r' ==~ /\Q.{[()\^$|?*+\E/ //another way to quote text is to bracket with \Q and \E import java.util.regex.Pattern assert Pattern.quote( /.{[()\^$|?*+/ ) == /\Q.{[()\^$|?*+\E/ //a special method to quote text in this way011\012\014\015/ //alternatively use octal codes (leading zero required) assert '\b' ==~ /\x08/ && ! ( '\b' ==~ /\b/ ) // \b has different meaning in regex than in string assert '\07\013\033' ==~ /\a\v\e/ //regex-only notation: bell \a, vertical tab \v, escape \e |
The chars \c@, \cA, \cB, ..., \cZ, \c[, \c], \c^, and \c_ map to the special characters 0x0 to 0x1f, except 0x1c:
...
| Code Block |
|---|
assert (0x0..0x7F).findAll{ (it as char) ==~ /\s/ } ==
['\t', '\n', '\013', '\f', '\r', ' '].collect{it as int}
assert (0x0..0x7F).findAll{ (it as char) ==~ /\w/ } ==
[*'0'..'9', *'A'..'Z', '_', *'a'..'z'].collect{it as int}
assert (0x0..0x7F).findAll{ (it as char) ==~ /\d/ } ==
('0'..'9').collect{it as int}
[ [/\w/, /\W/], [/\d/, /\D/], [/\s/, /\S/] ].each{ pair->
assert (0x0..0x7F).findAll{ (it as char) ==~ pair[0] &&
(it as char) ==~ pair[1] }.size() == 0
} // \S means not \s; \W means not \w; \D means not \d
|
...
| Code Block |
|---|
assert (0x0..0x7F).findAll{ !( (it as char) ==~ /./ ) } ==
['\n' as int, '\r' as int]
//chars that . doesn't match //also: 0x85, 0x2028, 0x2029
assert 'abc\ndef' ==~ /a.c\ndef/
assert !( 'abc\ndef' ==~ /abc.def/ ) //the . doesn't match \n
assert (0x0..0x7F).findAll{ !( (it as char) ==~ /(?s)./ ) } == []
//when (?s) used, . matches every character
assert 'abc\r\ndef' ==~ /(?s)abc..def/ && !( 'abc\r\ndef' ==~ /(?s)abc.def/ )
//on Windows, \r\n needs .. for match
assert (0x0..0x7F).findAll{ !( (it as char) ==~ /(?d)./ ) } == ['\n' as int]
//only char that . doesn't match for (?d) flag
assert (0x0..0x7F).findAll{ !( (it as char) ==~ /(?sd)./ ) } == []
// (?sd) together same as (?s) alone
|
...
| Code Block |
|---|
assert ( 'gOoDbYe' ==~ /(?i)goodbye/ ) //when (?i) used, case-insensitive matching for ASCII characters assert 'an ace' ==~ /(?x) an\ ace #comment here after hash/ //quote the space, ignore unquoted whitespace and comments |
...
| Code Block |
|---|
assert 'abcDEFG' ==~ /abc(?i)defg/ //turn on flag halfway thru pattern assert 'abCDefg' ==~ /ab(?i)cd(?-i)efg/ //turn flag on, then off again assert 'abCDEfg' ==~ /ab(?i:cde)fg/ //turn flag on for only a certain span of text assert 'ABcdeFG' ==~ /(?i)ab(?-i:cde)fg/ //turn flag on, but off for only a certain span assert 'abcdefg' ==~ /abc(?ix) d e f g #comment here/ //turn more than one flag on together assert 'abcdefg' ==~ /(?ix) a b c (?-ix)defg/ //turn more than one flag off together assert 'abcdefg' ==~ /(?ix) a b c (?s-ix)defg/ //turn some flag(s) on and other flag(s) off together import java.util.regex.Pattern assert Pattern.compile(/abc.def/, Pattern.DOTALL).matcher('abc\ndef').matches() //alternative to (?s) assert ! Pattern.compile(/abc.def/, Pattern.UNIX_LINES). matcher('abc\ndef').matches() //alternative to (?d) assert Pattern.compile(/goodbye/, Pattern.CASE_INSENSITIVE). matcher('gOoDbYe').matches() //alternative to (?i) assert Pattern.compile(/ an\ ace #comment here/, Pattern.COMMENTS). matcher('an ace').matches() //alternative to (?x) //we can enquire the flags set at the end-point of a pattern... import java.util.regex.Pattern assert Pattern.compile(/ab(?i)c.def/, Pattern.DOTALL).flags() == Pattern.DOTALL + Pattern.CASE_INSENSITIVE assert Pattern.compile(/ab(?i)c.d(?-i)ef/, Pattern.DOTALL).flags() == Pattern.DOTALL assert Pattern.compile(/ab(?i:c.d)ef/, Pattern.DOTALL).flags() == Pattern.DOTALL |
...
| Code Block |
|---|
['bat', 'bet', 'bit', 'bot', 'but'].each{ assert it ==~ /b[aeiou]t/ }
//[aeiou] matches one of a,e,i,o,u
assert ! ('bnt' ==~ /b[aeiou]t/)
['bat', 'bet', 'bit', 'bot', 'but'].each{ assert ! (it ==~ /b[^aeiou]t/) }
//[^aeiou] matches anything except a,e,i,o,u...
['bbt', 'bxt', 'b%t', 'b)t', 'b*t', 'b\nt'].each{ assert it ==~ /b[^aeiou]t/ }
//...even newlines
assert 'b' ==~ /[abbbc]/ //duplicate chars in character class have no effect
assert '&' ==~ /[a&]/ &&
!('&' ==~ /[a&&z]/) &&
'&' ==~ /[a&&&]/ &&
!('&' ==~ /[a&&]/) &&
'&' ==~ /[a&\&]/ //all legal syntax
[ /[a-j]/: [*'a'..'j'],
//we can specify a range of characters inside a class using hyphen -
/[_a-zA-Z]/: [*'A'..'Z', '_', *'a'..'z'],
//we can have many ranges mixed with single characters
/[_a-z[A-Z]]/: [*'A'..'Z', '_', *'a'..'z'],
//same effect as [_a-zA-Z]
/[a-m&&g-z]/: [*'g'..'m'],
//&& is intersection operator
/[a-z&&[^bc]]/: ['a', *'d'..'z'],
//^ means 'not' everything in the character class
/[a-z&&[^m-p]]/: [*'a'..'l', *'q'..'z'],
//&& with ^ works like subtraction
/[^\d\s]/: [*0x0..0x7F].collect{ it as char } - [*'\t'..'\r', ' ', *'0'..'9' ],
//not digit AND not whitespace
/[\D\S]/: [*0x0..0x7F].collect{ it as char },
//not equal to above, but means: not digit OR not whitespace
].each{ regex, validVals->
assert (0x0..0x7F).findAll{ (it as char) ==~ regex } ==
validVals.collect{ it as int }
}
|
...
| Code Block |
|---|
['abc', 'def', 'xyz'].each{ assert it ==~ /abc|def|xyz/ }
['abcz', 'aijz', 'axyz'].each{ assert it ==~ /a(bc|ij|xy)z/ }
//we delimit the alternation with parentheses
//when using longhand syntax, we can see what option was matched, using groups,
//which we'll meet soon:
def m= java.util.regex.Pattern.compile( /a(bc|ij|xy)z/ ).matcher( 'abcz' )
m.matches()
assert m.group(1) == 'bc' //whatever was matched between the parens
|
...
| Code Block |
|---|
assert 'aaab' ==~ /a{3}b/
assert 'abcabc' ==~ /(abc){2}/ // {n} can apply to a multi-character sequence
['ab', 'ba', 'bb', 'aa'].each{ it ==~ /[ab]{2}/ }
// {n} can apply to a character class
['abab', '%&@b'].each{ assert it ==~ /.{3}b/ }
|
...
| Code Block |
|---|
['aaab', 'aab', 'ab', 'b'].each{ assert it ==~ /a*b/ }
//even zero occurences of the character is matched
['abcabc', 'abc', ''].each{ assert it ==~ /(abc)*/ }
// * can apply to a multi-character sequence
['abbacb', 'acaba', 'cbbbac', 'c', ''].each{ assert it ==~ /[abc]*/ }
// * can apply to a character class
['aaab', 'b', 'abab'].each{ assert it ==~ /.*b/ }
// * is greedy: in 'abab' .* matches 'aba'
//Use + to match at least one occurence of a character:
['aaab', 'aab', 'ab'].each{ assert it ==~ /a+b/ }
assert !( 'b' ==~ /a+b/ ) //at least one 'a' is required
assert 'abcabcxz' ==~ /(abc)+[xyz]+/
// + can apply to character class or multi-character sequence
//Other variable-length repetition operators:
assert 'aaaab' ==~ /a{3,}b/ // {n,} matches at least n characters
assert 'aaaab' ==~ /a{3,5}b/ // {n1,n2} matches between n1 and n2 characters
assert 'abaxyzxyz' ==~ /[ab]{2,}(xyz){2,4}/
//these also can apply to multi-character sequences or character classes
|
...
| Code Block |
|---|
//we can access matched values in groups outside the pattern using //longhand syntax... def m= java.util.regex.Pattern.compile( /(a*)(b*)/ ).matcher( 'aaabb' ) m.matches() assert m.group(1) == 'aaa' && m.start(1) == 0 && m.end(1) == 3 assert m.group(2) == 'bb' && m.start(2) == 3 && m.end(2) == 5 assert m.group(0) == 'aaabb' //group(0) is the entire string assert m.group() == 'aaabb' && m.start() == 0 && m.end() == 5 //parameters default to 0 //...or outside the pattern using indexing syntax (don't forget the first [0] index)... m= java.util.regex.Pattern.compile( /(a*)(b*)/ ).matcher( 'aaabb' ) m.matches() assert m[0][0] == 'aaabb' //the entire string assert m[0][1] == 'aaa' && m.start(1) == 0 && m.end(1) == 3 assert m[0][2] == 'bb' && m.start(2) == 3 && m.end(2) == 5 //...or within the pattern using \n notation: assert 'aaabb,aaa,bb' ==~ /(a*)(b*),\1,\2/ // \1 is the first group matched, \2 the second matched assert 'abbcc,abb,bb,cc' ==~ /(a(b*))(c*),\1,\2,\3/ //groups numbered by sequence of their opening parens from left to right assert 'abcddd,ab,ddd' ==~ /(a(?:b))(?>c)(d*),\1,\2/ //groups beginning with ?: or ?> aren't numbered assert 'aba,a,b' ==~ /(a(b)?)+,\1,\2/ //second match for \1 has no match for \2, so \2 keeps value from its first match assert 'abc,bc' ==~ /a(bc)?,\1/ assert !( 'a,' ==~ /a(bc)?,\1/ ) //referencing \1 causes entire match to fail if it hasn't already matched a value assert !( 'a' ==~ /([abc]\1)/ ) //referencing a group within itself causes entire match to fail |
...
| Code Block |
|---|
def m= ( ~/(a*)|bc/ ).matcher( 'bc' ) //another longhand syntax
m.matches()
assert m.group(1) == null && m.start(1) == -1 && m.end(1) == -1
//if match successful but group didn't match anything
def p= java.util.regex.Pattern.compile( /ab*c/ )
assert p.pattern() == /ab*c/ //retrieve the definition from a compiled pattern
|
Finding Patterns in Strings
As well as matching an entire string to a pattern, we can also find a pattern within a string using =~ syntax:
| Code Block |
|---|
assert 'abcdefg' =~ /cde/ //is 'cde' within 'abcdefg'? assert ! ( 'abcdefg' =~ /ace/ ) assert java.util.regex.Pattern.compile( /cde/ ).matcher( 'abcdefg' ).find() //alternative syntax assert 'xxx z9g\t\nxxx' =~ /\s\w\d.\t\n/ //special characters work the same as with ==~ matching assert ( 'xxxgOoDbYexxx' =~ /(?i)goodbye/ ) //flags also work the same as with ==~ assert 'xxxbatxxx' =~ /b[aeiou]t/ //character classes also work the same as with ==~ |
...
| Code Block |
|---|
def s= 'horse house'
assert s =~ /ho.se/ //to check for the first occurence only
def m= (s =~ /ho.se/)
assert m.size() == 2 && m[0] == 'horse' && m[1] == 'house'
//to retrieve all occurences
def l= []
s.eachMatch( /ho.se/ ){ l << it[0] } //alternative syntax, be sure to use it[0]
assert l == ['horse', 'house']
def l2= []
s.eachMatch( /abc/ ){ l2 << it[0] } //no matches
assert l2 == []
def l3= []
s.eachMatch( /hor./ ){ l3 << it[0] } //one match only
assert l3 == ['hors']
|
...
| Code Block |
|---|
import java.util.regex.Pattern def s= 'hoose horse house' def m= Pattern.compile(/ho.se/).matcher(s) assert m.find() && s[m.start()..<m.end()] == 'hoose' assert m.find() && s[m.start()..<m.end()] == 'horse' assert m.find() && s[m.start()..<m.end()] == 'house' assert ! m.find() assert m.reset() && s[m.start()..<m.end()] == 'hoose' //use reset() to find from beginning assert m.find() && s[m.start()..<m.end()] == 'horse' assert m.find(1) && s[m.start()..<m.end()] == 'horse' //giving a parameter to find() starts finding from that index m.setIndex(1) //alternatively, calling setIndex() resets from that index, without finding //until find() called assert m.find() && s[m.start()..<m.end()] == 'horse' |
We can group when finding with =~ just as we do when matching with ==~:
| Code Block |
|---|
def m= ( 'mistlemuscle' =~ /m(.)s(.)le/ ) assert m.size() == 2 assert m.count == 2 //alternative to size() assert m[0] == ['mistle', 'i', 't'] assert m[0].size() == 3 && m[0][0] == 'mistle' && m[0][1] == 'i' && m[0][2] == 't' assert m[1] == ['muscle', 'u', 'c'] assert m[1].size() == 3 && m[1][0] == 'muscle' && m[1][1] == 'u' && m[1][2] == 'c' //using the eachMatch() method... def l= [] 'mistlemuscle'.eachMatch( /m(.)s(.)le/ ){ l << it } assert l*.toList() == [['mistle', 'i', 't'], ['muscle', 'u', 'c']] def l2= [] 'mistle'.eachMatch( /m(.)s(.)le/ ){ l2 << it } assert l2*.toList() == [['mistle', 'i', 't']] def l3= [] 'practical'.eachMatch( /m(.)s(.)le/ ){ l3 << it } assert l3*.toList() == [] //using longhand notation... import java.util.regex.Pattern m= Pattern.compile( /(a+)(b+)/ ).matcher( 'aaabbcccaabbb' ) m.find() assert m.group(1) == 'aaa' && m.start(1) == 0 && m.end(1) == 3 && m.group(2) == 'bb' && m.start(2) == 3 && m.end(2) == 5 && m.group() == 'aaabb' && m.start() == 0 && m.end() == 5 m.find() assert m.group(1) == 'aa' && m.start(1) == 8 && m.end(1) == 10 && m.group(2) == 'bbb' && m.start(2) == 10 && m.end(2) == 13 && m.group() == 'aabbb' && m.start() == 8 && m.end() == 13 |
...
| Code Block |
|---|
def m= ('redeem coffee' =~ /ee/)
assert m.collect{it} == ['ee', 'ee']
//when calling collect() on a pattern with no groups...
assert m.collect{it} == []
//...we must call reset() if we want to access the found matches again
m.reset()
assert m.collect{it} == ['ee', 'ee']
def l= [] //ditto for each()
m.each{ l << it }
assert l == []
m.reset()
l= []
m.each{ l << it }
assert l == ['ee', 'ee']
l= [] //ditto for eachWithIndex
m.eachWithIndex{it, i-> l << it+i }
assert l == []
m.reset()
l= []
m.eachWithIndex{it, i-> l << it+i }
assert l == ['ee0', 'ee1']
m= ('play the game\nfollow the rules' =~ /(?m)^(.*?) the (.*?)$/)
//for a pattern with groups...
l= []
m.each{g0, g1, g2-> l << [g0, g1, g2] }
//...we must pass the groups separately to the closure of each()
assert l == [['play the game', 'play', 'game'],
['follow the rules', 'follow', 'rules']]
m= ( 'mistlemuscle' =~ /m(.)s(.)le/ )
assert m[1] == ['muscle', 'u', 'c']
assert m.group(0) == 'muscle' && m.group(1) == 'u' && m.group(2) == 'c'
//only call group() after using subscripting first
|
...
| Code Block |
|---|
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
findAll{ it[1] == 'a' } == ['tame', 'tape', 'take']
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
find{ it[1] == 'a' } == 'tame'
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
findIndexOf{ it[1] == 'a' } == 2 //index of 'tame'
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
any{ it[1] == 'a' }
assert ! ('tone, true, tame, tape, take, tile, time' =~ /t..e/).
every{ it[1] == 'a' }
|
The sequence of text joined by operators such as | ? * + {} has no effect on the success of the ==~ matcher, but does affect what's found with the =~ finder. The first choice of the | is found first, and backtracking to the second choice is only tried if necessary. The choice of the ? is tried first, and backtracking to ignore the choice only tried if necessary. As much as possible of the * + {} is found first, and backtracking to find less text only tried if necessary.
| Code Block |
|---|
assert ('abcdefg' =~ /bcd|bcdef/)[0] == 'bcd'
assert ('abcdefg' =~ /bcdef|bcd/)[0] == 'bcdef'
//first choice always tried first
assert ('Friday 13th' =~ /Fri(day)?/)[0][0] == 'Friday'
assert ('Say "hello" and "goodbye" to the world!' =~ /".*"/)[0] ==
'"hello" and "goodbye"'
l= []
'Say "hello" and "goodbye" to the world!'.eachMatch( /"[^"]*"/ ){ l << it }
//use NOT DOUBLE-QUOTES instead of ANY CHARACTER
assert l*.toList() == [['"hello"'], ['"goodbye"']]
|
...
| Code Block |
|---|
def m= ('grgggr'=~/g?/)
def l= []
for( int i in 0..<(m.size() as int) ) l << m[i]
assert l == ['g', '', 'g', 'g', 'g', '', '']
// ? option also matches the empty space before each 'r', and the end of string
m= ('grgggr'=~/g*/)
l= []
for( int i in 0..<(m.size() as int) ) l << m[i]
assert l == ['g', '', 'ggg', '', '']
// * repetition also matches the empty space before each 'r', and the end of string
m= ('grgggr'=~/g+/)
l= []
for( int i in 0..<(m.size() as int) ) l << m[i]
assert l == ['g', 'ggg'] // + repetition is the most intuitive to use
|
...
| Code Block |
|---|
def s= 'a quick quick dog' def m= (s =~ /a.*k/) //starts at the beginning, but doesn't try to match the entire string assert m.lookingAt() && s[m.start()..<m.end()] == 'a quick quick' //replaceFirst... assert (s =~ /quick/).replaceFirst('fast') == 'a fast quick dog' assert (s =~ /qu(ick)/).replaceFirst('kw$1') == 'a kwick quick dog' //can reference groups in pattern using $ assert (s =~ /qu(ick)/).replaceFirst('kw\\$1') == 'a kw$1 quick dog' //include literal $ by writing \$, escaping \ as \\ //utility method to create a literal replacement String for the given String... import java.util.regex.Matcher assert Matcher.quoteReplacement( 'kw$1' ) == 'kw\\$1' assert (s =~ /qu(ick)/).replaceFirst( Matcher.quoteReplacement( 'kw$1' ) ) == 'a kw$1 quick dog' //we can mix GStrings and replacement group refs by mixing single-quoted and //double-quoted strings... def ice= 'ice cream' assert ('some malting beer' =~ /a(lting ).*/).replaceFirst('e$1' + "$ice") == 'some melting ice cream' //replaceAll... assert (s =~ /quick/).replaceAll('fast') == 'a fast fast dog' s= 'a quickly quacking duck' assert (s =~ /qu(.)ck/).replaceAll('kw$1ck') == 'a kwickly kwacking duck' //another shorthand... assert 'a quick quick dog'.replaceFirst(/qu(ick)/, 'kw\\$1') == 'a kw$1 quick dog' assert 'a quickly quacking duck'.replaceAll(/qu(.)ck/, 'kw$1ck') == 'a kwickly kwacking duck' //'appendReplacement' and 'appendTail' should be used together for more //complex replacements... m= 'one banana two havana three matana four' =~ /(.a.)ana/ def i=0, sb= new StringBuffer() while( m.find() ) m.appendReplacement(sb, '$1a' + 'na'*i++) m.appendTail(sb) assert sb.toString() == 'one bana two havana three matanana four' |
...
| Code Block |
|---|
//the greedy * operator, with backwards backtracking... def m= ( ~/(.*),(.*)/ ).matcher( 'one,two,three' ) assert m.matches() && m.group(1) == 'one,two' && m.group(2) == 'three' //the lazy *? operator, with forwards backtracking... m= ( ~/(.*?),(.*)/ ).matcher( 'one,two,three' ) assert m.matches() && m.group(1) == 'one' && m.group(2) == 'two,three' //the possessive *+ operator, with no backtracking at all, even when doing so //would cause a match... assert ! ( ~/(.*+),(.*)/ ).matcher( 'one,two,three' ).matches() //we can qualify other operators with possessiveness, such as ++, ?+, {m,n}+... m= ( ~/([abc,]*+),(.*)/ ).matcher( 'abba,and,beegees' ) assert ! m.matches() //greedily matches 'abba,a', but doesn't backtrack to 'abba' |
...
| Code Block |
|---|
assert ! ( 'abbbc' ==~ /a(?>b*)bc/ )
//after 'bbb' matched, no backtracking to 'bb' within atomic group
|
Atomic grouping and possessiveness are handy with nested repetition, allowing much faster match failures.
Finding Positions in Strings
We can use ^ and $ to match the beginning and end of each line using flag m:
| Code Block |
|---|
def s= 'an apple\nthe lime\na banana'
assert ! (s =~ /^a.{7}$/)
//normally, ^ matches the beginning of the entire input,
//and $ matches its end
def m= (s =~ /(?m)^a.{7}$/)
//in multi-line mode, ^ matches the beginning of each line,
//and $ matches each line's end
assert m.size() == 2 && m[0] == 'an apple' && m[1] == 'a banana'
assert m.toString() ==
'java.util.regex.Matcher[pattern=(?m)^a.{7}$ region=0,26 lastmatch=a banana]'
//some technical info
assert ((s+'\n') =~ /(?m)^a.{7}$/) // $ ignores any \n at the end of the string
import java.util.regex.Pattern
m= Pattern.compile(/^a.{7}$/, Pattern.MULTILINE).matcher(s)
//alternative to (?m) in longhand syntax
assert m.find() && s[m.start()..<m.end()] == 'an apple'
assert m.find() && s[m.start()..<m.end()] == 'a banana'
assert ! m.find()
|
...
| Code Block |
|---|
m= ( 'nine\nlives' =~ /$/ ) assert m.find() && m.start() == 10 && m.end() == 10 // $ matches at end of string once only assert ! m.find() m= ( 'nine\nlives\n' =~ /$/ ) assert m.find() && m.start() == 10 && m.end() == 10 // $ matches just before \n ... assert m.find() && m.start() == 11 && m.end() == 11 //...and again, $ matches after the \n assert ! m.find() m= ( 'nine\nlives\n' =~ /(?m)$/ ) assert m.find() && m.start() == 4 && m.end() == 4 //in multiline mode, $ matches at end of each line assert m.find() && m.start() == 10 && m.end() == 10 assert m.find() && m.start() == 11 && m.end() == 11 // $ also always matches after the \n in multiline mode assert ! m.find() m= ( 'nine\nlives\n' =~ /^/ ) // ^ matches at beginning of string once only, //even if there's an \n at the end assert m.find() && m.start() == 0 && m.end() == 0 assert ! m.find() m= ( 'nine\nlives\n' =~ /(?m)^/ ) assert m.find() && m.start() == 0 && m.end() == 0 assert m.find() && m.start() == 5 && m.end() == 5 //in multiline mode, ^ matches at beginning of each line assert ! m.find() // ^ also never matches after the \n in multiline mode |
...
| Code Block |
|---|
def s1= 'an apple\na banana'
assert (s1 =~ /\A.{8}\n.{8}\Z/)
// \A always matches the beginning of the entire input, and \Z its end
assert (s1 =~ /\A.{8}\n.{8}\z/) // \z also matches its end
assert (s1 =~ /(?m)\A.{8}\n.{8}\Z/)
// ?m flag has no effect on meaning of \A \Z and \z
def s2= s1 + '\n'
assert (s2 =~ /(?m)\A.{8}\n.{8}\Z/)
// \Z ignores an extra \n when matching the end of input...
assert ! (s2 =~ /(?m)\A.{8}\n.{8}\z/) // ...but \z is fussy
|
...
| Code Block |
|---|
// \b matches either the preceding or following character, but not both, is //a word (matched by \w) (0x20..0x7F).each{it1-> (0x20..0x7F).each{it2-> def s= "${it1 as char}${it2 as char}" if( s ==~ /.\b./ ) assert (s[0] ==~ /\w/) ^ (s[1] ==~ /\w/) // ^ means xor (exclusive or) } } // \B matches where \b doesn't assert (0x0..0x7F).findAll{ (it as char) ==~ /\b/ && (it as char) ==~ /\B/ }. size() == 0 |
We can can look behind or ahead of a position, ie, find a position based on text that precedes follows it, but without matching that text itself. We can only use fixed-length strings when looking behind, ie, literal text, character classes, finite repetition ( {length} and ? ), and alternation where each string in it is also of fixed length, because the length of the match must be able to be predetermined:
| Code Block |
|---|
//use (?=) to find the position just in front of all 'qu'... assert 'the queen quietly quacked'.replaceAll( /(?=qu)/, 'we' ) == 'the wequeen wequietly wequacked' //use (?!) to find all 'c' not followed by 'a'... assert 'clever cats can count mice'.replaceAll( /c(?!a)/, 'k' ) == 'klever cats can kount mike' //use (?<=) to find all words ending in '-gry'... assert 'The angry, hungry boy gried out.'. replaceAll( /\b\w+?(?<=gry)\b/, 'naughty' ) == 'The naughty, naughty boy gried out.' //use (?<!) to find 3-letter words not ending with 'e'... assert 'The spy saw seven spuds.'.replaceAll( /\b\w{3}(?<!e)\b/, 'hid' ) == 'The hid hid seven spuds.' //lookaheads and lookbehinds can contain capturing groups... assert 'the landlord dared band led not'. replaceAll( /\b\w{4,}(?<=(\w{3})d)\b/, '$1' ) == 'the lor are ban led not' |
...
| Code Block |
|---|
assert 'The leaky cauldron.'.split(/\b/).toList() == ['', 'The', ' ', 'leaky', ' ', 'cauldron', '.'] //note that an empty string is prepended assert 'Hi, my, bye.'.split( /\b(?=\w)/ ).toList() == ['', 'Hi, ', 'my, ', 'bye.'] assert 'The leaky cauldron.'.replaceAll(/\b/, '*') == '*The* *leaky* *cauldron*.' //note that text inserted at beginning but not at end |
...
| Code Block |
|---|
def s= 'hi,my,spy,tie,bye,,' assert s.split( /,/ ).toList() == ['hi', 'my', 'spy', 'tie', 'bye'] assert s.split( /,/, 1 ).toList() == ['hi,my,spy,tie,bye,,'] //extra argument gives max number of splits assert s.split( /,/, 2 ).toList() == ['hi', 'my,spy,tie,bye,,'] assert s.split( /,/, 3 ).toList() == ['hi', 'my', 'spy,tie,bye,,'] assert s.split( /,/, 0 ).toList() == ['hi', 'my', 'spy', 'tie', 'bye'] //any number of splits; same as no arg assert s.split( /,/, -1 ).toList() == ['hi', 'my', 'spy', 'tie', 'bye', '', ''] //a negative arg doesn't remove trailing empty strings assert ( ~/,/ ).split(s).toList() == ['hi', 'my', 'spy', 'tie', 'bye'] //alternative syntax assert ( ~/,/ ).split(s, 2).toList() == ['hi', 'my,spy,tie,bye,,'] |
Restricting a String to a Region for a Pattern
We can set the limit of the part of the input string that will be searched to find a match:
| Code Block |
|---|
import java.util.regex.Pattern def m= Pattern.compile( /abc+/ ).matcher( 'aaabc' ) assert m.find() m.region(1, 4) //restrict string 'aaabc' to a region within, ie, 'aab' assert ! m.find() assert m.regionStart() == 1 && m.regionEnd() == 4 assert ! m.region(1, 4).find() //alternative syntax //we can make a region's boundaries transparent to lookaround and boundary //matching constructs... m= Pattern.compile( /abc\b/ ).matcher( 'aaabcdef' ) m.region(1, 5) assert m.find() //doesn't consider whether there's a word boundary (\b) after //'aabc' in full string assert ! m.hasTransparentBounds() m.region(1, 5) m.useTransparentBounds(true) assert ! m.find() //doesn't find anything because the \b doesn't match assert m.hasTransparentBounds() assert ! m.region(1, 5).useTransparentBounds(true).find() //alternative syntax //we can decide whether to match anchors such as ^ and $ at the boundaries of //the region... m= Pattern.compile( /^abc$/ ).matcher( 'aaabcdef' ) m.region(2, 5) assert m.find() assert m.hasAnchoringBounds() //match such anchors by default m.region(2, 5) m.useAnchoringBounds(false) assert ! m.find() //the ^ and $ no longer match assert ! m.region(2, 5).useAnchoringBounds(false).find() //alternative syntax |