Skip to end of metadata
Go to start of metadata

You are viewing an old version of this page. View the current version.

Compare with Current View Page History

Version 1 Next »

Matching Strings to Patterns

We can define string patterns, aka "Regular Expressions" or "Regexes", and see if a String matches it:

Twelve characters that are special syntax for regexes need to be quoted:

The chars \c@, \cA, \cB, ..., \cZ, \c[, \c], \c^, and \c_ map to the special characters 0x0 to 0x1f, except 0x1c:

We have special pattern syntax for whitespace \s, word characters \w, digits \d, and their complements:

There's certain characters that the dot . doesn't match, except when (?s) is used:

Some other flags:

Some other ways to use flags:

A character class is a set of characters, one of which may be matched. We've already seen the predefined character classes \s, \w, \d, \S, \W, \D. We can also define our own:

The only meta-characters inside a character class are \, [, ^ (in the first position), ] (not in the first position or after the ^), - (not in the first position, after the ^, or before the ]), and &&. Quote them with a / to get the literal character. The other usual meta-characters are normal characters inside a character class, and do not need to be quoted with a backslash, though can be. Character class precedences are, from highest: literal escapes (eg \s), grouping (eg [abc]), ranges (eg a-g), unions (eg [abc][xyz]), then intersections ([a-z&&[gjpqy]]).

We can use the alternation operator | to give some options:

We use ? to indicate optional character/s:

Use

Unknown macro: {n}

to match a character exactly n times:

assert 'aaab' ==~ /a

Unknown macro: {3}

b/
assert 'abcabc' ==~ /(abc)

Unknown macro: {2}

/ //

can apply to a multi-character sequence
['ab', 'ba', 'bb', 'aa'].each{ it ==~ /[ab]

Unknown macro: {2}

/ } //

Unknown macro: {n}

can apply to a character class
['abab', '%&@b'].each{ assert it ==~ /.

Unknown macro: {3}

b/ }

['aaab', 'aab', 'ab', 'b'].each

Unknown macro: { assert it ==~ /a*b/ }

//even zero occurences of the character is matched
['abcabc', 'abc', ''].each

Unknown macro: { assert it ==~ /(abc)*/ }

// * can apply to a multi-character sequence
['abbacb', 'acaba', 'cbbbac', 'c', ''].each

Unknown macro: { assert it ==~ /[abc]*/ }

// * can apply to a character class
['aaab', 'b', 'abab'].each

Unknown macro: { assert it ==~ /.*b/ }

// * is greedy: in 'abab' .* matches 'aba'

//Use + to match at least one occurence of a character:
['aaab', 'aab', 'ab'].each

Unknown macro: { assert it ==~ /a+b/ }

assert !( 'b' ==~ /a+b/ ) //at least one 'a' is required
assert 'abcabcxz' ==~ /(abc)[xyz]/ // + can apply to character class or multi-character sequence

//Other variable-length repetition operators:
assert 'aaaab' ==~ /a

Unknown macro: {3,}

b/ //

Unknown macro: {n,}

matches at least n characters
assert 'aaaab' ==~ /a

Unknown macro: {3,5}

b/ //

Unknown macro: {n1,n2}

matches between n1 and n2 characters
assert 'abaxyzxyz' ==~ /[ab]

Unknown macro: {2,}

(xyz)

Unknown macro: {2,4}

/ //these also can apply to multi-character sequences or character classes

def m= java.util.regex.Pattern.compile( /(.),(.)/ ).matcher( 'one,two,three' )
m.matches()
assert m.group(1) == 'one,two' //what was matched between the first parens
assert m.group(2) == 'three'   //what was matched between the second parens

assert m.hasGroup() //misc method to check whether the pattern has groups
assert m.groupCount() == 2 //misc method to count them

//we can access matched values in groups outside the pattern using longhand syntax...
def m= java.util.regex.Pattern.compile( /(a*)(b*)/ ).matcher( 'aaabb' )
m.matches()
assert m.group(1) == 'aaa' && m.start(1) == 0 && m.end(1) == 3
assert m.group(2) == 'bb' && m.start(2) == 3 && m.end(2) == 5
assert m.group(0) == 'aaabb' //group(0) is the entire string
assert m.group() == 'aaabb' && m.start() == 0 && m.end() == 5 //parameters default to 0

//...or outside the pattern using indexing syntax (don't forget the first [0] index)...
m= java.util.regex.Pattern.compile( /(a*)(b*)/ ).matcher( 'aaabb' )
m.matches()
assert m[0][0] == 'aaabb' //the entire string
assert m[0][1] == 'aaa' && m.start(1) == 0 && m.end(1) == 3
assert m[0][2] == 'bb' && m.start(2) == 3 && m.end(2) == 5

//...or within the pattern using \n notation:
assert 'aaabb,aaa,bb' ==~ /(a*)(b*),\1,\2/ // \1 is the first group matched, \2 the second matched

assert 'abbcc,abb,bb,cc' ==~ /(a(b*))(c*),\1,\2,\3/ //groups numbered by sequence of their opening parens from left to right
assert 'abcddd,ab,ddd' ==~ /(a(?:b))(?>c)(d*),\1,\2/ //groups beginning with ?: or ?> aren't numbered
assert 'aba,a,b' ==~ /(a(b)?)+,\1,\2/ //second match for \1 has no match for \2, so \2 keeps value from its first match

assert 'abc,bc' ==~ /a(bc)?,\1/
assert !( 'a,' ==~ /a(bc)?,\1/ ) //referencing \1 causes entire match to fail if it hasn't already matched a value
assert !( 'a' ==~ /([abc]\1)/ ) //referencing a group within itself causes entire match to fail

def m= ( ~/(a*)|bc/ ).matcher( 'bc' ) //another longhand syntax
m.matches()
assert m.group(1) == null && m.start(1) == -1 && m.end(1) == -1 //if match successful but group didn't match anything

def p= java.util.regex.Pattern.compile( /ab*c/ )
assert p.pattern() == /ab*c/ //retrieve the definition from a compiled pattern

assert 'abcdefg' =~ /cde/ //is 'cde' within 'abcdefg'?
assert ! ( 'abcdefg' =~ /ace/ )
assert java.util.regex.Pattern.compile( /cde/ ).matcher( 'abcdefg' ).find() //alternative syntax

assert 'xxx z9g\t\nxxx' =~ /\s\w\d.\t\n/ //special characters work the same as with ==~ matching
assert ( 'xxxgOoDbYexxx' =~ /(?i)goodbye/ ) //flags also work the same as with ==~
assert 'xxxbatxxx' =~ /b[aeiou]t/ //character classes also work the same as with ==~

def s= 'horse house'
assert s =~ /ho.se/ //to check for the first occurence only
def m= (s =~ /ho.se/)
assert m.size() == 2 && m[0] == 'horse' && m[1] == 'house' //to retrieve all occurences

def l= []
s.eachMatch( /ho.se/ )

Unknown macro: { l << it[0] }

//alternative syntax, be sure to use it[0]
assert l == ['horse', 'house']
def l2= []
s.eachMatch( /abc/ )

Unknown macro: { l2 << it[0] }

//no matches
assert l2 == []
def l3= []
s.eachMatch( /hor./ )

Unknown macro: { l3 << it[0] }

//one match only
assert l3 == ['hors']

import java.util.regex.Pattern
def s= 'hoose horse house'
def m= Pattern.compile(/ho.se/).matcher(s)
assert m.find() && s[m.start()..<m.end()] == 'hoose'
assert m.find() && s[m.start()..<m.end()] == 'horse'
assert m.find() && s[m.start()..<m.end()] == 'house'
assert ! m.find()
assert m.reset() && s[m.start()..<m.end()] == 'hoose' //use reset() to find from beginning
assert m.find() && s[m.start()..<m.end()] == 'horse'
assert m.find(1) && s[m.start()..<m.end()] == 'horse' //giving a parameter to find() starts finding from that index
m.setIndex(1) //alternatively, calling setIndex() resets from that index, without finding until find() called
assert m.find() && s[m.start()..<m.end()] == 'horse'

def m= ( 'mistlemuscle' =~ /m(.)s(.)le/ )
assert m.size() == 2
assert m.count == 2 //alternative to size()
assert m[0] == ['mistle', 'i', 't']
assert m[0].size() == 3 && m[0][0] == 'mistle' && m[0][1] == 'i' && m[0][2] == 't'
assert m[1] == ['muscle', 'u', 'c']
assert m[1].size() == 3 && m[1][0] == 'muscle' && m[1][1] == 'u' && m[1][2] == 'c'

//using the eachMatch() method...
def l= []
'mistlemuscle'.eachMatch( /m(.)s(.)le/ )

Unknown macro: { l << it }

assert l*.toList() == [['mistle', 'i', 't'], ['muscle', 'u', 'c']]
def l2= []
'mistle'.eachMatch( /m(.)s(.)le/ )

Unknown macro: { l2 << it }

assert l2*.toList() == [['mistle', 'i', 't']]
def l3= []
'practical'.eachMatch( /m(.)s(.)le/ )

Unknown macro: { l3 << it }

assert l3*.toList() == []

//using longhand notation...
import java.util.regex.Pattern
m= Pattern.compile( /(a+)(b+)/ ).matcher( 'aaabbcccaabbb' )
m.find()
assert m.group(1) == 'aaa' && m.start(1) == 0 && m.end(1) == 3 &&
       m.group(2) == 'bb' && m.start(2) == 3 && m.end(2) == 5 &&
       m.group() == 'aaabb' && m.start() == 0 && m.end() == 5
m.find()
assert m.group(1) == 'aa' && m.start(1) == 8 && m.end(1) == 10 &&
       m.group(2) == 'bbb' && m.start(2) == 10 && m.end(2) == 13 &&
       m.group() == 'aabbb' && m.start() == 8 && m.end() == 13

def m= ('redeem coffee' =~ /ee/)
assert m.collect

Unknown macro: {it}

== ['ee', 'ee'] //when calling collect() on a pattern with no groups...
assert m.collect

== [] //...we must call reset() if we want to access the found matches again
m.reset()
assert m.collect

Unknown macro: {it}

== ['ee', 'ee']

def l= [] //ditto for each()
m.each

assert l == []
m.reset()
l= []
m.each

Unknown macro: { l << it }

assert l == ['ee', 'ee']

l= [] //ditto for eachWithIndex
m.eachWithIndex

Unknown macro: {it, i-> l << it+i }

assert l == []
m.reset()
l= []
m.eachWithIndex

assert l == ['ee0', 'ee1']

m= ('play the game\nfollow the rules' =~ /(?m)^(.?) the (.?)$/) //for a pattern with groups...
l= []
m.each

Unknown macro: {g0, g1, g2-> l << [g0, g1, g2] }

//...we must pass the groups separately to the closure of each()
assert l == [['play the game', 'play', 'game'], ['follow the rules', 'follow', 'rules']]

m= ( 'mistlemuscle' =~ /m(.)s(.)le/ )
assert m[1] == ['muscle', 'u', 'c']
assert m.group(0) == 'muscle' && m.group(1) == 'u' && m.group(2) == 'c' //only call group() after using subscripting first

assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).findAll

Unknown macro: { it[1] == 'a' }

== ['tame', 'tape', 'take']
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).find

== 'tame'
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).findIndexOf

Unknown macro: { it[1] == 'a' }

== 2 //index of 'tame'
assert ('tone, true, tame, tape, take, tile, time' =~ /t..e/).any

assert ! ('tone, true, tame, tape, take, tile, time' =~ /t..e/).every

Unknown macro: { it[1] == 'a' }

assert ('abcdefg' =~ /bcd|bcdef/)[0] == 'bcd'
assert ('abcdefg' =~ /bcdef|bcd/)[0] == 'bcdef' //first choice always tried first

assert ('Friday 13th' =~ /Fri(day)?/)[0][0] == 'Friday'

assert ('Say "hello" and "goodbye" to the world!' =~ /".*"/)[0] == '"hello" and "goodbye"'
l= []
'Say "hello" and "goodbye" to the world!'.eachMatch( /"[^"]*"/ )

//use NOT DOUBLE-QUOTES instead of ANY CHARACTER
assert l*.toList() == [['"hello"'], ['"goodbye"']]

def m= ('grgggr'=~/g?/)
def l= []
for( int i in 0..<(m.size() as int) ) l << m[i]
assert l == ['g', '', 'g', 'g', 'g', '', ''] // ? option also matches the empty space before each 'r', and the end of string

m= ('grgggr'=~/g*/)
l= []
for( int i in 0..<(m.size() as int) ) l << m[i]
assert l == ['g', '', 'ggg', '', ''] // * repetition also matches the empty space before each 'r', and the end of string

m= ('grgggr'=~/g+/)
l= []
for( int i in 0..<(m.size() as int) ) l << m[i]
assert l == ['g', 'ggg'] // + repetition is the most intuitive to use

assert ('Friday 13th' =~ /Fri(day)??/)[0][0] == 'Fri' //instead of 'Friday'

def l= []
'Say "hello" and "goodbye" to the world!'.eachMatch( /".*?"/ )

Unknown macro: { l << it }

assert l*.toList() == [['"hello"'], ['"goodbye"']]

def s= 'a quick quick dog'
def m= (s =~ /a.*k/) //starts at the beginning, but doesn't try to match the entire string
assert m.lookingAt() && s[m.start()..<m.end()] == 'a quick quick'

//replaceFirst...
assert (s =~ /quick/).replaceFirst('fast') == 'a fast quick dog'
assert (s =~ /qu(ick)/).replaceFirst('kw$1') == 'a kwick quick dog' //can reference groups in pattern using $
assert (s =~ /qu(ick)/).replaceFirst('kw
$1') == 'a kw$1 quick dog' //include literal $ by writing \$, escaping \ as

//utility method to create a literal replacement String for the given String...
import java.util.regex.Matcher
assert Matcher.quoteReplacement( 'kw$1' ) == 'kw
$1'
assert (s =~ /qu(ick)/).replaceFirst( Matcher.quoteReplacement( 'kw$1' ) ) == 'a kw$1 quick dog'

//we can mix GStrings and replacement group refs by mixing single-quoted and double-quoted strings...
def ice= 'ice cream'
assert ('some malting beer' =~ /a(lting ).*/).replaceFirst('e$1' + "$ice") == 'some melting ice cream'

//replaceAll...
assert (s =~ /quick/).replaceAll('fast') == 'a fast fast dog'
s= 'a quickly quacking duck'
assert (s =~ /qu(.)ck/).replaceAll('kw$1ck') == 'a kwickly kwacking duck'

//another shorthand...
assert 'a quick quick dog'.replaceFirst(/qu(ick)/, 'kw
$1') == 'a kw$1 quick dog'
assert 'a quickly quacking duck'.replaceAll(/qu(.)ck/, 'kw$1ck') == 'a kwickly kwacking duck'

//'appendReplacement' and 'appendTail' should be used together for more complex replacements...
m= 'one banana two havana three matana four' =~ /(.a.)ana/
def i=0, sb= new StringBuffer()
while( m.find() ) m.appendReplacement(sb, '$1a' + 'na'*i++)
m.appendTail(sb)
assert sb.toString() == 'one bana two havana three matanana four'

//the greedy * operator, with backwards backtracking...
def m= ( ~/(.),(.)/ ).matcher( 'one,two,three' )
assert m.matches() && m.group(1) == 'one,two' && m.group(2) == 'three'

//the lazy *? operator, with forwards backtracking...
m= ( ~/(.?),(.)/ ).matcher( 'one,two,three' )
assert m.matches() && m.group(1) == 'one' && m.group(2) == 'two,three'

//the possessive *+ operator, with no backtracking at all, even when doing so would cause a match...
assert ! ( ~/(.+),(.)/ ).matcher( 'one,two,three' ).matches()

//we can qualify other operators with possessiveness, such as +, ?,

Unknown macro: {m,n}

+...
m= ( ~/([abc,]+),(.)/ ).matcher( 'abba,and,beegees' )
assert ! m.matches() //greedily matches 'abba,a', but doesn't backtrack to 'abba'

assert ! ( 'abbbc' ==~ /a(?>b*)bc/ ) //after 'bbb' matched, no backtracking to 'bb' within atomic group

def s= 'an apple\nthe lime\na banana'
assert ! (s =~ /^a.

Unknown macro: {7}

$/) //normally, ^ matches the beginning of the entire input, and $ matches its end
def m= (s =~ /(?m)^a.

$/) //in multi-line mode, ^ matches the beginning of each line, $ matches each line's end
assert m.size() == 2 && m[0] == 'an apple' && m[1] == 'a banana'
assert m.toString() ==
  'java.util.regex.Matcher[pattern=(?m)^a.

Unknown macro: {7}

$ region=0,26 lastmatch=a banana]' //some technical info

assert ((s+'\n') =~ /(?m)^a.

$/) // $ ignores any \n at the end of the string

import java.util.regex.Pattern
m= Pattern.compile(/^a.

Unknown macro: {7}

$/, Pattern.MULTILINE).matcher(s) //alternative to (?m) in longhand syntax
assert m.find() && s[m.start()..<m.end()] == 'an apple'
assert m.find() && s[m.start()..<m.end()] == 'a banana'
assert ! m.find()

m= ( 'nine\nlives' =~ /$/ )
assert m.find() && m.start() == 10 && m.end() == 10 // $ matches at end of string once only
assert ! m.find()

m= ( 'nine\nlives\n' =~ /$/ )
assert m.find() && m.start() == 10 && m.end() == 10 // $ matches just before \n ...
assert m.find() && m.start() == 11 && m.end() == 11 //...and again, $ matches after the \n
assert ! m.find()

m= ( 'nine\nlives\n' =~ /(?m)$/ )
assert m.find() && m.start() == 4 && m.end() == 4 //in multiline mode, $ matches at end of each line
assert m.find() && m.start() == 10 && m.end() == 10
assert m.find() && m.start() == 11 && m.end() == 11 // $ also always matches after the \n in multiline mode
assert ! m.find()

m= ( 'nine\nlives\n' =~ /^/ ) // ^ matches at beginning of string once only, even if there's an \n at the end
assert m.find() && m.start() == 0 && m.end() == 0
assert ! m.find()

m= ( 'nine\nlives\n' =~ /(?m)^/ )
assert m.find() && m.start() == 0 && m.end() == 0
assert m.find() && m.start() == 5 && m.end() == 5 //in multiline mode, ^ matches at beginning of each line
assert ! m.find() // ^ also never matches after the \n in multiline mode

def s1= 'an apple\na banana'
assert (s1 =~ /\A.

Unknown macro: {8}

\n.

\Z/) // \A always matches the beginning of the entire input, and \Z its end
assert (s1 =~ /\A.

Unknown macro: {8}

\n.

\z/) // \z also matches its end

assert (s1 =~ /(?m)\A.

Unknown macro: {8}

\n.

\Z/) // ?m flag has no effect on meaning of \A \Z and \z

def s2= s1 + '\n'
assert (s2 =~ /(?m)\A.

Unknown macro: {8}

\n.

\Z/) // \Z ignores an extra \n when matching the end of input...
assert ! (s2 =~ /(?m)\A.

Unknown macro: {8}

\n.

\z/) // ...but \z is fussy

// \b matches either the preceding or following character, but not both, is a word (matched by \w)
(0x20..0x7F).each{it1->
  (0x20..0x7F).each{it2->
    def s= "$

Unknown macro: {it1 as char}

$

Unknown macro: {it2 as char}

"
    if( s ==~ /.\b./ ) assert (s[0] ==~ /\w/) ^ (s[1] ==~ /\w/) // ^ means xor (exclusive or)
  }
}
// \B matches where \b doesn't
assert (0x0..0x7F).findAll

Unknown macro: { (it as char) ==~ /b/ && (it as char) ==~ /B/ }

.size() == 0

//use (?=) to find the position just in front of all 'qu'...
assert 'the queen quietly quacked'.replaceAll( /(?=qu)/, 'we' ) == 'the wequeen wequietly wequacked'

//use (?!) to find all 'c' not followed by 'a'...
assert 'clever cats can count mice'.replaceAll( /c(?!a)/, 'k' ) == 'klever cats can kount mike'

//use (?<=) to find all words ending in '-gry'...
assert 'The angry, hungry boy gried out.'.replaceAll( /\b\w+?(?<=gry)\b/, 'naughty' ) ==
  'The naughty, naughty boy gried out.'

//use (?<!) to find 3-letter words not ending with 'e'...
assert 'The spy saw seven spuds.'.replaceAll( /\b\w

(?<!e)\b/, 'hid' ) == 'The hid hid seven spuds.'

//lookaheads and lookbehinds can contain capturing groups...
assert 'the landlord dared band led not'.replaceAll( /\b\w

Unknown macro: {4,}

(?<=(\w

Unknown macro: {3}

)d)\b/, '$1' ) ==
  'the lor are ban led not'

assert 'The leaky cauldron.'.split(/\b/).toList() ==
  ['', 'The', ' ', 'leaky', ' ', 'cauldron', '.'] //note that an empty string is prepended
assert 'Hi, my, bye.'.split( /\b(?=\w)/ ).toList() == ['', 'Hi, ', 'my, ', 'bye.']
assert 'The leaky cauldron.'.replaceAll(/\b/, '*') ==
  'The leaky cauldron.' //note that text inserted at beginning but not at end

def s= 'hi,my,spy,tie,bye,,'
assert s.split( /,/ ).toList() == ['hi', 'my', 'spy', 'tie', 'bye']
assert s.split( /,/, 1 ).toList() == ['hi,my,spy,tie,bye,,'] //extra argument gives max number of splits
assert s.split( /,/, 2 ).toList() == ['hi', 'my,spy,tie,bye,,']
assert s.split( /,/, 3 ).toList() == ['hi', 'my', 'spy,tie,bye,,']
assert s.split( /,/, 0 ).toList() == ['hi', 'my', 'spy', 'tie', 'bye'] //any number of splits; same as no arg
assert s.split( /,/, -1 ).toList() == ['hi', 'my', 'spy', 'tie', 'bye', '', '']
  //a negative arg doesn't remove trailing empty strings

assert ( ~/,/ ).split(s).toList() == ['hi', 'my', 'spy', 'tie', 'bye'] //alternative syntax
assert ( ~/,/ ).split(s, 2).toList() == ['hi', 'my,spy,tie,bye,,']

import java.util.regex.Pattern
def m= Pattern.compile( /abc+/ ).matcher( 'aaabc' )
assert m.find()
m.region(1, 4) //restrict string 'aaabc' to a region within, ie, 'aab'
assert ! m.find()
assert m.regionStart() == 1 && m.regionEnd() == 4
assert ! m.region(1, 4).find() //alternative syntax

//we can make a region's boundaries transparent to lookaround and boundary matching constructs...
m= Pattern.compile( /abc\b/ ).matcher( 'aaabcdef' )
m.region(1, 5)
assert m.find() //doesn't consider whether there's a word boundary (\b) after 'aabc' in full string
assert ! m.hasTransparentBounds()
m.region(1, 5)
m.useTransparentBounds(true)
assert ! m.find() //doesn't find anything because the \b doesn't match
assert m.hasTransparentBounds()
assert ! m.region(1, 5).useTransparentBounds(true).find() //alternative syntax

//we can decide whether to match anchors such as ^ and $ at the boundaries of the region...
m= Pattern.compile( /^abc$/ ).matcher( 'aaabcdef' )
m.region(2, 5)
assert m.find()
assert m.hasAnchoringBounds() //match such anchors by default
m.region(2, 5)
m.useAnchoringBounds(false)
assert ! m.find() //the ^ and $ no longer match
assert ! m.region(2, 5).useAnchoringBounds(false).find() //alternative syntax

  • No labels