理解完正则学会一半:
输出结果:
- import re
- def test_patterns(text,patterns=[]):
- """Given source text and a list of patterns,look for
- matches for each pattern within the text and print
- them to stdout.
- """
- print ''.join(str(i/10 or ' ') for i in range(len(text)))
- print ''.join(str(i%10) for i in range(len(text)))
- print text
- # Look for each pattern in the text and print the results
- for pattern in patterns:
- print 'Matching "%s"' % pattern
- for match in re.finditer(pattern,text):
- s = match.start()
- e = match.end()
- print ' %2d : %2d = "%s"' % \
- (s,e-1,text[s:e])
- return
- if __name__ == '__main__':
- print "*"*50
- #Pattern Syntax
- test_patterns('abbaaabbbbaaaaa',['ab'])
- print "*"*50
- #Repetition
- test_patterns('abbaaabbbbaaaaa',[ 'ab*',# a followed by zero or more b
- 'ab+',# a followed by one or more b
- 'ab?',# a followed by zero or one b
- 'ab{3}',# a followed by three b
- 'ab{2,3}',# a followed by two to three b
- ])
- print "*"*50
- #Character Sets
- test_patterns('abbaaabbbbaaaaa',[ '[ab]',# either a or b
- 'a[ab]+',# a followed by one or more a or b
- 'a[ab]+?',# a followed by one or more a or b,not greedy
- ])
- print "*"*50
- test_patterns('This is some text -- with punctuation.',[ '[^-. ]+',# sequences without -,.,or space
- ])
- print "*"*50
- test_patterns('This is some text -- with punctuation.',[ '[a-z]+',# sequences of lower case letters
- '[A-Z]+',# sequences of upper case letters
- '[a-zA-Z]+',# sequences of lower or upper case letters
- '[A-Z][a-z]+',# one upper case letter followed by lower case letters
- ])
- print "*"*50
- test_patterns('abbaaabbbbaaaaa',[ 'a.',# a followed by any one character
- 'b.',# b followed by any one character
- 'a.*b',# a followed by anything,ending in b
- 'a.*?b',ending in b
- ])
- print "*"*50
- #Escape Codes
- # Code Meaning
- # \d a digit
- # \D a non-digit
- # \s whitespace (tab,space,newline,etc.)
- # \S non-whitespace
- # \w alphanumeric
- # \W non-alphanumeric
- test_patterns('This is a prime #1 example!',[ r'\d+',# sequence of digits
- r'\D+',# sequence of non-digits
- r'\s+',# sequence of whitespace
- r'\S+',# sequence of non-whitespace
- r'\w+',# alphanumeric characters
- r'\W+',# non-alphanumeric
- ])
- print "*"*50
- test_patterns(r'\d+ \D+ \s+ \S+ \w+ \W+',[ r'\\d\+',r'\\D\+',r'\\s\+',r'\\S\+',r'\\w\+',r'\\W\+',])
- #Anchoring
- # Code Meaning
- # ^ start of string,or line
- # $ end of string,or line
- # \A start of string
- # \Z end of string
- # \b empty string at the beginning or end of a word
- # \B empty string not at the beginning or end of a word
- print "*"*50
- test_patterns('This is some text -- with punctuation.',[ r'^\w+',# word at start of string
- r'\A\w+',# word at start of string
- r'\w+\S*$',# word at end of string,with optional punctuation
- r'\w+\S*\Z',with optional punctuation
- r'\w*t\w*',# word containing 't'
- r'\bt\w+',# 't' at start of word
- r'\w+t\b',# 't' at end of word
- r'\Bt\B',# 't',not start or end of word
- ])
待续...
- **************************************************
- 11111
- 012345678901234
- abbaaabbbbaaaaa
- Matching "ab"
- 0 : 1 = "ab"
- 5 : 6 = "ab"
- **************************************************
- 11111
- 012345678901234
- abbaaabbbbaaaaa
- Matching "ab*"
- 0 : 2 = "abb"
- 3 : 3 = "a"
- 4 : 4 = "a"
- 5 : 9 = "abbbb"
- 10 : 10 = "a"
- 11 : 11 = "a"
- 12 : 12 = "a"
- 13 : 13 = "a"
- 14 : 14 = "a"
- Matching "ab+"
- 0 : 2 = "abb"
- 5 : 9 = "abbbb"
- Matching "ab?"
- 0 : 1 = "ab"
- 3 : 3 = "a"
- 4 : 4 = "a"
- 5 : 6 = "ab"
- 10 : 10 = "a"
- 11 : 11 = "a"
- 12 : 12 = "a"
- 13 : 13 = "a"
- 14 : 14 = "a"
- Matching "ab{3}"
- 5 : 8 = "abbb"
- Matching "ab{2,3}"
- 0 : 2 = "abb"
- 5 : 8 = "abbb"
- **************************************************
- 11111
- 012345678901234
- abbaaabbbbaaaaa
- Matching "[ab]"
- 0 : 0 = "a"
- 1 : 1 = "b"
- 2 : 2 = "b"
- 3 : 3 = "a"
- 4 : 4 = "a"
- 5 : 5 = "a"
- 6 : 6 = "b"
- 7 : 7 = "b"
- 8 : 8 = "b"
- 9 : 9 = "b"
- 10 : 10 = "a"
- 11 : 11 = "a"
- 12 : 12 = "a"
- 13 : 13 = "a"
- 14 : 14 = "a"
- Matching "a[ab]+"
- 0 : 14 = "abbaaabbbbaaaaa"
- Matching "a[ab]+?"
- 0 : 1 = "ab"
- 3 : 4 = "aa"
- 5 : 6 = "ab"
- 10 : 11 = "aa"
- 12 : 13 = "aa"
- **************************************************
- 1111111111222222222233333333
- 01234567890123456789012345678901234567
- This is some text -- with punctuation.
- Matching "[^-. ]+"
- 0 : 3 = "This"
- 5 : 6 = "is"
- 8 : 11 = "some"
- 13 : 16 = "text"
- 21 : 24 = "with"
- 26 : 36 = "punctuation"
- **************************************************
- 1111111111222222222233333333
- 01234567890123456789012345678901234567
- This is some text -- with punctuation.
- Matching "[a-z]+"
- 1 : 3 = "his"
- 5 : 6 = "is"
- 8 : 11 = "some"
- 13 : 16 = "text"
- 21 : 24 = "with"
- 26 : 36 = "punctuation"
- Matching "[A-Z]+"
- 0 : 0 = "T"
- Matching "[a-zA-Z]+"
- 0 : 3 = "This"
- 5 : 6 = "is"
- 8 : 11 = "some"
- 13 : 16 = "text"
- 21 : 24 = "with"
- 26 : 36 = "punctuation"
- Matching "[A-Z][a-z]+"
- 0 : 3 = "This"
- **************************************************
- 11111
- 012345678901234
- abbaaabbbbaaaaa
- Matching "a."
- 0 : 1 = "ab"
- 3 : 4 = "aa"
- 5 : 6 = "ab"
- 10 : 11 = "aa"
- 12 : 13 = "aa"
- Matching "b."
- 1 : 2 = "bb"
- 6 : 7 = "bb"
- 8 : 9 = "bb"
- Matching "a.*b"
- 0 : 9 = "abbaaabbbb"
- Matching "a.*?b"
- 0 : 1 = "ab"
- 3 : 6 = "aaab"
- **************************************************
- 11111111112222222
- 012345678901234567890123456
- This is a prime #1 example!
- Matching "\d+"
- 17 : 17 = "1"
- Matching "\D+"
- 0 : 16 = "This is a prime #"
- 18 : 26 = " example!"
- Matching "\s+"
- 4 : 4 = " "
- 7 : 7 = " "
- 9 : 9 = " "
- 15 : 15 = " "
- 18 : 18 = " "
- Matching "\S+"
- 0 : 3 = "This"
- 5 : 6 = "is"
- 8 : 8 = "a"
- 10 : 14 = "prime"
- 16 : 17 = "#1"
- 19 : 26 = "example!"
- Matching "\w+"
- 0 : 3 = "This"
- 5 : 6 = "is"
- 8 : 8 = "a"
- 10 : 14 = "prime"
- 17 : 17 = "1"
- 19 : 25 = "example"
- Matching "\W+"
- 4 : 4 = " "
- 7 : 7 = " "
- 9 : 9 = " "
- 15 : 16 = " #"
- 18 : 18 = " "
- 26 : 26 = "!"
- **************************************************
- 1111111111222
- 01234567890123456789012
- \d+ \D+ \s+ \S+ \w+ \W+
- Matching "\\d\+"
- 0 : 2 = "\d+"
- Matching "\\D\+"
- 4 : 6 = "\D+"
- Matching "\\s\+"
- 8 : 10 = "\s+"
- Matching "\\S\+"
- 12 : 14 = "\S+"
- Matching "\\w\+"
- 16 : 18 = "\w+"
- Matching "\\W\+"
- 20 : 22 = "\W+"
- **************************************************
- 1111111111222222222233333333
- 01234567890123456789012345678901234567
- This is some text -- with punctuation.
- Matching "^\w+"
- 0 : 3 = "This"
- Matching "\A\w+"
- 0 : 3 = "This"
- Matching "\w+\S*$"
- 26 : 37 = "punctuation."
- Matching "\w+\S*\Z"
- 26 : 37 = "punctuation."
- Matching "\w*t\w*"
- 13 : 16 = "text"
- 21 : 24 = "with"
- 26 : 36 = "punctuation"
- Matching "\bt\w+"
- 13 : 16 = "text"
- Matching "\w+t\b"
- 13 : 16 = "text"
- Matching "\Bt\B"
- 23 : 23 = "t"
- 30 : 30 = "t"
- 33 : 33 = "t"