| prev | Draft Version 560 (Wed Nov 9 17:41:15 2005) | next |
"*" in the shell's *.txtre module, then use re.search(pattern, text)
pattern is a regular expression that describes what you're looking fortext is the string you're searching inpattern?
| Pattern | Matches | Doesn't Match | Explanation |
|---|---|---|---|
⌈a*⌋ | "", "a", "aa", … | "A", "b" | ⌈*⌋ means “zero or more” matching is case sensitive |
⌈b+⌋ | "b", "bb", … | "" | ⌈+⌋ means “one or more” |
⌈ab?c⌋ | "ac", "abc" | "a", "abbc" | ⌈?⌋ means “optional” (zero or one) |
⌈[abc]⌋ | "a", "b", or "c" | "ab", "d" | ⌈[…]⌋ means “one character from a set” |
⌈[a-c]⌋ | "a", "b", or "c" | Character ranges can be abbreviated | |
⌈[abc]*⌋ | "", "ac", "baabcab", … | Operators can be combined: zero or more choices from "a", "b", or "c" |
re.search looks for a match anywhere in the textimport re
pattern = 'a[bc]*'
for text in ['b', 'ab', 'accb', 'mad']:
if re.search(pattern, text):
print '"%s" matches "%s"' % (pattern, text)
else:
print '"%s" does not match "%s"' % (pattern, text)
"a[bc]*" does not match "b" "a[bc]*" matches "ab" "a[bc]*" matches "accb" "a[bc]*" matches "mad"
⌈a[bc]*⌋ matches an "a", followed by zero or more of either "b" or "c"
"b" because there's no leading "a""ab" and "accb""mad"?
re.search looks for a match anywhere in text"m", then ⌈a⌋ matches "a", and ⌈[bc]*⌋ matches the empty stringre.search looks anywhere in the line, how to find blank lines?
"x \n" or " x\n" blank⌈^⌋ matches the beginning of the string⌈$⌋ matches the endimport sys, re
# Nothing but space, tab, carriage return, newline from start to end
pattern = '^[ \t\r\n]*$'
# Count matches in one file/stream.
def count(filename, instream):
count = 0
for line in instream:
if re.search(pattern, line):
count += 1
print '%s %d' % (filename, count)
# Only standard instream?
if len(sys.argv) == 1:
count('<stdin>', sys.stdin)
else:
for filename in sys.argv[1:]:
instream = open(filename, 'r')
count(filename, instream)
instream.close()
"^" or "*"?
"\" in front of it⌈\$⌋ matches a literal "$", and ⌈\\⌋ matches a literal "\""\\$" and "\\\\"
"\t" is a tab character, which matches a tab character"\\t" is the two-character sequence ⌈\t⌋, which also matches a tab character"\" is also used in shorthand notation for common character sets⌈[^abc]⌋ means “anything except the characters in this set”⌈.⌋ means “any character except the end of line”
⌈[^\n]⌋⌈\b⌋ anchors the match to a break between word and non-word characters⌈^⌋ and ⌈$⌋, doesn't consume any actual characters load D 1
10 : sub A B
jlt A 20
import sys, re
# start of line, optional spaces, digits, more optional spaces, colon
numbered = '^\\s*\\d+\\s*:'
seen = {}
for line in sys.stdin:
if re.search(numbered, line):
num = line.split()[0]
if num in seen:
print num
else:
seen[num] = True
'2 :'.split() gives ['2', ':'], but '2:'.split()' gives ['2:']re.search is actually a match object that records what what matched, and wheremo.group() returns the whole string that matched the REmo.start() and mo.end() are the indices of the match's locationimport re
text = 'abbcb'
for pattern in ['b+', 'bc*', 'b+c+']:
mo = re.search(pattern, text)
print '%s / %s => "%s" (%d, %d)' % \
(pattern, text, mo.group(), mo.start(), mo.end())
b+ / abbcb => "bb" (1, 3) bc* / abbcb => "b" (1, 2) b+c+ / abbcb => "bbc" (1, 4)
mo.group(3) is the text that matched the third subexpression, m.start(3) is where it startedimport sys, re
# start of line, optional spaces, digits, more optional spaces, colon
numbered = '^\\s*(\\d+)\\s*:'
seen = {}
for line in sys.stdin:
mo = re.search(numbered, line)
if mo:
num = mo.group(1)
if num in seen:
print num
else:
seen[num] = True
# optional spaces, number, required spaces, number, optional spaces
def reverse(instream, outstream):
cols = '^\\s*(\\d+)\\s+(\\d+)\s*$'
for line in instream:
mo = re.match(cols, line)
# If match, reverse numbers
if mo:
a, b = mo.group(1), mo.group(2)
print >> outstream, '%s\t%s' % (b, a)
# If no match, echo line (without adding extra newline at end)
else:
print >> outstream, line,
if __name__ == '__main__':
fixture = '''\
# Leading comment followed by blank line
10 20
30\t40\t
50
60 70 80
\t90 100
'''
expected = '''\
# Leading comment followed by blank line
20\t10
40\t30
50
60 70 80
100\t90
'''
from cStringIO import StringIO
instream = StringIO(fixture)
outstream = StringIO()
reverse(instream, outstream)
assert outstream.getvalue() == expected
re.compile(pattern) to get the compiled REre modulematcher.search(text) searches text for matches to the RE that was compiled to create matcherdef findAll(instream, outstream):
matcher = re.compile('\\b([A-Z][a-z]*)\\b(.*)')
for line in instream:
mo = matcher.search(line)
while mo:
print >> outstream, mo.group(1)
mo = matcher.search(mo.group(2))
if __name__ == '__main__':
fixture = '''\
This has several "Title Case" words
on Each Line (Some in parentheses).
'''
expected = '''\
This
Title
Case
Each
Line
Some
'''
from cStringIO import StringIO
instream = StringIO(fixture)
outstream = StringIO()
findAll(instream, outstream)
assert outstream.getvalue() == expected
print 'INPUT'
print fixture
print 'OUTPUT'
print expected
import re
#- start:findAll
def findAll(instream, outstream):
matcher = re.compile('\\b([A-Z][a-z]*)\\b(.*)')
for line in instream:
mo = matcher.search(line)
while mo:
print >> outstream, mo.group(1)
mo = matcher.search(mo.group(2))
#- end:findAll
# start:test
if __name__ == '__main__':
fixture = '''\
This has several "Title Case" words
on Each Line (Some in parentheses).
'''
expected = '''\
This
Title
Case
Each
Line
Some
'''
from cStringIO import StringIO
instream = StringIO(fixture)
outstream = StringIO()
findAll(instream, outstream)
assert outstream.getvalue() == expected
print 'INPUT'
print fixture
print 'OUTPUT'
print expected
# end:test
re module| Method | Purpose | Example | Result |
|---|---|---|---|
split | Split a string on a pattern. | re.split('\\s*,\\s*', 'a, b ,c , d') | ['a', 'b', 'c', 'd'] |
findall | Find all matches for a pattern. | re.findall('\\b[A-Z][a-z]*', 'Some words in Title Case.') | ['Some', 'Title', 'Case'] |
sub | Replace matches with new text. | re.sub('\\d+', 'NUM', 'If 123 is 456') | "If NUM is NUM" |
java.util.regex package contains two classes:
Pattern: a compiled regular expressionMatcher: the result of a matchpublic static String matchMiddle(String data) {
String result = null;
Pattern p = Pattern.compile("a(b|c)d");
Matcher m = p.matcher(data);
if (m.matches()) {
result = m.group(1);
}
return result;
}
open MAIL, 'mail.txt'
while (<MAIL>) {
if (($name, $value) = /^([^:]+): ?(.+)$/) {
print "Message header $name is $value\n";
}
}
⌈|⌋ for either/or⌈ab|cd⌋ matches either "ab" or "cd"⌈a(b|c)d⌋ matches either "abd" or "acd"⌈pat{N}⌋ to match exactly N occurrences of a pattern⌈pat{M,N}⌋ matches between M and N occurrences⌈\d{2,3}⌋ matches "19" or "207", but not "3" or "4567"
"456"⌈^\d{2,3}⌋ won'tAndrew Kuchling's Python Regular Expression HOWTOExercise 17.1:
By default, regular expression matches are
greedy: the first term in the RE
matches as much as it can, then the second part, and so on. As a
result, if you apply the RE ⌈X(.*)X(.*)⌋ to the string
"XaX and XbX", the first group will contain "aX and Xb",
and the second group will be empty.
It's also possible to make REs match
reluctantly, i.e., to have the
parts match as little as possible, rather than as much. Find out
how to do this, and then modify the RE in the previous paragraph
so that the first group winds up containing "a", and the
second group " and XbX".
| prev | Copyright © 2005, Python Software Foundation. See License for details. | next |