Python 正则表达式
In [17]:
from nltk.book import *
import nltk
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
In [10]:
regexp = r'ed$'
[w for w in wordlist if re.search(regexp,w)][0:10]
Out[10]:
或者匹配以aa开头的字符:
In [11]:
regexp = r'^aa'
[w for w in wordlist if re.search(regexp,w)]
Out[11]:
In [12]:
regexp = r'^b.t$'
[w for w in wordlist if re.search(regexp,w)]
Out[12]:
In [19]:
regexp = r'e-?mail'
sum(1 for w in nltk.book.text5 if re.search(regexp,w))
Out[19]:
In [39]:
chatWords = sorted(set(w for w in nltk.corpus.nps_chat.words()))
regexp = r'^[ha]+$'
[w for w in chatWords if re.search(regexp,w)][0:10]
Out[39]:
In [31]:
regexp1 = r'^m+i+n+e+$'
[w for w in chatWords if re.search(regexp1,w)]
Out[31]:
In [32]:
regexp2 = r'^mine$'
[w for w in chatWords if re.search(regexp2,w)]
Out[32]:
‘[^..]’括号中的^匹配所有不在其中的字符
比如匹配所有非元音字母:
In [38]:
regexp2 = r'^[^aeiouAEIOU]+$'
[w for w in wordlist if re.search(regexp2,w)][0:10]
Out[38]:
In [41]:
wsj = sorted(set(nltk.corpus.treebank.words()))
[w for w in wsj if re.search('^[0-9]+.[0-9]+$',w)][1:10]
Out[41]:
In [42]:
[w for w in wsj if re.search('^[A-Z]+\$$',w)]
Out[42]:
In [43]:
[w for w in wsj if re.search('^[0-9]{4}',w)][0:10]
Out[43]:
In [44]:
[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}',w)][0:10]
Out[44]:
In [45]:
[w for w in wsj if re.search('[a-z]{5,}-[a-z]{2,3}-[a-z]',w)][0:10]
Out[45]:
In [46]:
[w for w in wsj if re.search('(ed|ing)$',w)][0:10]
Out[46]: