python正则模块 re
正则:描述匹配的规则正则就是用一些具有特殊含义的符号组合到一起(正则表达式)来描述字符或者字符串的方法。或者说:正则就是用来描述一类事物的规则import re#####方法################re.findall('匹配规则','匹配对象'):匹配返回所有满足匹配条件的结果,放在列表里;re下的最常用方法print(re.findall('c','chencs sfd d c dsfc'))#===>['c','c', 'c', 'c']
#####规则#####正则表达式######################匹配字符####\w匹配字母数字下划线print(re.findall("\w","qwa123_\n\t\r&^%@!#$%^&"))#===>'q','w','a','1','2','3','_']#\W匹配非字母数字下划线print(re.findall("\W","qwa123_\n\t\r&^%@!#$%^&"))#====>['','','','\n','','\t','','\r','','&','^','%','@','!','#','$','%','^','&','']#\s匹配任意空白字符(\n\t\r\f)print(re.findall("\s","qwa123_\n\t\r&^%@!#$%^&"))#====>['','','','\n','','\t','','\r','','']#\S匹配任意非空白字符print(re.findall("\S","qwa123_\n\t\r&^%@!#$%^&"))#===>['q','w','a','1','2','3','_','&','^','%','@','!','#','$','%','^','&']#\d匹配任意数字print(re.findall("\d","qwa123_\n\t\r&^%@!#$%^&"))#===>['1','2','3']#\D匹配任意非数字print(re.findall("\D","qwa123_\n\t\r&^%@!#$%^&"))#['q','w','a','','','_','','\n','','\t','','\r','','&','^','%','@','!','#','$','%','^','&','']#\A匹配以指定字符串开头=====^print(re.findall('\Achen','mynameischen'))print(re.findall('\Achen','chenismyname'))print(re.findall('^chen','chenismyname'))#===>[]#===>['chen']#===>['chen']
#\Z匹配以指定字符结尾 ====$print(re.findall('chen\Z','chenismyname'))print(re.findall('chen\Z','mynameischen'))print(re.findall('chen$','mynameischen'))#==>[]#==>['chen']#==>['chen']
#^chen$以chen开头,以chen结尾,即匹配以chenprint(re.findall('^chen$','chen'))#===>['chen']
#####重复匹配#####指定字符的匹配次数################## .?* .* .*? + {m,n} |()
.默认表示除了换行符之外的任意一个字符,在findall中使用re.DOTALL可以让.匹配包括\n的任意字符
print(re.findall('a.c','abca1caAcaaaaaca\nc'))#====>['abc','a1c','aAc','aac']print(re.findall('a.c','abca1caAcaaaaaca\nc',re.DOTALL))#====>['abc','a1c','aAc','aac','a\nc']
?:代表?左边的第一个字符重复0次或1次print(re.findall('ab?','aababbabbbabbbbabbbb'))#===>['a','ab','ab','ab','ab','ab']
*:代表*左边的第一个字符重复0次或无穷次print(re.findall('ab*','aababbabbbabbbbabbbba1bbbbbbb'))#===>['a','ab','abb','abbb','abbbb','abbbb','a']
+:代表+左边的第一个字符重复1次或无穷次print(re.findall('ab+','aababbabbbabbbbabbbba1bbbbbbb'))#==>['ab','abb','abbb','abbbb','abbbb']
{m,n}:代表左边的第一个字符重复m次到n次#{m}:取m次#{0,}:0次到无穷次#{0.1}:0次到1次#{1,}:1次到无穷次#{1,3}:1次到3次print(re.findall('ab{1,3}','aababbaababbb'))#===>['ab','abb','ab','abbb']
.*:匹配任意长度,任意的字符,贪婪匹配,尽可能长的匹配(引号内的是一个字符串,空格也是字符串)print(re.findall('a.*c','acasfca123c1c44c25c6casdfgdghgfdf'))#====>['acasfca123c1c44c25c6c']
.*?非贪婪匹配print(re.findall('a.*?c','acasfca123c1c44c25c6casdfgdghgfdf'))#==>['ac','asfc','a123c']尽可能短的匹配
():分组my_(Joy)_chen以my_Joy_chen进行匹配,匹配成功后只保留括号内的内容(?:)?:取消分组效果
print(re.findall('(alex)_sb','alex_sbasdfsafdafdaalex_sb'))#===>['alex','alex']
#<li><aid="blog_nav_sitehome"class="menu"href="http://www.cnblogs.com/">博客园</a></li>print(re.findall('href="(.*?)"','<li><aid="blog_nav_sitehome"class="menu"href="http://www.cnblogs.com/">博客园</a></li>'))#===>['http://www.cnblogs.com/']匹配href="xxxxxxx",只显示xxxxx;虽然.*?是尽可能短的匹配,但必须匹配""
print(re.findall('href="(.*)"','<li><aid="blog_nav_sitehome"class="menu"href="http://www.cnblogs.com/">博客园</a></li>'))#===>['http://www.cnblogs.com/']匹配href="xxxxxxx",只显示xxxxx;虽然.*是尽可能长的匹配,但必须匹配""print(re.findall('href="(.*?)','<li><aid="blog_nav_sitehome"class="menu"href="http://www.cnblogs.com/">博客园</a></li>'))#====>['']匹配href="xxxxxxx只显示xxxxx,尽可能短的匹配,所以匹配空print(re.findall('href="(.*)','<li><aid="blog_nav_sitehome"class="menu"href="http://www.cnblogs.com/">博客园</a></li>'))#===>['http://www.cnblogs.com/">博客园</a></li>']匹配href="xxxxxxx只显示xxxxx,尽可能长的匹配
print(re.findall('href="(?:.*?)"','<li><aid="blog_nav_sitehome"class="menu"href="http://www.cnblogs.com/">博客园</a></li>'))#==>['href="http://www.cnblogs.com/"']
[]:匹配一个指定范围内的字符(这一个字符来自于括号内定义的)
print(re.findall('ac','a1ca+ca2ca9ca11ca-caccaAca98c'))#==>['a11c','a98c']a两个数字c
print(re.findall('(+)_user','enon_useralex_u3rwsdfs_userjiec_user'))#==>['enon','rwsdfs','jiec']匹配到了enon_userrwsdfs_userjiec_user只显示enonrwsdfsjiec
当-需要被当中普通符号匹配时,只能放到[]的最左边或最右边print(re.findall('a[-+*]c','a1ca+ca2ca9ca*ca11ca-caccaAc'))#===>['a+c','a*c','a-c']
print(re.findall('ac','a1ca+ca2ca9ca*ca11ca-caccaAc'))#===>['acc','aAc']
#[]内的^代表取反的意思print(re.findall('a[^a-zA-Z]c','aca1ca+ca2ca9ca*ca11ca-caccaAc'))#==>['ac','a1c','a+c','a2c','a9c','a*c','a-c']
print(re.findall('a[^0-9]c','aca1ca+ca2ca9ca*ca11ca-caccaAc'))#==>['ac','a+c','a*c','a-c','acc','aAc']
print(re.findall('(+)_sb','egonalex_sb123123wxxxxxxxxxxxxx_sb,lxx_sb'))#===>['alex','wxxxxxxxxxxxxx','lxx']
#|:或者print(re.findall('compan(ies|y)','Toomanycompanieshavegonebankrupt,andthenextoneismycompany'))#===>['ies','y']
#(?:):代表取匹配成功的所有内容,而不仅仅只是括号内的内容print(re.findall('compan(?:ies|y)','Toomanycompanieshavegonebankrupt,andthenextoneismycompany'))#==>['companies','company']#print(re.findall('alex|sb','alexsbsadfsadfasdfegonalexsbegon'))
#re模块的其他方法:
#re.findall("正则表达式","字符串")返回所有满足匹配条件的结果,放在列表里#re.findall("正则表达式","字符串",re.DOTALL)print(re.findall('alex|sb','123123alexsbsadfsadfasdfegonalexsbegon'))#==>['alex','sb','alex','sb']
#re.search("正则表达式","字符串")只到找到第一个匹配然后返回一个包含匹配信息的对象,该对象可以通过调用group()方法得到匹配的字符串,如果字符串没有匹配,则返回None。print(re.search('alex|sb','123213alexsbsadfsadfasdfegonalexsbegon').group())#====>alexprint(re.search('^alex','123213alexsbsadfsadfasdfegonalexsbegon'))#===>None^匹配开头print(re.search('^alex','alexsbsadfsadfasdfegonalexsbegon').group())#===>alex
#re.match("正则表达式","字符串")#从字符串开始处进行匹配,完全可以用search+^代替match返回值同search,print(re.match('alex','alexsbsadfsadfasdfegonalexsbegon').group())#====>alexprint(re.match('alex','123213alexsbsadfsadfasdfegonalexsbegon'))#====>None#字符串开头匹配不上,则不往后匹配
#re.split("分隔符","字符串")按分隔符分割字符串,放入列表中;特殊符号需要转义后才能当作分隔符info='a:b:c:d'print(info.split(':'))#==>['a','b','c','d']print(re.split(':',info))#==>['a','b','c','d']
#re.split("[多个分隔符]","字符串")特殊符号需要转义后才能当作分隔符,依次按分隔符分字符串,放入列表中info=r'get:a.txt\3333/rwx'print(re.split('[:\\\/.]',info))#==》['get','','a','txt','3333','rwx']
#替换re.sub("需要替换的字符","替换后的字符","字符串",替换次数)print('egonisbeutifullegon'.replace('egon','EGON'))#==>EGONisbeutifullEGONprint('egonisbeutifullegon'.replace('egon','EGON',1))#==>EGONisbeutifullegonprint(re.sub('egon','EGON',"egonisbeutifullegon"))#==>EGONisbeutifullEGONprint(re.sub('egon','EGON',"egonisbeutifullegon",1))#==>EGONisbeutifullegon
#替换与分组\1对应着第一个分组,\2对应着第二个分组,\3对应着第三个分组,\4对应着第四个分组\5对应着第五个分组print(re.sub('(Joy)(.*?)(good)',r'\3\2\1',r'Joyisgood'))#===>goodisJoy
print(re.sub('(.*?)(egon)(.*?)(egon)(.*?)',r'\1\2\3EGON\5','123egonisbeutifullegon123'))#(123)--->\1#(egon)--->\2#(isbeutifull)-->\3#(egon)--->\4#(123)--->\5#====>123egonisbeutifullEGON123
print(re.sub('(+)([^a-zA-Z]+)(+)([^a-zA-Z]+)(+)',r'\5\2\3\4\1',r'lqzzzz123+isSB'))#(lqzzzz)(123+)(is)()(SB)#===>SB123+islqzzzz
#re.subn("需要替换的字符","替换后的字符","字符串"))返回替换结果和总共替换的个数print(re.subn('a','A','alexmakelove'))#===》('AlexmAkelove',2)
#re.compole("正则表达式")指定匹配规则,后续使用中可以在多个方法中都是用这个正则表达式,不用重复写pattern=re.compile('alex')print(pattern.findall('alexisalexalex'))#===>['alex','alex','alex']print(pattern.findall('alexasdfsadfsadfasdfasdfasfdisalexalex'))#===>['alex','alex','alex']
页:
[1]