Python基础语法 - 正则表达式

ibaobei · 发表于 2018-8-8 13:48:03

概述
　　正则表达式是一个特殊的字符序列，它常常用于检查是否与某种模式匹配。第八节课主要从以下几方面介绍了Python正则表达式的用法。
　　（1）re模块的使用
　　（2）字符匹配、数量表示、边界表示
　　（3）正则表达式的高级用法
　　（4）贪婪与非贪婪模式

re模块：Regular Expression
　　（一）match（正则表达式，待匹配字符串）
　　（1）采用从左向右逐项匹配，从起始位置起。
　　（2）用于正则匹配检查，如果“待匹配字符串”能够匹配“正则表达式”，则match方法返回匹配对象，否则返回None
　　

import re #导入re模块　　

　　
#从起始位置开始匹配
　　
rs = re.match("51cto", "51cto.com")
　　
print(rs.group())
　　

　　
#没有从起始位置开始匹配。没有字符被匹配上。
　　
rs = re.match("com", "51cto.com")
　　
print(rs)
　　

　　

　　运行结果：
　　

51cto　　
None
　　

　　（二）group(num = 0)方法：
　　（1）默认用来返回字符串的匹配部分
　　（2）匹配的整个表达式的字符串，group() 可以一次输入多个组号()，在这种情况下它将返回一个包含那些组所对应值的元组。
　　

#(163|outlook|qq)是第一组，索引为1；(com|cn)是第二组，索引为2　　
rs = re.match("\w{3,10}@(163|outlook|qq)\.(com|cn)$","hello_124@163.com")
　　
print(rs.group()) #默认返回字符串的匹配部分
　　
print(rs.group(1))
　　
print(rs.group(2))
　　
print(rs.groups()) #返回一个包含所有小组字符串的元组，从 1 到所含的小组号
　　

　　运行结果：
　　

hello_124@163.com　　
163
　　
com
　　
('163', 'com')
　　

字符匹配、数量表示、边界表示
　　（一）单字符匹配

　　（1）. 匹配除"\n"之外的任意单个字符
　　

import re　　

　　
rs = re.match(".", "a")
　　
print(rs.group())
　　

　　
rs = re.match(".", "1")
　　
print(rs.group())
　　

　　
rs = re.match("...", "abc") #多个字符
　　
print(rs.group())
　　

　　
rs = re.match(".", "\n")
　　
print(rs)
　　

　　运行结果：
　　

a　　
1
　　
abc
　　
None
　　

　　（2）\s：匹配任意空白字符，如空格，制表符“\t”，换行符“\n”
　　

import re　　
rs = re.match("\s", "\t")
　　
print(rs)
　　
rs = re.match("\s", "\n")
　　
print(rs)
　　
rs = re.match("\s", " ")
　　
print(rs)
　　

　　（3）\S：匹配任意非空字符；和\s模式相反
　　

rs = re.match("\S", "\t")　　
print(rs)
　　
rs = re.match("\S", "abc") #匹配单个字符，从起始位置
　　
print(rs.group())
　　

　　运行结果：
　　

None　　
a
　　

　　（4）[ ]匹配[ ]中列举的字符
　　

rs = re.match("[Hh]", "hello")　　
print(rs.group())
　　
rs = re.match("[Hh]", "Hello")
　　
print(rs.group())
　　

　　
rs = re.match("[0123456789]", "32")
　　
print(rs.group())
　　
rs = re.match("[0-9]", "3")
　　
print(rs.group())
　　

　　运行返回结果：
　　

h　　
H
　　
3
　　
3
　　

　　其他单字符匹配用法不一一列举。
　　（二）数量表示

　　（1）* 出现次数 n >= 0
　　

import re　　

　　
rs = re.match("1\d*", "1234567")    #匹配规则：起始是1，接着数字[0-9]出现任意次
　　
print(rs.group())
　　

　　
rs = re.match("1\d*", "1234567abc")
　　
print(rs.group())
　　

　　运行结果：
　　

1234567　　
1234567
　　

　　（2）+ 出现次数n >=1
　　

rs = re.match("\d+", "abc")       #\d ：起始是数字[0-9]，数字出现至少一次　　
print(rs)
　　
rs = re.match("\d+", "1abc")
　　
print(rs.group())
　　

　　运行结果：
　　

None　　
1
　　

　　（3）{m}， {m,} 和 {m, n}
　　

#{m} ：一个字符出现m次　　
rs = re.match("\d{3}", "123abc")
　　
print(rs.group())
　　

　　
#{m,} ：一个字符至少出现m次
　　
rs = re.match("\d{1,}", "123467abc")    #等价于+至少一次
　　
print(rs.group())
　　

　　
#{m,n} ：一个字符出现m到n次
　　
rs = re.match("\d{0,1}", "1abc")          #等价于?至多一次
　　
print(rs.group())
　　

　　运行结果：
　　

123　　
123467
　　
1
　　

　　（4）\转义字符
　　在Python里，\是转义字符。其实在其他语言里，\也是转义字符。反斜杠\后面的特殊字符，比如换行符\n，正则表达式中的单字符匹配. 等原样打印出来
　　

str1 = "hello\\world"　　
print(str1)             #仅仅打印了一个反斜杠
　　

　　#打印两个反斜杠，如果是三个反斜杠，也是打印出两个反斜杠
　　
str2 = "hello\\\\world"
　　
print(str2)
　　

　　
str3 = r"hello\\world"    #原生字符：r"str"
　　
print(str3)
　　

　　
#在正则表达式里，如果要匹配字符串中的反斜杠，字符串中的一个反斜杠在正则表达式中就要四个反斜杠进行匹配。
　　
rs = re.match("\w{5}\\\\\\\\\w{5}", str3)
　　
print(rs.group())
　　

　　
rs = re.match(r"\w{5}\\\\\w{5}",str3)
　　
print(rs.group())
　　

　　运行结果：
　　

hello\world　　
hello\\world
　　
hello\\world
　　
hello\\world
　　
hello\\world
　　

　　（三）边界表示
　　（1）字符串与单词边界：$结尾
　　

#字符.没有被认为是一般字符　　
rs = re.match("\w{3,10}@163.com","hello_124@163mcom")
　　
print(rs.group())
　　

　　
#转义字符对符号.起作用
　　
rs = re.match("\w{3,10}@163\.com$","hello_124@163.com")
　　
print(rs.group())
　　

　　运行结果：
　　

hello_124@163mcom　　
hello_124@163.com
　　

　　注意：第一个邮箱匹配实际是我们不期望的。但是它仍然被匹配成功，是因为字符. 被当成单字符匹配了。所以我们需要加上转义字符，让. 被当成正常字符。
　　（2）匹配分组：()分组
　　

rs = re.match("\w{3,10}@(163|outlook|qq)\.com$","hello_124@163.com")　　
print(rs.group())
　　

　　输出结果：
　　

hello_124@163.com　　

　　索引可由自己制定，比如?P<g1>
　　

html_str = "<head><title>python</title></head>"　　
rs = re.match(r"<.+><.+>.+</.+></.+>",html_str)
　　
print(rs.group())
　　
html_str2 = "<head><title>python</head></title>"
　　
rs = re.match(r"<.+><.+>.+</.+></.+>",html_str2) #wrong to match
　　
print(rs.group())
　　
rs = re.match(r"<(.+)><(.+)>.+</\2></\1>",html_str) #\2 and \1 is an index
　　
print(rs.group())
　　
rs = re.match(r"<(?P<g1>.+)><(?P<g2>.+)>.+</(?P=g2)></(?P=g1)>",html_str)
　　
print(rs.group())
　　

　　运行结果：
　　

<head><title>python</title></head>　　
<head><title>python</head></title>
　　
<head><title>python</title></head>
　　
<head><title>python</title></head>
　　

正则表达式的高级用法：
　　（1）search()：从左到右在字符串的任意位置搜索第一次出现匹配给定正则表达式的字符
　　

#search()　　
rs = re.search("car","haha car carbal abcar carbal")
　　
print(rs.group())
　　

　　输出结果:
　　

car　　

　　（2）findall()：在字符串中查找所有匹配成功的组，返回匹配成功的结果列表。
　　

rs = re.findall("car","haha car carbal abcar carbal")　　
print(rs)
　　

　　
mail_str = "zhangsan:helloworld@163.com,li:123456@qq.cn"
　　
list = re.findall(r"(\w{3,20}@(163|qq)\.(com|cn))",mail_str)
　　
print(list)
　　

　　输出结果：
　　

['car', 'car', 'car', 'car']　　
[('helloworld@163.com', '163', 'com'), ('123456@qq.cn', 'qq', 'cn')]
　　

　　（3）finditer()：在字符串中查找所有正则表达式匹配成功的字符串，返回iterator迭代器。
　　

mail_str = "zhangsan:helloworld@163.com,li:123456@qq.cn"　　
itor = re.finditer(r"\w{3,20}@(163|qq)\.(com|cn)",mail_str)
　　
for it in itor:
　　print(it.group())
　　

　　输出结果：
　　

helloworld@163.com　　
123456@qq.cn
　　

　　（4）sub()方法：将匹配到的数据使用新的数据替换
　　

str = "java python c cpp java"　　
rs = re.sub(r"java","python",str)
　　
print(rs)
　　

　　输出结果：
　　

python python c cpp python　　

贪婪与非贪婪模式
　　（1）贪婪模式：尽可能的匹配更多的字符
　　（2）非贪婪模式：与贪婪模式相反
　　

rs = re.findall(r"hello\d*","hello12345")　　
print(rs)
　　
rs = re.findall(r"hello\d+","hello12345")
　　
print(rs)
　　
rs = re.findall(r"hello\d?","hello12345")
　　
print(rs)
　　
rs = re.findall(r"hello\d{2,}","hello12345")
　　
print(rs)
　　
rs = re.findall(r"hello\d{1,3}","hello12345")
　　
print(rs)
　　
print("----------------------------------")
　　
rs = re.findall(r"hello\d*?","hello12345")
　　
print(rs)
　　
rs = re.findall(r"hello\d+?","hello12345")
　　
print(rs)
　　
rs = re.findall(r"hello\d??","hello12345")
　　
print(rs)
　　
rs = re.findall(r"hello\d{2,}?","hello12345")
　　
print(rs)
　　
rs = re.findall(r"hello\d{1,3}?","hello12345")
　　
print(rs)
　　

　　运行结果：
　　

['hello12345']　　
['hello12345']
　　
['hello1']
　　
['hello12345']
　　
['hello123']
　　
----------------------------------
　　
['hello']
　　
['hello1']
　　
['hello']
　　
['hello12']
　　
['hello1']
　　

作业题
　　奶茶馆价格结算系统优化：
　　1、使用正则表达式判断顾客输入的手机号是否符合手机号设置规则：
　　1）以数字 1 开头
　　2）第二位为 3578 中的任意数字
　　3）其余 9 位为 0-9 任意数字
　　2、输出手机号运营商，判断规则：
　　 移动运营商：手机号前三位为 134、135、136、137、138、139
　　 联通运营商：手机号前三位为 130、131、132、155、156、176
　　 电信运营商：手机号前三位为 133、153、173、177、180、181
　　

def OutputPhoneProvider(str):　　cell_phone_info = {"(134|135|136|137|138|139)":"China Mobile", "(130|131|132|155|156|176)":"China Unicom", "(133|153|173|177|180|181)":"ChinaTelecom"}
　　for key in cell_phone_info.keys():
　　field = re.match(key, str)
　　if field != None:
　　print (cell_phone_info[key])
　　return
　　

　　
phoneNum = input("Please input the cell phone number:")
　　
rs = re.match(r"1[3578]\d{9}$", phoneNum);
　　
if rs != None:
　　OutputPhoneProvider(phoneNum)
　　
else:
　　print("others")
　　

　　3、使用正则表达式判断输入奶茶编号，如果不在 1-5 范围内，输出： Woops!我们只售卖以上五种奶茶哦！新口味敬请期待！
　　

import re　　
teaNum = input("Please input the tea number:")
　　

　　
rs = re.match(r"[1-5]$", teaNum)
　　
if rs.group() == None:
　　print("Woops! Please re-input the tea number!")

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] Python基础语法 - 正则表达式

浏览过的版块

扫码加入运维网微信交流群