handle_decl 处理<!开头的,比如<!DOCTYPE html PUBLIC “-//W3C//DTD HTML 4.01 Transitional//EN”
handle_pi 处理形如<?instruction>的东西
1. 基本解析,找到开始和结束标签
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
print("Encountered the beginning of a %s tag" %(tag))
def handle_endtag(self, tag):
print ("Encountered the end of a %s tag" %(tag))
if __name__ == '__main__':
a = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html><head><!--insert javaScript here!--><title>test</title><body><a href="http: //www.163.com">链接到163</a></body></html>'
m = MyHTMLParser()
#传入要分析的html模块
m.feed(a)
运行结果
Encountered the beginning of a html tag
Encountered the beginning of a head tag
Encountered the beginning of a title tag
Encountered the end of a title tag
Encountered the beginning of a body tag
Encountered the beginning of a a tag
Encountered the end of a a tag
Encountered the end of a body tag
Encountered the end of a html tag 2. 解析html的超链接和链接显示的内容
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.flag=None
def handle_starttag(self, tag, attrs):
# 这里重新定义了处理开始标签的函数
if tag == 'a':
# 判断标签<a>的属性
self.flag='a'
for name,value in attrs:
if name == 'href':
print("href:"+value)
def handle_data(self,data):
if self.flag == 'a':
print("data:"+data)
if __name__ == '__main__':
a = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html><head><!--insert javaScript here!--><title>test</title><body><a href="http: //www.163.com">链接到163</a></body></html>'
my = MyHTMLParser()
my.feed(a)
运行结果
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html><head><!--insert javaScript here!--><title>test</title><body><a href="http: //www.163.com">链接到163</a></body></html>