python网页列表爬虫

gbless 发表于 2017-4-27 09:51:54

　　#-*- encoding: utf-8 -*-
　　import htmllib,urllib,formatter,string
　　class GetLinks(htmllib.HTMLParser,str):
　　def __init__(self,str):
　　self.str=str
　　self.links = {}
　　f = formatter.NullFormatter()
　　htmllib.HTMLParser.__init__(self, f)
　　def anchor_bgn(self, href, name, type):
　　self.save_bgn()
　　self.link = href
　　def anchor_end(self):
　　text = string.strip(self.save_end())
　　if text.find(self.str)!=-1 :
　　if self.link and text:
　　self.links = self.link
　　def findall(str1,strfront,i,strlat):
　　fp = urllib.urlopen(strfront+str(i)+strlat)
　　data = fp.read()
　　fp.close()
　　linkdemo = GetLinks(str1)
　　linkdemo.feed(data)
　　linkdemo.close()
　　for href, link in linkdemo.links.items():
　　print href, "=>", link
　　i=1
　　strfront='http://readthedocs.org/docs/learn-python-the-hard-way-zh_cn-translation/en/latest/ex'
　　strlat='.html'
　　search='ex'
　　while i<20 :
　　findall(search,strfront,i,strlat)
　　i=i+1

页: [1]

运维网's Archiver

python网页列表爬虫