gbless 发表于 2017-4-27 09:51:54

python网页列表爬虫

  #-*- encoding: utf-8 -*-
  import htmllib,urllib,formatter,string
  class GetLinks(htmllib.HTMLParser,str):
  def __init__(self,str): 
  self.str=str
  self.links = {}
  f = formatter.NullFormatter()
  htmllib.HTMLParser.__init__(self, f)
  def anchor_bgn(self, href, name, type):
  self.save_bgn()
  self.link = href
  def anchor_end(self): 
  text = string.strip(self.save_end()) 
  if text.find(self.str)!=-1 :
  if self.link and text:
  self.links = self.link
  def findall(str1,strfront,i,strlat):
  fp = urllib.urlopen(strfront+str(i)+strlat)
  data = fp.read()
  fp.close()
  linkdemo = GetLinks(str1) 
  linkdemo.feed(data) 
  linkdemo.close()
  for href, link in linkdemo.links.items(): 
  print href, "=>", link
  i=1
  strfront='http://readthedocs.org/docs/learn-python-the-hard-way-zh_cn-translation/en/latest/ex'
  strlat='.html'
  search='ex'
  while i<20 :
  findall(search,strfront,i,strlat)
  i=i+1
页: [1]
查看完整版本: python网页列表爬虫