西大 发表于 2018-8-10 12:04:27

python-ovens

#conding:utf-8  
import urllib2
  
import time
  
page=1      #初始化下载页面为第一页
  
url = ['']*350   设置url的存储
  
while page<8:
  buf = urllib2.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read()    #打开url并读取内容
  i = 0

  title = buf.find(r'<a>  href = buf.find(r'href=',title)
  html = buf.find(r'.html',href)

  while>  url = buf 使url正常
  print url

  title = buf.find(r'<a>  href = buf.find(r'href=',title)
  html = buf.find(r'.html',href)
  i = i+1
  else:
  print page,"find end "
  page = page+1
  
else:
  print 'all down '
  
j = 0
  
while j<350:          下载url
  biaoti = ['']*350
  content = urllib2.urlopen(url).read()
  titname = content.find(r'SG_txta') 读取标题
  end = content.find(r'</h',titname)
  biaoti = content
  print biaoti
  open(r'hanhan/'+url[-26:],'w+').write(content) 保存内容以url的最后26位为名称和后缀
  print 'downing ',url
  j=j+1
  time.sleep(4)
  
else:
  print 'down fished'
页: [1]
查看完整版本: python-ovens