|
#conding:utf-8
import urllib2
import time
page=1 #初始化下载页面为第一页
url = ['']*350 设置url的存储
while page<8:
buf = urllib2.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read() #打开url并读取内容
i = 0
title = buf.find(r'<a> href = buf.find(r'href=',title)
html = buf.find(r'.html',href)
while> url = buf[href+6:html+5] 使url正常
print url
title = buf.find(r'<a> href = buf.find(r'href=',title)
html = buf.find(r'.html',href)
i = i+1
else:
print page,"find end "
page = page+1
else:
print 'all down '
j = 0
while j<350: 下载url
biaoti = ['']*350
content = urllib2.urlopen(url[j]).read()
titname = content.find(r'SG_txta') 读取标题
end = content.find(r'</h',titname)
biaoti[j] = content[titname+9:end]
print biaoti[j]
open(r'hanhan/'+url[j][-26:],'w+').write(content) 保存内容以url的最后26位为名称和后缀
print 'downing ',url[j]
j=j+1
time.sleep(4)
else:
print 'down fished' |
|
|