#!/usr/bin/env python
import urllib
url = ['']*40
i = 0
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read()
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
url = con[href +6 :html +5 ]
print url
二、查看博文目录第一页所有文章的URL
A:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#!/usr/bin/env python
import urllib
url = ['']*40
i = 0
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read()
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
url[0] = con[href +6 :html +5 ]
print url
while title != -1 and href != -1 and html != -1 and i < 40:
url = con[href +6 :html +5 ]
print url
title = con.find(r'<a title=',html)
href = con.find(r'href=',title)
html = con.find(r'.html',href)
i = i +1
或者B:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#!/usr/bin/env python
import urllib
i = 0
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read()
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
url = con[href +6 :html +5 ]
while title != -1 and href != -1 and html != -1 and i < 50:
title = con.find(r'<a title=',html)
href = con.find(r'href=',title)
html = con.find(r'.html',href)
url = con[href +6 :html +5 ]
print url
i = i + 1
#!/usr/bin/env python
import time
import urllib
i = 0
link = 1
page = 1
url = ['']*350
while page <= 7:
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read()
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
while title != -1 and href != -1 and html != -1 and i < 350:
url = con[href +6 :html +5 ]
print link,url
title = con.find(r'<a title=',html)
href = con.find(r'href=',title)
html = con.find(r'.html',href)
content = urllib.urlopen(url).read()
open(r'/tmp/sina/'+url[-26:],'w+').write(content)
time.sleep(5)
link = link + 1
i = i +1
page = page + 1
else:
print 'Download Over!'