Python 简单爬虫功能实现
当Google创始人用python写下他们第一个简陋的爬虫, 运行在同样简陋的服务器上的时候 ;很少有人能够想象 , 在接下的数十年间 , 他们是怎样地颠覆了互联网乃至于人类的世界 。 有网络的地方就有爬虫,爬虫英文名称spider。它是用来抓取网站数据的程序。比如: 我们通过一段程序,定期去抓取类似百度糯米、大众点评上的数据,将这些信息存储到数据库里,然后加上展示页面,一个团购导航站就问世了。毫无疑问,爬虫是很多网站的初期数据来源。
一、第一个爬虫功能的实现
——查看博文目录第一篇文章的URL
首先需要引入urllib模块,使用find函数查找url,经过字符处理就都得到了需要的URL。
1
2
3
4
5
6
7
8
9
10
#!/usr/bin/env python
import urllib
url = ['']*40
i = 0
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read()
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
url = con
print url
二、查看博文目录第一页所有文章的URL
A:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#!/usr/bin/env python
import urllib
url = ['']*40
i = 0
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read()
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
url = con
print url
while title != -1 and href != -1 and html != -1 and i < 40:
url = con
print url
title = con.find(r'<a title=',html)
href = con.find(r'href=',title)
html = con.find(r'.html',href)
i = i +1
或者B:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#!/usr/bin/env python
import urllib
i = 0
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read()
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
url = con
while title != -1 and href != -1 and html != -1 and i < 50:
title = con.find(r'<a title=',html)
href = con.find(r'href=',title)
html = con.find(r'.html',href)
url = con
print url
i = i + 1
三、下载博文目录第一页所有的文章
A:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/usr/bin/env python
import urllib
i = 0
url = ['']*40
con = urllib.urlopen('http://www.zhihu.com/collection/19668036').read()
target = con.find(r'<a target="_blank')
base = con.find(r'href=',target)
end = con.find('>',base)
url = 'http://www.zhihu.com' + con
print url
while i < 20:
url = 'http://www.zhihu.com' + con
print url
target = con.find(r'<a target="_blank',end)
base = con.find(r'href=',target)
end = con.find('>',base)
i = i + 1
while j < 30:
content = urllib.urlopen(url).read()
print url
open(r'zhihu/'+url,'w+').write(content)
print 'downloading',
j = j + 1
time.sleep(15)
或者B:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#!/usr/bin/env python
import time
import urllib
i = 0
j = 0
url = ['']*30
name = ['']*30
con = urllib.urlopen('http://www.zhihu.com/collection/19668036').read()
target = con.find(r'<a target="_blank')
base = con.find(r'href=',target)
end = con.find('>',base)
url = 'http://www.zhihu.com' + con
while target != -1 and base != -1 and end != -1 and i < 30:
url = 'http://www.zhihu.com' + con
name =con
target = con.find(r'<a target="_blank',end)
base = con.find(r'href=',target)
end = con.find('>',base)
content = urllib.urlopen(url).read()
open(r'zhihu/'+name+'.html','w+').write(content)
print 'downloading',name
time.sleep(5)
i = i + 1
四、下载所有文章
A:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import time
import urllib
page = 1
url = ['']*350
i = 0
link = 1
while page <= 7:
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read()
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
while title != -1 and href != -1 and html != -1 and i < 350:
url = con
print link,url
title = con.find(r'<a title=',html)
href = con.find(r'href=',title)
html = con.find(r'.html',href)
link = link + 1
i = i +1
else:
print 'find end!'
page = page + 1
else:
print 'all find end'
j = 0
while j < 50:
content = urllib.urlopen(url).read()
open(r'tmp/'+url[-26:],'w+').write(content)
j = j + 1
time.sleep(5)
else:
print 'Download over!'
B:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#!/usr/bin/env python
import time
import urllib
i = 0
link = 1
page = 1
url = ['']*350
while page <= 7:
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read()
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
while title != -1 and href != -1 and html != -1 and i < 350:
url = con
print link,url
title = con.find(r'<a title=',html)
href = con.find(r'href=',title)
html = con.find(r'.html',href)
content = urllib.urlopen(url).read()
open(r'/tmp/sina/'+url[-26:],'w+').write(content)
time.sleep(5)
link = link + 1
i = i +1
page = page + 1
else:
print 'Download Over!'
运行结果:
页:
[1]