Python爬虫多线程版

23rfe · 发表于 2015-12-21 08:37:26

XPath提取内容
//定位根节点
/ 往下层寻找
提取文本内容：/text()
提取属性内容 : /@XXXX

常规匹配

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

#-*-coding:utf8-*-
from lxml import etree
html = '''
<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title>测试-常规用法</title>
</head>
<body>
<div id="content">
<ul id="useful">
      <li>这是第一条信息</li>
      <li>这是第二条信息</li>
      <li>这是第三条信息</li>
</ul>
<ul id="useless">
      <li>不需要的信息1</li>
      <li>不需要的信息2</li>
      <li>不需要的信息3</li>
</ul>

<div id="url">
      <a href="http://jikexueyuan.com">极客学院</a>
      <a href="http://jikexueyuan.com/course/" title="极客学院课程库">点我打开课程库</a>
</div>
</div>

</body>
</html>
'''

selector = etree.HTML(html)

#提取文本
content = selector.xpath('//ul[@id="useful"]/li/text()')
for each in content:
print each

#提取属性
link = selector.xpath('//a/@href')
for each in link:
print each

title = selector.xpath('//a/@title')
print title[0]

特殊匹配

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56

#-*-coding:utf8-*-
from lxml import etree

html1 = '''
<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title></title>
</head>
<body>
<div id="test-1">需要的内容1</div>
<div id="test-2">需要的内容2</div>
<div id="testfault">需要的内容3</div>
</body>
</html>
'''

html2 = '''
<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title></title>
</head>
<body>
<div id="test3">
      我左青龙，
      <span id="tiger">
         右白虎，
         <ul>上朱雀，
            <li>下玄武。</li>
         </ul>
         老牛在当中，
      </span>
      龙头在胸口。
</div>
</body>
</html>
'''

selector = etree.HTML(html1)
content = selector.xpath('//div[starts-with(@id,"test")]/text()')
for each in content:
print each

# selector = etree.HTML(html2)
# content_1 = selector.xpath('//div[@id="test3"]/text()')
# for each in content_1:
#    print each
#
#
# data = selector.xpath('//div[@id="test3"]')[0]
# info = data.xpath('string(.)')
# content_2 = info.replace('\n','').replace(' ','')
# print content_2

单线程与多线程耗时比较

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

#-*-coding:utf8-*-

from multiprocessing.dummy import Pool as ThreadPool
import requests
import time

def getsource(url):
html = requests.get(url)

urls = []

for i in range(1,21):
newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
urls.append(newpage)

time1 = time.time()
for i in urls:
print i
getsource(i)
time2 = time.time()
print u'单线程耗时：' + str(time2-time1)

pool = ThreadPool(4)
time3 = time.time()
results = pool.map(getsource, urls)
pool.close()
pool.join()
time4 = time.time()
print u'并行耗时：' + str(time4-time3)

多线程爬取百度贴吧

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48

#-*-coding:utf8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import json
import sys

reload(sys)

sys.setdefaultencoding('utf-8')

'''重新运行之前请删除content.txt，因为文件操作使用追加方式，会导致内容太多。'''

def towrite(contentdict):
f.writelines(u'回帖时间:' + str(contentdict['topic_reply_time']) + '\n')
f.writelines(u'回帖内容:' + unicode(contentdict['topic_reply_content']) + '\n')
f.writelines(u'回帖人:' + contentdict['user_name'] + '\n\n')

def spider(url):
html = requests.get(url)
selector = etree.HTML(html.text)
content_field = selector.xpath('//div[@class="l_post l_post_bright "]')
item = {}
for each in content_field:
      reply_info = json.loads(each.xpath('@data-field')[0].replace('&quot',''))
      author = reply_info['author']['user_name']
      content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content "]/text()')[0]
      reply_time = reply_info['content']['date']
      print content
      print reply_time
      print author
      item['user_name'] = author
      item['topic_reply_content'] = content
      item['topic_reply_time'] = reply_time
      towrite(item)

if __name__ == '__main__':
pool = ThreadPool(4)
f = open('content.txt','a')
page = []
for i in range(1,21):
      newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
      page.append(newpage)

results = pool.map(spider, page)
pool.close()
pool.join()
f.close()

账号		自动登录	找回密码
密码			立即注册

Centos6.5×64安装配置openmeetings3.0.3详

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

[经验分享] Python爬虫多线程版

相关帖子

浏览过的版块

扫码加入运维网微信交流群