用python+selenium抓取知乎今日最热和本月最热的前三个问题...

iyufygfd · 发表于 2016-12-26 10:50:28

用python+selenium抓取知乎今日最热和本月最热的前三个问题及每个问题的首个回答并保存至html文件

抓取知乎今日最热和本月最热的前三个问题及每个问题的首个回答，保存至html文件，该html文件的文件名应该是20160228_zhihu_today_hot.html，也就是日期+zhihu_today_hot.html

代码如下：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

from selenium import webdriver
from time import sleep
import time

class ZhiHu():

def __init__(self):
 self.dr = webdriver.Chrome()
 self.dr.maximize_window()
 self.today_hot_list = self.get_today_hot()
 self.month_hot_list = self.get_month_hot()

def get_today_hot(self):
 '''知乎今日最热问题前3个'''
 today_hot = []
 i = 0
 while i < 3:
 self.dr.get('https://www.zhihu.com/explore')
 sleep(3)
 question_title = self.dr.find_elements_by_css_selector('div.explore-feed.feed-item>h2>a.question_link').text #获取问题
 question_answer_url = self.dr.find_elements_by_css_selector('div.explore-feed.feed-item>h2>a.question_link').get_attribute('href') #获取问题回答的url
 self.dr.get(question_answer_url) #访问问题url
 sleep(10)
 question_answer_innerhtml = self.dr.find_element_by_css_selector('.zm-editable-content.clearfix').get_attribute('innerHTML') #获取首个回答的innerHTML
 today_hot.append((question_title, question_answer_innerhtml))
 i += 1
 return today_hot

def write_today_data(self):
 file_date = time.strftime('%Y-%m-%d',time.localtime(time.time()))
 self.file = open(file_date+'_zhihu_today_hot'+'.html','wb')
 file_line = '********************************************** ' # 为转行符
 for item in self.today_hot_list:
 self.file.write(file_line.encode('gbk'))
 self.file.write(('问题：'+item[0]+' ').encode('gbk'))
 self.file.write(('首个回答：'+item[1]+' ').encode('gbk'))
 self.file.close()

def get_month_hot(self):
 '''知乎本月最热问题前3个'''
 month_hot = []
 i = 5 # 本月最热div前已有5个标签
 while i < 8:
 self.dr.get('https://www.zhihu.com/explore#monthly-hot')
 sleep(3)
 question_title = self.dr.find_elements_by_css_selector('div.explore-feed.feed-item>h2>a.question_link').text # 获取问题
 question_answer_url = self.dr.find_elements_by_css_selector('div.explore-feed.feed-item>h2>a.question_link').get_attribute('href') # 获取问题回答的url
 self.dr.get(question_answer_url) # 访问问题url
 sleep(5)
 question_answer_innerhtml = self.dr.find_element_by_css_selector('.zm-editable-content.clearfix').get_attribute('innerHTML') # 获取首个回答的innerHTML
 month_hot.append((question_title, question_answer_innerhtml))
 i += 1
 return month_hot

def write_month_data(self):
 file_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
 self.file = open(file_date + '_zhihu_mouth_hot' + '.html', 'wb')
 file_line = '-------------------------------------- '
 for item in self.month_hot_list:
 self.file.write(file_line.encode('gbk'))
 self.file.write(('问题：' + item[0] + ' ').encode('gbk'))
 self.file.write(('首个回答：' + item[1] + ' ').encode('gbk'))
 self.file.close()

def quit(self):
 self.dr.quit()

if __name__ == '__main__':
zhihu = ZhiHu()
zhihu.write_today_data()
zhihu.write_month_data()
zhihu.quit()

网页如下：

生成html如下：

嘻嘻，html的排版不是多好哈~

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] 用python+selenium抓取知乎今日最热和本月最热的前三个问题...

相关帖子

浏览过的版块

扫码加入运维网微信交流群