如何用python爬取知乎话题？

小时？ · 发表于 2018-8-9 12:08:04

#coding:utf-8　　"""
　　@author:haoning
　　@create time:2015.8.5
　　"""
　　from __future__ import division  # 精确除法
　　from Queue import Queue
　　from __builtin__ import False
　　import json
　　import os
　　import re
　　import platform
　　import uuid
　　import urllib
　　import urllib2
　　import sys
　　import time
　　import MySQLdb as mdb
　　from bs4 import BeautifulSoup
　　reload(sys)
　　sys.setdefaultencoding( "utf-8" )
　　headers = {
　　'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',
　　'Content-Type':'application/x-www-form-urlencoded; ',
　　'X-Requested-With':'XMLHttpRequest',
　　'Referer':'https://www.zhihu.com/topics',
　　'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'
　　}
　　DB_HOST = '127.0.0.1'
　　DB_USER = 'root'
　　DB_PASS = 'root'
　　queue= Queue() #接收队列
　　nodeSet=set()
　　keywordSet=set()
　　stop=0
　　offset=-20
　　level=0
　　maxLevel=7
　　counter=0
　　base=""
　　conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')
　　conn.autocommit(False)
　　curr = conn.cursor()
　　def get_html(url):
　　try:
　　req = urllib2.Request(url)
　　response = urllib2.urlopen(req,None,3) #在这里应该加入代理
　　html = response.read()
　　return html
　　except:
　　pass
　　return None
　　def getTopics():
　　url = 'https://www.zhihu.com/topics'
　　print url
　　try:
　　req = urllib2.Request(url)
　　response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�
　　html = response.read().decode('utf-8')
　　print html
　　soup = BeautifulSoup(html)
　　lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'})
　　for li in lis:
　　data_id=li.get('data-id')
　　name=li.text

　　curr.execute('select>　　y= curr.fetchone()
　　if not y:

　　curr.execute('INSERT INTO>　　conn.commit()
　　except Exception as e:
　　print "get topic error",e
　　def get_extension(name):
　　where=name.rfind('.')
　　if where!=-1:
　　return name[where:len(name)]
　　return None
　　def which_platform():
　　sys_str = platform.system()
　　return sys_str
　　def GetDateString():
　　when=time.strftime('%Y-%m-%d',time.localtime(time.time()))
　　foldername = str(when)
　　return foldername
　　def makeDateFolder(par,classify):
　　try:
　　if os.path.isdir(par):
　　newFolderName=par + '//' + GetDateString() + '//'  +str(classify)
　　if which_platform()=="Linux":
　　newFolderName=par + '/' + GetDateString() + "/" +str(classify)
　　if not os.path.isdir( newFolderName ):
　　os.makedirs( newFolderName )
　　return newFolderName
　　else:
　　return None
　　except Exception,e:
　　print "kk",e
　　return None
　　def download_img(url,classify):
　　try:
　　extention=get_extension(url)
　　if(extention is None):
　　return None
　　req = urllib2.Request(url)
　　resp = urllib2.urlopen(req,None,3)
　　dataimg=resp.read()
　　name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
　　top="E://topic_pic"

　　folder=makeDateFolder(top,>　　filename=None
　　if folder is not None:
　　filename  =folder+"//"+name
　　try:
　　if "e82bab09c_m" in str(url):
　　return True
　　if not os.path.exists(filename):
　　file_object = open(filename,'w+b')
　　file_object.write(dataimg)
　　file_object.close()
　　return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name
　　else:
　　print "file exist"
　　return None
　　except IOError,e1:
　　print "e1=",e1
　　pass
　　except Exception as e:
　　print "eee",e
　　pass
　　return None #如果没有下载下来就利用原来网站的链接
　　def getChildren(node,name):
　　global queue,nodeSet
　　try:
　　url="https://www.zhihu.com/topic/"+str(node)+"/hot"
　　html=get_html(url)
　　if html is None:
　　return
　　soup = BeautifulSoup(html)
　　p_ch='父话题'
　　node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
　　topic_cla=soup.find('div', {'class' : 'child-topic'})
　　if topic_cla is not None:
　　try:
　　p_ch=str(topic_cla.text)
　　aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点
　　if u'子话题' in p_ch:
　　for a in aList:
　　token=a.get('data-token')
　　a=str(a).replace('\n','').replace('\t','').replace('\r','')
　　start=str(a).find('>')
　　end=str(a).rfind('</a>')
　　new_node=str(str(a)[start+1:end])

　　curr.execute('select>　　y= curr.fetchone()
　　if not y:
　　print "y=",y,"new_node=",new_node,"token=",token
　　queue.put((token,new_node,node_name))
　　except Exception as e:
　　print "add queue error",e
　　except Exception as e:
　　print "get html error",e
　　def getContent(n,name,p,top_id):
　　try:
　　global counter

　　curr.execute('select>　　y= curr.fetchone()
　　print "exist?? ",y,"n=",n
　　if not y:
　　url="https://www.zhihu.com/topic/"+str(n)+"/hot"
　　html=get_html(url)
　　if html is None:
　　return
　　soup = BeautifulSoup(html)
　　title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
　　pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')
　　description=soup.find('div',{'class':'zm-editable-content'})
　　if description is not None:
　　description=description.text

　　if (u"未归类" in>　　description=None
　　tag_path=download_img(pic_path,top_id)
　　print "tag_path=",tag_path
　　if (tag_path is not None) or tag_path==True:
　　if tag_path==True:
　　tag_path=None
　　father_id=2 #默认为杂谈

　　curr.execute('select>　　results = curr.fetchall()
　　for r in results:
　　father_id=r[0]
　　name=title

　　curr.execute('select>　　y= curr.fetchone()
　　print "store see..",y
　　if not y:
　　friends_num=0
　　temp = time.time()
　　x = time.localtime(float(temp))
　　create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
　　create_time
　　creater_id=None
　　room_avatar=tag_path
　　is_pass=1
　　has_index=0
　　reason_id=None
　　#print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id
　　######################有资格入库的内容
　　counter=counter+1
　　curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))
　　conn.commit() #必须时时进入数据库，不然找不到父节点
　　if counter % 200==0:
　　print "current node",name,"num",counter
　　except Exception as e:
　　print "get content error",e
　　def work():
　　global queue

　　curr.execute('select>　　results = curr.fetchall()
　　for r in results:
　　top_id=r[0]
　　node=r[1]
　　parent=r[2]
　　name=r[3]
　　try:
　　queue.put((node,name,parent)) #首先放入队列
　　while queue.qsize() >0:
　　n,p=queue.get() #顶节点出队
　　getContent(n,p,top_id)
　　getChildren(n,name) #出队内容的子节点
　　conn.commit()
　　except Exception as e:
　　print "what's wrong",e
　　def new_work():
　　global queue

　　curr.execute('select>　　results = curr.fetchall()
　　for r in results:
　　top_id=r[0]
　　data_id=r[1]
　　name=r[2]
　　try:
　　get_topis(data_id,name,top_id)
　　except:
　　pass
　　def get_topis(data_id,name,top_id):
　　global queue
　　url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'
　　isGet = True;
　　offset = -20;
　　data_id=str(data_id)
　　while isGet:
　　offset = offset + 20
　　values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}
　　try:
　　msg=None
　　try:
　　data = urllib.urlencode(values)
　　request = urllib2.Request(url,data,headers)
　　response = urllib2.urlopen(request,None,5)
　　html=response.read().decode('utf-8')
　　json_str = json.loads(html)
　　ms=json_str['msg']
　　if len(ms) <5:
　　break
　　msg=ms[0]
　　except Exception as e:
　　print "eeeee",e
　　#print msg
　　if msg is not None:
　　soup = BeautifulSoup(str(msg))
　　blks = soup.find_all('div', {'class' : 'blk'})
　　for blk in blks:
　　page=blk.find('a').get('href')
　　if page is not None:
　　node=page.replace("/topic/","") #将更多的种子入库
　　parent=name
　　ne=blk.find('strong').text
　　try:
　　queue.put((node,ne,parent)) #首先放入队列
　　while queue.qsize() >0:
　　n,name,p=queue.get() #顶节点出队
　　size=queue.qsize()

　　if>
　　print>　　getContent(n,name,p,top_id)
　　getChildren(n,name) #出队内容的子节点
　　conn.commit()
　　except Exception as e:
　　print "what's wrong",e
　　except urllib2.URLError, e:
　　print "error is",e
　　pass
　　if __name__ == '__main__':
　　i=0
　　while i<400:
　　new_work()
　　i=i+1

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] 如何用python爬取知乎话题？

浏览过的版块

扫码加入运维网微信交流群