python bs4抓取百度贴吧

542179528 · 发表于 2018-8-14 08:34:57

# -*- coding:utf-8 -*-　　
__author__='fengzhankui'
　　
import urllib2
　　
from bs4 import  BeautifulSoup
　　
class Item(object):
　　
title=None
　　
firstAuthor=None
　　
firstTime=None
　　
reNum=None
　　
content=None
　　
lastAuthor=None
　　
lastTime=None
　　
class GetTiebaInfo(object):
　　
def __init__(self,url):
　　
      self.url=url
　　
      self.pageSum=5
　　
      self.urls=self.getUrls(self.pageSum)
　　
      self.items=self.spider(self.urls)
　　
      self.pipelines(self.items)
　　
def getUrls(self,pageSum):
　　
      urls=[]
　　
      pns=[str(i*50) for i in range(pageSum)]
　　
      ul=self.url.split('=')
　　
      for pn in pns:
　　
         ul[-1]=pn
　　
         url='='.join(ul)
　　
         urls.append(url)
　　
      return urls
　　
def spider(self,urls):
　　
      items=[]
　　
      for url in urls:
　　
         htmlContent=self.getResponseContent(url)
　　
         soup=BeautifulSoup(htmlContent,'lxml')
　　
         tagsli = soup.find_all('li',class_=['j_thread_list','clearfix'])[2:]
　　
         for tag in tagsli:
　　
            if tag.find('div',attrs={'class': 'threadlist_abs threadlist_abs_onlyline '})==None:
　　
                  continue
　　
            item=Item()
　　
            item.title=tag.find('a',attrs={'class':'j_th_tit'}).get_text().strip()
　　
            item.firstAuthor=tag.find('span',attrs={'class':'frs-author-name-wrap'}).a.get_text().strip()
　　
            item.firstTime = tag.find('span', attrs={'title': u'创建时间'.encode('utf8')}).get_text().strip()
　　
            item.reNum = tag.find('span', attrs={'title': u'回复'.encode('utf8')}).get_text().strip()
　　
            item.content = tag.find('div',attrs={'class': 'threadlist_abs threadlist_abs_onlyline '}).get_text().strip()
　　
            item.lastAuthor = tag.find('span',attrs={'class': 'tb_icon_author_rely j_replyer'}).a.get_text().strip()
　　
            item.lastTime = tag.find('span', attrs={'title': u'最后回复时间'.encode('utf8')}).get_text().strip()
　　
            items.append(item)
　　
      return items
　　
def pipelines(self,items):
　　
      with open('tieba.txt','a') as fp:
　　
         for item in items:
　　
            fp.write('title:'+item.title.encode('utf8')+'\t')
　　
            fp.write('firstAuthor:'+item.firstAuthor.encode('utf8') + '\t')
　　
            fp.write('reNum:'+item.reNum.encode('utf8') + '\t')
　　
            fp.write('content:' + item.content.encode('utf8') + '\t')
　　
            fp.write('lastAuthor:' + item.lastAuthor.encode('utf8') + '\t')
　　
            fp.write('lastTime:' + item.lastTime.encode('utf8') + '\t')
　　
            fp.write('\n')
　　
def getResponseContent(self,url):
　　
      try:
　　
         response=urllib2.urlopen(url.encode('utf8'))
　　
      except:
　　
         print 'fail'
　　
      else:
　　
         return response.read()
　　
if __name__=='__main__':
　　
url=u'http://tieba.baidu.com/f?kw=战狼2&ie=utf-8&pn=50'
　　
GetTiebaInfo(url)

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] python bs4抓取百度贴吧

浏览过的版块

扫码加入运维网微信交流群