python bs4抓取百度贴吧

542179528 发表于 2018-8-14 08:34:57

# -*- coding:utf-8 -*-　　
__author__='fengzhankui'
　　
import urllib2
　　
from bs4 importBeautifulSoup
　　
class Item(object):
　　
title=None
　　
firstAuthor=None
　　
firstTime=None
　　
reNum=None
　　
content=None
　　
lastAuthor=None
　　
lastTime=None
　　
class GetTiebaInfo(object):
　　
def __init__(self,url):
　　
   self.url=url
　　
   self.pageSum=5
　　
   self.urls=self.getUrls(self.pageSum)
　　
   self.items=self.spider(self.urls)
　　
   self.pipelines(self.items)
　　
def getUrls(self,pageSum):
　　
   urls=[]
　　
   pns=
　　
   ul=self.url.split('=')
　　
   for pn in pns:
　　
         ul[-1]=pn
　　
         url='='.join(ul)
　　
         urls.append(url)
　　
   return urls
　　
def spider(self,urls):
　　
   items=[]
　　
   for url in urls:
　　
         htmlContent=self.getResponseContent(url)
　　
         soup=BeautifulSoup(htmlContent,'lxml')
　　
         tagsli = soup.find_all('li',class_=['j_thread_list','clearfix'])
　　
         for tag in tagsli:
　　
            if tag.find('div',attrs={'class': 'threadlist_abs threadlist_abs_onlyline '})==None:
　　
               continue
　　
            item=Item()
　　
            item.title=tag.find('a',attrs={'class':'j_th_tit'}).get_text().strip()
　　
            item.firstAuthor=tag.find('span',attrs={'class':'frs-author-name-wrap'}).a.get_text().strip()
　　
            item.firstTime = tag.find('span', attrs={'title': u'创建时间'.encode('utf8')}).get_text().strip()
　　
            item.reNum = tag.find('span', attrs={'title': u'回复'.encode('utf8')}).get_text().strip()
　　
            item.content = tag.find('div',attrs={'class': 'threadlist_abs threadlist_abs_onlyline '}).get_text().strip()
　　
            item.lastAuthor = tag.find('span',attrs={'class': 'tb_icon_author_rely j_replyer'}).a.get_text().strip()
　　
            item.lastTime = tag.find('span', attrs={'title': u'最后回复时间'.encode('utf8')}).get_text().strip()
　　
            items.append(item)
　　
   return items
　　
def pipelines(self,items):
　　
   with open('tieba.txt','a') as fp:
　　
         for item in items:
　　
            fp.write('title:'+item.title.encode('utf8')+'\t')
　　
            fp.write('firstAuthor:'+item.firstAuthor.encode('utf8') + '\t')
　　
            fp.write('reNum:'+item.reNum.encode('utf8') + '\t')
　　
            fp.write('content:' + item.content.encode('utf8') + '\t')
　　
            fp.write('lastAuthor:' + item.lastAuthor.encode('utf8') + '\t')
　　
            fp.write('lastTime:' + item.lastTime.encode('utf8') + '\t')
　　
            fp.write('\n')
　　
def getResponseContent(self,url):
　　
   try:
　　
         response=urllib2.urlopen(url.encode('utf8'))
　　
   except:
　　
         print 'fail'
　　
   else:
　　
         return response.read()
　　
if __name__=='__main__':
　　
url=u'http://tieba.baidu.com/f?kw=战狼2&ie=utf-8&pn=50'
　　
GetTiebaInfo(url)

页: [1]

运维网's Archiver

python bs4抓取百度贴吧