python bs4抓取百度贴吧
# -*- coding:utf-8 -*-__author__='fengzhankui'
import urllib2
from bs4 importBeautifulSoup
class Item(object):
title=None
firstAuthor=None
firstTime=None
reNum=None
content=None
lastAuthor=None
lastTime=None
class GetTiebaInfo(object):
def __init__(self,url):
self.url=url
self.pageSum=5
self.urls=self.getUrls(self.pageSum)
self.items=self.spider(self.urls)
self.pipelines(self.items)
def getUrls(self,pageSum):
urls=[]
pns=
ul=self.url.split('=')
for pn in pns:
ul[-1]=pn
url='='.join(ul)
urls.append(url)
return urls
def spider(self,urls):
items=[]
for url in urls:
htmlContent=self.getResponseContent(url)
soup=BeautifulSoup(htmlContent,'lxml')
tagsli = soup.find_all('li',class_=['j_thread_list','clearfix'])
for tag in tagsli:
if tag.find('div',attrs={'class': 'threadlist_abs threadlist_abs_onlyline '})==None:
continue
item=Item()
item.title=tag.find('a',attrs={'class':'j_th_tit'}).get_text().strip()
item.firstAuthor=tag.find('span',attrs={'class':'frs-author-name-wrap'}).a.get_text().strip()
item.firstTime = tag.find('span', attrs={'title': u'创建时间'.encode('utf8')}).get_text().strip()
item.reNum = tag.find('span', attrs={'title': u'回复'.encode('utf8')}).get_text().strip()
item.content = tag.find('div',attrs={'class': 'threadlist_abs threadlist_abs_onlyline '}).get_text().strip()
item.lastAuthor = tag.find('span',attrs={'class': 'tb_icon_author_rely j_replyer'}).a.get_text().strip()
item.lastTime = tag.find('span', attrs={'title': u'最后回复时间'.encode('utf8')}).get_text().strip()
items.append(item)
return items
def pipelines(self,items):
with open('tieba.txt','a') as fp:
for item in items:
fp.write('title:'+item.title.encode('utf8')+'\t')
fp.write('firstAuthor:'+item.firstAuthor.encode('utf8') + '\t')
fp.write('reNum:'+item.reNum.encode('utf8') + '\t')
fp.write('content:' + item.content.encode('utf8') + '\t')
fp.write('lastAuthor:' + item.lastAuthor.encode('utf8') + '\t')
fp.write('lastTime:' + item.lastTime.encode('utf8') + '\t')
fp.write('\n')
def getResponseContent(self,url):
try:
response=urllib2.urlopen(url.encode('utf8'))
except:
print 'fail'
else:
return response.read()
if __name__=='__main__':
url=u'http://tieba.baidu.com/f?kw=战狼2&ie=utf-8&pn=50'
GetTiebaInfo(url)
页:
[1]