[Python]爬取糗事百科
# coding=utf-8import urllib2
import urllib
import re
class QiuShi:
def _init_(self):
self.page = 1
# 从网页获取糗事
def GetQiuShis(self,page):
#网址
url = "http://www.qiushibaike.com/hot/page/"+page
#伪装浏览器
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'user-Agent':user_agent}
#请求
req = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(req)
html = response.read()
#encode的作用是将unicode编码转换成其他编码的字符串
#decode的作用是将其他编码的字符串转换成unicode编码
unicodeHtml = html.decode("utf-8")
items = re.findall('<div.*?class="content".*?title="(.*?)">(.*?)</div>',unicodeHtml,re.S)
contents = []
for item in items:
# item 中第一个是div的标题,也就是时间
# item 中第二个是div的内容,也就是内容
contents.append(.replace("\n",""),item.replace("\n","")])
return contents
#打印糗事
def ShowQiuShi(self,contents):
count = 1
for content in contents:
print "第%d条糗事" % count,content,"\n"
print content,"\n"
count += 1
#启动
def Start(self):
page = 1
while page < 5:
print "第%d页:\n" % page
contents = self.GetQiuShis(str(page))
self.ShowQiuShi(contents)
page += 1
qiuShi = QiuShi()
qiuShi.Start()
页:
[1]