运用python抓取博客园首页的所有数据，而且定时持续抓取新公布的内容存入mongodb中

jialiguo · 发表于 2017-12-16 18:34:07

依赖包：　　

　　
1.jieba
　　

　　
2.pymongo
　　

　　
3.HTMLParser
　　

　　
# -*- coding: utf-8 -*-
　　
"""
　　
@author: jiangfuqiang
　　
"""
　　

　　
from HTMLParser import HTMLParser
　　
import re
　　
import time
　　
from datetime import date
　　
import pymongo
　　
import urllib2
　　
import sys
　　
import traceback
　　
import jieba
　　

　　
default_encoding = 'utf-8'
　　
if sys.getdefaultencoding() != default_encoding:
　　reload(sys)
　　sys.setdefaultencoding(default_encoding)
　　
isExist = False
　　

　　
class FetchCnblog(HTMLParser):

　　def __init__(self,>　　HTMLParser.__init__(self)
　　self.result = []
　　self.data = {}
　　self.isTitleLink = False

　　self.id =>　　self.isSummary = False
　　self.isPostItem = False
　　self.isArticleView = False
　　

　　

　　def handle_data(self, data):
　　if self.isTitleLink and self.isPostItem:
　　self.data['title'] = data
　　self.isTitleLink = False
　　elif self.isSummary and self.isPostItem:
　　data = data.strip()
　　if data:
　　self.data['desc'] = data
　　

　　

　　def handle_starttag(self, tag, attrs):
　　if tag == 'a':
　　for key, value in attrs:
　　if key == 'class':
　　if value == 'titlelnk':
　　self.isTitleLink = True
　　elif value == 'gray' and self.isArticleView:
　　self.isArticleView = False
　　for key, value in attrs:
　　if key == 'href':
　　self.data['readmoreLink'] = value
　　reg = 'd+'
　　result = re.search(reg,value)
　　self.isPostItem = False
　　

　　if result:
　　self.data['id'] = int(result.group())
　　else:
　　self.data = {}
　　return
　　if self.data['id'] <= self.id:
　　self.data = {}
　　isExist = True
　　return
　　else:
　　self.data['srouce'] = "www.cnblogs.com"
　　self.data['source_key'] = 'cnblogs'
　　self.data['fetchTime'] = str(date.today())
　　self.data['keyword'] = ",".join(jieba.cut(self.data['title']))
　　self.result.append(self.data)
　　self.data = {}
　　

　　elif tag == 'p':
　　for key, value in attrs:
　　if key == 'class' and value == 'post_item_summary':
　　self.isSummary = True
　　elif tag == 'img':
　　for key, value in attrs:
　　if key == 'class' and value == 'pfs':
　　for key, value in attrs:
　　if key == 'src':
　　self.data['imgSrc'] = value
　　

　　

　　elif tag == 'div':
　　for key, value in attrs:
　　if key == 'class' and value == 'post_item_foot':
　　self.isSummary = False
　　elif key == 'class' and value == 'post_item':
　　self.isPostItem = True
　　elif tag == 'span':
　　for key , value in attrs:
　　if key == 'class' and value == 'article_view':
　　self.isArticleView = True
　　

　　

　　def getResult(self):
　　

　　return self.result
　　

　　

　　
if __name__ == "__main__":
　　con = pymongo.Connection('localhost', 27017)
　　db = con.blog
　　fetchblog = db.fetch_blog
　　record = db.record
　　url = "http://www.cnblogs.com/sitehome/p/%d"
　　count = 1
　　flag = False
　　headers={
　　'User-Agent':'Mozilla/5.0 （Windows； U； Windows NT 6.1； en-US。 rv：1.9.1.6） Gecko/20091201 Firefox/3.5.6'}
　　reco = record.find_one({"type":'cnblogs'})
　　id = 0
　　if reco:
　　id = reco['maxId']
　　while isExist == False:
　　try:
　　req = urllib2.Request(url%count,headers=headers)
　　request = urllib2.urlopen(req)
　　data = request.read()
　　fj = FetchCnblog(id)
　　fj.feed(data)
　　result = fj.getResult()
　　if len(result) < 1:
　　isExist = True
　　else:
　　if flag == False:
　　flag = True
　　dic = result[0]
　　id = int(dic['id'])
　　record.update({"type":'cnblogs'},{"$set":{'maxId':id}},True,False)
　　result.reverse()
　　for doc in result:
　　fetchblog.insert(doc)
　　print "page is %d"%count
　　count += 1
　　

　　time.sleep(5)
　　except Exception, e:
　　traceback.print_exc()
　　print "parse error",e
　　

　　
程序假设在linux,mac下运行。在可在crontab -e中设置定时任务，假设在windows运行，则自己再在程序里加个定时器就可以

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

Red Hat RHCE 8 (EX294) Cert Guide

c++ size_t 和 int 的区别

HERE 使用 AWS EF 和 JFrog Artifactory 打

C++ 指针大全：从基础到进阶，一篇快速上手

wirelessnetview好用的无线分析工具

亿图图示专家(EDraw Max) V7.9 中文破解版

[经验分享] 运用python抓取博客园首页的所有数据，而且定时持续抓取新公布的内容存入mongodb中

浏览过的版块

扫码加入运维网微信交流群