sdfsdnfslk 发表于 2018-8-13 06:37:18

python爬虫——爬取古诗名句

#!/usr/bin/env python  # -*- coding: utf-8 -*-
  '''
  @Date    : 2017/12/21 12:35
  @Author: kaiqing.huang
  @File    : mingJuSpider.py
  '''
  from utils import MySpider, MongoBase
  from datetime import date
  from lxml import etree
  import sys
  class mingJuSpider():
  def __init__(self):
  self.db = MongoBase()
  self.spider = MySpider()
  def download(self):
  for pageId in range(1,117):
  url = 'http://so.gushiwen.org/mingju/Default.aspx?p={}&c=&t='.format(pageId)
  print url
  data = self.spider.get(url)
  if data:
  self.parse(data)
  def parse(self, data):
  response = etree.HTML(data)
  for row in response.xpath('//div[@class="left"]/div[@class="sons"]/div[@class="cont"]'):
  content = row.xpath('a/text()')
  origin = row.xpath('a/text()')[-1]
  self.db.add_new_row('mingJuSpider', {'content': content, 'origin': origin, 'createTime': str(date.today())})
  if __name__ == '__main__':
  sys.setrecursionlimit(100000)
  do = mingJuSpider()
  do.download()
页: [1]
查看完整版本: python爬虫——爬取古诗名句