dopost 发表于 2015-12-2 14:40:42

python+scrapy爬取知乎日报全站文章

# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy

class ZhihudailyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
date=scrapy.Field()
title=scrapy.Field()
url=scrapy.Field()
content=scrapy.Field()

  



#!/usr/bin/python
#coding:utf-8
import scrapy
class ZhihudailySpider(scrapy.spider.Spider):
name='zhihudaily'
allowd_domains=['zhihu.com']
start_urls=[
"http://zhihudaily.ahorn.me/page/1"]
def parse(self,response):
for sel in response.xpath("//div[@class='post']"):
for sub in sel.xpath("./div/div"):
url=sub.xpath("./a/@href").extract()
yield scrapy.Request(url,callback=self.parse_url)               
for page in range(2,500):
request=scrapy.Request("http://zhihudaily.ahorn.me/page/"+str(page),callback=self.parse)
yield request
def parse_url(self,response):
title=response.xpath("//h1[@class='headline-title']/text()").extract()
print "标题:",title
print "*************************************************************************"
for p in response.xpath("//div[@class='content']/p/text()").extract():
print p
  
页: [1]
查看完整版本: python+scrapy爬取知乎日报全站文章