|
#!/usr/bin/python
#coding:utf-8
import scrapy
from scrapy.selector import Selector
import os
import requests
class NextSpider(scrapy.spiders.Spider):
name = 'nextspider'
start_urls = ["http://group.jobbole.com/27740/#comm-77724"]
def parse(self,response):
items_selector = Selector(response=response)
items = items_selector.xpath('//ul[@class="cmnt-list"]/li')
# print items
for i in range(len(items)):
srcs = items_selector.xpath('//ul[@class="cmnt-list"]/li[%d]//div[@class="cmnt-header"]/a/img/@src'%i).extract()
names = items_selector.xpath('//ul[@class="cmnt-list"]/li[%d]//div[@class="cmnt-header"]/div/span[1]/a/text()'%i).extract()
msgs = items_selector.xpath('//ul[@class="cmnt-list"]/li[%d]//div[@class="cmnt-body"]/p/text()'%i).extract()
if srcs and names and msgs:
try:
img_url = srcs[0]
filename = names[0].encode('utf-8')
msg = ','.join([i.encode('utf-8') for i in msgs[1:]])
print '用户ID: {}\n发表信息: {}'.format(filename,msg)
img_dir = 'imgs'
path = os.path.join(os.getcwd(),img_dir,filename+'.png')
r = requests.get(img_url)
with open(path,'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
f.close()
except Exception,e:
print '错误: {}'.format(e) |
|
|