def unzip(data):
import gzip
import StringIO
data = StringIO.StringIO(data)
gz = gzip.GzipFile(fileobj=data)
data = gz.read()
gz.close()
return data
其它的压缩方式暂不讨论。
三、一个很丑陋的demo
# -*- coding: utf-8 -*-
'''
Created on 2015年1月28日
@author: zhang
'''
from bs4 import BeautifulSoup
result = {}
key_word = u'李克强'
def unzip(data):
import gzip
import StringIO
data = StringIO.StringIO(data)
gz = gzip.GzipFile(fileobj=data)
data = gz.read()
gz.close()
return data
def init_bs(url,encoding):
import urllib2
html_doc = ''
respone = urllib2.urlopen(url)
header = respone.info()
if 'Content-Encoding' in header:
if header['Content-Encoding'] == 'gzip':
html_doc = unzip(respone.read()).decode(encoding,'ignore')
else:
pass
else:
html_doc = respone.read().decode(encoding,'ignore')
return(BeautifulSoup(html_doc))
def get_target(soup):
for link in soup.find_all('a'):
text = link.get_text()
if text.find(key_word) != -1:
result[link.get('href')] = text
di = { 'gb2312':['http://www.sina.com.cn','http://www.people.com.cn/','http://www.people.com.cn/'
,'http://www.163.com/','http://www.qq.com/'],
'gbk':['http://www.sohu.com'],
'utf-8':['http://www.huanqiu.com/','http://www.xinhuanet.com/']
}
for k,v in di.iteritems():
for url in v:
soup = init_bs(url,'gb2312')
get_target(soup)