1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
| #encoding=utf-8
#author: walker
#date: 2014-11-26
#function: 使用BeautifulSoup获取url及其内容
import sys, re, requests, urllib
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')
#给定关键词,获取百度搜索的结果
def GetList(keyword):
keyword = unicode(keyword, 'gb18030')
dic = {'wd': keyword}
urlwd = urllib.urlencode(dic)
print(urlwd)
sn = requests.Session()
url = 'http://www.baidu.com/s?ie=utf-8&csq=1&pstg=22&mod=2&isbd=1&cqid=9c0f47b700036f17&istc=8560&ver=0ApvSgUI_ODaje7cp4DVye9X2LZqWiCPEIS&chk=54753dd5&isid=BD651248E4C31919&'
url += urlwd
url += '&ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&rsv_pq=b05765d70003b6c0&rsv_t=ce54Z5LOdER%2Fagxs%2FORKVsCT6cE0zvMTaYpqpgprhExMhsqDACiVefXOze4&_ck=145469.1.129.57.22.735.37'
r = sn.get(url=url)
soup = BeautifulSoup(r.content) #r.text很可能中文乱码
rtn = soup.find('div',id='content_left').find_all(name='a',href=re.compile('baidu.com'))
for item in rtn:
print(item.getText().encode('gb18030'))
print(item['href'])
if __name__ == '__main__':
keyword = '正则表达式'
GetList(keyword)
|