python爬虫实例2017-3-14
# _*_ coding:utf-8 _*_from bs4 import BeautifulSoup
import urllib2
#2017-3-14 爬 <python中文社区高级教程>自动保存为txt文本文件 urllib2 beautifulsoup 4
#1.for in,数据类型list,切片操作
#2.bsp使用,如何找到内容,css选择器,select
#3.文件操作,命名,写入方式
url = 'http://www.pythontab.com/html/pythonhexinbiancheng/index.html' #高级教程页面
url_list = #链接放入列表里
for i in range(2,20):
url_list.append('http://www.pythontab.com/html/pythonhexinbiancheng/index.html')
#print(url_list[-1])
source_list = []
for j in url_list:
request = urllib2.urlopen(j) #打开链接
html = request.read() #读成源代码
#print(html)
soup = BeautifulSoup(html,'html.parser')
titles = soup.select('#catlist > li > a') #find title
#print titles
links = soup.select('#catlist > li > a')
#print links
for title,link in zip(titles,links):
data = {
"title": title.get_text(), #标题文本
"link": link.get('href') #直接获取按标签里的值"href = http://www.pythontab.com/html/2017/pythonhexinbiancheng_0228/1120.html"
}
source_list.append(data)
#print source_list
for l in source_list:
request = urllib2.urlopen(l['link']) #找到链接 获取href
html = request.read()
#print html
soup = BeautifulSoup(html,'html.parser') #创建一个对象
text_p = soup.select('div.content') #查找到内容
#print text_p
text = []
for t in text_p:
text.append(t.get_text().encode('utf-8'))
#print text
title_text = l['title'] #找到标题
'''title_text = title_text.replace('*', '').replace('/',
'or').replace('"', ' ').replace('?', 'wenhao').replace(':', ' ')
#替换代替符号'''
#open(路径+文件名字+模式(读\写)
with open('study/%s.txt' % title_text, 'wb') as f:
for a in text:
f.write(a)
print title_text
页:
[1]