python爬虫实例2017-3-14

5imobi 发表于 2018-8-15 08:45:44

# _*_ coding:utf-8 _*_　　
from bs4 import BeautifulSoup
　　
import urllib2
　　

　　
#2017-3-14 爬 <python中文社区高级教程>自动保存为txt文本文件 urllib2 beautifulsoup 4
　　
#1.for in,数据类型list,切片操作
　　
#2.bsp使用,如何找到内容,css选择器,select
　　
#3.文件操作,命名,写入方式
　　

　　
url = 'http://www.pythontab.com/html/pythonhexinbiancheng/index.html' #高级教程页面
　　
url_list = #链接放入列表里
　　

　　
for i in range(2,20):
　　
url_list.append('http://www.pythontab.com/html/pythonhexinbiancheng/index.html')
　　
#print(url_list[-1])
　　
source_list = []
　　
for j in url_list:
　　
request = urllib2.urlopen(j) #打开链接
　　
html = request.read() #读成源代码
　　
#print(html)
　　
soup = BeautifulSoup(html,'html.parser')
　　
titles = soup.select('#catlist > li > a') #find title
　　
#print titles
　　
links = soup.select('#catlist > li > a')
　　
#print links
　　
for title,link in zip(titles,links):
　　
   data = {
　　
                     "title": title.get_text(), #标题文本
　　
                     "link": link.get('href') #直接获取按标签里的值"href = http://www.pythontab.com/html/2017/pythonhexinbiancheng_0228/1120.html"
　　
}
　　
   source_list.append(data)
　　
   #print source_list
　　
for l in source_list:
　　
   request = urllib2.urlopen(l['link']) #找到链接获取href
　　
   html = request.read()
　　
   #print html
　　
   soup = BeautifulSoup(html,'html.parser') #创建一个对象
　　
   text_p = soup.select('div.content') #查找到内容
　　
   #print text_p
　　
   text = []
　　
   for t in text_p:
　　
         text.append(t.get_text().encode('utf-8'))
　　
         #print text
　　
         title_text = l['title'] #找到标题
　　

　　
'''title_text = title_text.replace('*', '').replace('/',
　　
'or').replace('"', ' ').replace('?', 'wenhao').replace(':', ' ')
　　
#替换代替符号'''
　　
         #open(路径+文件名字+模式(读\写)
　　
with open('study/%s.txt' % title_text, 'wb') as f:
　　
   for a in text:
　　
         f.write(a)
　　
         print title_text

页: [1]

运维网's Archiver

python爬虫实例2017-3-14