Python学习（2）

上帝大脸 发表于 2018-8-12 13:58:41

　　爬取网页的部分链接
　　#!/usr/bin/python
　　#coding = utf8
　　from urllib.request import urlopen
　　from bs4 import BeautifulSoup
　　import re
　　import random
　　pages = set()
　　def getlink(pageurl):
　　global pages
　　html = urlopen('http://www.ftchinese.com' + pageurl)
　　bs_data = BeautifulSoup(html,'lxml')
　　#from ipdb import set_trace
　　#set_trace()
　　for link in bs_data.find_all('a',href = re.compile("^(/m/)")):
　　if 'href' in link.attrs:
　　if link.attrs['href'] not in pages:
　　#我们遇到了新页面
　　newpage = link.attrs['href']
　　print(newpage)
　　pages.add(newpage)
　　getlink(newpage)
　　getlink("")

页: [1]

运维网's Archiver

Python学习（2）