上帝大脸 发表于 2018-8-12 13:58:41

Python学习(2)

  爬取网页的部分链接
  #!/usr/bin/python
  #coding = utf8
  from urllib.request import urlopen
  from bs4 import BeautifulSoup
  import re
  import random
  pages = set()
  def getlink(pageurl):
  global pages
  html = urlopen('http://www.ftchinese.com' + pageurl)
  bs_data = BeautifulSoup(html,'lxml')
  #from ipdb import set_trace
  #set_trace()
  for link in bs_data.find_all('a',href = re.compile("^(/m/)")):
  if 'href' in link.attrs:
  if link.attrs['href'] not in pages:
  #我们遇到了新页面
  newpage = link.attrs['href']
  print(newpage)
  pages.add(newpage)
  getlink(newpage)
  getlink("")
页: [1]
查看完整版本: Python学习(2)