zhaolu 发表于 2018-8-13 11:05:27

python网页爬虫浅析

  
def getImg2(html, initialFile, finalFile):
  
    reg = '"*'          #split string html with " and write in file name re.txt
  
    imgre1 = re.compile(reg)
  
    imglist = re.split(imgre1, str(html))
  
    f1 = open(initialFile, mode='w')
  
    for index in imglist:
  
      f1.write("\n")
  
      f1.write(index)
  
    f1.close
  
    reg2 = "^https.*jpg"   # match items start with "https" and ends with "jpg"
  
    imgre2 = re.compile(reg2)
  
    f2 = open(initialFile, mode='r')
  
    f3 = open(finalFile, mode='w')
  
    tempre = f2.readlines()
  
    for index in tempre:
  
      temp = re.match(imgre2,index)
  
      if temp != None:
  
            f3.write(index)
  
            #f3.write("\n")
  
    f2.close()
  
    f3.close()
页: [1]
查看完整版本: python网页爬虫浅析