python 抓取美女图片

zhangpengfei00 发表于 2018-8-8 06:33:20

# -*- coding:utf8 -*-　　
# __author__ = 'jony'
　　
from bs4 import BeautifulSoup
　　
import os, sys, urllib2,time,random
　　
import re
　　

　　
def GetUrl():
　　
url = 'http://www.27270.com/ent/meinvtupian/'
　　
header = {'User-Agent' : 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'}
　　
request = urllib2.Request(url,None,header)
　　
response = urllib2.urlopen(request,None,timeout=10).read()
　　
#pattern = re.compile(r'<a href="(.*)" title="(.*)"> class="MMPic"><i><img src="(.*)" width="190" height="280"alt=.*')#在一行无法正则获取，所以使用BeautifulSoup
　　
soup = BeautifulSoup(response,"html.parser", from_encoding="gb18030") #WARNING:root:Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.所以gb18030
　　
#soup = BeautifulSoup(response,from_encoding='gb2312')
　　
#过滤 div为MeinvTuPianBox
　　
content = soup.find_all('div',attrs={'class':'MeinvTuPianBox'})
　　
#定义列表
　　
urls = []
　　
#titles = []
　　
#picurls = []
　　
for i in content:
　　
   #再次过滤 MMpic 注意是a 不是div了
　　
   for j in i.findAll('a',attrs={'class':'MMPic'}):
　　
         urls.append(j.get('href'))
　　
         #titles.append(j.get('title'))
　　
return urls
　　
def GetImage(*urls):
　　
header = {'User-Agent' : 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'}
　　
pattern = re.compile(r'<img alt=".*" src="(.*)" />')
　　
for url in urls:
　　
   print url
　　
   #获取初始的页面的图片
　　
   try:
　　
            request = urllib2.Request(url,None,header)
　　
            response = urllib2.urlopen(request).read()
　　
            girlink = pattern.search(response).group(1)
　　
            print girlink
　　
            req = urllib2.Request(girlink,None,header)
　　
            res = urllib2.urlopen(req,None,timeout=10).read()
　　
            with open(u'PICTURE'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999))+u'.jpg','wb') as code:
　　
                  code.write(res)
　　
   except:
　　
         continue
　　
   #http://www.27270.com/ent/meinvtupian/2016/156239_20.html 第二十张图片的网址
　　
   orignurl=url.split('.html')
　　
   for i in range(2,15):
　　
         picurl = '%s_%s.html' % (orignurl,i)
　　
         #print picurl
　　
         try:
　　
            request = urllib2.Request(picurl,None,header)
　　
            response = urllib2.urlopen(request).read()
　　
            girlink = pattern.search(response).group(1)
　　
            print girlink
　　
            req = urllib2.Request(girlink,None,header)
　　
            res = urllib2.urlopen(req,None,timeout=10).read()
　　
            with open(u'PICTURE'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999))+u'.jpg','wb') as code:
　　
               code.write(res)
　　
         except:
　　
            continue
　　
if __name__ == '__main__':
　　
path = os.getcwd()
　　
new_path = os.path.join(path,u'PICTURE')
　　
if not os.path.isdir(new_path):
　　
   os.mkdir(new_path)
　　
links = GetUrl()
　　
#print type(links)
　　
GetImage(*links)

页: [1]

运维网's Archiver

python 抓取美女图片