|
网页的图片大致是用Image导入的,使用的是相对路径,例如
通过匹配可以获取image/bg.jpg,与页面地址组合可以得到图片的地址
除了直接引入的图片,还有通过CSS,HTML引入的图片,也需要处理
# -*- coding: utf-8 -*-
import urllib, httplib, urlparse
import sys
import re
def httpExists(url):
host, path = urlparse.urlsplit(url)[1:3]
if ':' in host:
# port specified, try to use it
host, port = host.split(':', 1)
try:
port = int(port)
except ValueError:
print 'invalid port number %r' % (port,)
return False
else:
# no port specified, use default port
port = None
try:
connection = httplib.HTTPConnection(host, port=port)
connection.request("HEAD", path)
resp = connection.getresponse( )
if resp.status == 200: # normal 'found' status
found = True
elif resp.status == 302: # recurse on temporary redirect
found = httpExists(urlparse.urljoin(url,resp.getheader('location', '')))
else: # everything else -> not found
print "Status %d %s : %s" % (resp.status, resp.reason, url)
found = False
except Exception, e:
print e.__class__, e, url
found = False
return found
"""根据url获取文件名"""
def gGetFileName(url):
if url==None: return None
if url=="" : return ""
arr=url.split("/")
return arr[len(arr)-1]
"""根据url下载文件,文件名参数指定"""
def gDownloadWithFilename(url,savePath,file):
#参数检查,现忽略
try:
urlopen=urllib.URLopener()
fp = urlopen.open(url)
data = fp.read()
fp.close()
print 'download file url :',url
file=open(savePath + file,'w+b')
file.write(data)
file.close()
except IOError:
print "download error!"+ url
def gDownload(url,savePath):
fileName = gGetFileName(url)
gDownloadWithFilename(url,savePath,fileName)
def getRexgList(lines,regx,searchRegx):
if lines==None : return
lists =[]
for line in lines:
ismatch = re.search(regx,line,re.IGNORECASE)
if ismatch :
matchs = re.search(searchRegx,line,re.IGNORECASE)
if matchs != None:
groups = matchs.groups()
for str in groups:
if str not in lists:
lists.append(str)
return lists
def checkLine(lines):
for line in lines :
matchs = re.search(r'url\((\S+)\)',re.IGNORECASE)
if matchs != None :
print matchs.groups()
def getPageLines(url):
if url==None : return
if not httpExists(url): return
try:
page = urllib.urlopen(url)
html = page.readlines()
page.close()
return html
except:
print "getPageLines() error!"
return
def getCurrentPageImage(url,savePath):
lines = getPageLines(url)
print 'lines.length',len(lines)
regxlists = getRexgList(lines,r'src\s*="images(\S+)"',r'src\s*="(\S+)"')
if regxlists==None: return
print 'getCurrentPageImage() images.length',len(regxlists)
for jpg in regxlists:
jpg =url + jpg
gDownload(jpg,savePath)
def getCSSImages(link,savePath,url):
lines = getPageLines(link)
print 'lines.length',len(lines)
regxlists = getRexgList(lines,r'url\((\S+)\)',r'url\((\S+)\)')
if regxlists==None: return
print 'getCurrentPageImage() images.length',len(regxlists)
for jpg in regxlists:
jpg =url + jpg
gDownload(jpg,savePath)
"""根据url获取其上的相关htm、html链接,返回list"""
def gGetHtmlLink(url):
#参数检查,现忽略
rtnList=[]
lines=getPageLines(url)
regx = r"""href="?(\S+)\.htm"""
for link in getRexgList(lines,regx,r'href="(\S+)"'):
link =url + link
if link not in rtnList:
rtnList.append(link)
print link
return rtnList
"""根据url获取其上的相关css链接,返回list"""
def gGetCSSLink(url):
#参数检查,现忽略
rtnList=[]
lines=getPageLines(url)
regx = r"""href="?(\S+)\.css"""
for link in getRexgList(lines,regx,r'href="(\S+)"'):
link = url + link
if link not in rtnList:
rtnList.append(link)
return rtnList
def getPageImage(url,savePath):
"""getCurrentPageImage(url,savePath)"""
"""读取其他的CSS,html文件中的图片
links=gGetHtmlLink(url)
for link in links:
print u'get images on link-html读取'
getCurrentPageImage(link,savePath)"""
links=gGetCSSLink(url)
for link in links:
print 'get images on link:',link
getCSSImages(link,savePath,url)
if __name__ == '__main__':
url = 'http://www.templatemo.com/templates/templatemo_281_chrome/'
savePath = 'd:/tmp/'
print 'download pic from [' + url +']'
print 'save to [' +savePath+'] ...'
getPageImage(url,savePath)
print "download finished"
具体使用的时候根据URL的情况,具体分析得到图片地址的方式。 |
|