百度空间相册下载器 python实现 by Gods_巨蚁（原创）

wss1051 发表于 2015-4-24 05:47:14

　　最近学习python中，感觉python确实挺好用
　　昨晚加今天实现了一个百度空间相册下载器
　　下面开放源代码，作者:Gods_巨蚁，转载注明出处
　　新QQ:1443561883
　　

#coding: UTF-8
import urllib, re, os
__metaclass__ = type#使用新类
class AntAlbumDownload:
'''
用于下载百度空间相册照片
'''
#imgarr={purl:"/zhongji/album/item/8177718de7d67312b21bba72.html", psrc:"http://hiphotos.baidu.com/zhongji/abpic/item/8177718de7d67312b21bba72.jpg",
#psize:"300*200 61K", pcmtNum:0, pname:"移动.gif",
#pedit:'' ,
#pid:"8177718de7d67312b21bba72",
#isMobileUp:0,
#isLocked:0};
#相册网页信息匹配
patPage = re.compile(
r'''
imgarr\={purl:"(.*?)",
.*?
psrc:"(.*?)",
.*?
pname:"(.*?)"
.*?
pid:"(.*?)"
.*?};
''',
re.VERBOSE
)
#测试相册网页
pat = re.compile(r'imgarr\={purl:".*?",.*?psrc:".*?",.*?')
#var Session = {
#spaceURL: "/zhongji",
#isHost: false, // 是否是空间主人
#isLogin: false,
#isActive: false,
#isShowVcode: true,
#userName: "饥饿蚂蚁", // 空间主人用户名
#userNameEnc: "%BC%A2%B6%F6%C2%EC%D2%CF",
#visitorName: "",
#visitorURL: "\/index.html", //
#refer: "http:\/\/hi.baidu.com\/zhongji\/album\/%D7%CA%C1%CF%D6%D0%B5%C4%CD%BC%C6%AC\/index\/2",
#spaceDomain: 'http://hi.baidu.com',
#spaceStaticDomain: 'http://hi.bdimg.com',
#portraitDomain: 'http://tx.bdimg.com',
#photoDomain: 'http://hiphotos.baidu.com',
#hiupDomain: 'http://hiup.baidu.com',
#spToken: 'd3981061a624c51023d46bcdc8336fd4'
#};
#图片网页信息匹配
patImage = re.compile(
r'''
var\ Session\ =\ {
.*?
spaceURL:\ "(.*?)",
.*?
userName:\ "(.*?)",# 空间主人用户名
.*?
photoDomain:\ '(.*?)',
.*?};
''',
re.VERBOSE
)
#测试图片网页
pat2 = re.compile(r'''
var\ Session\ =\ {
.*?
spaceURL.*?
userName:.*?
photoDomain:
''',
re.VERBOSE
)
def __init__(self):
pass
def _getPageText(self, url):
#获取一个网页的内容，并且替换掉所有换行符
page = urllib.urlopen(url)
text = page.read()
page.close()
#这里很关键，去除换行符
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
return text
def setAttr(self, name = '', url = ''):
if not name:
self.nameAlbum = raw_input('I will create the album directory, Input the name:')
else:
self.nameAlbum = name
if not url:
self.urlAlbum = raw_input('Input the URL of the album:')
else:
self.urlAlbum = url
self.countAnalysisPage = 0
#百度空间特殊性
#若URL中包含#，则提取#后的URL替换当前URL
loc = self.urlAlbum.find('#')
if loc != -1:
self.urlAlbum = 'http://hi.baidu.com' + self.urlAlbum

#如果已经包含页数信息，特将当前页数改为0
#若不包含页数信息，加上/index/0
#patFirst = re.compile(
#r'''
#(.*?)/index/(+)
#''',
#re.VERBOSE
#)
#urlFirst = patFirst.search(self.urlAlbum)
#if urlFirst:
#self.urlAlbum = urlFirst.group(1) + '/index/0'
#else:
#self.urlAlbum += '/index/0'
#新方法
self.urlAlbum = self._getIndexPageUrl(0)
print '解析到相册首页URL为: ', self.urlAlbum
cmd = 'md ' + self.nameAlbum
os.system(cmd)

def analysisImagePage(self, url, imageId):
'''
分析相片网页，获取相片实际地址
'''
text = self._getPageText(url)
print '分析图片页当前页:%3d' % self.countAnalysisPage
self.countAnalysisPage += 1
#if self.pat2.search(text):
#print 'analysisImagePage ok'
urlImage = self.patImage.search(text)
return urlImage.group(3) + urlImage.group(1) + '/pic/item/' + imageId +'.jpg'

def _getIndexPageUrl(self, iPage):
#获得当前相册页码为iPage的网页地址
patIndexPage = re.compile(
r'''
(.*?)/index/(+)
''',
re.VERBOSE
)
urlIndexPage = patIndexPage.search(self.urlAlbum)
if urlIndexPage:
return urlIndexPage.group(1) + '/index/' + str(iPage)
else:
return self.urlAlbum + '/index/' + str(iPage)
def analysis(self, countPage):
'''
分析相册网页内容
参数countPage表示分析的页数，0表示分析所有页
'''
print '开始分析页面'
#存放图片下载地址
images = []
#表示总页数
maxPage = 0
#读取相册首页内容
textPage = self._getPageText(self.urlAlbum)
#[尾页]
patLastPage = re.compile(
r'''
\[尾页\]
''',
re.VERBOSE
)
urlLastPage = patLastPage.search(textPage)
if urlLastPage:
#如果找到 [尾页] 匹配,则可以得到相册最大页数
print '尾页匹配成功'
maxPage = int(urlLastPage.group(1))
else:
print '尾页匹配失败'
maxPage = 0

#参数countPage为0时，表示分析相册所有页面
#参数countPage不允许超过总页数
if countPage == 0 or countPage > maxPage + 1:
countPage = maxPage + 1
for iPage in xrange(countPage):
print '分析相册当前页:%3d' % (iPage)
if iPage != 0:
#获得页码为iPage的页码类容
urlIndexPage = self._getIndexPageUrl(iPage)
textPage = self._getPageText(urlIndexPage)
for imagePage, imageSmall, imageName, imageId in self.patPage.findall(textPage):
#print 'href:', imagePage
#print 'imageSmall:', imageSmall
#print 'name:', imageName
#print 'id:', imageId
#imagePages.append(imagePage)
#原图所在网页URL
urlImagePage = 'http://hi.baidu.com' + imagePage
#print 'ulrImage:', urlImagePage
#print imageId
#获取原图URL
urlImage = self.analysisImagePage(urlImagePage, imageId)
#print 'ulrImage:', urlImage
images.append(urlImage)
print '分析完成，开始下载'
max = len(images)
#显示图片URL，并且下载图片到已命名文件夹
for index, image in enumerate(images):
#设置保存路径
pathImage = r'%s\%04d.jpg' % (self.nameAlbum, index)
#下载图片
urllib.urlretrieve(image, pathImage)
print pathImage
print '下载完成%.1f:%%' % ((index+1)*100.0/max)

def download(self, countPage = 0):
'''
下载相册图形
countPage表示页数,0表示下载所有图片
'''
self.analysis(countPage);
#for i in xrange(countPage):

def main():
album = AntAlbumDownload()
album.setAttr('', '')
album.download(0)
main()

页: [1]

运维网's Archiver

百度空间相册下载器 python实现 by Gods_巨蚁（原创）