用Python导出QQ空间的日志到WordPress

阅读模式 · 发表于 2015-4-20 12:03:14

用Python导出QQ空间的日志到WordPress
  　　文章来源：http://www.keakon.cn/bbs/thread-964-1-1.html
　　方法很简单，找出日志的地址，再遍历列出日志的内容。
因为单纯导出没用，还得转换成其他格式，所以我保存到一个列表里，每篇日志都对应其中的一个字典元素，字典的属性都用unicode编码。
然后dump出来，可以方便以后用Python进行再处理（默认为blogs.txt文件）。
并转换成了WordPress用的格式（默认为qzone.xml文件）。
本想用多线程来下载，但似乎没必要，因为只花了80秒，我的149篇日志就全部下载下来了。
如果空间有设置访问权限的话，可以用Client这个模块来处理，把注释改下就行了。
此外，这个也可以盗取别人的日志，但愿不要滥用…
最后，评论我没下载，因为WordPress好像不能导入评论。
代码如下：
　　复制内容到剪贴板
  代码:
# -*- coding: gbk -*-
from __future__ import with_statement
import codecs
from datetime import datetime
from datetime import timedelta
from os import linesep
import cPickle
#import Client
from urllib2 import urlopen
mainUrl = 'http://%s.qzone.qq.com/'
listUrl = 'http://b.qzone.qq.com/cgi-bin/blognew/blog_output_toppage?uin=%(qq)s&vuin=0&property=GoRE&getall=1&imgdm=imgcache.qq.com&bdm=b.qzone.qq.com&cate=&numperpage=100&sorttype=0&arch=0&pos=%(pos)d&direct=1'
blogUrl = 'http://qzone.qq.com/blog/%(qq)s-%(blogid)s'
GMT_FORMAT = '%a, %d %b %Y %H:%M:%S +0800'
HEADER = u'''

  %(author)s的QQ空间
  %(deion)s
  %(time)s
  keakon的QQ空间导出程序
  zh-CN
  1.0
'''.replace('\n', linesep)
FOOTER = '''
'''.replace('\n', linesep)
#cj = Client.MSIEJar(delayload=True)
#cj.load_from_registry()
#opener = Client.build_opener(Client.HTTPProcessor(cj))
#Client.install_opener(opener)
def getBasicInfo(qq):
  AUTHOR = '< name="author" content="'
  AUTHOR_LEN = len(AUTHOR)
  DESC = '< name="Deion" content="'
  DESC_LEN = len(DESC)
  #res = Client.urlopen(mainUrl % qq)
  res = urlopen(mainUrl % qq)
  html = res.read()
  begin = html.find(AUTHOR)
  if begin == -1:
raise URLError, 'HTML not complete.'
  begin += AUTHOR_LEN
  end = html.find('"', begin)
  author = unicode(html[begin:end], 'utf8', 'replace')
  begin = html.find(DESC)
  if begin == -1:
raise URLError, 'HTML not complete.'
  begin += DESC_LEN
  end = html.find('"', begin)
  deion = unicode(html[begin:end], 'utf8', 'replace')
  return author, deion
def getBlogList(qq):
  global listUrl
  CATEGORY = "selectCategory('"
  CAT_LEN =  len(CATEGORY)
  BLOG = 'selectBlog('
  BLOG_LEN = len(BLOG)
  pos = 0
  round = 0
  blogs = []
  while pos == len(blogs):
#res = Client.urlopen(listUrl % {'qq': qq, 'pos': pos})
res = urlopen(listUrl % {'qq': qq, 'pos': pos})
html = res.read()
res.close()
begin = 0
while True:
   begin = html.find(CATEGORY, begin)
   if begin == -1:
      break
   else:
      begin += CAT_LEN
      end = html.find("')", begin)
      blog = {}
      blog['category'] = unicode(html[begin:end], 'gb18030', 'replace')
      begin = html.find(BLOG, end)
      if begin == -1:
      raise URLError, 'HTML not complete.'
      else:
      begin += BLOG_LEN
      end = html.find(')', begin)
      blog['id'] = html[begin:end]
      blogs.append(blog)
      begin = end
pos += 100
print '已找到%d篇' % len(blogs)
  return blogs
def getBlogContent(qq, author, blogs, outFile):
  global blogUrl
  TITLE = u''
  TIT_LEN = len(TITLE)
  TITLE_END = u''
  TIME = u'发表时间：'
  TIME_LEN = len(TIME)
  DETAIL = u'
%(time)s
%(gmtTime)s
open
open
%(title)s
publish
0
0
post


'''.replace('\n', linesep)
  for index, blog in enumerate(blogs):
url = blogUrl % {'qq': qq, 'blogid': blog['id']}
print '正在下载第%(index)d篇日志: %(url)s' % {'index': index + 1, 'url': url}
#res = Client.urlopen(url)
res = urlopen(url)
html = res.read()
res.close()
content = unicode(html, 'gbk', 'replace')
begin = content.find(TITLE)
if begin == -1:
   print 'HTML not complete. ID: ' + blog['id']
   continue
begin += TIT_LEN
end = content.find(TITLE_END, begin)
blog['title'] = content[begin:end]
begin = content.find(TIME, end)
if begin == -1:
   print 'HTML not complete. ID: ' + blog['id']
   continue
begin += TIME_LEN
end = content.find('\r\n', begin)
blog['time'] = datetime.strptime(content[begin:end].encode('gbk'), TIME_FORMAT)
begin = content.find(DETAIL, end)
if begin == -1:
   print 'HTML not complete. ID: ' + blog['id']
   continue
begin = content.find('>', begin) + 1
if begin == 0:
   print 'HTML not complete. ID: ' + blog['id']
   continue
end = content.find(DETAIL_END, begin)
if end == -1:
   print 'HTML not complete. ID: ' + blog['id']
   continue
# 去掉最后2个div关闭标签
end2 = content.rfind(DETAIL_END_DIV, begin, end)
if end2 != -1:
   end3 = content.rfind(DETAIL_END_DIV, begin, end2)
   end = end3 != -1 and end3 or end2
blog['content'] = content[begin:end].strip()
outFile.write(ITEM % {'title': blog['title'], 'author': author, 'content': blog['content'],
               'time': blog['time'].strftime(DATE_FORMAT),
               'gmtTime': (blog['time'] -timedelta(hours=8)).strftime(DATE_FORMAT),
               'pubDate': blog['time'].strftime(GMT_FORMAT)})
def main(qq, filename='qzone.xml', filename2='blogs.txt'):
  author, deion = getBasicInfo(qq)
  blogs = getBlogList(qq)
  if not blogs:
print '没有找到日志。若您设置了QQ空间权限，请用IE登录QQ空间，并启用。'
exit(1)
  categories = set([blog['category'] for blog in blogs])
  with codecs.open(filename, 'w', 'utf8') as out
# write header
outFile.write(HEADER % {'author': author, 'deion': deion, 'time': datetime.now().strftime(GMT_FORMAT)})
for category in set([blog['category'] for blog in blogs]):
   outFile.write(u'  %(category)s%(linesep)s' % {'category': category, 'linesep': linesep})
# write item
getBlogContent(qq, author, blogs, outFile)
# write footer
outFile.write(FOOTER)
  with open(filename2, 'w') as outFile2:
cPickle.dump(blogs, outFile2)
  print '全部导出完毕'
if __name__ == "__main__":
  main('123456789') # 这里填你的QQ号

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

选中篇:

[经验分享] 用Python导出QQ空间的日志到WordPress

浏览过的版块

扫码加入运维网微信交流群


	选中篇: 置顶\|

选中 篇:

[经验分享] 用Python导出QQ空间的日志到WordPress

浏览过的版块

选中篇: