PHP备份人人网日志脚本 ver 0.1
今天确实是无聊了,写了一个备份人人网的日志,到本地html的类。主要是cURL登录,正则解析页面。
使用方法,最后那个脚本的最后两行改掉,你知道的。或者重新写一个运行脚本:
<?phpinclude("renren.php");$test = new renren("你的人人网账号","你的人人网密码");$test->download();?>
然后开一个cmd,执行(注意,以命令行的方式运行脚本!)
>php rrBlogBackup.php
运行时的效果:
运行后会在运行的目录下,生成一堆日志html和一个index.html算是目录吧。
完整的脚本如下:
<?php/*** 人人网个人日志备份* * 版本 0.1* 作者 BookMoth* 日期 2010-04-19**/class renren {private $home_url = "http://www.renren.com";private $blog_url = "http://blog.renren.com/blog/0";private $ch;private $cookie_file;private $username,$password;private $menu;private $download_path = 'blog';//保存的目录private $rt_encode = 'cp936';//系统编码,主要是为了win下cmd输出正常的汉字。public function __construct($username,$password){$this->cookie_file = tempnam('./','xn');$this->username = urlencode($username);$this->password = urlencode($password);if(!is_dir($this->download_path)){mkdir('blog');}}public function download(){//主函数if($this->login()){$total_blogs = $this->getTotalBlog();$this->_echo("[信息] 日志总数 {$total_blogs}/n");if($total_blogs > 0){$this->_echo( "[信息] 获取日志列表/n");$blog_list = $this->getBlogList($total_blogs);if(is_array($blog_list)&&count($blog_list)==$total_blogs){$this->_echo("[信息] 获取日志列表完成/n");foreach ($blog_list as $blog_url) {$this->getBlog($blog_url);}$this->_echo("[信息] 下载日志全部完成/n");$this->makeIndex();$this->_echo("[信息] 创建日志目录完成/n");}elseif (is_array($blog_list)){$blog_list_length = count($blog_list);$this->_echo("[错误] 获取日志列表不完成:{$blog_list_length}/{$total_blogs}/n");}else{$this->_echo( "[错误] 获取日志失败/n");}}else{$this->_echo( "[错误] 没有日志!");}}}public function login(){//登录1 Plogin.do$this->ch = curl_init("http://www.renren.com/PLogin.do");$post_data = "email={$this->username}&password={$this->password}&origURL=http%3A%2F%2Fwww.renren.com%2FHome.do&domain=renren.com";curl_setopt($this->ch,CURLOPT_POSTFIELDS,$post_data);curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, 1);curl_setopt($this->ch, CURLOPT_COOKIEJAR, $this->cookie_file);$result = curl_exec($this->ch);curl_close($this->ch);preg_match('/"([^"]+)"/',$result,$url);//登录2 callback.do$result = $this->_getUrlContents($url);preg_match('/"([^"]+)"/',$result,$url);//登录3 Home.do$result = $this->_getUrlContents($url);if(preg_match("/最近来访/",$result)){preg_match("/<title>([^<]+)<//title>/",$result,$title);$this->_echo( "[信息] 登录成功 [{$title}]/n");return true;}else{$this->_echo( "[错误] 登录失败 ");return false;}}public function getTotalBlog(){//获取总共多少篇日志$blog_page = $this->_getUrlContents($this->blog_url);preg_match("/<div class=/"blog-home/"><div class=/"pager-top/"><span>当前显示1-10篇//共(+)篇/",$blog_page,$total_blog);return $total_blog;}/*** 获取全部日志的url列表** @param int $total_blog* @return array()*/public function getBlogList($total_blog){$total_pages = (int)($total_blog/10);$blog_list = array();for ($curr_page=0;$curr_page<=$total_pages;$curr_page++){$_url = "http://blog.renren.com/blog/0?curpage={$curr_page}&year=0&month=0&selitem=";$blog_page = $this->_getUrlContents($_url);preg_match_all("/href=/"(http:////blog.renren.com//blog//{7,10}//{7,10})/">([^<]+)<//a> <//strong>/",$blog_page,$match_result);$blog_list = array_merge($blog_list,$match_result);}return $blog_list;}/*** 获取指定url的日志,保存为html** @param string $blog_url*/public function getBlog($blog_url){$blog_page_html = $this->_getUrlContents($blog_url);$blog_title = trim($this->_getBlogTitle($blog_page_html));$blog_time = $this->_getBlogTime($blog_page_html);$blog_content = $this->_getBlogContent($blog_page_html);$tmp_html= '<html>';$tmp_html .= '<head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /></head>';$tmp_html .= '<body><h1>'.$blog_title.'</h1>';$tmp_html .= '<br />'.$blog_time.'<br />';$tmp_html .= $blog_content.'</body></html>';$tmp_file_name = str_replace('/','_',substr($blog_url,28)).'.html';$this->menu[$tmp_file_name] = $blog_title;file_put_contents($this->download_path .DIRECTORY_SEPARATOR. $tmp_file_name,$tmp_html);$this->_echo("[信息] 保存日志列表完成[$blog_title] /n");}/*** 解析HTML,获取日志的标题** @param string $blog_html_str* @return string*/private function _getBlogTitle($blog_html_str){preg_match("/<b>(.+)<//strong>?/",$blog_html_str,$match_result);return $match_result;}/*** 解析HTML,获取日志的内容** @param string $blog_html_str* @return string*/private function _getBlogContent($blog_html_str){preg_match('/<div/ id=/"blogContent/"/ class=/"text/-article/">.+?<//div>/s',$blog_html_str,$match_result);return $match_result;}/*** 解析HTML,获取日志的时间** @param string $blog_html_str* @return string*/private function _getBlogTime($blog_html_str){preg_match('//"timestamp/">(/d{4}/-/d{2}/-/d{2}/ /d{2}:/d{2})/',$blog_html_str,$match_result);return $match_result;}/*** 获取指定url的页面全部内容** @param string $url* @return string*/private function _getUrlContents($url){$this->ch = curl_init($url);curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, 1);curl_setopt($this->ch, CURLOPT_COOKIEJAR, $this->cookie_file);curl_setopt($this->ch, CURLOPT_COOKIEFILE, $this->cookie_file);$result = curl_exec($this->ch);curl_close($this->ch);return $result;}/*** 创建目录**/private function makeIndex(){$tmp_index = '<html>';$tmp_index .= '<head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /></head>';$tmp_index .= '<body>';if (is_array($this->menu)) {foreach ($this->menu as $link=>$title) {$tmp_index .= '<a href="'.$link.'" mce_href="'.$link.'" target="_blank">'.$title.'</a><br />';}}$tmp_index .= '</body></html>';file_put_contents($this->download_path .DIRECTORY_SEPARATOR.'index.html',$tmp_index);}/*** 转换msg为指定的编码,echo** @param string $msg*/private function _echo($msg){echo iconv("utf-8",$this->rt_encode,$msg);}public function __destruct(){unlink($this->cookie_file);}}$test = new renren("人人网账号","人人网密码");$test->download();?>
待完善的内容:
1、保存日志中的图片。
2、保存日志中的评论。
3、将零散的html备份文件,编译成chm,或者其他格式。
4、保存备份所有的状态。
不过,我不打算做这些了。
最后要说:PHP不光做网页,做脚本也非常好用!
PS:发现什么问题的话,敬请指教。
页:
[1]