php单线程爬虫类
[*]代码:
/**
* @desc:单线程爬虫类
* @author
* @property
* 1、callcontent 获取给定url页面中的内容的回调函数
* 2、calltodo 处理业务逻辑的回调函数 如:把抓取到的内容处理后存到数据库
* @method
* run 执行爬虫程序
* @param depth 深度 默认2
* @return void
*/
class crawl{
public $callcontent = 'getcontent';#获取给定url页面中的内容的回调函数
public $calltodo = 'todo';#处理业务逻辑的回调函数 如:把抓取到的内容处理后存到数据库
private $url;#内部属性:当前处理中的url
/*
@desc:内部方法,调用回调函数获取页面内容
@param url 传入到回调函数的参数
@return ret 页面内容
*/
private function getcontent($url){
$callback = $this->callcontent;
$ret = call_user_func($callback,$url);
return $ret;
}
/*
@desc:内部方法,调用回调函数进行业务处理
@param content 传入到回调函数的参数
*/
private function todo($content){
$callback = $this->calltodo;
call_user_func($callback,$content);
}
/*
@desc:内部方法,获取页面中的超链接
@param content 页面内容
@return urls 获取到的超链接
*/
private function geturl($content){
$preg = '/\'\"\ ]*).*?>/i';
$bool = preg_match_all($preg,$content,$res);
$urls = array();
if($bool){
$urls = $res;
}
$urls = array_unique($urls);
var_dump($urls);
}
[*]输出:
array(72) {
=>
string(22) "http://blog.运维网.com/"
=>
string(30) "http://blog.运维网.com/original"
=>
string(34) "http://blog.运维网.com/cloumn/index"
=>
string(28) "http://blog.运维网.com/expert"
=>
string(35) "http://blog.运维网.com/blogger/index"
=>
string(19) "javascript:void(0);"
=>
string(20) "http://edu.运维网.com"
=>
string(21) "http://blog.运维网.com"
=>
string(21) "http://down.运维网.com"
=>
string(21) "http://home.运维网.com"
=>
string(20) "http://bbs.运维网.com"
=>
string(18) "http://x.运维网.com"
=>
string(0) ""
=>
string(20) "http://wot.运维网.com"
=>
string(20) "http://www.运维网.com"
=>
string(89) "http://home.运维网.com/user/register?reback=http%253A%252F%252Fblog.运维网.com%252F12173069"
=>
string(78) "http://blog.运维网.com/user/login?reback=http%3A%2F%2Fblog.运维网.com%2F12173069"
=>
string(12) "javascript:;"
=>
string(34) "http://blog.运维网.com/search/index"
=>
string(40) "http://home.运维网.com/space?uid=12163069"
=>
string(37) "http://blog.运维网.com/12173069?type=1"
=>
string(37) "http://blog.运维网.com/12173069?type=2"
=>
string(37) "http://blog.运维网.com/12173069?type=3"
=>
string(30) "http://blog.运维网.com/12173069"
=>
string(33) "http://blog.运维网.com/12173069?s="
=>
string(34) "http://blog.运维网.com/12173069?s=3"
=>
string(34) "http://blog.运维网.com/12173069?s=4"
=>
string(34) "http://blog.运维网.com/12173069?s=5"
=>
string(34) "http://blog.运维网.com/12173069?s=6"
=>
string(38) "http://blog.运维网.com/12173069/2126752"
=>
string(38) "http://blog.运维网.com/12173069/2126693"
=>
string(38) "http://blog.运维网.com/12173069/2126661"
=>
string(38) "http://blog.运维网.com/12173069/2126657"
=>
string(38) "http://blog.运维网.com/12173069/2126596"
=>
string(38) "http://blog.运维网.com/12173069/2126591"
=>
string(38) "http://blog.运维网.com/12173069/2126496"
=>
string(38) "http://blog.运维网.com/12173069/2126420"
=>
string(38) "http://blog.运维网.com/12173069/2126324"
=>
string(38) "http://blog.运维网.com/12173069/2126210"
=>
string(38) "http://blog.运维网.com/12173069/2126090"
=>
string(38) "http://blog.运维网.com/12173069/2125724"
=>
string(38) "http://blog.运维网.com/12173069/2125666"
=>
string(38) "http://blog.运维网.com/12173069/2125424"
=>
string(38) "http://blog.运维网.com/12173069/2125359"
=>
string(38) "http://blog.运维网.com/12173069/2124937"
=>
string(38) "http://blog.运维网.com/12173069/2124923"
=>
string(38) "http://blog.运维网.com/12173069/2124720"
=>
string(38) "http://blog.运维网.com/12173069/2124693"
=>
string(38) "http://blog.运维网.com/12173069/2124499"
=>
string(33) "http://blog.运维网.com/12173069/p1"
=>
string(33) "http://blog.运维网.com/12173069/p2"
=>
string(33) "http://blog.运维网.com/12173069/p3"
=>
string(33) "http://blog.运维网.com/12173069/p4"
=>
string(33) "http://blog.运维网.com/12173069/p5"
=>
string(33) "http://blog.运维网.com/12173069/p6"
=>
string(33) "http://blog.运维网.com/12173069/p7"
=>
string(33) "http://blog.运维网.com/12173069/p8"
=>
string(34) "http://blog.运维网.com/12173069/p19"
=>
string(39) "http://blog.运维网.com/ityouknow/2124403"
=>
string(35) "http://blog.运维网.com/wyait/2125708"
=>
string(39) "http://blog.运维网.com/lumay0526/2124116"
=>
string(38) "http://blog.运维网.com/11010461/2123639"
=>
string(35) "http://blog.运维网.com/qiuyt/2124456"
=>
string(30) "http://blog.运维网.com/13716231"
=>
string(30) "http://blog.运维网.com/13108471"
=>
string(30) "http://blog.运维网.com/10316297"
=>
string(30) "http://blog.运维网.com/13718637"
=>
string(30) "http://blog.运维网.com/13681316"
=>
string(20) "http://www.运维网.com"
=>
string(37) "http://blog.运维网.com/blogger/publish"
=>
string(71) "http://wpa.qq.com/msgrd?v=3&uin=3591348659&site=qq&menu=yes"
=>
string(39) "http://blog.运维网.com/运维网blog/2057444"
}
页:
[1]