php爬虫 phpspider

santaclaus 发表于 2017-12-29 15:31:27

<?php　　
/**
　　* Created by PhpStorm.
　　* User: brady
　　* Date: 2016/12/9
　　* Time: 17:32
　　*/
　　
ini_set("memory_limit", "1024M");
　　
require dirname(__FILE__).'/../core/init.php';
　　

　　
$url = "http://www.epooll.com/archives/806/";
　　
$html = requests::get($url);
　　
// 抽取文章标题
　　
$selector = "//*[@id=\"content\"]/div/div/h1/a";
　　

　　
$title = selector::select($html, $selector);
　　
// 检查是否抽取到标题
　　
// 抽取文章作者
　　
$selector = "//*[@id=\"content\"]/div/div/h6/span";
　　
$author = selector::select($html, $selector);
　　
// 检查是否抽取到作者
　　
// 去掉作者：
　　
$author = str_replace("作者：", "", $author);
　　
//发布时间
　　
$selector = "//*[@id=\"content\"]/div/div/h6/span";
　　
$time = selector::select($html, $selector);
　　
$time = str_replace("发布时间：",'', $time);
　　
$time= date("Y-m-d H:i:s",strtotime($time));
　　
// 抽取文章内容
　　
$selector = "//*[@id=\"content\"]/div/div";
　　
$content = selector::select($html, $selector);
　　
// 检查是否抽取到内容
　　
$data = array(
　　'article_title' => $title,
　　'article_author' => $author,
　　'article_content' => $content,
　　
);
　　
// 查看数据是否正常
　　
$res = db::insert("content", $data);
　　
var_dump($res);
　　

页: [1]

运维网's Archiver

php爬虫 phpspider