santaclaus 发表于 2017-12-29 15:31:27

php爬虫 phpspider

<?php  
/**
  * Created by PhpStorm.
  * User: brady
  * Date: 2016/12/9
  * Time: 17:32
  */
  
ini_set("memory_limit", "1024M");
  
require dirname(__FILE__).'/../core/init.php';
  

  
$url = "http://www.epooll.com/archives/806/";
  
$html = requests::get($url);
  
// 抽取文章标题
  
$selector = "//*[@id=\"content\"]/div/div/h1/a";
  

  
$title = selector::select($html, $selector);
  
// 检查是否抽取到标题
  
// 抽取文章作者
  
$selector = "//*[@id=\"content\"]/div/div/h6/span";
  
$author = selector::select($html, $selector);
  
// 检查是否抽取到作者
  
// 去掉 作者:
  
$author = str_replace("作者:", "", $author);
  
//发布时间
  
$selector = "//*[@id=\"content\"]/div/div/h6/span";
  
$time = selector::select($html, $selector);
  
$time = str_replace("发布时间:",'', $time);
  
$time= date("Y-m-d H:i:s",strtotime($time));
  
// 抽取文章内容
  
$selector = "//*[@id=\"content\"]/div/div";
  
$content = selector::select($html, $selector);
  
// 检查是否抽取到内容
  
$data = array(
  'article_title' => $title,
  'article_author' => $author,
  'article_content' => $content,
  
);
  
// 查看数据是否正常
  
$res = db::insert("content", $data);
  
var_dump($res);
  
页: [1]
查看完整版本: php爬虫 phpspider