|
有多种方法,常用的有:
第一种:php自带的simplexml
//gb2312转码为utf8,这里注意要转码
$body_utf8 = mb_convert_encoding( $body, "utf8","gb2312");
$xml = simplexml_load_string($body_utf8);
第二种:开源的simple_html_dom,这个不需要编码一定为utf8.
$html = str_get_html($body);
// Find all article blocks
foreach($html->find('div.book_news_style_form') as $article) {
$item['img'] = $article->find('img', 0)->src;
$info = $article->find('div.book_news_style_text', 0);
$item['title'] = $info->find('h1 a', 0)->innertext;
$tmp = $info->find('h2', 0)->innertext;
$item['author'] = getAuthor($tmp);
$articles[] = $item;
}
foreach($html->find('a') as $element)
echo $element->href . '<br>';
参考:http://www.cnblogs.com/likwo/archive/2011/08/24/2151793.html
开源:http://simplehtmldom.sourceforge.net/ |
|
|