Perl 关键词搜索机器人

zhousi · 发表于 2015-12-27 12:08:00

　　这段代码在网上找的。觉得很不错，准备弄来分析下。。
　　看别人的代码也是一种另类的学习方法。在学习的过程当中多看别人的代码能够提升自己的理解。
　　特别是一些自己没有用过的模块，通过这些实例就能知道怎么去使用。
　　当然，你也可以自己去研究官方那些文档。但是对于我来说，我觉得最快的方法就是看别人写的代码实例。
　　或许每个人都有点不同吧。

#!/usr/bin/perl
# siteindexingbot.pl
use warnings;
use strict;
use LWP::Simple;
use LWP::RobotUA;
use WWW::RobotRules;
use HTML::Parse;
use HTML::HeadParser;
use URI::URL;
my ($response, $tree, $link, %scanned);
# the arrays and hashes used to store page data
my (@pages, %titles, %keywords);
my $url = $ARGV[0] or die "Usage: siteindexingbot [url]\n";
my $base_url = &globalize_url('/', $url);
my $robots_txt = $base_url . '/robots.txt';
my $robot_rules = new WWW::RobotRules (
"indexifier/1.0 (libwww-perl-$LWP::VERSION)"
);
# look for and parse the robots.txt file
if (head($robots_txt)) {
print "robots.txt file found OK.\n";
$robot_rules->parse($robots_txt, get($robots_txt));
} else {
print "robots.txt file not found.\n";
}
# build the user agent
my $ua = new LWP::UserAgent (
"indexifier/1.0 (libwww-perl-$LWP::VERSION)",
'me@here.com',
$robot_rules
);
#$ua->proxy('http' => 'http://proxy.mylan.com/' );
$ua->timeout(30);
$ua->max_size(1024 * 100);
$ua->parse_head('TRUE');
&scan($base_url);
open (FILE, ">indexed.txt") or die "Opening indexed.txt: $!";
foreach my $page(@pages) {
print FILE join( "\t",
($page, $titles{$page}, $keywords{$page})
), "\n";
}
close (FILE);
exit;
sub scan {
my $url = shift;
print "Scanning '$url':\n";
if ($scanned{$url}) {
return;
} else {
&get_info($url); # this is the extra subroutine
$scanned{$url} = 'TRUE';
my @links = &get_links($url);
foreach $link(@links) {
if ($robot_rules->allowed($link)) {
if ($link =~ /^$base_url/i) {
my $request = HTTP::Request->new ('HEAD' => $link);
my $response = $ua->request($request);
my $content_type = $response->header('Content-type');
if ($response->is_error) {
print "Dead link to $link found on $url\n";
} else {
print "$url links to $link\n";
if ($content_type eq 'text/html') {
&scan($link);
} else {
print "$link is not HTML\n";
}
}
} else {
print "$link is not local to $base_url\n";
}
} else {
print "Access to $link is not allowed by robots.txt\n";
}
}
}
return;
}
sub globalize_url {
my ($link, $referring_url) = @_;
my $url_obj = new URI::URL($link, $referring_url);
my $absolute_url = $url_obj->abs->as_string;
$absolute_url =~ s/^(.+?)#(.+?)$/$1/ig;
return $absolute_url;
}
sub get_links {
my $url = shift;
my $request = HTTP::Request->new ('GET' => $url);
$request->header('Accept' => 'text/html');
my $response = $ua->request($request);
my $tree = HTML::Parse::parse_html($response->content);
my $links_ref = $tree->extract_links('a', 'frame', 'iframe');
my @links;
foreach $link(sort @$links_ref) {
push(@links, &globalize_url(${$link}[0], $url));
}
return @links;
}
sub get_info {
my $url = shift;
my $request = HTTP::Request->new('GET' => $url);
$request->header('Accept' => 'text/html');
my $response = $ua->request($request);
my $html = $response->content;
my ($title, $keywords, $type);
my $parser = HTML::HeadParser->new;
$parser->parse($html);
$title = $parser->header('title') || 'Untitled Document';
$keywords = $response->header('X-Meta-description') || 'none';
push (@pages, $url);
$titles{$url} = $title;
$keywords{$url} = $keywords;
return;
}

　　

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] Perl 关键词搜索机器人

浏览过的版块

扫码加入运维网微信交流群