#!/usr/bin/perl
# siteindexingbot.pl
use warnings;
use strict;
use LWP::Simple;
use LWP::RobotUA;
use WWW::RobotRules;
use HTML::Parse;
use HTML::HeadParser;
use URI::URL;
my ($response, $tree, $link, %scanned);
# the arrays and hashes used to store page data
my (@pages, %titles, %keywords);
my $url = $ARGV[0] or die "Usage: siteindexingbot [url]\n";
my $base_url = &globalize_url('/', $url);
my $robots_txt = $base_url . '/robots.txt';
my $robot_rules = new WWW::RobotRules (
"indexifier/1.0 (libwww-perl-$LWP::VERSION)"
);
# look for and parse the robots.txt file
if (head($robots_txt)) {
print "robots.txt file found OK.\n";
$robot_rules->parse($robots_txt, get($robots_txt));
} else {
print "robots.txt file not found.\n";
}
# build the user agent
my $ua = new LWP::UserAgent (
"indexifier/1.0 (libwww-perl-$LWP::VERSION)",
'me@here.com',
$robot_rules
);
#$ua->proxy('http' => 'http://proxy.mylan.com/' );
$ua->timeout(30);
$ua->max_size(1024 * 100);
$ua->parse_head('TRUE');
&scan($base_url);
open (FILE, ">indexed.txt") or die "Opening indexed.txt: $!";
foreach my $page(@pages) {
print FILE join( "\t",
($page, $titles{$page}, $keywords{$page})
), "\n";
}
close (FILE);
exit;
sub scan {
my $url = shift;
print "Scanning '$url':\n";
if ($scanned{$url}) {
return;
} else {
&get_info($url); # this is the extra subroutine
$scanned{$url} = 'TRUE';
my @links = &get_links($url);
foreach $link(@links) {
if ($robot_rules->allowed($link)) {
if ($link =~ /^$base_url/i) {
my $request = HTTP::Request->new ('HEAD' => $link);
my $response = $ua->request($request);
my $content_type = $response->header('Content-type');
if ($response->is_error) {
print "Dead link to $link found on $url\n";
} else {
print "$url links to $link\n";
if ($content_type eq 'text/html') {
&scan($link);
} else {
print "$link is not HTML\n";
}
}
} else {
print "$link is not local to $base_url\n";
}
} else {
print "Access to $link is not allowed by robots.txt\n";
}
}
}
return;
}
sub globalize_url {
my ($link, $referring_url) = @_;
my $url_obj = new URI::URL($link, $referring_url);
my $absolute_url = $url_obj->abs->as_string;
$absolute_url =~ s/^(.+?)#(.+?)$/$1/ig;
return $absolute_url;
}
sub get_links {
my $url = shift;
my $request = HTTP::Request->new ('GET' => $url);
$request->header('Accept' => 'text/html');
my $response = $ua->request($request);
my $tree = HTML::Parse::parse_html($response->content);
my $links_ref = $tree->extract_links('a', 'frame', 'iframe');
my @links;
foreach $link(sort @$links_ref) {
push(@links, &globalize_url(${$link}[0], $url));
}
return @links;
}
sub get_info {
my $url = shift;
my $request = HTTP::Request->new('GET' => $url);
$request->header('Accept' => 'text/html');
my $response = $ua->request($request);
my $html = $response->content;
my ($title, $keywords, $type);
my $parser = HTML::HeadParser->new;
$parser->parse($html);
$title = $parser->header('title') || 'Untitled Document';
$keywords = $response->header('X-Meta-description') || 'none';
push (@pages, $url);
$titles{$url} = $title;
$keywords{$url} = $keywords;
return;
}