Last week I got a large book "Perl Cookbook". It mentions an useful module HTML::LinkExtor in the book, seems handy to use. Right now, I just wanted to crawl some docs from MVS-OE archive webpage, so I wrote a small script that can demo how to use the module.
A DEMO script on how to use CPAN module HTML::LinkExtor
use LWP::Simple;
use HTML::LinkExtor;
use URI::URL;
binmode STDOUT, ':utf8';
my $url = "http://www2.marist.edu/htbin/wlvindex?mvs-oe";
my $base = "http://www2.marist.edu/htbin";
my $ref_links = extract_link($url, "", "a", "href");
foreach(@$ref_links)
{
my $sub_url = $_;
print "Parsing sub url: ".$sub_url."\n";
my $thread_links = extract_link($sub_url, $base, "a", "href");
foreach(@$thread_links)
{
print "GET\n";
print $_."\n";
get($_);
}
}
sub extract_link()
{
my $url = shift;
my $base= shift; # base URL
my $mytag = shift; # specified html tag name, such as a, form ...
my $attr_name = shift; # link pattern
$base =~ s/\/$//g;
$ua = LWP::UserAgent->new or dir $!;
# Set up a callback that collect image links
my @links = ();
sub callback {
my($tag, %attr) = @_;
return if $tag ne $mytag; # we only look closer at <img ...>
push(@links, $attr{$attr_name});
}
# Make the parser. Unfortunately, we don't know the base yet
# (it might be different from $url)
my $p = HTML::LinkExtor->new(\&callback);
# Request document and parse it as it arrives
my $res = $ua->request(HTTP::Request->new(GET => $url),
sub {$p->parse($_[0])}) or die $!;
# Expand all image URLs to absolute ones
$base or $base = $res->base;
@links = map { $_= url($_, $base)->abs; } @links;
return \@links;
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.