#!/usr/longitude/bin/perl -w # YAPC::Europe 2001 # "Parallel Processing with Perl and PVM" - Benjamin Holzman: Longitude, Inc. # Listing One : "spider" use strict; require LWP::UserAgent; require URI; require HTML::Parse; use Digest::MD5 qw(md5); my $ua = LWP::UserAgent->new(); my $base = $ARGV[0] || "http://127.0.0.1/"; my(%visited); # md5(content) -> URI my(@URIs) = ($base); while (@URIs) { my %pending = (); for my $reqURI (@URIs) { my $request = HTTP::Request->new('GET', $reqURI); my $response = $ua->request($request); if ($response->is_success) { my $key = md5($response->content); next if exists $visited{$key}; $visited{$key} = $reqURI; for my $uri (extract_uris($response->content_ref, $reqURI)) { $pending{$uri} = 1 if $uri =~ /^$base/; } } } @URIs = keys %pending; } print "$_\n" for sort values %visited; sub extract_uris { my($content, $base) = @_; my @uris; for (@{ HTML::Parse::parse_html($$content)->extract_links }) { my($link, $elem) = @$_; my $uri = URI->new($link)->abs($base); push @uris, join(":", $uri->scheme, $uri->opaque); } return @uris; }