This scipt is to demonstrate how to use LWP::UserAgent to extract useful info. such as email address from website.
g.pl
#!/usr/bin/perl -w
# Hao Chen
# The purpose of this script is to extract email info. from vansky.com website
#
use strict;
use LWP::UserAgent;
my $url_root = 'http://www.vansky.com/vanphp/gg/shownews.php?id=';
# starting id
my $start = 10600;
# ending id
my $end = 12000;
# wait seconds
my $wait = 1;
my $ua = LWP::UserAgent->new;
$ua->agent( 'Mozilla/5.0' );
my ( $url, $req, $res, %emails, $email );
my $verbose = 1;
for ( my $id = $start; $id < $end; $id++ )
{
$url = $url_root . $id;
print STDERR $url . "\n" if ( $verbose );
$req = HTTP::Request->new( GET => $url );
$res = $ua->request( $req );
if ( $res->is_success )
{
foreach ( split( "\n", $res->content ) )
{
if ( /Email:<\/b> ([^<>\/]*)<\/td><\/tr><tr>/ )
{
$email = $1;
if ( $email =~ /.+\@.+\..+/ )
{
print STDERR $email . "\n" if ( $verbose );
if ( exists $emails{ $email } )
{
$emails{ $email } = $emails{ $email } + 1;
} else
{
$emails{ $email } = 1;
}
}
last;
}
}
sleep $wait;
}
}
print "######## email from $start to $end #########\n";
foreach my $key ( keys %emails )
{
print STDERR "$key => $emails{$key}\n";
print $key. "\n";
}