Tuesday, February 21, 2006

A script to extract email address from vansky.com website.



This scipt is to demonstrate how to use LWP::UserAgent to extract useful info. such as email address from website.

g.pl






#!/usr/bin/perl -w

# Hao Chen
# The purpose of this script is to extract email info. from vansky.com website
#

use strict;
use LWP::UserAgent;

my $url_root = 'http://www.vansky.com/vanphp/gg/shownews.php?id=';

# starting id
my $start = 10600;

# ending id
my $end = 12000;

# wait seconds
my $wait = 1;

my $ua = LWP::UserAgent->new;
$ua->agent( 'Mozilla/5.0' );

my ( $url, $req, $res, %emails, $email );

my $verbose = 1;

for ( my $id = $start; $id < $end; $id++ )
{
$url = $url_root . $id;
print STDERR $url . "\n" if ( $verbose );

$req = HTTP::Request->new( GET => $url );
$res = $ua->request( $req );

if ( $res->is_success )
{
foreach ( split( "\n", $res->content ) )
{
if ( /Email:<\/b>&nbsp;([^<>\/]*)<\/td><\/tr><tr>/ )
{
$email = $1;
if ( $email =~ /.+\@.+\..+/ )
{
print STDERR $email . "\n" if ( $verbose );
if ( exists $emails{ $email } )
{
$emails{ $email } = $emails{ $email } + 1;
} else
{
$emails{ $email } = 1;
}
}
last;
}
}
sleep $wait;
}

}

print "######## email from $start to $end #########\n";

foreach my $key ( keys %emails )
{
print STDERR "$key => $emails{$key}\n";
print $key. "\n";
}