This is an enhanced version of my webbot script to extract ad. info from vansky.com.
---------------------------------------------
#!/usr/bin/perl -w
# Hao Chen
# The purpose of this script is to extract ad info. from vansky.com website
# and write the data to grab.dat, email.dat files.
#
# grab.log file records the id of ads. grabbed to avoid redundant work.
#
use strict;
use LWP::UserAgent;
my $url_hp = 'http://www.vansky.com/vanphp/gg/newsgroup.php';
my $url_root = 'http://www.vansky.com/vanphp/gg/shownews.php?id=';
# starting id
my $start = 50000;
# ending id
my $end = 0;
# wait seconds
my $wait = 1;
my $ua = LWP::UserAgent->new;
$ua->agent( 'Mozilla/5.0' );
my ( $url, $req, $res );
my $verbose = 1;
$req = HTTP::Request->new( GET => $url_hp );
$res = $ua->request( $req );
if ( $res->is_success )
{
foreach ( split( "\n", $res->content ) )
{
if ( /pageno_c=(.*?)shownews\.php\?id=(\d*?)'\)/ )
{
$end = $2;
last;
}
}
}
open( FILE, 'grab.log' ) or die "Can't open file grab.log\n";
my @log = <FILE>;
close( FILE );
my $num_lines = scalar @log;
if ( $num_lines && $log[ $num_lines - 1 ] =~ / => (\d+) - (\d+)/ )
{
$start = $2;
} else
{
print STDERR "brand new task: start = $start\n";
}
if ( $end > $start )
{
print STDERR "new grab task: $start - $end\n";
} else
{
print STDERR "no new grab tasks!\n";
exit;
}
open( FILE, '>>grab.log' ) or die "Can't open file grab.log\n";
my $currTime = localtime;
print FILE $currTime . ' => ' . $start . ' - ' . $end . "\n";
close( FILE );
print "######## grab $start to $end #########\n";
open( FILE, '>>grab.dat' ) or die "Can't open file grab.dat\n";
open( EMAIL, '>>email.dat' ) or die "Can't open file email.dat\n";
print FILE "##### $currTime => grab ad. $start to $end\n";
print EMAIL "##### $currTime => grab email. $start to $end\n";
for ( my $id = $start; $id <= $end; $id++ )
{
$url = $url_root . $id;
print STDERR $url . "\n" if ( $verbose );
$req = HTTP::Request->new( GET => $url );
$res = $ua->request( $req );
if ( $res->is_success )
{
my @content = split( "\n", $res->content );
my $the_ad;
my $start_ad = 0;
foreach ( @content )
{
chop;
if ( /Address:/ )
{ #found the ad. line
$start_ad = 1;
}
$the_ad .= $_ if ( $start_ad );
if ( /<\/pre>/ )
{ #end of ad.
last;
}
}
if ( $the_ad =~ /<font color=darkblue size=5>(.*?)<\/td>.*Author: <\/b>(.*?)<\/td>.*Email:<\/b> (.*?)<\/td>.*Tel\.:<\/b><\/td><td align=left>(.*?)<\/td>.*Address:<\/b> (.*?)<\/td>.*<pre>(.*?)<\/pre>/ )
{
my $title = $1;
my $author = $2;
my $email = $3;
my $tel = $4;
my $address = $5;
my $ad = $6;
if ( $email =~ /.+\@.+\..+/ )
{
$email =~ s/ //g;
print EMAIL lc( $email ) . "\n";
}
print STDERR $id . ' : ' . $tel . ' : ' . lc( $email ) . "\n" if ( $verbose );
$ad =~ s/[\n|\r]//g;
print FILE $id . ' : ' . $title . ' : ' . $author . ' : ' . $email . ' : ' . $tel . ' : ' . $address . ' : ' . $ad . "\n";
}
sleep $wait;
}
}
close( FILE );
close( EMAIL );
exit;