Wednesday, February 22, 2006

Enhanced webbot to grab ads. info from vansky.com

This is an enhanced version of my webbot script to extract ad. info from vansky.com.

---------------------------------------------
#!/usr/bin/perl -w

# Hao Chen
# The purpose of this script is to extract ad info. from vansky.com website
# and write the data to grab.dat, email.dat files.
#
# grab.log file records the id of ads. grabbed to avoid redundant work.
#

use strict;
use LWP::UserAgent;

my $url_hp   = 'http://www.vansky.com/vanphp/gg/newsgroup.php';
my $url_root = 'http://www.vansky.com/vanphp/gg/shownews.php?id=';

# starting id
my $start = 50000;

# ending id
my $end = 0;

# wait seconds
my $wait = 1;

my $ua = LWP::UserAgent->new;
$ua->agent( 'Mozilla/5.0' );

my ( $url, $req, $res );

my $verbose = 1;

$req = HTTP::Request->new( GET => $url_hp );
$res = $ua->request( $req );

if ( $res->is_success )
{
   foreach ( split( "\n", $res->content ) )
   {  
      if ( /pageno_c=(.*?)shownews\.php\?id=(\d*?)'\)/ )
      {  
         $end = $2;
         last;
      }
   }
}

open( FILE, 'grab.log' ) or die "Can't open file grab.log\n";
my @log = <FILE>;
close( FILE );

my $num_lines = scalar @log;

if ( $num_lines && $log[ $num_lines - 1 ] =~ / => (\d+) - (\d+)/ )
{
   $start = $2;
} else
{
   print STDERR "brand new task: start = $start\n";
}

if ( $end > $start )
{
   print STDERR "new grab task: $start - $end\n";
} else
{
   print STDERR "no new grab tasks!\n";
   exit;
}
open( FILE, '>>grab.log' ) or die "Can't open file grab.log\n";
my $currTime = localtime;
print FILE $currTime . ' => ' . $start . ' - ' . $end . "\n";
close( FILE );

print "######## grab $start to $end #########\n";

open( FILE,  '>>grab.dat' )  or die "Can't open file grab.dat\n";
open( EMAIL, '>>email.dat' ) or die "Can't open file email.dat\n";

print FILE "##### $currTime => grab ad. $start to $end\n";
print EMAIL "##### $currTime => grab email. $start to $end\n";

for ( my $id = $start; $id <= $end; $id++ )
{
   $url = $url_root . $id;
   print STDERR $url . "\n" if ( $verbose );
  
   $req = HTTP::Request->new( GET => $url );
   $res = $ua->request( $req );
  
   if ( $res->is_success )
   { 
      my @content = split( "\n", $res->content );
      my $the_ad;
      my $start_ad = 0;
      foreach ( @content )
      { 
         chop;
         if ( /Address:/ )
         {    #found the ad. line
            $start_ad = 1;
         }
        
         $the_ad .= $_ if ( $start_ad );
        
         if ( /<\/pre>/ )
         {    #end of ad.
            last;
         }
      }
     
      if ( $the_ad =~ /<font color=darkblue size=5>(.*?)<\/td>.*Author:&nbsp;&nbsp; <\/b>(.*?)<\/td>.*Email:<\/b>&nbsp;(.*?)<\/td>.*Tel\.:<\/b><\/td><td align=left>(.*?)<\/td>.*Address:<\/b>&nbsp;(.*?)<\/td>.*<pre>(.*?)<\/pre>/ )
      { 
         my $title   = $1;
         my $author  = $2;
         my $email   = $3;
         my $tel     = $4;
         my $address = $5;
         my $ad      = $6;
        
         if ( $email =~ /.+\@.+\..+/ )
         { 
            $email =~ s/ //g;
            print EMAIL lc( $email ) . "\n";
         }
         print STDERR $id . ' : ' . $tel . ' : ' . lc( $email ) . "\n" if ( $verbose );
        
         $ad =~ s/[\n|\r]//g;
        
         print FILE $id . ' : ' . $title . ' : ' . $author . ' : ' . $email . ' : ' . $tel . ' : ' . $address . ' : ' . $ad . "\n";
      }
      sleep $wait;
   }

}

close( FILE );
close( EMAIL );

exit;