joboy84
07-06-2008, 01:20 PM
Hello,
I just begun to develop perl scripts.
My script:
1) It connects to mysql db to get some url's
2) It grab some other url's from given websites
3) It try to get the ip of the server where grabbed url's are hosted.
The script work without any problem with url's like http://www.mysite.com (from database)
But when the url look's like this:
http://www.mysite.com/index.php?method=pv&cat=&start=41
It will not consider the variable...It's like I entered http://www.mysite.com/index.php
Can you help me please?
Script:
#!/usr/bin/perl
use warnings;
umask 0;
use Config;
use DBI;
use DBD::mysql;
use LWP::Simple;
use threads;
use strict;
use CGI;
use CGI::Carp qw(warningsToBrowser fatalsToBrowser);
my $cgi = CGI->new;
print $cgi->header;
$| = 1;
$Config{useithreads} or die "Recompile Perl with threads to run this program. \n";
my @threads;
my $threadcount = 50;
my $enablethreading = 1;
my $dbh;
sub connectDB {
my $hostname = "localhost";
my $database = "xxxx";
my $username = "xxx";
my $password = "xxxxx";
$dbh = DBI->connect("DBI:mysql:$database:$hostname", $username, $password) or die "Can't connect to the DB: $DBI::errstr\n";
}
sub processHost {
my ($host) = @_;
my ($hostonly) = $host =~ m/[a-z0-9][a-z0-9-.]*[.][a-z.]+[a-z]/ig;
my ($ip) = `host -W 3 -t A $hostonly 2>/dev/null` =~ m/[0-9]+[.][0-9]+[.][0-9]+[.][0-9]+/g;
connectDB();
if ($ip) {
print "<b>[ " . threads->self()->tid() . " ]</b> <a href=\"http://$host\">$host</a> - $ip <br/>\n";
$dbh->prepare("INSERT IGNORE INTO listeIp (Ipadress, url, Type, InsertDate) VALUES ('$ip', '$host', 'WebProxy', NOW())")->execute();
} else {
print "<b>[ " . threads->self()->tid() . " ]</b> <a href=\"http://$host\">$host</a> - NO IP <br/>\n";
$dbh->prepare("INSERT IGNORE INTO UrltoVerify (url) VALUES ('$host')")->execute();
}
}
sub parseHosts {
my ($url) = @_;
my $content = `curl --header "User-agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14" $url 2>&1`;
my $hostpattern = "[a-z0-9][a-z0-9-.]*[.][a-z.]+[a-z]/?[a-z0-9_./-]*";
my $p1 = "href=[\"]($hostpattern)[\"] onclick";
my $p2 = "href=[\"]http://($hostpattern)[\"] onclick";
my $c = 0;
while ($content =~ m/($p1)|($p2)/igs) {
if ($2) {
processHost($2);
} elsif ($4) {
processHost($4);
}
}
}
sub parseHostsThreaded {
my ($url) = @_;
for (my $t = 0; $t < $threadcount; $t++) {
if (!$threads[$t]) {
$threads[$t] = threads->new(\&parseHosts, $url);
return;
}
}
closeOpenThreads();
parseHostsThreaded($url);
}
sub closeOpenThreads {
for (my $t = 0; $t < $threadcount; $t++) {
if ($threads[$t]) {
$threads[$t]->join();
$threads[$t] = 0;
}
}
}
connectDB();
my $sth = $dbh->prepare("SELECT * FROM listeUrl WHERE category=1");
$sth->execute;
my $t = -1;
while (my @row = $sth->fetchrow_array()) {
if ($enablethreading) {
parseHostsThreaded($row[1]);
} else {
parseHosts($row[1]);
}
}
closeOpenThreads();
I just begun to develop perl scripts.
My script:
1) It connects to mysql db to get some url's
2) It grab some other url's from given websites
3) It try to get the ip of the server where grabbed url's are hosted.
The script work without any problem with url's like http://www.mysite.com (from database)
But when the url look's like this:
http://www.mysite.com/index.php?method=pv&cat=&start=41
It will not consider the variable...It's like I entered http://www.mysite.com/index.php
Can you help me please?
Script:
#!/usr/bin/perl
use warnings;
umask 0;
use Config;
use DBI;
use DBD::mysql;
use LWP::Simple;
use threads;
use strict;
use CGI;
use CGI::Carp qw(warningsToBrowser fatalsToBrowser);
my $cgi = CGI->new;
print $cgi->header;
$| = 1;
$Config{useithreads} or die "Recompile Perl with threads to run this program. \n";
my @threads;
my $threadcount = 50;
my $enablethreading = 1;
my $dbh;
sub connectDB {
my $hostname = "localhost";
my $database = "xxxx";
my $username = "xxx";
my $password = "xxxxx";
$dbh = DBI->connect("DBI:mysql:$database:$hostname", $username, $password) or die "Can't connect to the DB: $DBI::errstr\n";
}
sub processHost {
my ($host) = @_;
my ($hostonly) = $host =~ m/[a-z0-9][a-z0-9-.]*[.][a-z.]+[a-z]/ig;
my ($ip) = `host -W 3 -t A $hostonly 2>/dev/null` =~ m/[0-9]+[.][0-9]+[.][0-9]+[.][0-9]+/g;
connectDB();
if ($ip) {
print "<b>[ " . threads->self()->tid() . " ]</b> <a href=\"http://$host\">$host</a> - $ip <br/>\n";
$dbh->prepare("INSERT IGNORE INTO listeIp (Ipadress, url, Type, InsertDate) VALUES ('$ip', '$host', 'WebProxy', NOW())")->execute();
} else {
print "<b>[ " . threads->self()->tid() . " ]</b> <a href=\"http://$host\">$host</a> - NO IP <br/>\n";
$dbh->prepare("INSERT IGNORE INTO UrltoVerify (url) VALUES ('$host')")->execute();
}
}
sub parseHosts {
my ($url) = @_;
my $content = `curl --header "User-agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14" $url 2>&1`;
my $hostpattern = "[a-z0-9][a-z0-9-.]*[.][a-z.]+[a-z]/?[a-z0-9_./-]*";
my $p1 = "href=[\"]($hostpattern)[\"] onclick";
my $p2 = "href=[\"]http://($hostpattern)[\"] onclick";
my $c = 0;
while ($content =~ m/($p1)|($p2)/igs) {
if ($2) {
processHost($2);
} elsif ($4) {
processHost($4);
}
}
}
sub parseHostsThreaded {
my ($url) = @_;
for (my $t = 0; $t < $threadcount; $t++) {
if (!$threads[$t]) {
$threads[$t] = threads->new(\&parseHosts, $url);
return;
}
}
closeOpenThreads();
parseHostsThreaded($url);
}
sub closeOpenThreads {
for (my $t = 0; $t < $threadcount; $t++) {
if ($threads[$t]) {
$threads[$t]->join();
$threads[$t] = 0;
}
}
}
connectDB();
my $sth = $dbh->prepare("SELECT * FROM listeUrl WHERE category=1");
$sth->execute;
my $t = -1;
while (my @row = $sth->fetchrow_array()) {
if ($enablethreading) {
parseHostsThreaded($row[1]);
} else {
parseHosts($row[1]);
}
}
closeOpenThreads();