echristo66
02-02-2009, 04:19 PM
I need to exclude .htm and .doc from my search engine. I have provided the actual script below which was taken from a book and modified by another. I am a perl novice, the script works fine for my small site, just want to exclude a few file types.
#!/usr/local/bin/perl
# use strict;
use CGI;
# use CGIBook::Error;
use CGI::Carp 'fatalsToBrowser';
my $DOCUMENT_ROOT = "/usr/local/apache2/htdocs/Continual_Learning/";
my $VIRTUAL_PATH = "../Continual_Learning";
local( *LOG );
open(LOG,">>dcjsrch.log");
my $q = new CGI;
my $spc = " ";
my $typed_query = $q->param( "query" );
my $new_query = $q->param( "newquery" );
$ip = $ENV{'REMOTE_ADDR'}; # "dotted quad" IP address
print LOG &sql_time, " ===== Search CLS start: query=$typed_query; IP=$ip;\n";
my $query;
my $results;
if ($new_query eq "") {
$query = quotemeta ( $typed_query );
} else {
$typed_query = $new_query;
$query = quotemeta ( $new_query );
}
$results = search ( $q, $query );
print $q->header( "text/html" ),
$q->start_html( "CLS Search" ),
$q->start_form,
$spc, "Enter New Search: ",
$q->textfield(-name=>'newquery',-default=>"",-size=>20),
$q->submit(-label=>'Submit'),
$q->br, $q->hr(),
$q->end_form,
$q->h1( "Search for: $typed_query" ),
$q->ul( $results || "No matches found" ),
$q->end_html;
sub search {
my ( $q, $query ) = @_;
my ( $t1f, $cmd, %matches, $file, @line, $results );
my ( $content, $num_matches, $link, $nbr_htm );
my ( $ilb, $irb, $my_title, %title, $key, $path );
my ( $left, $right, $ls );
local( *FILE );
$t1f = "/tmp/dcjsr";
$cmd = "cd $DOCUMENT_ROOT; find \. -exec grep -i $query {} \\; -print >$t1f";
print LOG " cmd=$cmd;\n";
system($cmd);
close($t1f);
open(FILE, "<$t1f") || die "Cannot open $t1f for reading.\n";
$cmd = "";
$num_matches = 0;
$my_title = "_no_title_found_";
while(<FILE>) {
chomp($_);
$ilb = index($_, "<title>");
if ($ilb >= 0) {
$irb = index($_, "</title>");
$my_title = substr($_,$ilb+7,($irb-$ilb-7));
# print LOG " found title=$my_title;\n";
}
if (/$query/io ) {
$num_matches++;
$ilb = index($_, '<');
if ($ilb >= 0) {
$irb = index($_, '>', $ilb+1);
} else {
$irb = 0;
}
while ( ($ilb >= 0) && ($irb >= 0) && ($ilb < $irb) ) {
$ls = length($_);
# print LOG " match.nbr=$num_matches; ilb=$ilb; irb=$irb; ls=$ls;\n";
if ($ilb > 0) {
$left = substr($_,0,$ilb);
} else {
$left = "";
}
if ($irlb < $ls) {
$right = substr($_,$irb+1);
} else {
$right = "";
}
$_ = $left . $right;
$ilb = index($_, '<');
if ($ilb >= 0) {
$irb = index($_, '>', $ilb+1);
} else {
$irb = 0;
}
}
$_ = html_escape( $_ );
s|($query)|<B>$1</B>|gio;
# print LOG " txt found in=$_;\n";
if ($cmd eq "") {
$cmd = $_;
} else {
$cmd = $cmd . "~" . $_;
}
} else {
# print LOG " dir.path=$_;\n";
$key = sprintf("%04d%s", (9999-$num_matches), $_);
$matches{$key} = $cmd;
$title{$key} = $my_title;
$cmd = "";
$num_matches = 0;
$my_title = "_no_title_found_";
}
}
close(FILE);
$results = "";
foreach $file (sort keys %matches) {
$num_matches = 9999 - substr($file,0,4);
$path = substr($file,4);
$my_title = $title{$file};
if ($my_title eq "_no_title_found_") {
$irb = rindex($path,'/');
if ($irb > 0) {
$my_title = substr($path,$irb+1);
} else {
$my_title = "_garbage_title_";
}
}
$content = $matches{$file};
$content =~ s|~|<br>|gio;
# print LOG " $num_matches lines in path=$path;\n";
print "$file : $matches{$file}\n";
$link = $q->a( { -href => "$VIRTUAL_PATH/$path" }, $my_title );
# print LOG " link=$link;\n";
$results .= $q->p( $q->b( $link ) . " ($num_matches matches)" .
$q->br . $content);
} # print each pathname followed by its matches.
return $results;
}
sub html_escape {
my ( $text ) = @_;
$text =~ s/&/&/g;
$text =~ s/</</g;
$text =~ s/>/>/g;
return $text;
}
sub sql_time {
#
# sql_time - return the current hour,minute,second as an ASCII
# string formatted like MySQL datetime: yyyy-mm-dd HH:MM:SS
#
local($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst);
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
sprintf("%04d-%02d-%02d %02d:%02d:%02d",
$year+1900,$mon+1,$mday,$hour,$min,$sec);
} # END sub sql_time
#!/usr/local/bin/perl
# use strict;
use CGI;
# use CGIBook::Error;
use CGI::Carp 'fatalsToBrowser';
my $DOCUMENT_ROOT = "/usr/local/apache2/htdocs/Continual_Learning/";
my $VIRTUAL_PATH = "../Continual_Learning";
local( *LOG );
open(LOG,">>dcjsrch.log");
my $q = new CGI;
my $spc = " ";
my $typed_query = $q->param( "query" );
my $new_query = $q->param( "newquery" );
$ip = $ENV{'REMOTE_ADDR'}; # "dotted quad" IP address
print LOG &sql_time, " ===== Search CLS start: query=$typed_query; IP=$ip;\n";
my $query;
my $results;
if ($new_query eq "") {
$query = quotemeta ( $typed_query );
} else {
$typed_query = $new_query;
$query = quotemeta ( $new_query );
}
$results = search ( $q, $query );
print $q->header( "text/html" ),
$q->start_html( "CLS Search" ),
$q->start_form,
$spc, "Enter New Search: ",
$q->textfield(-name=>'newquery',-default=>"",-size=>20),
$q->submit(-label=>'Submit'),
$q->br, $q->hr(),
$q->end_form,
$q->h1( "Search for: $typed_query" ),
$q->ul( $results || "No matches found" ),
$q->end_html;
sub search {
my ( $q, $query ) = @_;
my ( $t1f, $cmd, %matches, $file, @line, $results );
my ( $content, $num_matches, $link, $nbr_htm );
my ( $ilb, $irb, $my_title, %title, $key, $path );
my ( $left, $right, $ls );
local( *FILE );
$t1f = "/tmp/dcjsr";
$cmd = "cd $DOCUMENT_ROOT; find \. -exec grep -i $query {} \\; -print >$t1f";
print LOG " cmd=$cmd;\n";
system($cmd);
close($t1f);
open(FILE, "<$t1f") || die "Cannot open $t1f for reading.\n";
$cmd = "";
$num_matches = 0;
$my_title = "_no_title_found_";
while(<FILE>) {
chomp($_);
$ilb = index($_, "<title>");
if ($ilb >= 0) {
$irb = index($_, "</title>");
$my_title = substr($_,$ilb+7,($irb-$ilb-7));
# print LOG " found title=$my_title;\n";
}
if (/$query/io ) {
$num_matches++;
$ilb = index($_, '<');
if ($ilb >= 0) {
$irb = index($_, '>', $ilb+1);
} else {
$irb = 0;
}
while ( ($ilb >= 0) && ($irb >= 0) && ($ilb < $irb) ) {
$ls = length($_);
# print LOG " match.nbr=$num_matches; ilb=$ilb; irb=$irb; ls=$ls;\n";
if ($ilb > 0) {
$left = substr($_,0,$ilb);
} else {
$left = "";
}
if ($irlb < $ls) {
$right = substr($_,$irb+1);
} else {
$right = "";
}
$_ = $left . $right;
$ilb = index($_, '<');
if ($ilb >= 0) {
$irb = index($_, '>', $ilb+1);
} else {
$irb = 0;
}
}
$_ = html_escape( $_ );
s|($query)|<B>$1</B>|gio;
# print LOG " txt found in=$_;\n";
if ($cmd eq "") {
$cmd = $_;
} else {
$cmd = $cmd . "~" . $_;
}
} else {
# print LOG " dir.path=$_;\n";
$key = sprintf("%04d%s", (9999-$num_matches), $_);
$matches{$key} = $cmd;
$title{$key} = $my_title;
$cmd = "";
$num_matches = 0;
$my_title = "_no_title_found_";
}
}
close(FILE);
$results = "";
foreach $file (sort keys %matches) {
$num_matches = 9999 - substr($file,0,4);
$path = substr($file,4);
$my_title = $title{$file};
if ($my_title eq "_no_title_found_") {
$irb = rindex($path,'/');
if ($irb > 0) {
$my_title = substr($path,$irb+1);
} else {
$my_title = "_garbage_title_";
}
}
$content = $matches{$file};
$content =~ s|~|<br>|gio;
# print LOG " $num_matches lines in path=$path;\n";
print "$file : $matches{$file}\n";
$link = $q->a( { -href => "$VIRTUAL_PATH/$path" }, $my_title );
# print LOG " link=$link;\n";
$results .= $q->p( $q->b( $link ) . " ($num_matches matches)" .
$q->br . $content);
} # print each pathname followed by its matches.
return $results;
}
sub html_escape {
my ( $text ) = @_;
$text =~ s/&/&/g;
$text =~ s/</</g;
$text =~ s/>/>/g;
return $text;
}
sub sql_time {
#
# sql_time - return the current hour,minute,second as an ASCII
# string formatted like MySQL datetime: yyyy-mm-dd HH:MM:SS
#
local($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst);
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
sprintf("%04d-%02d-%02d %02d:%02d:%02d",
$year+1900,$mon+1,$mday,$hour,$min,$sec);
} # END sub sql_time