First and foremost, I am a PHP novice so, if there's better or more efficient way of doing what I'm trying to do, please feel free to point it out
I came across an old PHP script that was used to crawl a site and check the response code on the pages found. I have modified it to do a duplicate content check. It's using the similar_text function to compare 1 page's content (specified by the user) against the content of each page it finds.
It's a little slow but, its working. The only problem that I'm having is that it stops after about the first 10 links and I can't figure out why.
$domain= extract_domain_name($baseurl);
echo '<p class="small">Extracted domain name: <strong>'.$domain.'</strong>. ';
echo 'Maximum depth: <strong>'.$i.'</strong></p>';
function get_urls($page){
global $domain, $i;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $page);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HEADER, true);
/* Spoof the User-Agent header value; just to be safe */
curl_setopt($ch, CURLOPT_USERAGENT,
'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)');
/* I set timeout values for the connection and download
because I don't want my script to get stuck
downloading huge files or trying to connect to
a nonresponsive server. These are optional. */
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 100);
curl_setopt($ch, CURLOPT_TIMEOUT, 100);
/* This ensures 404 Not Found (and similar) will be
treated as errors */
curl_setopt($ch, CURLOPT_FAILONERROR, 0);
/* Download the page */
$html = curl_exec($ch);
/* in case of an error*/
if(curl_exec($ch) === false)
{
echo '<p class="small">Error. Please check URL: <strong style="color:#ae3100">' . curl_error($ch).'</p></strong>';
}
curl_close($ch);
if(!$html) return false;
/* Extract the BASE tag (if present) for
relative-to-absolute URL conversions later */
if(preg_match('/<base[\s]+href=\s*[\"\']?([^\'\" >]+)[\'\" >]/i',$html, $matches)){
preg_match_all('/<a[\s]+[^>]*href\s*=\s*[\"\']?([^\'\" >]+)[\'\" >]/i', $html, $m);
/* this regexp is a combination of numerous
versions I saw online*/
foreach($m[1] as $url) {
$url=trim($url);
/* get rid of PHPSESSID, #linkname, & and javascript: */
$url=preg_replace(
array('/([\?&]PHPSESSID=\w+)$/i','/(#[^\/]*)$/i', '/&/','/^(javascript:.*)/i'),
array('','','&',''),
$url);
/* turn relative URLs into absolute URLs.
relative2absolute() is defined further down
below on this page. */
$url = relative2absolute($base_url, $url);
// check if in the same (sub-)$domain
if(preg_match("/^http[s]?:\/\/[^\/]*".str_replace('.', '\.', $domain)."/i", $url))
{
$depth= substr_count($url, "/")-2 ;
/* Counts slashes in URL
Responsible for link depth
*/
// Functions to crawl the next page
function next_page(){
global $pages;
$k=0;
foreach( array_keys($pages) as $k=> $page){
if($pages[$page] == NULL){
$k++;
echo "[$k] - ";
return $page;
}
}
return NULL;
}
function add_urls($page){ // ads new unique urls in to array and checks each url for Server Header Status
global $pages, $maxlinks;
$start = microtime();
$urls = get_urls($page);
$resptime = microtime() - $start; // with microtime it is possible to find out on which page the crowler stops responding.
//Start checking for Server Header
$ch = curl_init($page);
curl_setopt($ch, CURLOPT_NOBODY, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
// If the status code os 200, then print OK, else = NO
// if($info['http_code']==200) {
$page1 = ($_POST['page1']);
$page1data = file_get_contents($page1);
$page2 = file_get_contents($page);
echo '[1] - '; // this is for the first input url, as it will be extracted from input
add_urls($baseurl);
while(($page= next_page()) != NULL ) //while there are urls available
{
add_urls($page);
}
echo '<p class="small">Amount of crawled links: <strong>'.count ($pages).'</strong></p>';
if (count($pages)<$maxlinks) echo '<p class="small">Sorry, no more links to crawl!!</p>';// count all extracted Urls
}
?><?php
function extract_domain_name($url){
/* old domain extractor
if(preg_match('@^(?:http:\/\/)?([^\/]+)@i', $url, $matches)) {
return trim(strtolower($matches[1]));
} else {
return '';
}*/
preg_match("/^(http:\/\/)?([^\/]+)/i", $url, $matches);
$host = $matches[2];
// get last two segments of host name
preg_match("/[^\.\/]+\.[^\.\/]+$/", $host, $matches);
return $matches[0];
Bookmarks