/    Sign up×
Community /Pin to ProfileBookmark

Where Is this Link Coming From in Xampp ?

Php cURL Folks,

I found this Web crawler:
https://stackoverflow.com/questions/2313107/how-do-i-make-a-simple-crawler-in-php

[code]
<?php

class crawler
{
protected $_url;
protected $_depth;
protected $_host;
protected $_useHttpAuth = false;
protected $_user;
protected $_pass;
protected $_seen = array();
protected $_filter = array();

public function __construct($url, $depth = 5)
{
$this->_url = $url;
$this->_depth = $depth;
$parse = parse_url($url);
$this->_host = $parse[‘host’];
}

protected function _processAnchors($content, $url, $depth)
{
$dom = new DOMDocument(‘1.0’);
@$dom->loadHTML($content);
$anchors = $dom->getElementsByTagName(‘a’);

foreach ($anchors as $element) {
$href = $element->getAttribute(‘href’);
if (0 !== strpos($href, ‘http’)) {
$path = ‘/’ . ltrim($href, ‘/’);
if (extension_loaded(‘http’)) {
$href = http_build_url($url, array(‘path’ => $path));
} else {
$parts = parse_url($url);
$href = $parts[‘scheme’] . ‘://’;
if (isset($parts[‘user’]) && isset($parts[‘pass’])) {
$href .= $parts[‘user’] . ‘:’ . $parts[‘pass’] . ‘@’;
}
$href .= $parts[‘host’];
if (isset($parts[‘port’])) {
$href .= ‘:’ . $parts[‘port’];
}
$href .= $path;
}
}
// Crawl only link that belongs to the start domain
$this->crawl_page($href, $depth – 1);
}
}

protected function _getContent($url)
{
$handle = curl_init($url);
if ($this->_useHttpAuth) {
curl_setopt($handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
curl_setopt($handle, CURLOPT_USERPWD, $this->_user . “:” . $this->_pass);
}
// follows 302 redirect, creates problem wiht authentication
// curl_setopt($handle, CURLOPT_FOLLOWLOCATION, TRUE);
// return the content
curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);

/* Get the HTML or whatever is linked in $url. */
$response = curl_exec($handle);
// response total time
$time = curl_getinfo($handle, CURLINFO_TOTAL_TIME);
/* Check for 404 (file not found). */
$httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);

curl_close($handle);
return array($response, $httpCode, $time);
}

protected function _printResult($url, $depth, $httpcode, $time)
{
ob_end_flush();
$currentDepth = $this->_depth – $depth;
$count = count($this->_seen);
echo “N::$count,CODE::$httpcode,TIME::$time,DEPTH::$currentDepth URL::$url <br>”;
ob_start();
flush();
}

protected function isValid($url, $depth)
{
if (strpos($url, $this->_host) === false
|| $depth === 0
|| isset($this->_seen[$url])
) {
return false;
}
foreach ($this->_filter as $excludePath) {
if (strpos($url, $excludePath) !== false) {
return false;
}
}
return true;
}

public function crawl_page($url, $depth)
{
if (!$this->isValid($url, $depth)) {
return;
}
// add to the seen URL
$this->_seen[$url] = true;
// get Content and Return Code
list($content, $httpcode, $time) = $this->_getContent($url);
// print Result for current Page
$this->_printResult($url, $depth, $httpcode, $time);
// process subPages
$this->_processAnchors($content, $url, $depth);
}

public function setHttpAuth($user, $pass)
{
$this->_useHttpAuth = true;
$this->_user = $user;
$this->_pass = $pass;
}

public function addFilterPath($path)
{
$this->_filter[] = $path;
}

public function run()
{
$this->crawl_page($this->_url, $this->_depth);
}
}

// USAGE
$startURL = ‘http://localhost/test/crawler/3/1.php’;
$depth = 6;
$username = ‘YOURUSER’;
$password = ‘YOURPASS’;
$crawler = new crawler($startURL, $depth);
$crawler->setHttpAuth($username, $password);
// Exclude path with the following structure to be processed
$crawler->addFilterPath(‘customer/account/login/referer’);
$crawler->run();

?>
[/code]

I just tested it after creating a few pages on my Xampp.
I created index.php where I pasted the above crawler code. Pasted it here:
http://localhost/test/crawler/3/1.php

That same folder has these files:
1.php (linking to 2.php & 10.php)
2.php (linking to 3.php & 20.php)
3.php (linking to 4.php & 30.php)
4.php (linking to 5.php & 40.php)
5.php (linking to1.php & 50.php)

10.php (linking to 20.php & 2.php)
20.php (linking to 30.php & 3.php)
30.php (linking to 40.php & 4.php)
40.php (linking to 50.php & 5.php)
50.php (linking to 10.php & 1.php)

I set the crawler to start spidering from 1.php to 50 depths.
It should have found all the 10 links mentioned on the above list just running depth 1 or 2.
But it only spits out these ….

**N::1,CODE::200,TIME::0.016,DEPTH::0 URL::http://localhost/test/crawler/3/1.php
N::2,CODE::404,TIME::0.031,DEPTH::1 URL::http://localhost/2.php
N::3,CODE::403,TIME::0.031,DEPTH::2 URL::http://localhost/mailto:postmaster@localhost
N::4,CODE::302,TIME::0.031,DEPTH::3 URL::http://localhost/
N::5,CODE::404,TIME::0.016,DEPTH::1 URL::http://localhost/10.php**

Now, where did this come from ?
**N::3,CODE::403,TIME::0.031,DEPTH::2 URL::http://localhost/mailto:postmaster@localhost**

I can’t find this link or file: **http://localhost/mailto:postmaster@localhost**
Where is it and how come crawler found it when I can’t manually ?
What is wrong with the crawler code ?
Can someone have a look and fix it and then send over ? It seems it is having problems going from depth to depth. Starts and stops at depth: 1.

For my learning purpose, I’d appreciate it if you can add as many comments on the lines as the php script lacks comments on each line. Very little comments present. A lot of lines I do not understand. Would have been best if all the lines contained comments.

to post a comment
PHP

1 Comments(s)

Copy linkTweet thisAlerts:
@developer_webauthorMay 10.2020 — No one encountered this in Xampp before ? 😁
×

Success!

Help @developer_web spread the word by sharing this article on Twitter...

Tweet This
Sign in
Forgot password?
Sign in with TwitchSign in with GithubCreate Account
about: ({
version: 0.1.9 BETA 4.26,
whats_new: community page,
up_next: more Davinci•003 tasks,
coming_soon: events calendar,
social: @webDeveloperHQ
});

legal: ({
terms: of use,
privacy: policy
});
changelog: (
version: 0.1.9,
notes: added community page

version: 0.1.8,
notes: added Davinci•003

version: 0.1.7,
notes: upvote answers to bounties

version: 0.1.6,
notes: article editor refresh
)...
recent_tips: (
tipper: @Yussuf4331,
tipped: article
amount: 1000 SATS,

tipper: @darkwebsites540,
tipped: article
amount: 10 SATS,

tipper: @Samric24,
tipped: article
amount: 1000 SATS,
)...