Forked from
Digital Experience Group / UNL_WDN_Assessment
42 commits behind the upstream repository.
-
Brett Bieber authoredBrett Bieber authored
Spider.php 4.60 KiB
<?php
/**
* pear2\Spider\Main
*
* PHP version 5
*
* @category Yourcategory
* @package PEAR2_Spider
* @author Your Name <handle@php.net>
* @copyright 2010 Your Name
* @license http://www.opensource.org/licenses/bsd-license.php New BSD License
* @version SVN: $Id$
* @link http://svn.php.net/repository/pear2/PEAR2_Spider
*/
/**
* Main class for PEAR2_Spider
*
* @category Yourcategory
* @package PEAR2_Spider
* @author Your Name <handle@php.net>
* @copyright 2010 Your Name
* @license http://www.opensource.org/licenses/bsd-license.php New BSD License
* @link http://svn.php.net/repository/pear2/PEAR2_Spider
*/
class Spider
{
const MAX_DEPTH = 50;
protected $loggers = array();
protected $filters = array();
protected $downloader = null;
protected $parser = null;
protected $visited = array();
public function __construct(
Spider_Downloader $downloader,
Spider_Parser $parser)
{
$this->setDownloader($downloader);
$this->setParser($parser);
}
public function setDownloader(Spider_Downloader $downloader)
{
$this->downloader = $downloader;
}
public function setParser(Spider_ParserInterface $parser)
{
$this->parser = $parser;
}
public function addLogger(Spider_LoggerAbstract $logger)
{
if (!in_array($logger, $this->loggers)) {
$this->loggers[] = $logger;
}
}
public function addUriFilter($filterClass)
{
if (!in_array($filterClass, $this->filters)) {
$this->filters[] = $filterClass;
}
}
public function spider($baseUri)
{
if (!filter_var($baseUri, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED)) {
throw new Exception('Invalid URI: ' . $baseUri);
}
$this->spiderPage($baseUri, $baseUri);
}
protected function spiderPage($baseUri, $uri, $depth = 1)
{
$this->visited[$uri] = true;
$content = $this->downloader->download($uri);
$xpath = $this->parser->parse($content, $uri);
foreach ($this->loggers as $logger) {
$logger->log($uri, $xpath);
}
// spider sub-pages
if ($depth < self::MAX_DEPTH) {
$subUris = $this->getUris($baseUri, $xpath);
foreach ($this->filters as $filter_class) {
$subUris = new $filter_class($subUris);
}
foreach ($subUris as $subUri) {
if (!array_key_exists($subUri, $this->visited)) {
try {
$this->spiderPage($baseUri, $subUri, $depth + 1);
} catch(Exception $e) {
throw new Exception($baseUri . ' linked to a page that does not exist!' .$subUri, 404, $e);
}
}
}
}
}
protected function getUris($baseUri, DOMXPath $xpath)
{
$uris = array();
$baseHrefNodes = $xpath->query(
"//xhtml:base/@href"
);
if ($baseHrefNodes->length > 0) {
$baseHref = (string)$baseHrefNodes->item(0)->nodeValue;
} else {
$baseHref = '';
}
$nodes = $xpath->query(
"//xhtml:a[@href]/@href"
);
foreach ($nodes as $node) {
$uri = $this->absolutePath((string)$node->nodeValue, $baseUri);
if (!empty($uri)) {
if (strncmp($baseUri, $uri, strlen($baseUri)) === 0) {
$uris[] = $uri;
} elseif (
$uri != '.'
&& preg_match('!^(https?|ftp)://!i', $uri) === 0
) {
$uris[] = $baseHref . $uri;
}
}
}
return new Spider_UriIterator($uris);
}
public function absolutePath($relativeUri, $baseUri)
{
$new_base_url = $baseUri;
$base_url_parts = parse_url($baseUri);
if (substr($baseUri, -1) != '/') {
$path = pathinfo($base_url_parts['path']);
$new_base_url = substr($new_base_url, 0, strlen($new_base_url)-strlen($path['basename']));
}
$new_txt = '';
if (!filter_var($relativeUri, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED)) {
if (substr($relativeUri, 0, 1) == '/') {
$new_base_url = $base_url_parts['scheme'].'://'.$base_url_parts['host'];
}
$new_txt .= $new_base_url;
}
$absoluteUri = $new_txt.$relativeUri;
return $absoluteUri;
}
}