<?php
/**
 * pear2\Spider\Main
 *
 * PHP version 5
 *
 * @category  Yourcategory
 * @package   PEAR2_Spider
 * @author    Your Name <handle@php.net>
 * @copyright 2010 Your Name
 * @license   http://www.opensource.org/licenses/bsd-license.php New BSD License
 * @version   SVN: $Id$
 * @link      http://svn.php.net/repository/pear2/PEAR2_Spider
 */

/**
 * Main class for PEAR2_Spider
 *
 * @category  Yourcategory
 * @package   PEAR2_Spider
 * @author    Your Name <handle@php.net>
 * @copyright 2010 Your Name
 * @license   http://www.opensource.org/licenses/bsd-license.php New BSD License
 * @link      http://svn.php.net/repository/pear2/PEAR2_Spider
 */
class Spider
{
    const MAX_DEPTH = 50;

    protected $loggers = array();
    protected $filters = array();
    protected $downloader = null;
    protected $parser = null;
    protected $visited = array();

    public function __construct(
        Spider_Downloader $downloader,
        Spider_Parser $parser)
    {
        $this->setDownloader($downloader);
        $this->setParser($parser);
    }

    public function setDownloader(Spider_Downloader $downloader)
    {
        $this->downloader = $downloader;
    }

    public function setParser(Spider_ParserInterface $parser)
    {
        $this->parser = $parser;
    }

    public function addLogger(Spider_LoggerAbstract $logger)
    {
        if (!in_array($logger, $this->loggers)) {
            $this->loggers[] = $logger;
        }
    }
    
    public function addUriFilter($filterClass)
    {
        if (!in_array($filterClass, $this->filters)) {
            $this->filters[] = $filterClass;
        }
    }

    public function spider($baseUri)
    {
        if (!filter_var($baseUri, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED)) {
            throw new Exception('Invalid URI: ' . $baseUri);
        }
        $this->spiderPage($baseUri, $baseUri);
    }

    protected function spiderPage($baseUri, $uri, $depth = 1)
    {

        $this->visited[$uri] = true;

        $content = $this->downloader->download($uri);
        $xpath   = $this->parser->parse($content, $uri);

        foreach ($this->loggers as $logger) {
            $logger->log($uri, $xpath);
        }

        // spider sub-pages
        if ($depth < self::MAX_DEPTH) {
            $subUris = $this->getUris($baseUri, $xpath);
            
            foreach ($this->filters as $filter_class) {
                $subUris = new $filter_class($subUris);
            }
            
            foreach ($subUris as $subUri) {
                if (!array_key_exists($subUri, $this->visited)) {
                    try {
                        $this->spiderPage($baseUri, $subUri, $depth + 1);
                    } catch(Exception $e) {
                        throw new Exception($baseUri . ' linked to a page that does not exist!' .$subUri, 404, $e);
                    }
                }
            }
        }
    }

    protected function getUris($baseUri, DOMXPath $xpath)
    {
        $uris = array();

        $baseHrefNodes = $xpath->query(
            "//xhtml:base/@href"
        );

        if ($baseHrefNodes->length > 0) {
            $baseHref = (string)$baseHrefNodes->item(0)->nodeValue;
        } else {
            $baseHref = '';
        }

        $nodes = $xpath->query(
            "//xhtml:a[@href]/@href"
        );

        foreach ($nodes as $node) {
            $uri = $this->absolutePath((string)$node->nodeValue, $baseUri);
            
            if (!empty($uri)) {
                if (strncmp($baseUri, $uri, strlen($baseUri)) === 0) {
                    $uris[] = $uri;
                } elseif (
                       $uri != '.'
                    && preg_match('!^(https?|ftp)://!i', $uri) === 0
                ) {
                    $uris[] = $baseHref . $uri;
                }
            }
        }

        return new Spider_UriIterator($uris);
    }
    
    public function absolutePath($relativeUri, $baseUri)
    {
        $new_base_url = $baseUri;
        $base_url_parts = parse_url($baseUri);
        
        if (substr($baseUri, -1) != '/') {
            $path = pathinfo($base_url_parts['path']);
            $new_base_url = substr($new_base_url, 0, strlen($new_base_url)-strlen($path['basename']));
        }
        
        $new_txt = '';
    
        if (!filter_var($relativeUri, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED)) {
             if (substr($relativeUri, 0, 1) == '/') {
                 $new_base_url = $base_url_parts['scheme'].'://'.$base_url_parts['host'];
             }
             $new_txt .= $new_base_url;
        }
        
        $absoluteUri = $new_txt.$relativeUri;
        
        return $absoluteUri;
    }
}