Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • s-mfairch4/UNL_WDN_Assessment
  • dxg/UNL_WDN_Assessment
2 results
Show changes
Commits on Source (42)
Showing
with 635 additions and 31 deletions
......@@ -17,6 +17,5 @@
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.php.core.PHPNature</nature>
</natures>
</projectDescription>
CREATE TABLE IF NOT EXISTS `assessment` (
`site_id` int(11) NOT NULL,
`baseurl` varchar(255) NOT NULL,
`url` varchar(1000) NOT NULL,
`valid` enum('true','false','unknown') DEFAULT NULL,
`code` int(4) DEFAULT '0',
`timestamp` datetime NOT NULL DEFAULT '0000-00-00 00:00:00',
PRIMARY KEY (`baseurl`,`site_id`,`url`(500)),
KEY `valid` (`valid`),
KEY `baseurl` (`baseurl`),
KEY `url` (`url`)
);
No preview for this file type
<?xml version="1.0" encoding="UTF-8"?>
<channel version="1.0" xmlns="http://pear.php.net/channel-1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pear.php.net/channel-1.0http://pear.php.net/dtd/channel-1.0.xsd">
<name>pear.unl.edu</name>
<summary>UNL PHP Extension and Application Repository</summary>
<suggestedalias>unl</suggestedalias>
<servers>
<primary>
<rest>
<baseurl type="REST1.0">http://pear.unl.edu/Chiara_PEAR_Server_REST/</baseurl>
<baseurl type="REST1.1">http://pear.unl.edu/Chiara_PEAR_Server_REST/</baseurl>
<baseurl type="REST1.3">http://pear.unl.edu/Chiara_PEAR_Server_REST/</baseurl>
</rest>
</primary>
</servers>
</channel>
pear.unl.edu
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://pear.php.net/dtd/package-2.0" xmlns:tasks="http://pear.php.net/dtd/tasks-1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" packagerversion="1.8.0alpha1" version="2.0" xsi:schemaLocation="http://pear.php.net/dtd/tasks-1.0 http://pear.php.net/dtd/tasks-1.0.xsd http://pear.php.net/dtd/package-2.0 http://pear.php.net/dtd/package-2.0.xsd">
<name>UNL_Autoload</name>
<channel>pear.unl.edu</channel>
<summary>An autoloader implementation for UNL PEAR packages</summary>
<description>This package provides an autoloader for classes beginning
with UNL_ and is mainly used for autoloading package files from http://pear.unl.edu/.</description>
<lead>
<name>Brett Bieber</name>
<user>saltybeagle</user>
<email>brett.bieber@gmail.com</email>
<active>yes</active>
</lead>
<date>2010-01-21</date>
<time>11:02:13</time>
<version>
<release>0.5.0</release>
<api>0.5.0</api>
</version>
<stability>
<release>alpha</release>
<api>alpha</api>
</stability>
<license uri="http://www1.unl.edu/wdn/wiki/Software_License">BSD License</license>
<notes>* First release.</notes>
<contents>
<dir name="/">
<file baseinstalldir="/" md5sum="2d13c44763ebe506f915d211dcd8f00a" name="UNL/Autoload.php" role="php"/>
</dir>
</contents>
<dependencies>
<required>
<php>
<min>5.2.0</min>
</php>
<pearinstaller>
<min>1.4.3</min>
</pearinstaller>
</required>
</dependencies>
<phprelease>
<changelog>
<release>
<version>
<release>0.5.0</release>
<api>0.5.0</api>
</version>
<stability>
<release>alpha</release>
<api>alpha</api>
</stability>
<date>2008-11-10</date>
<license uri="http://www1.unl.edu/wdn/wiki/Software_License">BSD License</license>
<notes>* First release.</notes>
</release>
</changelog>
</phprelease>
</package>
File added
......@@ -44,7 +44,6 @@
/**
* Base class for exceptions in PEAR
*/
require_once 'PEAR/Exception.php';
/**
* Exception class for HTTP_Request2 package
......@@ -56,7 +55,7 @@ require_once 'PEAR/Exception.php';
* @package HTTP_Request2
* @version Release: 0.5.1
*/
class HTTP_Request2_Exception extends PEAR_Exception
class HTTP_Request2_Exception extends Exception
{
}
?>
\ No newline at end of file
......@@ -4,10 +4,11 @@
*
* PHP version 5
*
* @category Yourcategory
* @category Tools
* @package PEAR2_Spider
* @author Your Name <handle@php.net>
* @copyright 2010 Your Name
* @author Michael Gauthier <mike@silverorange.com>
* @author Brett Bieber <saltybeagle@php.net>
* @copyright 2010 silverorange Inc.
* @license http://www.opensource.org/licenses/bsd-license.php New BSD License
* @version SVN: $Id$
* @link http://svn.php.net/repository/pear2/PEAR2_Spider
......@@ -16,10 +17,11 @@
/**
* Main class for PEAR2_Spider
*
* @category Yourcategory
* @category Tools
* @package PEAR2_Spider
* @author Your Name <handle@php.net>
* @copyright 2010 Your Name
* @author Michael Gauthier <mike@silverorange.com>
* @author Brett Bieber <saltybeagle@php.net>
* @copyright 2010 silverorange Inc.
* @license http://www.opensource.org/licenses/bsd-license.php New BSD License
* @link http://svn.php.net/repository/pear2/PEAR2_Spider
*/
......@@ -31,6 +33,7 @@ class Spider
protected $filters = array();
protected $downloader = null;
protected $parser = null;
protected $start_base = null;
protected $visited = array();
public function __construct(
......@@ -70,7 +73,8 @@ class Spider
if (!filter_var($baseUri, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED)) {
throw new Exception('Invalid URI: ' . $baseUri);
}
$this->spiderPage($baseUri, $baseUri);
$this->start_base = self::getUriBase($baseUri);
$this->spiderPage($this->start_base, $baseUri);
}
protected function spiderPage($baseUri, $uri, $depth = 1)
......@@ -82,7 +86,7 @@ class Spider
$xpath = $this->parser->parse($content, $uri);
foreach ($this->loggers as $logger) {
$logger->log($uri, $xpath);
$logger->log($uri, $depth, $xpath);
}
// spider sub-pages
......@@ -96,9 +100,9 @@ class Spider
foreach ($subUris as $subUri) {
if (!array_key_exists($subUri, $this->visited)) {
try {
$this->spiderPage($baseUri, $subUri, $depth + 1);
$this->spiderPage(self::getURIBase($subUri), $subUri, $depth + 1);
} catch(Exception $e) {
throw new Exception($baseUri . ' linked to a page that does not exist!' .$subUri, 404, $e);
echo "\nThe page, ".$uri.' linked to a page that could not be accessed: ' . $subUri.PHP_EOL;
}
}
}
......@@ -120,29 +124,44 @@ class Spider
}
$nodes = $xpath->query(
"//xhtml:a[@href]/@href"
"//xhtml:a[@href]/@href | //a[@href]/@href"
);
foreach ($nodes as $node) {
$uri = $this->absolutePath((string)$node->nodeValue, $baseUri);
if (!empty($uri)) {
if (strncmp($baseUri, $uri, strlen($baseUri)) === 0) {
$uris[] = $uri;
} elseif (
$uri != '.'
&& preg_match('!^(https?|ftp)://!i', $uri) === 0
) {
$uris[] = $baseHref . $uri;
$uri = trim((string)$node->nodeValue);
if (substr($uri, 0, 7) != 'mailto:'
&& substr($uri, 0, 11) != 'javascript:') {
$uri = self::absolutePath($uri, $baseUri);
if (!empty($uri)) {
if (strncmp($this->start_base, $uri, strlen($this->start_base)) === 0) {
$uris[] = $uri;
} elseif (
$uri != '.'
&& preg_match('!^(https?|ftp)://!i', $uri) === 0
) {
$uris[] = $baseHref . $uri;
}
}
}
}
sort($uris);
return new Spider_UriIterator($uris);
}
public function absolutePath($relativeUri, $baseUri)
public static function absolutePath($relativeUri, $baseUri)
{
if (filter_var($relativeUri, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED)) {
// URL is already absolute
return $relativeUri;
}
$new_base_url = $baseUri;
$base_url_parts = parse_url($baseUri);
......@@ -152,16 +171,34 @@ class Spider
}
$new_txt = '';
if (!filter_var($relativeUri, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED)) {
if (substr($relativeUri, 0, 1) == '/') {
$new_base_url = $base_url_parts['scheme'].'://'.$base_url_parts['host'];
}
$new_txt .= $new_base_url;
if (substr($relativeUri, 0, 1) == '/') {
$new_base_url = $base_url_parts['scheme'].'://'.$base_url_parts['host'];
}
$new_txt .= $new_base_url;
$absoluteUri = $new_txt.$relativeUri;
// Convert /dir/../ into /
while (preg_match('/\/[^\/]+\/\.\.\//', $absoluteUri)) {
$absoluteUri = preg_replace('/\/[^\/]+\/\.\.\//', '/', $absoluteUri);
}
return $absoluteUri;
}
public static function getUriBase($uri)
{
$base_url_parts = parse_url($uri);
$new_base_url = $uri;
if (substr($uri, -1) != '/') {
$path = pathinfo($base_url_parts['path']);
$new_base_url = substr($uri, 0, strlen($uri)-strlen($path['basename']));
}
return $new_base_url;
}
}
<?php
abstract class Spider_LoggerAbstract
{
abstract public function log($uri, DOMXPath $xpath);
abstract public function log($uri, $depth, DOMXPath $xpath);
}
\ No newline at end of file
<?php
function UNL_Autoload($class)
{
if (substr($class, 0, 4) !== 'UNL_') {
return false;
}
$fp = @fopen(str_replace('_', '/', $class) . '.php', 'r', true);
if ($fp) {
fclose($fp);
require str_replace('_', '/', $class) . '.php';
if (!class_exists($class, false) && !interface_exists($class, false)) {
die(new Exception('Class ' . $class . ' was not present in ' .
str_replace('_', '/', $class) . '.php (include_path="' . get_include_path() .
'") [UNL_Autoload version 1.0]'));
}
return true;
}
$e = new Exception('Class ' . $class . ' could not be loaded from ' .
str_replace('_', '/', $class) . '.php, file does not exist (include_path="' . get_include_path() .
'") [UNL_Autoload version 1.0]');
$trace = $e->getTrace();
if (isset($trace[2]) && isset($trace[2]['function']) &&
in_array($trace[2]['function'], array('class_exists', 'interface_exists'))) {
return false;
}
if (isset($trace[1]) && isset($trace[1]['function']) &&
in_array($trace[1]['function'], array('class_exists', 'interface_exists'))) {
return false;
}
die ((string) $e);
}
// set up __autoload
if (function_exists('spl_autoload_register')) {
if (!($_____t = spl_autoload_functions()) || !in_array('UNL_Autoload', spl_autoload_functions())) {
spl_autoload_register('UNL_Autoload');
if (function_exists('__autoload') && ($_____t === false)) {
// __autoload() was being used, but now would be ignored, add
// it to the autoload stack
spl_autoload_register('__autoload');
}
}
unset($_____t);
} elseif (!function_exists('__autoload')) {
function __autoload($class) { return UNL_Autoload($class); }
}
// set up include_path if it doesn't register our current location
$____paths = explode(PATH_SEPARATOR, get_include_path());
$____found = false;
foreach ($____paths as $____path) {
if ($____path == dirname(dirname(__FILE__))) {
$____found = true;
break;
}
}
if (!$____found) {
set_include_path(get_include_path() . PATH_SEPARATOR . dirname(dirname(__FILE__)));
}
unset($____paths);
unset($____path);
unset($____found);
<?php
class UNL_WDN_Assessment
{
public $baseUri;
public $db;
function __construct($baseUri, $db)
{
$this->baseUri = $baseUri;
$this->db = $db;
}
/**
*
* @return Spider
*/
protected function getSpider()
{
$plogger = new UNL_WDN_Assessment_PageLogger($this);
$downloader = new Spider_Downloader();
$parser = new Spider_Parser();
$spider = new Spider($downloader, $parser);
$spider->addUriFilter('Spider_AnchorFilter');
$spider->addUriFilter('Spider_MailtoFilter');
$spider->addUriFilter('UNL_WDN_Assessment_FileExtensionFilter');
$spider->addLogger($plogger);
return $spider;
}
function checkInvalid()
{
$vlogger = new UNL_WDN_Assessment_ValidateInvalidLogger($this);
$slogger = new UNL_WDN_Assessment_ValidityStatusLogger($this);
$spider = $this->getSpider();
$spider->addLogger($vlogger);
$spider->addLogger($slogger);
$spider->spider($this->baseUri);
}
function reValidate()
{
$this->removeEntries();
$vlogger = new UNL_WDN_Assessment_ValidationLogger($this);
//$slogger = new UNL_WDN_Assessment_ValidityStatusLogger($this);
$spider = $this->getSpider();
$spider->addLogger($vlogger);
//$spider->addLogger($slogger);
$spider->spider($this->baseUri);
}
function logPages()
{
//$slogger = new UNL_WDN_Assessment_ValidityStatusLogger($this);
$spider = $this->getSpider();
//$spider->addLogger($slogger);
$spider->spider($this->baseUri);
}
function checkLinks()
{
$checker = new UNL_WDN_Assessment_LinkChecker($this);
$spider = $this->getSpider();
$spider->addLogger($checker);
$spider->spider($this->baseUri);
}
function removeEntries()
{
$sth = $this->db->prepare('DELETE FROM assessment WHERE baseurl = ?');
$sth->execute(array($this->baseUri));
}
function addUri($uri)
{
$sth = $this->db->prepare('INSERT INTO assessment (baseurl, url, valid, timestamp) VALUES (?, ?, ?, ?);');
$sth->execute(array($this->baseUri, $uri, 'unknown', date('Y-m-d H:i:s')));
}
function setValidationResult($uri, $result)
{
$sth = $this->db->prepare('UPDATE assessment SET valid = ?, timestamp = ? WHERE baseurl = ? AND url = ?;');
if ($result) {
$result = 'true';
} else {
$result = 'false';
}
$sth->execute(array($result, date('Y-m-d H:i:s'), $this->baseUri, $uri));
}
function getSubPages()
{
$sth = $this->db->prepare('SELECT * FROM assessment WHERE baseurl = ?;');
$sth->execute(array($this->baseUri));
return $sth->fetchAll();
}
function pageWasValid($uri)
{
if ($this->getValidityStatus($uri) == 'true') {
return true;
}
return false;
}
function getValidityStatus($uri)
{
$sth = $this->db->prepare('SELECT valid FROM assessment WHERE baseurl = ? AND url = ?;');
$sth->execute(array($this->baseUri, $uri));
$result = $sth->fetch();
return $result['valid'];
}
}
\ No newline at end of file
<?php
class UNL_WDN_Assessment_FileExtensionFilter extends Spider_UriFilterInterface
{
function accept()
{
$path_parts = pathinfo($this->current());
if (!isset($path_parts['extension'])
|| $path_parts['extension'] == 'html'
|| $path_parts['extension'] == 'php'
|| $path_parts['extension'] == 'shtml'
|| $path_parts['extension'] == 'asp'
|| $path_parts['extension'] == 'aspx'
|| $path_parts['extension'] == 'jsp') {
return true;
}
return false;
}
}
\ No newline at end of file
<?php
class UNL_WDN_Assessment_LinkChecker extends Spider_LoggerAbstract
{
protected static $checked = array();
public function log($uri, $depth, DOMXPath $xpath)
{
$links = $this->getLinks($xpath);
$this->checkLinks($uri, $links, $depth);
}
function checkLinks($uri, $links, $depth)
{
$mcurl = curl_multi_init();
$curl = array();
$activeRequests = 0;
while (count($links) + $activeRequests > 0) {
while ($activeRequests < 50 && count($links) > 0) {
$link = Spider::absolutePath(array_shift($links), $uri);
if (filter_var($link, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED)
&& !array_key_exists($link, self::$checked)) {
$curl[$link] = curl_init($link);
curl_setopt($curl[$link], CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl[$link], CURLOPT_CONNECTTIMEOUT, 5);
curl_setopt($curl[$link], CURLOPT_LOW_SPEED_LIMIT, 10);
curl_setopt($curl[$link], CURLOPT_LOW_SPEED_TIME, 5);
curl_setopt($curl[$link], CURLOPT_FOLLOWLOCATION, false);
curl_setopt($curl[$link], CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6');
curl_multi_add_handle($mcurl, $curl[$link]);
$activeRequests++;
}
}
usleep(500);
curl_multi_exec($mcurl, $running);
while ($msg = curl_multi_info_read($mcurl, $msgCount)) {
$finishedCurl = $msg['handle'];
$info = curl_getinfo($finishedCurl);
$activeRequests--;
if ($info['http_code'] == 200) {
self::$checked[$info['url']] = true;
curl_multi_remove_handle($mcurl, $finishedCurl);
curl_close($finishedCurl);
} else {
self::$checked[$info['url']] = false;
$this->logLinkError($info, $depth);
continue;
}
}
}
}
protected function getLinks(DOMXPath $xpath)
{
$links = array();
$nodes = $xpath->query(
"//xhtml:a[@href]/@href | //a[@href]/@href"
);
foreach ($nodes as $node) {
$link = trim((string)$node->nodeValue);
if (substr($link, 0, 7) != 'mailto:'
&& substr($link, 0, 11) != 'javascript:') {
$links[] = $link;
}
}
sort($links);
return $links;
}
protected function checkLink($uri, $link, $depth)
{
$link = spider::absolutePath($link, $uri);
if (!filter_var($link, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED)) {
echo PHP_EOL.'Will not check '.$link;
return false;
}
if (!array_key_exists($link, $this->checked)) {
if ($contents = @file_get_contents($link)) {
$this->checked[$link] = true;
} else {
$this->checked[$link] = false;
}
}
if (!$this->checked[$link]) {
echo PHP_EOL.str_repeat(' ', $depth) . " ->$link is a broken link";
}
}
protected function logLinkError($info, $depth)
{
echo PHP_EOL.str_repeat(' ', $depth) . " ->{$info['url']} ";
if ($info['http_code'] != 0) {
echo "returned a {$info['http_code']}.";
switch ($info['http_code']) {
case '404':
echo ' This is broken and should be fixed!';
break;
case '301':
echo ' This should be checked.';
break;
case '302':
echo ' This is probably OK.';
break;
}
echo '<br />';
} else {
echo 'timed out';
}
}
}
\ No newline at end of file
<?php
class UNL_WDN_Assessment_PageLogger extends Spider_LoggerAbstract
{
/**
*
* @var UNL_WDN_Assessment
*/
public $assessment;
function __construct(UNL_WDN_Assessment $assessment)
{
$this->assessment = $assessment;
}
function log($uri, $depth, DOMXPath $xpath)
{
$this->assessment->addUri($uri);
echo PHP_EOL.'<div id="uri_'.md5($uri).'" class="depth_'.$depth.' '.$this->assessment->getValidityStatus($uri).'">
<span class="uri">'.$uri.'</span>
</div>'.PHP_EOL;
}
}
<?php
class UNL_WDN_Assessment_ValidateInvalidLogger extends UNL_WDN_Assessment_ValidationLogger
{
function log($uri, $depth, DOMXPath $xpath)
{
if (!$this->assessment->pageWasValid($uri)) {
parent::log($uri, $depth, $xpath);
}
}
}
\ No newline at end of file
<?php
class UNL_WDN_Assessment_ValidationLogger extends Spider_LoggerAbstract
{
/**
*
* @var Services_W3C_HTMLValidator
*/
public $validator;
/**
*
* @var UNL_WDN_Assessment
*/
public $assessment;
function __construct(UNL_WDN_Assessment $assessment)
{
$this->validator = new Services_W3C_HTMLValidator();
$this->validator->validator_uri = 'http://validator.unl.edu/check';
$this->assessment = $assessment;
}
function log($uri, $depth, DOMXPath $xpath)
{
$r = $this->validator->validate($uri);
$this->assessment->setValidationResult($uri, $r->isValid());
}
}
\ No newline at end of file
<?php
class UNL_WDN_Assessment_ValidityStatusLogger extends Spider_LoggerAbstract
{
/**
*
* @var UNL_WDN_Assessment
*/
public $assessment;
function __construct(UNL_WDN_Assessment $assessment)
{
$this->assessment = $assessment;
}
function log($uri, $depth, DOMXPath $xpath)
{
$status = $this->assessment->getValidityStatus($uri);
echo '<span id="validity_'.md5($uri).'" class=" validity '.$status.'">'.$status.'</span>'.PHP_EOL;
}
}
\ No newline at end of file
function validateAll()
{
WDN.jQuery('.uri').each(function(){
// Grab the URI
var uri = WDN.jQuery(this).html();
var uriDiv = WDN.jQuery(this).parent();
WDN.jQuery('body').queue('validation', function() {
checkValidity(uri, uriDiv)
});
});
WDN.jQuery('body').dequeue('validation');
}
function validateInvalid()
{
WDN.jQuery('.false .uri, .unknown .uri').each(function(){
// Grab the URI
var uri = WDN.jQuery(this).html();
var uriDiv = WDN.jQuery(this).parent();
WDN.jQuery('body').queue('validation', function() {
checkValidity(uri, uriDiv)
});
});
WDN.jQuery('body').dequeue('validation');
}
function checkValidity(uri, uriDiv)
{
uriDiv.removeClass('true false')
// Tell the user we're loading the result
uriDiv.append('<img class="loading" src="/wdn/templates_3.0/css/header/images/colorbox/loading.gif" />');
// Fetch the validator results in JSON format.
WDN.get('validator.php?base='+baseURI+'&u='+escape(uri), null, function(result) {
handleJSONResult(result, uriDiv);
}, 'json');
}
function handleJSONResult(result, uriDiv)
{
WDN.log(result);
uriDiv.children('.loading').remove();
// Advance the queue
WDN.jQuery('body').dequeue('validation');
if (result.validity) {
// It is valid, say no more!
uriDiv.addClass('true');
return;
}
uriDiv.addClass('false');
}
<?php
function autoload($class)
{
$class = str_replace('_', '/', $class);
include $class . '.php';
}
spl_autoload_register("autoload");
set_include_path(dirname(__FILE__).'/../src'.PATH_SEPARATOR.dirname(__FILE__).'/../lib/php');
$db = new PDO(
'mysql:host=localhost;dbname=wdn',
'wdn',
'wdn'
);
ini_set('display_errors', true);
error_reporting(E_ALL);
\ No newline at end of file