Skip to content
Snippets Groups Projects
Commit 5fec2c10 authored by Laurent Destailleur's avatar Laurent Destailleur
Browse files

New: Enhance error management and support Atom feeds in rss module.

parent 16564de3
No related branches found
No related tags found
No related merge requests found
...@@ -19,14 +19,14 @@ ...@@ -19,14 +19,14 @@
* \file htdocs/core/class/rssparser.class.php * \file htdocs/core/class/rssparser.class.php
* \ingroup core * \ingroup core
* \brief File of class to parse rss feeds * \brief File of class to parse rss feeds
* \version $Id: rssparser.class.php,v 1.5 2011/08/26 23:06:16 eldy Exp $ * \version $Id: rssparser.class.php,v 1.3 2011/08/26 19:09:02 eldy Exp $
*/ */
class RssParser class RssParser
{ {
var $db; var $db;
var $error; var $error;
protected $_format=''; protected $_format='rss';
protected $_urlRSS; protected $_urlRSS;
protected $_language; protected $_language;
protected $_generator; protected $_generator;
...@@ -53,12 +53,6 @@ class RssParser ...@@ -53,12 +53,6 @@ class RssParser
public function getLastFetchDate() { return $this->_lastfetchdate; } public function getLastFetchDate() { return $this->_lastfetchdate; }
public function getItems() { return $this->_rssarray; } public function getItems() { return $this->_rssarray; }
// For parsing with xmlparser
var $stack = array(); // parser stack
var $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright');
/** /**
* Constructor * Constructor
*/ */
...@@ -81,8 +75,6 @@ class RssParser ...@@ -81,8 +75,6 @@ class RssParser
{ {
include_once(DOL_DOCUMENT_ROOT.'/lib/files.lib.php'); include_once(DOL_DOCUMENT_ROOT.'/lib/files.lib.php');
$str=''; // This will contain content of feed
// Check parameters // Check parameters
if (! dol_is_url($urlRSS)) if (! dol_is_url($urlRSS))
{ {
...@@ -115,54 +107,41 @@ class RssParser ...@@ -115,54 +107,41 @@ class RssParser
} }
} }
// Load file into $str // Load file into $rss
if ($foundintocache) // Cache file found and is not too old if ($foundintocache) // Cache file found and is not too old
{ {
$str = file_get_contents($newpathofdestfile); $str = file_get_contents($newpathofdestfile);
$rss = simplexml_load_string(unserialize($str));
} }
else else
{ {
try { try {
ini_set("user_agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)"); ini_set("user_agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)");
ini_set("max_execution_time", 10); ini_set("max_execution_time", 10);
$str = file_get_contents($this->_urlRSS); if (! empty($conf->global->MAIN_SIMPLEXMLLOAD_DEBUG)) $rss = simplexml_load_file($this->_urlRSS);
else
{
//libxml_use_internal_errors(false);
$rss = @simplexml_load_file($this->_urlRSS);
}
} }
catch (Exception $e) { catch (Exception $e) {
print 'Error retrieving URL '.$this->urlRSS.' - '.$e->getMessage(); print 'Error retrieving URL '.$this->urlRSS.' - '.$e->getMessage();
} }
} }
// Convert $str into xml
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
{
//print 'xx'.LIBXML_NOCDATA;
libxml_use_internal_errors(false);
$rss = simplexml_load_string($str, "SimpleXMLElement", LIBXML_NOCDATA);
}
else
{
$xmlparser=xml_parser_create('');
if (!is_resource($xmlparser)) { $this->error="ErrorFailedToCreateParser"; return -1; }
xml_set_object( $xmlparser, $this );
xml_set_element_handler($xmlparser, 'feed_start_element', 'feed_end_element' );
xml_set_character_data_handler( $xmlparser, 'feed_cdata' );
$status = xml_parse( $xmlparser, $str );
xml_parser_free( $xmlparser );
$rss=$this;
//var_dump($this);exit;
}
// If $rss loaded // If $rss loaded
if ($rss) if ($rss)
{ {
$items=array();
// Save file into cache // Save file into cache
if (empty($foundintocache) && $cachedir) if (empty($foundintocache) && $cachedir)
{ {
dol_syslog("RssParser::parser cache file ".$newpathofdestfile." is saved onto disk."); dol_syslog("RssParser::parser cache file ".$newpathofdestfile." is saved onto disk.");
if (! dol_is_dir($cachedir)) dol_mkdir($cachedir); if (! dol_is_dir($cachedir)) dol_mkdir($cachedir);
$fp = fopen($newpathofdestfile, 'w'); $fp = fopen($newpathofdestfile, 'w');
fwrite($fp, $str); fwrite($fp, serialize($rss->asXML()));
fclose($fp); fclose($fp);
if (! empty($conf->global->MAIN_UMASK)) $newmask=$conf->global->MAIN_UMASK; if (! empty($conf->global->MAIN_UMASK)) $newmask=$conf->global->MAIN_UMASK;
@chmod($newpathofdestfile, octdec($newmask)); @chmod($newpathofdestfile, octdec($newmask));
...@@ -170,21 +149,11 @@ class RssParser ...@@ -170,21 +149,11 @@ class RssParser
$this->_lastfetchdate=$nowgmt; $this->_lastfetchdate=$nowgmt;
} }
unset($str); // Free memory
if (empty($rss->_format)) // If format not detected automatically
{
$rss->_format='rss'; $rss->_format='rss';
if (empty($rss->channel)) $rss->_format='atom'; if (empty($rss->channel)) $rss->_format='atom';
}
$items=array();
// Save description entries // Save description entries
if ($rss->_format == 'rss') if ($rss->_format == 'rss')
{
//var_dump($rss);
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
{ {
if (!empty($rss->channel->language)) $this->_language = (string) $rss->channel->language; if (!empty($rss->channel->language)) $this->_language = (string) $rss->channel->language;
if (!empty($rss->channel->generator)) $this->_generator = (string) $rss->channel->generator; if (!empty($rss->channel->generator)) $this->_generator = (string) $rss->channel->generator;
...@@ -194,108 +163,46 @@ class RssParser ...@@ -194,108 +163,46 @@ class RssParser
if (!empty($rss->channel->link)) $this->_link = (string) $rss->channel->link; if (!empty($rss->channel->link)) $this->_link = (string) $rss->channel->link;
if (!empty($rss->channel->title)) $this->_title = (string) $rss->channel->title; if (!empty($rss->channel->title)) $this->_title = (string) $rss->channel->title;
if (!empty($rss->channel->description)) $this->_description = (string) $rss->channel->description; if (!empty($rss->channel->description)) $this->_description = (string) $rss->channel->description;
} $items=$rss->channel->item;
else
{
if (!empty($rss->channel['rss_language'])) $this->_language = (string) $rss->channel['rss_language'];
if (!empty($rss->channel['rss_generator'])) $this->_generator = (string) $rss->channel['rss_generator'];
if (!empty($rss->channel['rss_copyright'])) $this->_copyright = (string) $rss->channel['rss_copyright'];
if (!empty($rss->channel['rss_lastbuilddate'])) $this->_lastbuilddate = (string) $rss->channel['rss_lastbuilddate'];
if (!empty($rss->image['rss_url'])) $this->_imageurl = (string) $rss->image['rss_url'];
if (!empty($rss->channel['rss_link'])) $this->_link = (string) $rss->channel['rss_link'];
if (!empty($rss->channel['rss_title'])) $this->_title = (string) $rss->channel['rss_title'];
if (!empty($rss->channel['rss_description'])) $this->_description = (string) $rss->channel['rss_description'];
}
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) $items=$rss->channel->item; // With simplexml
else $items=$rss->items; // With xmlparse
//var_dump($items);exit;
} }
else if ($rss->_format == 'atom') else if ($rss->_format == 'atom')
{
//var_dump($rss);
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
{ {
if (!empty($rss->generator)) $this->_generator = (string) $rss->generator; if (!empty($rss->generator)) $this->_generator = (string) $rss->generator;
if (!empty($rss->lastbuilddate)) $this->_lastbuilddate = (string) $rss->modified; if (!empty($rss->lastbuilddate)) $this->_lastbuilddate = (string) $rss->modified;
if (!empty($rss->link->href)) $this->_link = (string) $rss->link->href; if (!empty($rss->link->href)) $this->_link = (string) $rss->link->href;
if (!empty($rss->title)) $this->_title = (string) $rss->title; if (!empty($rss->title)) $this->_title = (string) $rss->title;
if (!empty($rss->description)) $this->_description = (string) $rss->description; if (!empty($rss->description)) $this->_description = (string) $rss->description;
} $tmprss=xml2php($rss);
else $items=$tmprss['entry'];
{
//if (!empty($rss->channel['rss_language'])) $this->_language = (string) $rss->channel['rss_language'];
if (!empty($rss->channel['generator'])) $this->_generator = (string) $rss->channel['generator'];
//if (!empty($rss->channel['rss_copyright'])) $this->_copyright = (string) $rss->channel['rss_copyright'];
if (!empty($rss->channel['modified'])) $this->_lastbuilddate = (string) $rss->channel['modified'];
//if (!empty($rss->image['rss_url'])) $this->_imageurl = (string) $rss->image['rss_url'];
if (!empty($rss->channel['link'])) $this->_link = (string) $rss->channel['link'];
if (!empty($rss->channel['title'])) $this->_title = (string) $rss->channel['title'];
//if (!empty($rss->channel['rss_description'])) $this->_description = (string) $rss->channel['rss_description'];
}
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) { $tmprss=xml2php($rss); $items=$tmprss['entry'];} // With simplexml
else $items=$rss->items; // With xmlparse
//var_dump($items);exit;
} }
$i = 0; $i = 0;
// Loop on each record // Loop on each record
foreach($items as $item) foreach($items as $item)
{ {
//var_dump($item);exit;
if ($rss->_format == 'rss') if ($rss->_format == 'rss')
{
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
{ {
$itemLink = (string) $item->link; $itemLink = (string) $item->link;
$itemTitle = (string) $item->title; $itemTitle = (string) $item->title;
$itemDescription = (string) $item->description; $itemDescription = (string) $item->description;
$itemPubDate = (string) $item->pubDate; $itemPubDate = (string) $item->pubDate;
$itemId = ''; $itemId = '';
$itemAuthor = '';
}
else
{
$itemLink = (string) $item['rss_link'];
$itemTitle = (string) $item['rss_title'];
$itemDescription = (string) $item['rss_description'];
$itemPubDate = (string) $item['rss_pubdate'];
$itemId = (string) $item['rss_guid'];
$itemAuthor = (string) $item['rss_author'];
}
// Loop on each category // Loop on each category
$itemCategory=array(); $itemCategory=array();
if (is_array($item->category))
{
foreach ($item->category as $cat) foreach ($item->category as $cat)
{ {
$itemCategory[] = (string) $cat; $itemCategory[] = (string) $cat;
} }
} }
}
else if ($rss->_format == 'atom') else if ($rss->_format == 'atom')
{
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
{
$itemLink = (string) $item['link']['href'];
$itemTitle = (string) $item['title'];
$itemDescription = (string) $item['summary'];
$itemPubDate = (string) $item['created'];
$itemId = (string) $item['id'];
$itemAuthor = (string) ($item['author']?$item['author']:$item['author_name']);
}
else
{ {
$itemLink = (string) $item['link']['href']; $itemLink = (string) $item['link']['href'];
$itemTitle = (string) $item['title']; $itemTitle = (string) $item['title'];
$itemDescription = (string) $item['summary']; $itemDescription = (string) $item['summary'];
$itemPubDate = (string) $item['created']; $itemPubDate = (string) $item['created'];
$itemId = (string) $item['id']; $itemId = (string) $item['id'];
$itemAuthor = (string) ($item['author']?$item['author']:$item['author_name']);
}
} }
else print 'ErrorBadFeedFormat';
// Add record to result array // Add record to result array
$this->_rssarray[$i] = array( $this->_rssarray[$i] = array(
...@@ -304,8 +211,7 @@ class RssParser ...@@ -304,8 +211,7 @@ class RssParser
'description'=>$itemDescription, 'description'=>$itemDescription,
'pubDate'=>$itemPubDate, 'pubDate'=>$itemPubDate,
'category'=>$itemCategory, 'category'=>$itemCategory,
'id'=>$itemId, 'id'=>$itemId);
'author'=>$itemAuthor);
$i++; $i++;
...@@ -321,269 +227,6 @@ class RssParser ...@@ -321,269 +227,6 @@ class RssParser
} }
} }
/**
* Triggered when opened tag is found
*
* @param $p
* @param $element Tag
* @param $attrs Attributes of tags
*/
function feed_start_element($p, $element, &$attrs)
{
$el = $element = strtolower($element);
$attrs = array_change_key_case($attrs, CASE_LOWER);
// check for a namespace, and split if found
$ns = false;
if ( strpos( $element, ':' ) ) {
list($ns, $el) = explode( ':', $element, 2);
}
if ( $ns and $ns != 'rdf' ) {
$this->current_namespace = $ns;
}
// if feed type isn't set, then this is first element of feed identify feed from root element
if (empty($this->_format))
{
if ( $el == 'rdf' ) {
$this->_format = 'rss';
$this->feed_version = '1.0';
}
elseif ( $el == 'rss' ) {
$this->_format = 'rss';
$this->feed_version = $attrs['version'];
}
elseif ( $el == 'feed' ) {
$this->_format = 'atom';
$this->feed_version = $attrs['version'];
$this->inchannel = true;
}
return;
}
if ( $el == 'channel' )
{
$this->inchannel = true;
}
elseif ($el == 'item' or $el == 'entry' )
{
$this->initem = true;
if ( isset($attrs['rdf:about']) ) {
$this->current_item['about'] = $attrs['rdf:about'];
}
}
// if we're in the default namespace of an RSS feed,
// record textinput or image fields
elseif (
$this->_format == 'rss' and
$this->current_namespace == '' and
$el == 'textinput' )
{
$this->intextinput = true;
}
elseif (
$this->_format == 'rss' and
$this->current_namespace == '' and
$el == 'image' )
{
$this->inimage = true;
}
# handle atom content constructs
elseif ( $this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) )
{
// avoid clashing w/ RSS mod_content
if ($el == 'content' ) {
$el = 'atom_content';
}
$this->incontent = $el;
}
// if inside an Atom content construct (e.g. content or summary) field treat tags as text
elseif ($this->_format == 'atom' and $this->incontent )
{
// if tags are inlined, then flatten
$attrs_str = join(' ',
array_map('map_attrs',
array_keys($attrs),
array_values($attrs) ) );
$this->append_content( "<$element $attrs_str>" );
array_unshift( $this->stack, $el );
}
// Atom support many links per containging element.
// Magpie treats link elements of type rel='alternate'
// as being equivalent to RSS's simple link element.
//
elseif ($this->_format == 'atom' and $el == 'link' )
{
if ( isset($attrs['rel']) and $attrs['rel'] == 'alternate' )
{
$link_el = 'link';
}
else {
$link_el = 'link_' . $attrs['rel'];
}
$this->append($link_el, $attrs['href']);
}
// set stack[0] to current element
else {
array_unshift($this->stack, $el);
}
}
/**
* Triggered when CDATA is found
*
* @param $p
* @param $element Tag
* @param $attrs Attributes of tags
*/
function feed_cdata ($p, $text) {
if ($this->_format == 'atom' and $this->incontent)
{
$this->append_content( $text );
}
else {
$current_el = join('_', array_reverse($this->stack));
$this->append($current_el, $text);
}
}
/**
* Triggered when closed tag is found
*
* @param $p
* @param $element Tag
*/
function feed_end_element ($p, $el) {
$el = strtolower($el);
if ( $el == 'item' or $el == 'entry' )
{
$this->items[] = $this->current_item;
$this->current_item = array();
$this->initem = false;
}
elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'textinput' )
{
$this->intextinput = false;
}
elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'image' )
{
$this->inimage = false;
}
elseif ($this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) )
{
$this->incontent = false;
}
elseif ($el == 'channel' or $el == 'feed' )
{
$this->inchannel = false;
}
elseif ($this->_format == 'atom' and $this->incontent ) {
// balance tags properly
// note: i don't think this is actually neccessary
if ( $this->stack[0] == $el )
{
$this->append_content("</$el>");
}
else {
$this->append_content("<$el />");
}
array_shift( $this->stack );
}
else {
array_shift( $this->stack );
}
$this->current_namespace = false;
}
/**
* To concat 2 string with no warning if an operand is not defined
*
* @param $str1
* @param $str2
*/
function concat (&$str1, $str2="") {
if (!isset($str1) ) {
$str1="";
}
$str1 .= $str2;
}
/**
*/
function append_content($text) {
if ( $this->initem ) {
$this->concat( $this->current_item[ $this->incontent ], $text );
}
elseif ( $this->inchannel ) {
$this->concat( $this->channel[ $this->incontent ], $text );
}
}
/**
* smart append - field and namespace aware
*/
function append($el, $text) {
if (!$el) {
return;
}
if ( $this->current_namespace )
{
if ( $this->initem ) {
$this->concat(
$this->current_item[ $this->current_namespace ][ $el ], $text);
}
elseif ($this->inchannel) {
$this->concat(
$this->channel[ $this->current_namespace][ $el ], $text );
}
elseif ($this->intextinput) {
$this->concat(
$this->textinput[ $this->current_namespace][ $el ], $text );
}
elseif ($this->inimage) {
$this->concat(
$this->image[ $this->current_namespace ][ $el ], $text );
}
}
else {
if ( $this->initem ) {
$this->concat(
$this->current_item[ $el ], $text);
}
elseif ($this->intextinput) {
$this->concat(
$this->textinput[ $el ], $text );
}
elseif ($this->inimage) {
$this->concat(
$this->image[ $el ], $text );
}
elseif ($this->inchannel) {
$this->concat(
$this->channel[ $el ], $text );
}
}
}
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment