From 5fec2c10f27943ce0b5a2bba281395a69a2f0a31 Mon Sep 17 00:00:00 2001
From: Laurent Destailleur <eldy@users.sourceforge.net>
Date: Fri, 26 Aug 2011 19:09:02 +0000
Subject: [PATCH] New: Enhance error management and support Atom feeds in rss
 module.

---
 htdocs/core/class/rssparser.class.php | 445 +++-----------------------
 1 file changed, 44 insertions(+), 401 deletions(-)

diff --git a/htdocs/core/class/rssparser.class.php b/htdocs/core/class/rssparser.class.php
index 8a5776a9330..4a8277f52bf 100755
--- a/htdocs/core/class/rssparser.class.php
+++ b/htdocs/core/class/rssparser.class.php
@@ -19,14 +19,14 @@
  *      \file       htdocs/core/class/rssparser.class.php
  *      \ingroup    core
  *      \brief      File of class to parse rss feeds
- *      \version    $Id: rssparser.class.php,v 1.5 2011/08/26 23:06:16 eldy Exp $
+ *      \version    $Id: rssparser.class.php,v 1.3 2011/08/26 19:09:02 eldy Exp $
  */
 class RssParser
 {
     var $db;
     var $error;
 
-	protected $_format='';
+	protected $_format='rss';
 	protected $_urlRSS;
 	protected $_language;
 	protected $_generator;
@@ -53,12 +53,6 @@ class RssParser
 	public function getLastFetchDate() { return $this->_lastfetchdate; }
 	public function getItems()         { return $this->_rssarray; }
 
-
-	// For parsing with xmlparser
-    var $stack               = array(); // parser stack
-    var $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright');
-
-
 	/**
 	 * 		Constructor
 	 */
@@ -81,8 +75,6 @@ class RssParser
 	{
 	    include_once(DOL_DOCUMENT_ROOT.'/lib/files.lib.php');
 
-	    $str='';    // This will contain content of feed
-
 	    // Check parameters
 	    if (! dol_is_url($urlRSS))
 	    {
@@ -115,54 +107,41 @@ class RssParser
 			}
         }
 
-		// Load file into $str
+		// Load file into $rss
 		if ($foundintocache)    // Cache file found and is not too old
 		{
 		    $str = file_get_contents($newpathofdestfile);
+		    $rss = simplexml_load_string(unserialize($str));
 		}
 		else
 		{
 		    try {
 		        ini_set("user_agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)");
                 ini_set("max_execution_time", 10);
-                $str = file_get_contents($this->_urlRSS);
+		        if (! empty($conf->global->MAIN_SIMPLEXMLLOAD_DEBUG)) $rss = simplexml_load_file($this->_urlRSS);
+		        else
+		        {
+		            //libxml_use_internal_errors(false);
+		            $rss = @simplexml_load_file($this->_urlRSS);
+		        }
 		    }
 		    catch (Exception $e) {
 		         print 'Error retrieving URL '.$this->urlRSS.' - '.$e->getMessage();
 		    }
 		}
 
-		// Convert $str into xml
-		if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
-        {
-            //print 'xx'.LIBXML_NOCDATA;
-            libxml_use_internal_errors(false);
-            $rss = simplexml_load_string($str, "SimpleXMLElement", LIBXML_NOCDATA);
-        }
-        else
-        {
-            $xmlparser=xml_parser_create('');
-            if (!is_resource($xmlparser)) { $this->error="ErrorFailedToCreateParser"; return -1; }
-
-            xml_set_object( $xmlparser, $this );
-            xml_set_element_handler($xmlparser, 'feed_start_element', 'feed_end_element' );
-            xml_set_character_data_handler( $xmlparser, 'feed_cdata' );
-            $status = xml_parse( $xmlparser, $str );
-            xml_parser_free( $xmlparser );
-            $rss=$this;
-            //var_dump($this);exit;
-        }
-
 		// If $rss loaded
 		if ($rss)
 		{
+		    $items=array();
+
 		    // Save file into cache
 		    if (empty($foundintocache) && $cachedir)
 		    {
 				dol_syslog("RssParser::parser cache file ".$newpathofdestfile." is saved onto disk.");
 		        if (! dol_is_dir($cachedir)) dol_mkdir($cachedir);
 		        $fp = fopen($newpathofdestfile, 'w');
-                fwrite($fp, $str);
+                fwrite($fp, serialize($rss->asXML()));
                 fclose($fp);
 		        if (! empty($conf->global->MAIN_UMASK)) $newmask=$conf->global->MAIN_UMASK;
 		        @chmod($newpathofdestfile, octdec($newmask));
@@ -170,132 +149,60 @@ class RssParser
 		        $this->_lastfetchdate=$nowgmt;
 		    }
 
-		    unset($str);    // Free memory
-
-		    if (empty($rss->_format))    // If format not detected automatically
-		    {
-		        $rss->_format='rss';
-		        if (empty($rss->channel)) $rss->_format='atom';
-		    }
-
-		    $items=array();
+		    $rss->_format='rss';
+		    if (empty($rss->channel)) $rss->_format='atom';
 
 		    // Save description entries
 			if ($rss->_format == 'rss')
 			{
-			    //var_dump($rss);
-    			if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
-    			{
-        			if (!empty($rss->channel->language))      $this->_language = (string) $rss->channel->language;
-        			if (!empty($rss->channel->generator))     $this->_generator = (string) $rss->channel->generator;
-        			if (!empty($rss->channel->copyright))     $this->_copyright = (string) $rss->channel->copyright;
-        			if (!empty($rss->channel->lastbuilddate)) $this->_lastbuilddate = (string) $rss->channel->lastbuilddate;
-        			if (!empty($rss->channel->image->url[0])) $this->_imageurl = (string) $rss->channel->image->url[0];
-        			if (!empty($rss->channel->link))		  $this->_link = (string) $rss->channel->link;
-        			if (!empty($rss->channel->title))         $this->_title = (string) $rss->channel->title;
-        			if (!empty($rss->channel->description))	  $this->_description = (string) $rss->channel->description;
-    			}
-    			else
-    			{
-        			if (!empty($rss->channel['rss_language']))      $this->_language = (string) $rss->channel['rss_language'];
-        			if (!empty($rss->channel['rss_generator']))     $this->_generator = (string) $rss->channel['rss_generator'];
-        			if (!empty($rss->channel['rss_copyright']))     $this->_copyright = (string) $rss->channel['rss_copyright'];
-        			if (!empty($rss->channel['rss_lastbuilddate'])) $this->_lastbuilddate = (string) $rss->channel['rss_lastbuilddate'];
-        			if (!empty($rss->image['rss_url']))             $this->_imageurl = (string) $rss->image['rss_url'];
-        			if (!empty($rss->channel['rss_link']))		    $this->_link = (string) $rss->channel['rss_link'];
-        			if (!empty($rss->channel['rss_title']))         $this->_title = (string) $rss->channel['rss_title'];
-        			if (!empty($rss->channel['rss_description']))   $this->_description = (string) $rss->channel['rss_description'];
-    			}
-
-    			if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) $items=$rss->channel->item;    // With simplexml
-    			else $items=$rss->items;                                                              // With xmlparse
-    			//var_dump($items);exit;
+    			if (!empty($rss->channel->language))      $this->_language = (string) $rss->channel->language;
+    			if (!empty($rss->channel->generator))     $this->_generator = (string) $rss->channel->generator;
+    			if (!empty($rss->channel->copyright))     $this->_copyright = (string) $rss->channel->copyright;
+    			if (!empty($rss->channel->lastbuilddate)) $this->_lastbuilddate = (string) $rss->channel->lastbuilddate;
+    			if (!empty($rss->channel->image->url[0])) $this->_imageurl = (string) $rss->channel->image->url[0];
+    			if (!empty($rss->channel->link))		  $this->_link = (string) $rss->channel->link;
+    			if (!empty($rss->channel->title))         $this->_title = (string) $rss->channel->title;
+    			if (!empty($rss->channel->description))	  $this->_description = (string) $rss->channel->description;
+    			$items=$rss->channel->item;
 			}
 			else if ($rss->_format == 'atom')
 			{
-			    //var_dump($rss);
-    			if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
-    			{
-    			    if (!empty($rss->generator))     $this->_generator = (string) $rss->generator;
-        			if (!empty($rss->lastbuilddate)) $this->_lastbuilddate = (string) $rss->modified;
-        			if (!empty($rss->link->href))    $this->_link = (string) $rss->link->href;
-        			if (!empty($rss->title))         $this->_title = (string) $rss->title;
-        			if (!empty($rss->description))	 $this->_description = (string) $rss->description;
-    			}
-    			else
-    			{
-        			//if (!empty($rss->channel['rss_language']))      $this->_language = (string) $rss->channel['rss_language'];
-        			if (!empty($rss->channel['generator']))     $this->_generator = (string) $rss->channel['generator'];
-        			//if (!empty($rss->channel['rss_copyright']))     $this->_copyright = (string) $rss->channel['rss_copyright'];
-        			if (!empty($rss->channel['modified'])) $this->_lastbuilddate = (string) $rss->channel['modified'];
-        			//if (!empty($rss->image['rss_url']))             $this->_imageurl = (string) $rss->image['rss_url'];
-        			if (!empty($rss->channel['link']))		    $this->_link = (string) $rss->channel['link'];
-        			if (!empty($rss->channel['title']))         $this->_title = (string) $rss->channel['title'];
-        			//if (!empty($rss->channel['rss_description']))   $this->_description = (string) $rss->channel['rss_description'];
-    			}
-    			if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))  { $tmprss=xml2php($rss); $items=$tmprss['entry'];} // With simplexml
-    			else $items=$rss->items;                                                              // With xmlparse
-    			//var_dump($items);exit;
+    			if (!empty($rss->generator))     $this->_generator = (string) $rss->generator;
+    			if (!empty($rss->lastbuilddate)) $this->_lastbuilddate = (string) $rss->modified;
+    			if (!empty($rss->link->href))    $this->_link = (string) $rss->link->href;
+    			if (!empty($rss->title))         $this->_title = (string) $rss->title;
+    			if (!empty($rss->description))	 $this->_description = (string) $rss->description;
+    			$tmprss=xml2php($rss);
+    			$items=$tmprss['entry'];
 			}
 
 			$i = 0;
 			// Loop on each record
 			foreach($items as $item)
 			{
-			    //var_dump($item);exit;
     			if ($rss->_format == 'rss')
     			{
-    			    if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
-    			    {
-        			    $itemLink = (string) $item->link;
-        			    $itemTitle = (string) $item->title;
-        				$itemDescription = (string) $item->description;
-        			    $itemPubDate = (string) $item->pubDate;
-                        $itemId = '';
-                        $itemAuthor = '';
-    			    }
-    			    else
-    			    {
-        			    $itemLink = (string) $item['rss_link'];
-        			    $itemTitle = (string) $item['rss_title'];
-        				$itemDescription = (string) $item['rss_description'];
-        			    $itemPubDate = (string) $item['rss_pubdate'];
-                        $itemId = (string) $item['rss_guid'];
-                        $itemAuthor = (string) $item['rss_author'];
-    			    }
+    			    $itemLink = (string) $item->link;
+    			    $itemTitle = (string) $item->title;
+    				$itemDescription = (string) $item->description;
+    			    $itemPubDate = (string) $item->pubDate;
+                    $itemId = '';
 
     				// Loop on each category
     				$itemCategory=array();
-    				if (is_array($item->category))
+    				foreach ($item->category as $cat)
     				{
-        				foreach ($item->category as $cat)
-        				{
-        					$itemCategory[] = (string) $cat;
-        				}
+    					$itemCategory[] = (string) $cat;
     				}
     			}
     			else if ($rss->_format == 'atom')
     			{
-    			    if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
-    			    {
-    			        $itemLink = (string) $item['link']['href'];
-        			    $itemTitle = (string) $item['title'];
-        				$itemDescription = (string) $item['summary'];
-        			    $itemPubDate = (string) $item['created'];
-                        $itemId = (string) $item['id'];
-                        $itemAuthor = (string) ($item['author']?$item['author']:$item['author_name']);
-    			    }
-    			    else
-    			    {
-        			    $itemLink = (string) $item['link']['href'];
-        			    $itemTitle = (string) $item['title'];
-        				$itemDescription = (string) $item['summary'];
-        			    $itemPubDate = (string) $item['created'];
-                        $itemId = (string) $item['id'];
-                        $itemAuthor = (string) ($item['author']?$item['author']:$item['author_name']);
-    			    }
+    			    $itemLink = (string) $item['link']['href'];
+    			    $itemTitle = (string) $item['title'];
+    				$itemDescription = (string) $item['summary'];
+    			    $itemPubDate = (string) $item['created'];
+                    $itemId = (string) $item['id'];
     			}
-    			else print 'ErrorBadFeedFormat';
 
 				// Add record to result array
 				$this->_rssarray[$i] = array(
@@ -304,8 +211,7 @@ class RssParser
 					'description'=>$itemDescription,
 					'pubDate'=>$itemPubDate,
 					'category'=>$itemCategory,
-				    'id'=>$itemId,
-				    'author'=>$itemAuthor);
+				    'id'=>$itemId);
 
 				$i++;
 
@@ -321,269 +227,6 @@ class RssParser
 		}
 	}
 
-
-
-	/**
-	 * 	Triggered when opened tag is found
-	 *
-	 * 	@param		$p
-	 *  @param		$element	Tag
-	 *  @param		$attrs		Attributes of tags
-	 */
-    function feed_start_element($p, $element, &$attrs)
-    {
-        $el = $element = strtolower($element);
-        $attrs = array_change_key_case($attrs, CASE_LOWER);
-
-        // check for a namespace, and split if found
-        $ns = false;
-        if ( strpos( $element, ':' ) ) {
-            list($ns, $el) = explode( ':', $element, 2);
-        }
-        if ( $ns and $ns != 'rdf' ) {
-            $this->current_namespace = $ns;
-        }
-
-        // if feed type isn't set, then this is first element of feed identify feed from root element
-        if (empty($this->_format))
-        {
-            if ( $el == 'rdf' ) {
-                $this->_format = 'rss';
-                $this->feed_version = '1.0';
-            }
-            elseif ( $el == 'rss' ) {
-                $this->_format = 'rss';
-                $this->feed_version = $attrs['version'];
-            }
-            elseif ( $el == 'feed' ) {
-                $this->_format = 'atom';
-                $this->feed_version = $attrs['version'];
-                $this->inchannel = true;
-            }
-            return;
-        }
-
-        if ( $el == 'channel' )
-        {
-            $this->inchannel = true;
-        }
-        elseif ($el == 'item' or $el == 'entry' )
-        {
-            $this->initem = true;
-            if ( isset($attrs['rdf:about']) ) {
-                $this->current_item['about'] = $attrs['rdf:about'];
-            }
-        }
-
-        // if we're in the default namespace of an RSS feed,
-        //  record textinput or image fields
-        elseif (
-            $this->_format == 'rss' and
-            $this->current_namespace == '' and
-            $el == 'textinput' )
-        {
-            $this->intextinput = true;
-        }
-
-        elseif (
-            $this->_format == 'rss' and
-            $this->current_namespace == '' and
-            $el == 'image' )
-        {
-            $this->inimage = true;
-        }
-
-        # handle atom content constructs
-        elseif ( $this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) )
-        {
-            // avoid clashing w/ RSS mod_content
-            if ($el == 'content' ) {
-                $el = 'atom_content';
-            }
-
-            $this->incontent = $el;
-
-
-        }
-
-        // if inside an Atom content construct (e.g. content or summary) field treat tags as text
-        elseif ($this->_format == 'atom' and $this->incontent )
-        {
-            // if tags are inlined, then flatten
-            $attrs_str = join(' ',
-                    array_map('map_attrs',
-                    array_keys($attrs),
-                    array_values($attrs) ) );
-
-            $this->append_content( "<$element $attrs_str>"  );
-
-            array_unshift( $this->stack, $el );
-        }
-
-        // Atom support many links per containging element.
-        // Magpie treats link elements of type rel='alternate'
-        // as being equivalent to RSS's simple link element.
-        //
-        elseif ($this->_format == 'atom' and $el == 'link' )
-        {
-            if ( isset($attrs['rel']) and $attrs['rel'] == 'alternate' )
-            {
-                $link_el = 'link';
-            }
-            else {
-                $link_el = 'link_' . $attrs['rel'];
-            }
-
-            $this->append($link_el, $attrs['href']);
-        }
-        // set stack[0] to current element
-        else {
-            array_unshift($this->stack, $el);
-        }
-    }
-
-
-	/**
-	 * 	Triggered when CDATA is found
-	 *
-	 * 	@param		$p
-	 *  @param		$element	Tag
-	 *  @param		$attrs		Attributes of tags
-	 */
-    function feed_cdata ($p, $text) {
-        if ($this->_format == 'atom' and $this->incontent)
-        {
-            $this->append_content( $text );
-        }
-        else {
-            $current_el = join('_', array_reverse($this->stack));
-            $this->append($current_el, $text);
-        }
-    }
-
-	/**
-	 * 	Triggered when closed tag is found
-	 *
-	 * 	@param		$p
-	 *  @param		$element	Tag
-	 */
-    function feed_end_element ($p, $el) {
-        $el = strtolower($el);
-
-        if ( $el == 'item' or $el == 'entry' )
-        {
-            $this->items[] = $this->current_item;
-            $this->current_item = array();
-            $this->initem = false;
-        }
-        elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'textinput' )
-        {
-            $this->intextinput = false;
-        }
-        elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'image' )
-        {
-            $this->inimage = false;
-        }
-        elseif ($this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) )
-        {
-            $this->incontent = false;
-        }
-        elseif ($el == 'channel' or $el == 'feed' )
-        {
-            $this->inchannel = false;
-        }
-        elseif ($this->_format == 'atom' and $this->incontent  ) {
-            // balance tags properly
-            // note:  i don't think this is actually neccessary
-            if ( $this->stack[0] == $el )
-            {
-                $this->append_content("</$el>");
-            }
-            else {
-                $this->append_content("<$el />");
-            }
-
-            array_shift( $this->stack );
-        }
-        else {
-            array_shift( $this->stack );
-        }
-
-        $this->current_namespace = false;
-    }
-
-
-	/**
-	 * 	To concat 2 string with no warning if an operand is not defined
-	 *
-	 * 	@param		$str1
-	 *  @param		$str2
-	 */
-    function concat (&$str1, $str2="") {
-        if (!isset($str1) ) {
-            $str1="";
-        }
-        $str1 .= $str2;
-    }
-
-	/**
-	 */
-    function append_content($text) {
-        if ( $this->initem ) {
-            $this->concat( $this->current_item[ $this->incontent ], $text );
-        }
-        elseif ( $this->inchannel ) {
-            $this->concat( $this->channel[ $this->incontent ], $text );
-        }
-    }
-
-	/**
-	 * 	smart append - field and namespace aware
-	 */
-    function append($el, $text) {
-        if (!$el) {
-            return;
-        }
-        if ( $this->current_namespace )
-        {
-            if ( $this->initem ) {
-                $this->concat(
-                    $this->current_item[ $this->current_namespace ][ $el ], $text);
-            }
-            elseif ($this->inchannel) {
-                $this->concat(
-                    $this->channel[ $this->current_namespace][ $el ], $text );
-            }
-            elseif ($this->intextinput) {
-                $this->concat(
-                    $this->textinput[ $this->current_namespace][ $el ], $text );
-            }
-            elseif ($this->inimage) {
-                $this->concat(
-                    $this->image[ $this->current_namespace ][ $el ], $text );
-            }
-        }
-        else {
-            if ( $this->initem ) {
-                $this->concat(
-                    $this->current_item[ $el ], $text);
-            }
-            elseif ($this->intextinput) {
-                $this->concat(
-                    $this->textinput[ $el ], $text );
-            }
-            elseif ($this->inimage) {
-                $this->concat(
-                    $this->image[ $el ], $text );
-            }
-            elseif ($this->inchannel) {
-                $this->concat(
-                    $this->channel[ $el ], $text );
-            }
-
-        }
-    }
-
 }
 
 
-- 
GitLab