From 933c59c405a303bbbc2f7e5b87d913b4be06f95b Mon Sep 17 00:00:00 2001 From: Laurent Destailleur <eldy@users.sourceforge.net> Date: Fri, 26 Aug 2011 22:38:27 +0000 Subject: [PATCH] Rss parser can use xml_parse or simplexml functions --- htdocs/core/class/rssparser.class.php | 442 +++++++++++++++++++++++--- 1 file changed, 399 insertions(+), 43 deletions(-) diff --git a/htdocs/core/class/rssparser.class.php b/htdocs/core/class/rssparser.class.php index 4a8277f52bf..d0d1af6f895 100755 --- a/htdocs/core/class/rssparser.class.php +++ b/htdocs/core/class/rssparser.class.php @@ -19,7 +19,7 @@ * \file htdocs/core/class/rssparser.class.php * \ingroup core * \brief File of class to parse rss feeds - * \version $Id: rssparser.class.php,v 1.3 2011/08/26 19:09:02 eldy Exp $ + * \version $Id: rssparser.class.php,v 1.4 2011/08/26 22:38:27 eldy Exp $ */ class RssParser { @@ -53,6 +53,11 @@ class RssParser public function getLastFetchDate() { return $this->_lastfetchdate; } public function getItems() { return $this->_rssarray; } + + // For parsing with xmlparser + var $stack = array(); // parser stack + + /** * Constructor */ @@ -75,6 +80,8 @@ class RssParser { include_once(DOL_DOCUMENT_ROOT.'/lib/files.lib.php'); + $str=''; // This will contain content of feed + // Check parameters if (! dol_is_url($urlRSS)) { @@ -107,41 +114,54 @@ class RssParser } } - // Load file into $rss + // Load file into $str if ($foundintocache) // Cache file found and is not too old { $str = file_get_contents($newpathofdestfile); - $rss = simplexml_load_string(unserialize($str)); } else { try { ini_set("user_agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)"); ini_set("max_execution_time", 10); - if (! empty($conf->global->MAIN_SIMPLEXMLLOAD_DEBUG)) $rss = simplexml_load_file($this->_urlRSS); - else - { - //libxml_use_internal_errors(false); - $rss = @simplexml_load_file($this->_urlRSS); - } + $str = file_get_contents($this->_urlRSS); } catch (Exception $e) { print 'Error retrieving URL '.$this->urlRSS.' - '.$e->getMessage(); } } + // Convert $str into xml + if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) + { + //print 'xx'.LIBXML_NOCDATA; + libxml_use_internal_errors(false); + $rss = simplexml_load_string($str, "SimpleXMLElement", LIBXML_NOCDATA); + } + else + { + $xmlparser=xml_parser_create(''); + if (!is_resource($xmlparser)) { $this->error="ErrorFailedToCreateParser"; return -1; } + + xml_set_object( $xmlparser, $this ); + xml_set_element_handler($xmlparser, 'feed_start_element', 'feed_end_element' ); + xml_set_character_data_handler( $xmlparser, 'feed_cdata' ); + $status = xml_parse( $xmlparser, $str ); + xml_parser_free( $xmlparser ); + $rss=$this; + //var_dump($this);exit; + } + // If $rss loaded if ($rss) { - $items=array(); - // Save file into cache if (empty($foundintocache) && $cachedir) { dol_syslog("RssParser::parser cache file ".$newpathofdestfile." is saved onto disk."); if (! dol_is_dir($cachedir)) dol_mkdir($cachedir); $fp = fopen($newpathofdestfile, 'w'); - fwrite($fp, serialize($rss->asXML())); + fwrite($fp, $str); fclose($fp); if (! empty($conf->global->MAIN_UMASK)) $newmask=$conf->global->MAIN_UMASK; @chmod($newpathofdestfile, octdec($newmask)); @@ -149,60 +169,131 @@ class RssParser $this->_lastfetchdate=$nowgmt; } - $rss->_format='rss'; - if (empty($rss->channel)) $rss->_format='atom'; + unset($str); // Free memory + + if (empty($rss->_format)) // If format not detected automatically + { + $rss->_format='rss'; + if (empty($rss->channel)) $rss->_format='atom'; + } + + $items=array(); // Save description entries if ($rss->_format == 'rss') { - if (!empty($rss->channel->language)) $this->_language = (string) $rss->channel->language; - if (!empty($rss->channel->generator)) $this->_generator = (string) $rss->channel->generator; - if (!empty($rss->channel->copyright)) $this->_copyright = (string) $rss->channel->copyright; - if (!empty($rss->channel->lastbuilddate)) $this->_lastbuilddate = (string) $rss->channel->lastbuilddate; - if (!empty($rss->channel->image->url[0])) $this->_imageurl = (string) $rss->channel->image->url[0]; - if (!empty($rss->channel->link)) $this->_link = (string) $rss->channel->link; - if (!empty($rss->channel->title)) $this->_title = (string) $rss->channel->title; - if (!empty($rss->channel->description)) $this->_description = (string) $rss->channel->description; - $items=$rss->channel->item; + //var_dump($rss); + if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) + { + if (!empty($rss->channel->language)) $this->_language = (string) $rss->channel->language; + if (!empty($rss->channel->generator)) $this->_generator = (string) $rss->channel->generator; + if (!empty($rss->channel->copyright)) $this->_copyright = (string) $rss->channel->copyright; + if (!empty($rss->channel->lastbuilddate)) $this->_lastbuilddate = (string) $rss->channel->lastbuilddate; + if (!empty($rss->channel->image->url[0])) $this->_imageurl = (string) $rss->channel->image->url[0]; + if (!empty($rss->channel->link)) $this->_link = (string) $rss->channel->link; + if (!empty($rss->channel->title)) $this->_title = (string) $rss->channel->title; + if (!empty($rss->channel->description)) $this->_description = (string) $rss->channel->description; + } + else + { + if (!empty($rss->channel['rss_language'])) $this->_language = (string) $rss->channel['rss_language']; + if (!empty($rss->channel['rss_generator'])) $this->_generator = (string) $rss->channel['rss_generator']; + if (!empty($rss->channel['rss_copyright'])) $this->_copyright = (string) $rss->channel['rss_copyright']; + if (!empty($rss->channel['rss_lastbuilddate'])) $this->_lastbuilddate = (string) $rss->channel['rss_lastbuilddate']; + if (!empty($rss->image['rss_url'])) $this->_imageurl = (string) $rss->image['rss_url']; + if (!empty($rss->channel['rss_link'])) $this->_link = (string) $rss->channel['rss_link']; + if (!empty($rss->channel['rss_title'])) $this->_title = (string) $rss->channel['rss_title']; + if (!empty($rss->channel['rss_description'])) $this->_description = (string) $rss->channel['rss_description']; + } + + if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) $items=$rss->channel->item; // With simplexml + else $items=$rss->items; // With xmlparse + //var_dump($items);exit; } else if ($rss->_format == 'atom') { - if (!empty($rss->generator)) $this->_generator = (string) $rss->generator; - if (!empty($rss->lastbuilddate)) $this->_lastbuilddate = (string) $rss->modified; - if (!empty($rss->link->href)) $this->_link = (string) $rss->link->href; - if (!empty($rss->title)) $this->_title = (string) $rss->title; - if (!empty($rss->description)) $this->_description = (string) $rss->description; - $tmprss=xml2php($rss); - $items=$tmprss['entry']; + if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) + { + if (!empty($rss->generator)) $this->_generator = (string) $rss->generator; + if (!empty($rss->lastbuilddate)) $this->_lastbuilddate = (string) $rss->modified; + if (!empty($rss->link->href)) $this->_link = (string) $rss->link->href; + if (!empty($rss->title)) $this->_title = (string) $rss->title; + if (!empty($rss->description)) $this->_description = (string) $rss->description; + } + else + { + if (!empty($rss->channel['rss_language'])) $this->_language = (string) $rss->channel['rss_language']; + if (!empty($rss->channel['rss_generator'])) $this->_generator = (string) $rss->channel['rss_generator']; + if (!empty($rss->channel['rss_copyright'])) $this->_copyright = (string) $rss->channel['rss_copyright']; + if (!empty($rss->channel['rss_lastbuilddate'])) $this->_lastbuilddate = (string) $rss->channel['rss_lastbuilddate']; + if (!empty($rss->image['rss_url'])) $this->_imageurl = (string) $rss->image['rss_url']; + if (!empty($rss->channel['rss_link'])) $this->_link = (string) $rss->channel['rss_link']; + if (!empty($rss->channel['rss_title'])) $this->_title = (string) $rss->channel['rss_title']; + if (!empty($rss->channel['rss_description'])) $this->_description = (string) $rss->channel['rss_description']; + } + if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) { $tmprss=xml2php($rss); $items=$tmprss['entry'];} // With simplexml + else $items=$rss->items; // With xmlparse + //var_dump($items);exit; } $i = 0; // Loop on each record foreach($items as $item) { + //var_dump($item);exit; if ($rss->_format == 'rss') { - $itemLink = (string) $item->link; - $itemTitle = (string) $item->title; - $itemDescription = (string) $item->description; - $itemPubDate = (string) $item->pubDate; - $itemId = ''; + if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) + { + $itemLink = (string) $item->link; + $itemTitle = (string) $item->title; + $itemDescription = (string) $item->description; + $itemPubDate = (string) $item->pubDate; + $itemId = ''; + $itemAuthor = ''; + } + else + { + $itemLink = (string) $item['rss_link']; + $itemTitle = (string) $item['rss_title']; + $itemDescription = (string) $item['rss_description']; + $itemPubDate = (string) $item['rss_pubdate']; + $itemId = (string) $item['rss_guid']; + $itemAuthor = (string) $item['rss_author']; + } // Loop on each category $itemCategory=array(); - foreach ($item->category as $cat) + if (is_array($item->category)) { - $itemCategory[] = (string) $cat; + foreach ($item->category as $cat) + { + $itemCategory[] = (string) $cat; + } } } else if ($rss->_format == 'atom') { - $itemLink = (string) $item['link']['href']; - $itemTitle = (string) $item['title']; - $itemDescription = (string) $item['summary']; - $itemPubDate = (string) $item['created']; - $itemId = (string) $item['id']; + if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) + { + $itemLink = (string) $item['link']['href']; + $itemTitle = (string) $item['title']; + $itemDescription = (string) $item['summary']; + $itemPubDate = (string) $item['created']; + $itemId = (string) $item['id']; + $itemAuthor = ''; + } + else + { + $itemLink = (string) $item['rss_link']; + $itemTitle = (string) $item['rss_title']; + $itemDescription = (string) $item['rss_description']; + $itemPubDate = (string) $item['rss_pubdate']; + $itemId = (string) $item['rss_guid']; + $itemAuthor = (string) $item['rss_author']; + } } + else print 'ErrorBadFeedFormat'; // Add record to result array $this->_rssarray[$i] = array( @@ -211,7 +302,8 @@ class RssParser 'description'=>$itemDescription, 'pubDate'=>$itemPubDate, 'category'=>$itemCategory, - 'id'=>$itemId); + 'id'=>$itemId, + 'author'=>$itemAuthor); $i++; @@ -227,6 +319,270 @@ class RssParser } } + + + /** + * Triggered when opened tag is found + * + * @param $p + * @param $element Tag + * @param $attrs Attributes of tags + */ + function feed_start_element($p, $element, &$attrs) + { + $el = $element = strtolower($element); + $attrs = array_change_key_case($attrs, CASE_LOWER); + + // check for a namespace, and split if found + $ns = false; + if ( strpos( $element, ':' ) ) { + list($ns, $el) = explode( ':', $element, 2); + } + if ( $ns and $ns != 'rdf' ) { + $this->current_namespace = $ns; + } + + # if feed type isn't set, then this is first element of feed + # identify feed from root element + # + if (!isset($this->_format) ) { + if ( $el == 'rdf' ) { + $this->_format = 'rss'; + $this->feed_version = '1.0'; + } + elseif ( $el == 'rss' ) { + $this->_format = 'rss'; + $this->feed_version = $attrs['version']; + } + elseif ( $el == 'feed' ) { + $this->_format = 'atom'; + $this->feed_version = $attrs['version']; + $this->inchannel = true; + } + return; + } + + if ( $el == 'channel' ) + { + $this->inchannel = true; + } + elseif ($el == 'item' or $el == 'entry' ) + { + $this->initem = true; + if ( isset($attrs['rdf:about']) ) { + $this->current_item['about'] = $attrs['rdf:about']; + } + } + + // if we're in the default namespace of an RSS feed, + // record textinput or image fields + elseif ( + $this->_format == 'rss' and + $this->current_namespace == '' and + $el == 'textinput' ) + { + $this->intextinput = true; + } + + elseif ( + $this->_format == 'rss' and + $this->current_namespace == '' and + $el == 'image' ) + { + $this->inimage = true; + } + + # handle atom content constructs + elseif ( $this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) ) + { + // avoid clashing w/ RSS mod_content + if ($el == 'content' ) { + $el = 'atom_content'; + } + + $this->incontent = $el; + + + } + + // if inside an Atom content construct (e.g. content or summary) field treat tags as text + elseif ($this->_format == 'atom' and $this->incontent ) + { + // if tags are inlined, then flatten + $attrs_str = join(' ', + array_map('map_attrs', + array_keys($attrs), + array_values($attrs) ) ); + + $this->append_content( "<$element $attrs_str>" ); + + array_unshift( $this->stack, $el ); + } + + // Atom support many links per containging element. + // Magpie treats link elements of type rel='alternate' + // as being equivalent to RSS's simple link element. + // + elseif ($this->_format == 'atom' and $el == 'link' ) + { + if ( isset($attrs['rel']) and $attrs['rel'] == 'alternate' ) + { + $link_el = 'link'; + } + else { + $link_el = 'link_' . $attrs['rel']; + } + + $this->append($link_el, $attrs['href']); + } + // set stack[0] to current element + else { + array_unshift($this->stack, $el); + } + } + + + /** + * Triggered when CDATA is found + * + * @param $p + * @param $element Tag + * @param $attrs Attributes of tags + */ + function feed_cdata ($p, $text) { + if ($this->_format == 'atom' and $this->incontent) + { + $this->append_content( $text ); + } + else { + $current_el = join('_', array_reverse($this->stack)); + $this->append($current_el, $text); + } + } + + /** + * Triggered when closed tag is found + * + * @param $p + * @param $element Tag + */ + function feed_end_element ($p, $el) { + $el = strtolower($el); + + if ( $el == 'item' or $el == 'entry' ) + { + $this->items[] = $this->current_item; + $this->current_item = array(); + $this->initem = false; + } + elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'textinput' ) + { + $this->intextinput = false; + } + elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'image' ) + { + $this->inimage = false; + } + elseif ($this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) ) + { + $this->incontent = false; + } + elseif ($el == 'channel' or $el == 'feed' ) + { + $this->inchannel = false; + } + elseif ($this->_format == 'atom' and $this->incontent ) { + // balance tags properly + // note: i don't think this is actually neccessary + if ( $this->stack[0] == $el ) + { + $this->append_content("</$el>"); + } + else { + $this->append_content("<$el />"); + } + + array_shift( $this->stack ); + } + else { + array_shift( $this->stack ); + } + + $this->current_namespace = false; + } + + + /** + * To concat 2 string with no warning if an operand is not defined + * + * @param $str1 + * @param $str2 + */ + function concat (&$str1, $str2="") { + if (!isset($str1) ) { + $str1=""; + } + $str1 .= $str2; + } + + /** + */ + function append_content($text) { + if ( $this->initem ) { + $this->concat( $this->current_item[ $this->incontent ], $text ); + } + elseif ( $this->inchannel ) { + $this->concat( $this->channel[ $this->incontent ], $text ); + } + } + + /** + * smart append - field and namespace aware + */ + function append($el, $text) { + if (!$el) { + return; + } + if ( $this->current_namespace ) + { + if ( $this->initem ) { + $this->concat( + $this->current_item[ $this->current_namespace ][ $el ], $text); + } + elseif ($this->inchannel) { + $this->concat( + $this->channel[ $this->current_namespace][ $el ], $text ); + } + elseif ($this->intextinput) { + $this->concat( + $this->textinput[ $this->current_namespace][ $el ], $text ); + } + elseif ($this->inimage) { + $this->concat( + $this->image[ $this->current_namespace ][ $el ], $text ); + } + } + else { + if ( $this->initem ) { + $this->concat( + $this->current_item[ $el ], $text); + } + elseif ($this->intextinput) { + $this->concat( + $this->textinput[ $el ], $text ); + } + elseif ($this->inimage) { + $this->concat( + $this->image[ $el ], $text ); + } + elseif ($this->inchannel) { + $this->concat( + $this->channel[ $el ], $text ); + } + + } + } + } -- GitLab