rss_parse.inc 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605
  1. <?php
  2. /**
  3. * Project: MagpieRSS: a simple RSS integration tool
  4. * File: rss_parse.inc - parse an RSS or Atom feed
  5. * return as a simple object.
  6. *
  7. * Handles RSS 0.9x, RSS 2.0, RSS 1.0, and Atom 0.3
  8. *
  9. * The lastest version of MagpieRSS can be obtained from:
  10. * http://magpierss.sourceforge.net
  11. *
  12. * For questions, help, comments, discussion, etc., please join the
  13. * Magpie mailing list:
  14. * magpierss-general@lists.sourceforge.net
  15. *
  16. * @author Kellan Elliott-McCrea <kellan@protest.net>
  17. * @version 0.7a
  18. * @license GPL
  19. * @package chamilo.include.rss
  20. */
  21. /**
  22. * Code
  23. */
  24. define('RSS', 'RSS');
  25. define('ATOM', 'Atom');
  26. require_once (MAGPIE_DIR . 'rss_utils.inc');
  27. /**
  28. * Hybrid parser, and object, takes RSS as a string and returns a simple object.
  29. *
  30. * see: rss_fetch.inc for a simpler interface with integrated caching support
  31. *
  32. * @package chamilo.include.rss
  33. */
  34. class MagpieRSS {
  35. public $parser;
  36. public $current_item = array(); // item currently being parsed
  37. public $items = array(); // collection of parsed items
  38. public $channel = array(); // hash of channel fields
  39. public $textinput = array();
  40. public $image = array();
  41. public $feed_type;
  42. public $feed_version;
  43. public $encoding = ''; // output encoding of parsed rss
  44. private $_source_encoding = ''; // only set if we have to parse xml prolog
  45. public $ERROR = "";
  46. public $WARNING = "";
  47. // define some constants
  48. private $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright');
  49. private $_KNOWN_ENCODINGS = array('UTF-8', 'US-ASCII', 'ISO-8859-1');
  50. // parser variables, useless if you're not a parser, treat as private
  51. public $stack = array(); // parser stack
  52. public $inchannel = false;
  53. public $initem = false;
  54. public $incontent = false; // if in Atom <content mode="xml"> field
  55. public $intextinput = false;
  56. public $inimage = false;
  57. public $current_namespace = false;
  58. /**
  59. * Set up XML parser, parse source, and return populated RSS object..
  60. *
  61. * @param string $source string containing the RSS to be parsed
  62. *
  63. * NOTE: Probably a good idea to leave the encoding options alone unless
  64. * you know what you're doing as PHP's character set support is
  65. * a little weird.
  66. *
  67. * NOTE: A lot of this is unnecessary but harmless with PHP5
  68. *
  69. *
  70. * @param string $output_encoding output the parsed RSS in this character
  71. * set defaults to ISO-8859-1 as this is PHP's
  72. * default.
  73. *
  74. * NOTE: might be changed to UTF-8 in future
  75. * versions.
  76. *
  77. * @param string $input_encoding the character set of the incoming RSS source.
  78. * Leave blank and Magpie will try to figure it
  79. * out.
  80. *
  81. *
  82. * @param bool $detect_encoding if false Magpie won't attempt to detect
  83. * source encoding. (caveat emptor)
  84. *
  85. */
  86. public function MagpieRSS ($source, $output_encoding='ISO-8859-1',
  87. $input_encoding=null, $detect_encoding=true)
  88. {
  89. # if PHP xml isn't compiled in, die
  90. #
  91. if (!function_exists('xml_parser_create')) {
  92. $this->error( "Failed to load PHP's XML Extension. " .
  93. "http://www.php.net/manual/en/ref.xml.php",
  94. E_USER_ERROR );
  95. }
  96. list($parser, $source) = $this->create_parser($source,
  97. $output_encoding, $input_encoding, $detect_encoding);
  98. if (!is_resource($parser)) {
  99. $this->error( "Failed to create an instance of PHP's XML parser. " .
  100. "http://www.php.net/manual/en/ref.xml.php",
  101. E_USER_ERROR );
  102. }
  103. $this->parser = $parser;
  104. # pass in parser, and a reference to this object
  105. # setup handlers
  106. #
  107. xml_set_object( $this->parser, $this );
  108. xml_set_element_handler($this->parser,
  109. 'feed_start_element', 'feed_end_element' );
  110. xml_set_character_data_handler( $this->parser, 'feed_cdata' );
  111. $status = xml_parse( $this->parser, $source );
  112. if (! $status ) {
  113. $errorcode = xml_get_error_code( $this->parser );
  114. if ( $errorcode != XML_ERROR_NONE ) {
  115. $xml_error = xml_error_string( $errorcode );
  116. $error_line = xml_get_current_line_number($this->parser);
  117. $error_col = xml_get_current_column_number($this->parser);
  118. $errormsg = "$xml_error at line $error_line, column $error_col";
  119. $this->error( $errormsg );
  120. }
  121. }
  122. xml_parser_free( $this->parser );
  123. $this->normalize();
  124. }
  125. public function feed_start_element($p, $element, &$attrs) {
  126. $el = $element = strtolower($element);
  127. $attrs = array_change_key_case($attrs, CASE_LOWER);
  128. // check for a namespace, and split if found
  129. $ns = false;
  130. if ( strpos( $element, ':' ) ) {
  131. list($ns, $el) = split( ':', $element, 2);
  132. }
  133. if ( $ns and $ns != 'rdf' ) {
  134. $this->current_namespace = $ns;
  135. }
  136. # if feed type isn't set, then this is first element of feed
  137. # identify feed from root element
  138. #
  139. if (!isset($this->feed_type) ) {
  140. if ( $el == 'rdf' ) {
  141. $this->feed_type = RSS;
  142. $this->feed_version = '1.0';
  143. }
  144. elseif ( $el == 'rss' ) {
  145. $this->feed_type = RSS;
  146. $this->feed_version = $attrs['version'];
  147. }
  148. elseif ( $el == 'feed' ) {
  149. $this->feed_type = ATOM;
  150. $this->feed_version = $attrs['version'];
  151. $this->inchannel = true;
  152. }
  153. return;
  154. }
  155. if ( $el == 'channel' )
  156. {
  157. $this->inchannel = true;
  158. }
  159. elseif ($el == 'item' or $el == 'entry' )
  160. {
  161. $this->initem = true;
  162. if ( isset($attrs['rdf:about']) ) {
  163. $this->current_item['about'] = $attrs['rdf:about'];
  164. }
  165. }
  166. // if we're in the default namespace of an RSS feed,
  167. // record textinput or image fields
  168. elseif (
  169. $this->feed_type == RSS and
  170. $this->current_namespace == '' and
  171. $el == 'textinput' )
  172. {
  173. $this->intextinput = true;
  174. }
  175. elseif (
  176. $this->feed_type == RSS and
  177. $this->current_namespace == '' and
  178. $el == 'image' )
  179. {
  180. $this->inimage = true;
  181. }
  182. # handle atom content constructs
  183. elseif ( $this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) )
  184. {
  185. // avoid clashing w/ RSS mod_content
  186. if ($el == 'content' ) {
  187. $el = 'atom_content';
  188. }
  189. $this->incontent = $el;
  190. }
  191. // if inside an Atom content construct (e.g. content or summary) field treat tags as text
  192. elseif ($this->feed_type == ATOM and $this->incontent )
  193. {
  194. // if tags are inlined, then flatten
  195. $attrs_str = join(' ',
  196. array_map('map_attrs',
  197. array_keys($attrs),
  198. array_values($attrs) ) );
  199. $this->append_content( "<$element $attrs_str>" );
  200. array_unshift( $this->stack, $el );
  201. }
  202. // Atom support many links per containging element.
  203. // Magpie treats link elements of type rel='alternate'
  204. // as being equivalent to RSS's simple link element.
  205. //
  206. elseif ($this->feed_type == ATOM and $el == 'link' )
  207. {
  208. if ( isset($attrs['rel']) and $attrs['rel'] == 'alternate' )
  209. {
  210. $link_el = 'link';
  211. }
  212. else {
  213. $link_el = 'link_' . $attrs['rel'];
  214. }
  215. $this->append($link_el, $attrs['href']);
  216. }
  217. // set stack[0] to current element
  218. else {
  219. array_unshift($this->stack, $el);
  220. }
  221. }
  222. public function feed_cdata ($p, $text) {
  223. if ($this->feed_type == ATOM and $this->incontent)
  224. {
  225. $this->append_content( $text );
  226. }
  227. else {
  228. $current_el = join('_', array_reverse($this->stack));
  229. $this->append($current_el, $text);
  230. }
  231. }
  232. public function feed_end_element ($p, $el) {
  233. $el = strtolower($el);
  234. if ( $el == 'item' or $el == 'entry' )
  235. {
  236. $this->items[] = $this->current_item;
  237. $this->current_item = array();
  238. $this->initem = false;
  239. }
  240. elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'textinput' )
  241. {
  242. $this->intextinput = false;
  243. }
  244. elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'image' )
  245. {
  246. $this->inimage = false;
  247. }
  248. elseif ($this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) )
  249. {
  250. $this->incontent = false;
  251. }
  252. elseif ($el == 'channel' or $el == 'feed' )
  253. {
  254. $this->inchannel = false;
  255. }
  256. elseif ($this->feed_type == ATOM and $this->incontent ) {
  257. // balance tags properly
  258. // note: i don't think this is actually neccessary
  259. if ( $this->stack[0] == $el )
  260. {
  261. $this->append_content("</$el>");
  262. }
  263. else {
  264. $this->append_content("<$el />");
  265. }
  266. array_shift( $this->stack );
  267. }
  268. else {
  269. array_shift( $this->stack );
  270. }
  271. $this->current_namespace = false;
  272. }
  273. public function concat (&$str1, $str2="") {
  274. if (!isset($str1) ) {
  275. $str1="";
  276. }
  277. $str1 .= $str2;
  278. }
  279. public function append_content($text) {
  280. if ( $this->initem ) {
  281. $this->concat( $this->current_item[ $this->incontent ], $text );
  282. }
  283. elseif ( $this->inchannel ) {
  284. $this->concat( $this->channel[ $this->incontent ], $text );
  285. }
  286. }
  287. // smart append - field and namespace aware
  288. public function append($el, $text) {
  289. if (!$el) {
  290. return;
  291. }
  292. if ( $this->current_namespace )
  293. {
  294. if ( $this->initem ) {
  295. $this->concat(
  296. $this->current_item[ $this->current_namespace ][ $el ], $text);
  297. }
  298. elseif ($this->inchannel) {
  299. $this->concat(
  300. $this->channel[ $this->current_namespace][ $el ], $text );
  301. }
  302. elseif ($this->intextinput) {
  303. $this->concat(
  304. $this->textinput[ $this->current_namespace][ $el ], $text );
  305. }
  306. elseif ($this->inimage) {
  307. $this->concat(
  308. $this->image[ $this->current_namespace ][ $el ], $text );
  309. }
  310. }
  311. else {
  312. if ( $this->initem ) {
  313. $this->concat(
  314. $this->current_item[ $el ], $text);
  315. }
  316. elseif ($this->intextinput) {
  317. $this->concat(
  318. $this->textinput[ $el ], $text );
  319. }
  320. elseif ($this->inimage) {
  321. $this->concat(
  322. $this->image[ $el ], $text );
  323. }
  324. elseif ($this->inchannel) {
  325. $this->concat(
  326. $this->channel[ $el ], $text );
  327. }
  328. }
  329. }
  330. public function normalize () {
  331. // if atom populate rss fields
  332. if ( $this->is_atom() ) {
  333. $this->channel['description'] = $this->channel['tagline'];
  334. for ( $i = 0; $i < count($this->items); $i++) {
  335. $item = $this->items[$i];
  336. if ( isset($item['summary']) )
  337. $item['description'] = $item['summary'];
  338. if ( isset($item['atom_content']))
  339. $item['content']['encoded'] = $item['atom_content'];
  340. $atom_date = (isset($item['issued']) ) ? $item['issued'] : $item['modified'];
  341. if ( $atom_date ) {
  342. $epoch = @parse_w3cdtf($atom_date);
  343. if ($epoch and $epoch > 0) {
  344. $item['date_timestamp'] = $epoch;
  345. }
  346. }
  347. $this->items[$i] = $item;
  348. }
  349. }
  350. elseif ( $this->is_rss() ) {
  351. $this->channel['tagline'] = $this->channel['description'];
  352. for ( $i = 0; $i < count($this->items); $i++) {
  353. $item = $this->items[$i];
  354. if ( isset($item['description']))
  355. $item['summary'] = $item['description'];
  356. if ( isset($item['content']['encoded'] ) )
  357. $item['atom_content'] = $item['content']['encoded'];
  358. if ( $this->is_rss() == '1.0' and isset($item['dc']['date']) ) {
  359. $epoch = @parse_w3cdtf($item['dc']['date']);
  360. if ($epoch and $epoch > 0) {
  361. $item['date_timestamp'] = $epoch;
  362. }
  363. }
  364. elseif ( isset($item['pubdate']) ) {
  365. $epoch = @strtotime($item['pubdate']);
  366. if ($epoch > 0) {
  367. $item['date_timestamp'] = $epoch;
  368. }
  369. }
  370. $this->items[$i] = $item;
  371. }
  372. }
  373. }
  374. public function is_rss () {
  375. if ( $this->feed_type == RSS ) {
  376. return $this->feed_version;
  377. }
  378. else {
  379. return false;
  380. }
  381. }
  382. public function is_atom() {
  383. if ( $this->feed_type == ATOM ) {
  384. return $this->feed_version;
  385. }
  386. else {
  387. return false;
  388. }
  389. }
  390. /**
  391. * return XML parser, and possibly re-encoded source
  392. *
  393. */
  394. public function create_parser($source, $out_enc, $in_enc, $detect) {
  395. if ( substr(phpversion(),0,1) == 5) {
  396. $parser = $this->php5_create_parser($in_enc, $detect);
  397. }
  398. else {
  399. list($parser, $source) = $this->php4_create_parser($source, $in_enc, $detect);
  400. }
  401. if ($out_enc) {
  402. $this->encoding = $out_enc;
  403. xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, $out_enc);
  404. }
  405. return array($parser, $source);
  406. }
  407. /**
  408. * Instantiate an XML parser under PHP5
  409. *
  410. * PHP5 will do a fine job of detecting input encoding
  411. * if passed an empty string as the encoding.
  412. *
  413. * All hail libxml2!
  414. *
  415. */
  416. public function php5_create_parser($in_enc, $detect) {
  417. // by default php5 does a fine job of detecting input encodings
  418. if(!$detect && $in_enc) {
  419. return xml_parser_create($in_enc);
  420. }
  421. else {
  422. return xml_parser_create('');
  423. }
  424. }
  425. /**
  426. * Instaniate an XML parser under PHP4
  427. *
  428. * Unfortunately PHP4's support for character encodings
  429. * and especially XML and character encodings sucks. As
  430. * long as the documents you parse only contain characters
  431. * from the ISO-8859-1 character set (a superset of ASCII,
  432. * and a subset of UTF-8) you're fine. However once you
  433. * step out of that comfy little world things get mad, bad,
  434. * and dangerous to know.
  435. *
  436. * The following code is based on SJM's work with FoF
  437. * @see http://minutillo.com/steve/weblog/2004/6/17/php-xml-and-character-encodings-a-tale-of-sadness-rage-and-data-loss
  438. *
  439. */
  440. public function php4_create_parser($source, $in_enc, $detect) {
  441. if ( !$detect ) {
  442. return array(xml_parser_create($in_enc), $source);
  443. }
  444. if (!$in_enc) {
  445. if (preg_match('/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m', $source, $m)) {
  446. $in_enc = strtoupper($m[1]);
  447. $this->source_encoding = $in_enc;
  448. }
  449. else {
  450. $in_enc = 'UTF-8';
  451. }
  452. }
  453. if ($this->known_encoding($in_enc)) {
  454. return array(xml_parser_create($in_enc), $source);
  455. }
  456. // the dectected encoding is not one of the simple encodings PHP knows
  457. // attempt to use the iconv extension to
  458. // cast the XML to a known encoding
  459. // @see http://php.net/iconv
  460. if (function_exists('iconv')) {
  461. $encoded_source = iconv($in_enc,'UTF-8', $source);
  462. if ($encoded_source) {
  463. return array(xml_parser_create('UTF-8'), $encoded_source);
  464. }
  465. }
  466. // iconv didn't work, try mb_convert_encoding
  467. // @see http://php.net/mbstring
  468. if(function_exists('mb_convert_encoding')) {
  469. $encoded_source = mb_convert_encoding($source, 'UTF-8', $in_enc );
  470. if ($encoded_source) {
  471. return array(xml_parser_create('UTF-8'), $encoded_source);
  472. }
  473. }
  474. // else
  475. $this->error("Feed is in an unsupported character encoding. ($in_enc) " .
  476. "You may see strange artifacts, and mangled characters.",
  477. E_USER_NOTICE);
  478. return array(xml_parser_create(), $source);
  479. }
  480. public function known_encoding($enc) {
  481. $enc = strtoupper($enc);
  482. if ( in_array($enc, $this->_KNOWN_ENCODINGS) ) {
  483. return $enc;
  484. }
  485. else {
  486. return false;
  487. }
  488. }
  489. public function error ($errormsg, $lvl=E_USER_WARNING) {
  490. // append PHP's error message if track_errors enabled
  491. if ( isset($php_errormsg) ) {
  492. $errormsg .= " ($php_errormsg)";
  493. }
  494. if ( MAGPIE_DEBUG ) {
  495. trigger_error( $errormsg, $lvl);
  496. }
  497. else {
  498. error_log( $errormsg, 0);
  499. }
  500. $notices = E_USER_NOTICE|E_NOTICE;
  501. if ( $lvl&$notices ) {
  502. $this->WARNING = $errormsg;
  503. } else {
  504. $this->ERROR = $errormsg;
  505. }
  506. }
  507. } // end class RSS
  508. function map_attrs($k, $v) {
  509. return "$k=\"$v\"";
  510. }
  511. // patch to support medieval versions of PHP4.1.x,
  512. // courtesy, Ryan Currie, ryan@digibliss.com
  513. if (!function_exists('array_change_key_case')) {
  514. define("CASE_UPPER",1);
  515. define("CASE_LOWER",0);
  516. function array_change_key_case($array,$case=CASE_LOWER) {
  517. if ($case=CASE_LOWER) $cmd=strtolower;
  518. elseif ($case=CASE_UPPER) $cmd=strtoupper;
  519. foreach($array as $key=>$value) {
  520. $output[$cmd($key)]=$value;
  521. }
  522. return $output;
  523. }
  524. }