rss_fetch.inc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. <?php
  2. /**
  3. * Project: MagpieRSS: a simple RSS integration tool
  4. * File: rss_fetch.inc, a simple functional interface
  5. to fetching and parsing RSS files, via the
  6. function fetch_rss()
  7. * Author: Kellan Elliott-McCrea <kellan@protest.net>
  8. * License: GPL
  9. *
  10. * The lastest version of MagpieRSS can be obtained from:
  11. * http://magpierss.sourceforge.net
  12. *
  13. * For questions, help, comments, discussion, etc., please join the
  14. * Magpie mailing list:
  15. * magpierss-general@lists.sourceforge.net
  16. * @package chamilo.include.rss
  17. */
  18. /**
  19. * Code
  20. */
  21. // Setup MAGPIE_DIR for use on hosts that don't include
  22. // the current path in include_path.
  23. // with thanks to rajiv and smarty
  24. if (!defined('DIR_SEP')) {
  25. define('DIR_SEP', DIRECTORY_SEPARATOR);
  26. }
  27. if (!defined('MAGPIE_DIR')) {
  28. define('MAGPIE_DIR', dirname(__FILE__) . DIR_SEP);
  29. }
  30. require_once( MAGPIE_DIR . 'rss_parse.inc' );
  31. require_once( MAGPIE_DIR . 'rss_cache.inc' );
  32. // for including 3rd party libraries
  33. define('MAGPIE_EXTLIB', MAGPIE_DIR . 'extlib' . DIR_SEP);
  34. require_once( MAGPIE_EXTLIB . 'Snoopy.class.inc');
  35. define('MAGPIE_CACHE_DIR', api_get_path(SYS_ARCHIVE_PATH));
  36. /*
  37. * CONSTANTS - redefine these in your script to change the
  38. * behaviour of fetch_rss() currently, most options effect the cache
  39. *
  40. * MAGPIE_CACHE_ON - Should Magpie cache parsed RSS objects?
  41. * For me a built in cache was essential to creating a "PHP-like"
  42. * feel to Magpie, see rss_cache.inc for rationale
  43. *
  44. *
  45. * MAGPIE_CACHE_DIR - Where should Magpie cache parsed RSS objects?
  46. * This should be a location that the webserver can write to. If this
  47. * directory does not already exist Mapie will try to be smart and create
  48. * it. This will often fail for permissions reasons.
  49. *
  50. *
  51. * MAGPIE_CACHE_AGE - How long to store cached RSS objects? In seconds.
  52. *
  53. *
  54. * MAGPIE_CACHE_FRESH_ONLY - If remote fetch fails, throw error
  55. * instead of returning stale object?
  56. *
  57. * MAGPIE_DEBUG - Display debugging notices?
  58. *
  59. */
  60. /*=======================================================================*\
  61. Function: fetch_rss:
  62. Purpose: return RSS object for the give url
  63. maintain the cache
  64. Input: url of RSS file
  65. Output: parsed RSS object (see rss_parse.inc)
  66. NOTES ON CACHEING:
  67. If caching is on (MAGPIE_CACHE_ON) fetch_rss will first check the cache.
  68. NOTES ON RETRIEVING REMOTE FILES:
  69. If conditional gets are on (MAGPIE_CONDITIONAL_GET_ON) fetch_rss will
  70. return a cached object, and touch the cache object upon recieving a
  71. 304.
  72. NOTES ON FAILED REQUESTS:
  73. If there is an HTTP error while fetching an RSS object, the cached
  74. version will be return, if it exists (and if MAGPIE_CACHE_FRESH_ONLY is off)
  75. \*=======================================================================*/
  76. define('MAGPIE_VERSION', '0.72');
  77. $MAGPIE_ERROR = "";
  78. function fetch_rss ($url) {
  79. // initialize constants
  80. init();
  81. if ( !isset($url) ) {
  82. error("fetch_rss called without a url");
  83. return false;
  84. }
  85. // if cache is disabled
  86. if ( !MAGPIE_CACHE_ON ) {
  87. // fetch file, and parse it
  88. $resp = _fetch_remote_file( $url );
  89. if ( is_success( $resp->status ) ) {
  90. return _response_to_rss( $resp );
  91. }
  92. else {
  93. error("Failed to fetch $url and cache is off");
  94. return false;
  95. }
  96. }
  97. // else cache is ON
  98. else {
  99. // Flow
  100. // 1. check cache
  101. // 2. if there is a hit, make sure its fresh
  102. // 3. if cached obj fails freshness check, fetch remote
  103. // 4. if remote fails, return stale object, or error
  104. $cache = new RSSCache( MAGPIE_CACHE_DIR, MAGPIE_CACHE_AGE );
  105. if (MAGPIE_DEBUG and $cache->ERROR) {
  106. debug($cache->ERROR, E_USER_WARNING);
  107. }
  108. $cache_status = 0; // response of check_cache
  109. $request_headers = array(); // HTTP headers to send with fetch
  110. $rss = 0; // parsed RSS object
  111. $errormsg = 0; // errors, if any
  112. // store parsed XML by desired output encoding
  113. // as character munging happens at parse time
  114. $cache_key = $url . MAGPIE_OUTPUT_ENCODING;
  115. if (!$cache->ERROR) {
  116. // return cache HIT, MISS, or STALE
  117. $cache_status = $cache->check_cache( $cache_key);
  118. }
  119. // if object cached, and cache is fresh, return cached obj
  120. if ( $cache_status == 'HIT' ) {
  121. $rss = $cache->get( $cache_key );
  122. if ( isset($rss) and $rss ) {
  123. // should be cache age
  124. $rss->from_cache = 1;
  125. if ( MAGPIE_DEBUG > 1) {
  126. debug("MagpieRSS: Cache HIT", E_USER_NOTICE);
  127. }
  128. return $rss;
  129. }
  130. }
  131. // else attempt a conditional get
  132. // setup headers
  133. if ( $cache_status == 'STALE' ) {
  134. $rss = $cache->get( $cache_key );
  135. if ( $rss and $rss->etag and $rss->last_modified ) {
  136. $request_headers['If-None-Match'] = $rss->etag;
  137. $request_headers['If-Last-Modified'] = $rss->last_modified;
  138. }
  139. }
  140. $resp = _fetch_remote_file( $url, $request_headers );
  141. if (isset($resp) and $resp) {
  142. if ($resp->status == '304' ) {
  143. // we have the most current copy
  144. if ( MAGPIE_DEBUG > 1) {
  145. debug("Got 304 for $url");
  146. }
  147. // reset cache on 304 (at minutillo insistent prodding)
  148. $cache->set($cache_key, $rss);
  149. return $rss;
  150. }
  151. elseif ( is_success( $resp->status ) ) {
  152. $rss = _response_to_rss( $resp );
  153. if ( $rss ) {
  154. if (MAGPIE_DEBUG > 1) {
  155. debug("Fetch successful");
  156. }
  157. // add object to cache
  158. $cache->set( $cache_key, $rss );
  159. return $rss;
  160. }
  161. }
  162. else {
  163. $errormsg = "Failed to fetch $url ";
  164. if ( $resp->status == '-100' ) {
  165. $errormsg .= "(Request timed out after " . MAGPIE_FETCH_TIME_OUT . " seconds)";
  166. }
  167. elseif ( $resp->error ) {
  168. # compensate for Snoopy's annoying habbit to tacking
  169. # on '\n'
  170. $http_error = substr($resp->error, 0, -2);
  171. $errormsg .= "(HTTP Error: $http_error)";
  172. }
  173. else {
  174. $errormsg .= "(HTTP Response: " . $resp->response_code .')';
  175. }
  176. }
  177. }
  178. else {
  179. $errormsg = "Unable to retrieve RSS file for unknown reasons.";
  180. }
  181. // else fetch failed
  182. // attempt to return cached object
  183. if ($rss) {
  184. if ( MAGPIE_DEBUG ) {
  185. debug("Returning STALE object for $url");
  186. }
  187. return $rss;
  188. }
  189. // else we totally failed
  190. //hide the error
  191. //error( $errormsg );
  192. return false;
  193. } // end if ( !MAGPIE_CACHE_ON ) {
  194. } // end fetch_rss()
  195. /*=======================================================================*\
  196. Function: error
  197. Purpose: set MAGPIE_ERROR, and trigger error
  198. \*=======================================================================*/
  199. function error ($errormsg, $lvl=E_USER_WARNING) {
  200. global $MAGPIE_ERROR;
  201. // append PHP's error message if track_errors enabled
  202. if ( isset($php_errormsg) ) {
  203. $errormsg .= " ($php_errormsg)";
  204. }
  205. if ( $errormsg ) {
  206. $errormsg = "MagpieRSS: $errormsg";
  207. $MAGPIE_ERROR = $errormsg;
  208. trigger_error( $errormsg, $lvl);
  209. }
  210. }
  211. function debug ($debugmsg, $lvl=E_USER_NOTICE) {
  212. trigger_error("MagpieRSS [debug] $debugmsg", $lvl);
  213. }
  214. /*=======================================================================*\
  215. Function: magpie_error
  216. Purpose: accessor for the magpie error variable
  217. \*=======================================================================*/
  218. function magpie_error ($errormsg="") {
  219. global $MAGPIE_ERROR;
  220. if ( isset($errormsg) and $errormsg ) {
  221. $MAGPIE_ERROR = $errormsg;
  222. }
  223. return $MAGPIE_ERROR;
  224. }
  225. /*=======================================================================*\
  226. Function: _fetch_remote_file
  227. Purpose: retrieve an arbitrary remote file
  228. Input: url of the remote file
  229. headers to send along with the request (optional)
  230. Output: an HTTP response object (see Snoopy.class.inc)
  231. \*=======================================================================*/
  232. function _fetch_remote_file ($url, $headers = "" ) {
  233. // Snoopy is an HTTP client in PHP
  234. $client = new Snoopy();
  235. $client->agent = MAGPIE_USER_AGENT;
  236. $client->read_timeout = MAGPIE_FETCH_TIME_OUT;
  237. $client->use_gzip = MAGPIE_USE_GZIP;
  238. if (is_array($headers) ) {
  239. $client->rawheaders = $headers;
  240. }
  241. @$client->fetch($url);
  242. return $client;
  243. }
  244. /*=======================================================================*\
  245. Function: _response_to_rss
  246. Purpose: parse an HTTP response object into an RSS object
  247. Input: an HTTP response object (see Snoopy)
  248. Output: parsed RSS object (see rss_parse)
  249. \*=======================================================================*/
  250. function _response_to_rss ($resp) {
  251. $rss = new MagpieRSS( $resp->results, MAGPIE_OUTPUT_ENCODING, MAGPIE_INPUT_ENCODING, MAGPIE_DETECT_ENCODING );
  252. // if RSS parsed successfully
  253. if ( $rss and !$rss->ERROR) {
  254. // find Etag, and Last-Modified
  255. foreach($resp->headers as $h) {
  256. // 2003-03-02 - Nicola Asuni (www.tecnick.com) - fixed bug "Undefined offset: 1"
  257. if (strpos($h, ": ")) {
  258. list($field, $val) = explode(": ", $h, 2);
  259. }
  260. else {
  261. $field = $h;
  262. $val = "";
  263. }
  264. if ( $field == 'ETag' ) {
  265. $rss->etag = $val;
  266. }
  267. if ( $field == 'Last-Modified' ) {
  268. $rss->last_modified = $val;
  269. }
  270. }
  271. return $rss;
  272. } // else construct error message
  273. else {
  274. $errormsg = "Failed to parse RSS file.";
  275. if ($rss) {
  276. $errormsg .= " (" . $rss->ERROR . ")";
  277. }
  278. error($errormsg,E_USER_NOTICE);
  279. return false;
  280. } // end if ($rss and !$rss->error)
  281. }
  282. /*=======================================================================*\
  283. Function: init
  284. Purpose: setup constants with default values
  285. check for user overrides
  286. \*=======================================================================*/
  287. function init () {
  288. if ( defined('MAGPIE_INITALIZED') ) {
  289. return;
  290. }
  291. else {
  292. define('MAGPIE_INITALIZED', true);
  293. }
  294. if ( !defined('MAGPIE_CACHE_ON') ) {
  295. define('MAGPIE_CACHE_ON', true);
  296. }
  297. if ( !defined('MAGPIE_CACHE_DIR') ) {
  298. define('MAGPIE_CACHE_DIR', './cache');
  299. }
  300. if ( !defined('MAGPIE_CACHE_AGE') ) {
  301. define('MAGPIE_CACHE_AGE', 60*60); // one hour
  302. }
  303. if ( !defined('MAGPIE_CACHE_FRESH_ONLY') ) {
  304. define('MAGPIE_CACHE_FRESH_ONLY', false);
  305. }
  306. if ( !defined('MAGPIE_OUTPUT_ENCODING') ) {
  307. define('MAGPIE_OUTPUT_ENCODING', 'UTF-8');
  308. }
  309. if ( !defined('MAGPIE_INPUT_ENCODING') ) {
  310. define('MAGPIE_INPUT_ENCODING', null);
  311. }
  312. if ( !defined('MAGPIE_DETECT_ENCODING') ) {
  313. define('MAGPIE_DETECT_ENCODING', true);
  314. }
  315. if ( !defined('MAGPIE_DEBUG') ) {
  316. define('MAGPIE_DEBUG', 0);
  317. }
  318. if ( !defined('MAGPIE_USER_AGENT') ) {
  319. $ua = 'MagpieRSS/'. MAGPIE_VERSION . ' (+http://magpierss.sf.net';
  320. if ( MAGPIE_CACHE_ON ) {
  321. $ua = $ua . ')';
  322. }
  323. else {
  324. $ua = $ua . '; No cache)';
  325. }
  326. define('MAGPIE_USER_AGENT', $ua);
  327. }
  328. if ( !defined('MAGPIE_FETCH_TIME_OUT') ) {
  329. define('MAGPIE_FETCH_TIME_OUT', 5); // 5 second timeout
  330. }
  331. // use gzip encoding to fetch rss files if supported?
  332. if ( !defined('MAGPIE_USE_GZIP') ) {
  333. define('MAGPIE_USE_GZIP', true);
  334. }
  335. }
  336. // NOTE: the following code should really be in Snoopy, or at least
  337. // somewhere other then rss_fetch!
  338. /*=======================================================================*\
  339. HTTP STATUS CODE PREDICATES
  340. These functions attempt to classify an HTTP status code
  341. based on RFC 2616 and RFC 2518.
  342. All of them take an HTTP status code as input, and return true or false
  343. All this code is adapted from LWP's HTTP::Status.
  344. \*=======================================================================*/
  345. /*=======================================================================*\
  346. Function: is_info
  347. Purpose: return true if Informational status code
  348. \*=======================================================================*/
  349. function is_info ($sc) {
  350. return $sc >= 100 && $sc < 200;
  351. }
  352. /*=======================================================================*\
  353. Function: is_success
  354. Purpose: return true if Successful status code
  355. \*=======================================================================*/
  356. function is_success ($sc) {
  357. return $sc >= 200 && $sc < 300;
  358. }
  359. /*=======================================================================*\
  360. Function: is_redirect
  361. Purpose: return true if Redirection status code
  362. \*=======================================================================*/
  363. function is_redirect ($sc) {
  364. return $sc >= 300 && $sc < 400;
  365. }
  366. /*=======================================================================*\
  367. Function: is_error
  368. Purpose: return true if Error status code
  369. \*=======================================================================*/
  370. function is_error ($sc) {
  371. return $sc >= 400 && $sc < 600;
  372. }
  373. /*=======================================================================*\
  374. Function: is_client_error
  375. Purpose: return true if Error status code, and its a client error
  376. \*=======================================================================*/
  377. function is_client_error ($sc) {
  378. return $sc >= 400 && $sc < 500;
  379. }
  380. /*=======================================================================*\
  381. Function: is_client_error
  382. Purpose: return true if Error status code, and its a server error
  383. \*=======================================================================*/
  384. function is_server_error ($sc) {
  385. return $sc >= 500 && $sc < 600;
  386. }