http_resource.class.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662
  1. <?php
  2. /**
  3. * An HTTP resource. In most cases an HTML document.
  4. *
  5. * @copyright (c) 2011 University of Geneva
  6. * @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html
  7. * @author Laurent Opprecht
  8. */
  9. class HttpResource
  10. {
  11. /**
  12. * Fetch the content and metadata of an url.
  13. *
  14. * If the content type is not parsable, i.e. it is not made of text,
  15. * only fetch the metadata and not the content. This is mostly done to
  16. * avoid downloading big files - videos, images, etc - which is unnecessary.
  17. *
  18. * @param string $url the url to fetch
  19. * @return array array containing the content and various info
  20. */
  21. static function fetch($url, $fetch_content = null)
  22. {
  23. static $cache = array();
  24. if (isset($cache[$url]))
  25. {
  26. return $cache;
  27. }
  28. if (is_null($fetch_content) || $fetch_content === false)
  29. {
  30. // create a new cURL resource
  31. $ch = curl_init();
  32. // set URL and other appropriate options
  33. curl_setopt($ch, CURLOPT_URL, $url);
  34. curl_setopt($ch, CURLOPT_HEADER, false);
  35. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  36. curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
  37. curl_setopt($ch, CURLOPT_NOBODY, true);
  38. $content = curl_exec($ch);
  39. $error = curl_error($ch);
  40. $info = curl_getinfo($ch);
  41. // close cURL resource, and free up system resources
  42. curl_close($ch);
  43. $info['content'] = $content;
  44. $info['error'] = $error;
  45. if ($fetch_content === false)
  46. {
  47. return $cache[$url] = $info;
  48. }
  49. if (isset($info['content_type']) && strpos($info['content_type'], 'text') === false)
  50. {
  51. return $cache[$url] = $info;
  52. }
  53. }
  54. // create a new cURL resource
  55. $ch = curl_init();
  56. // set URL and other appropriate options
  57. curl_setopt($ch, CURLOPT_URL, $url);
  58. curl_setopt($ch, CURLOPT_HEADER, false);
  59. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  60. curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
  61. //curl_setopt($ch, CURLOPT_VERBOSE, true);
  62. $content = curl_exec($ch);
  63. $error = curl_error($ch);
  64. $info = curl_getinfo($ch);
  65. // close cURL resource, and free up system resources
  66. curl_close($ch);
  67. $info['content'] = $content;
  68. $info['error'] = $error;
  69. return $cache[$url] = $info;
  70. }
  71. static function fetch_json($url)
  72. {
  73. $content = self::fetch($url, true);
  74. $content = $content['content'];
  75. if ($content)
  76. {
  77. $result = (array) json_decode($content);
  78. }
  79. else
  80. {
  81. $result = array();
  82. }
  83. return $result;
  84. }
  85. protected $url;
  86. protected $url_params = null;
  87. protected $info = null;
  88. protected $source = null;
  89. protected $metadata = null;
  90. protected $links = null;
  91. protected $title = null;
  92. protected $mime = null;
  93. protected $doc = null;
  94. protected $config = array();
  95. public function __construct($url, $config = array())
  96. {
  97. $this->url = $url;
  98. $this->config = $config;
  99. }
  100. public function config($key = '', $default = null)
  101. {
  102. return isset($this->config[$key]) ? $this->config[$key] : $default;
  103. }
  104. /**
  105. * Url of the resource
  106. *
  107. * @return string
  108. */
  109. public function url()
  110. {
  111. return $this->url;
  112. }
  113. public function url_domain()
  114. {
  115. $url = $this->url();
  116. $url = trim($url, '/');
  117. if (strpos($url, '//') !== false)
  118. {
  119. $parts = explode('//', $url);
  120. $url = end($parts);
  121. }
  122. $parts = explode('/', $url);
  123. $result = reset($parts);
  124. return $result;
  125. }
  126. /**
  127. *
  128. * @param array|string $part
  129. * @return boolean
  130. */
  131. public function url_match($part)
  132. {
  133. $params = func_get_args();
  134. $params = is_array($params) ? $params : array($params);
  135. $url = strtolower($this->url());
  136. foreach ($params as $param)
  137. {
  138. if (strpos($url, strtolower($param)) !== false)
  139. {
  140. return true;
  141. }
  142. }
  143. return false;
  144. }
  145. public function url_params()
  146. {
  147. if (!is_null($this->url_params))
  148. {
  149. return $this->url_params;
  150. }
  151. $url = $this->url();
  152. if (strpos($url, '?') === false)
  153. {
  154. return $this->url_params = array();
  155. }
  156. $result = array();
  157. $params = explode('?', $url);
  158. $params = end($params);
  159. $params = explode('&', $params);
  160. foreach ($params as $param)
  161. {
  162. list($key, $val) = explode('=', $param);
  163. $result[$key] = $val;
  164. }
  165. return $this->url_params = $result;
  166. }
  167. public function url_param($name, $default = false)
  168. {
  169. $params = $this->url_params();
  170. return isset($params[$name]) ? $params[$name] : $default;
  171. }
  172. /**
  173. * The name of the resource. I.e. the last part of the url without the ext
  174. *
  175. * @return string
  176. */
  177. public function name()
  178. {
  179. $url = $this->url();
  180. $url = explode('/', $url);
  181. $title = end($url);
  182. $title = explode('.', $title);
  183. $title = reset($title);
  184. return $title;
  185. }
  186. /**
  187. * Extention of the url
  188. *
  189. * @return string
  190. */
  191. public function ext()
  192. {
  193. $url = $this->url();
  194. $url = explode('.', $url);
  195. $ext = end($url);
  196. $ext = strtolower($ext);
  197. return $ext;
  198. }
  199. /**
  200. * Return true if the object has one of the extentions. Overloaded:
  201. *
  202. * $res->has_ext('pdf');
  203. * $res->has_ext('pdf', 'doc');
  204. * $res->has_ext(array('pdf', 'doc'));
  205. *
  206. * @param array|string $_
  207. * @return boolean true if the resource has one of the extentions passed
  208. */
  209. public function has_ext($_)
  210. {
  211. if (is_array($_))
  212. {
  213. $params = $_;
  214. }
  215. else
  216. {
  217. $params = func_get_args();
  218. $params = is_array($params) ? $params : array($params);
  219. }
  220. $ext = $this->ext();
  221. foreach ($params as $param)
  222. {
  223. if (strtolower($param) == $ext)
  224. {
  225. return true;
  226. }
  227. }
  228. return false;
  229. }
  230. public function charset()
  231. {
  232. $info = $this->info();
  233. $content_type = isset($info['content_type']) ? $info['content_type'] : '';
  234. if (empty($content_type))
  235. {
  236. return null;
  237. }
  238. $items = explode(';', $content_type);
  239. foreach ($items as $item)
  240. {
  241. $parts = explode('=', $item);
  242. if (count($parts) == 2 && reset($parts) == 'charset')
  243. {
  244. return strtolower(end($parts));
  245. }
  246. }
  247. return null;
  248. }
  249. /**
  250. * The mime type of the resource or the empty string if none has been specified
  251. *
  252. * @return string
  253. */
  254. public function mime()
  255. {
  256. if (!is_null($this->mime))
  257. {
  258. return $this->mime;
  259. }
  260. $info = $this->info();
  261. $content_type = isset($info['content_type']) ? $info['content_type'] : '';
  262. if ($content_type)
  263. {
  264. $result = reset(explode(';', $content_type));
  265. $result = strtolower($result);
  266. return $this->mime = $result;
  267. }
  268. return $this->mime = '';
  269. }
  270. public function is_xml()
  271. {
  272. $mime = $this->mime();
  273. if (!empty($mime))
  274. {
  275. return strpos($mime, 'xml') !== false;
  276. }
  277. return $this->ext() == 'xml';
  278. }
  279. public function is_image()
  280. {
  281. $mime = $this->mime();
  282. if ($mime)
  283. {
  284. return strpos($mime, 'image') !== false;
  285. }
  286. $ext = $this->ext();
  287. $formats = array('gif', 'jpeg', 'jpg', 'jpe', 'pjpeg', 'png', 'svg', 'tiff', 'ico');
  288. foreach ($formats as $format)
  289. {
  290. if ($format == $ext)
  291. {
  292. return true;
  293. }
  294. }
  295. return false;
  296. }
  297. public function is_video()
  298. {
  299. $mime = $this->mime();
  300. if ($mime)
  301. {
  302. return strpos($mime, 'video') !== false;
  303. }
  304. $ext = $this->ext();
  305. $formats = array('mpeg', 'mp4', 'ogg', 'wmv', 'mkv');
  306. foreach ($formats as $format)
  307. {
  308. if ($format == $ext)
  309. {
  310. return true;
  311. }
  312. }
  313. return false;
  314. }
  315. public function is_audio()
  316. {
  317. $mime = $this->mime();
  318. if ($mime)
  319. {
  320. return strpos($mime, 'audio') !== false;
  321. }
  322. $ext = $this->ext();
  323. $formats = array('mp3');
  324. foreach ($formats as $format)
  325. {
  326. if ($format == $ext)
  327. {
  328. return true;
  329. }
  330. }
  331. return false;
  332. }
  333. public function is_rss()
  334. {
  335. if (!$this->is_xml())
  336. {
  337. return false;
  338. }
  339. $doc = $this->doc();
  340. $nodes = $doc->getElementsByTagName('rss');
  341. return $nodes->length != 0;
  342. }
  343. public function is_gadget()
  344. {
  345. if (!$this->is_xml())
  346. {
  347. return false;
  348. }
  349. $doc = $this->doc();
  350. $nodes = $doc->getElementsByTagName('ModulePrefs');
  351. return $nodes->length != 0;
  352. }
  353. public function canonic_url($src)
  354. {
  355. if (strpos($src, '//') === 0)
  356. {
  357. $src = "http:$src";
  358. }
  359. else if (strpos($src, '/') === 0) //relative url to the root
  360. {
  361. $url = $this->url();
  362. $protocol = reset(explode('://', $url));
  363. $domain = end(explode('://', $url));
  364. $domain = reset(explode('/', $domain));
  365. $src = "$protocol://$domain/$src";
  366. }
  367. else if (strpos($src, 'http') !== 0) //relative url to the document
  368. {
  369. $url = $this->url();
  370. $tail = end(explode('/', $url));
  371. $base = str_replace($tail, '', $url);
  372. $src = $base . $src;
  373. }
  374. return $src;
  375. }
  376. /**
  377. * Content of the resource.
  378. *
  379. * @return string
  380. */
  381. public function source()
  382. {
  383. if (!is_null($this->source))
  384. {
  385. return $this->source;
  386. }
  387. $info = $this->info();
  388. return $this->source = $info['content'];
  389. }
  390. /**
  391. * Array of arrays containing the page's metadata.
  392. *
  393. * @return array
  394. */
  395. public function metadata()
  396. {
  397. if (!is_null($this->metadata))
  398. {
  399. return $this->metadata;
  400. }
  401. return $this->metadata = $this->get_metadata();
  402. }
  403. public function title()
  404. {
  405. if (!is_null($this->title))
  406. {
  407. return $this->title;
  408. }
  409. return $this->title = $this->get_title();
  410. }
  411. /**
  412. *
  413. * @return DOMDocument|boolean
  414. */
  415. public function doc()
  416. {
  417. if (!is_null($this->doc))
  418. {
  419. return $this->doc;
  420. }
  421. return $this->doc = $this->get_doc($this->source());
  422. }
  423. function get_meta($name)
  424. {
  425. $metadata = $this->metadata();
  426. $name = strtolower($name);
  427. foreach ($metadata as $attributes)
  428. {
  429. $key = isset($attributes['name']) ? $attributes['name'] : false;
  430. $key = $key ? strtolower($key) : $key;
  431. if ($name == $key)
  432. {
  433. return $attributes['content'];
  434. }
  435. $key = isset($attributes['property']) ? $attributes['property'] : false;
  436. $key = $key ? strtolower($key) : $key;
  437. if ($name == $key)
  438. {
  439. return isset($attributes['content']) ? $attributes['content'] : false;
  440. }
  441. }
  442. return false;
  443. }
  444. function get_link($key, $value)
  445. {
  446. $links = $this->links();
  447. $key = strtolower($key);
  448. $value = strtolower($value);
  449. foreach ($links as $attributes)
  450. {
  451. $a = isset($attributes[$key]) ? $attributes[$key] : false;
  452. $a = $a ? strtolower($a) : $a;
  453. if ($a == $value)
  454. {
  455. return $attributes;
  456. }
  457. }
  458. return false;
  459. }
  460. public function links()
  461. {
  462. if (!is_null($this->links))
  463. {
  464. return $this->links;
  465. }
  466. return $this->links = $this->get_links();
  467. }
  468. /**
  469. *
  470. * @param string $xpath dom xpath
  471. * @return string
  472. */
  473. public function findx($query)
  474. {
  475. $doc = $this->doc();
  476. if (empty($doc))
  477. {
  478. return array();
  479. }
  480. $xpath = new DOMXpath($doc);
  481. $nodes = $xpath->query($query);
  482. if ($nodes->length > 0)
  483. {
  484. return $doc->saveXML($nodes->item(0));
  485. }
  486. else
  487. {
  488. return '';
  489. }
  490. }
  491. protected function info()
  492. {
  493. if (!is_null($this->info))
  494. {
  495. return $this->info;
  496. }
  497. return $this->info = self::fetch($this->url());
  498. }
  499. /**
  500. *
  501. * @param string $source
  502. * @return boolean|DOMDocument
  503. */
  504. protected function get_doc($source)
  505. {
  506. if ($source == false)
  507. {
  508. return false;
  509. }
  510. $source = $this->source();
  511. $result = new DOMDocument();
  512. libxml_clear_errors();
  513. libxml_use_internal_errors(true);
  514. if ($this->is_xml())
  515. {
  516. $success = $result->loadXML($source);
  517. }
  518. else
  519. {
  520. $success = $result->loadHTML($source);
  521. }
  522. //$e = libxml_get_errors();
  523. return $result ? $result : false;
  524. }
  525. protected function get_metadata()
  526. {
  527. $result = array();
  528. $doc = $this->doc();
  529. if ($doc == false)
  530. {
  531. return array();
  532. }
  533. $metas = $doc->getElementsByTagName('meta');
  534. if ($metas->length == 0)
  535. {
  536. return $result;
  537. }
  538. foreach ($metas as $meta)
  539. {
  540. $values = array();
  541. $attributes = $meta->attributes;
  542. $length = $attributes->length;
  543. for ($i = 0; $i < $length; ++$i)
  544. {
  545. $name = $attributes->item($i)->name;
  546. $value = $attributes->item($i)->value;
  547. $value = $attributes->item($i)->value;
  548. $values[$name] = $value;
  549. }
  550. $result[] = $values;
  551. }
  552. return $result;
  553. }
  554. protected function get_title()
  555. {
  556. $doc = $this->doc();
  557. if ($doc == false)
  558. {
  559. return '';
  560. }
  561. $titles = $doc->getElementsByTagName('title');
  562. if ($titles->length == 0)
  563. {
  564. return false;
  565. }
  566. $result = $titles->item(0)->nodeValue;
  567. return $result;
  568. }
  569. protected function get_links()
  570. {
  571. $doc = $this->doc();
  572. if ($doc == false)
  573. {
  574. return array();
  575. }
  576. $result = array();
  577. $metas = $doc->getElementsByTagName('link');
  578. if ($metas->length == 0)
  579. {
  580. return $result;
  581. }
  582. foreach ($metas as $meta)
  583. {
  584. $values = array();
  585. $attributes = $meta->attributes;
  586. $length = $attributes->length;
  587. for ($i = 0; $i < $length; ++$i)
  588. {
  589. $name = $attributes->item($i)->name;
  590. $value = $attributes->item($i)->value;
  591. $values[$name] = $value;
  592. }
  593. $result[] = $values;
  594. }
  595. return $result;
  596. }
  597. }