123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662 |
- <?php
- /**
- * An HTTP resource. In most cases an HTML document.
- *
- * @copyright (c) 2011 University of Geneva
- * @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html
- * @author Laurent Opprecht
- */
- class HttpResource
- {
- /**
- * Fetch the content and metadata of an url.
- *
- * If the content type is not parsable, i.e. it is not made of text,
- * only fetch the metadata and not the content. This is mostly done to
- * avoid downloading big files - videos, images, etc - which is unnecessary.
- *
- * @param string $url the url to fetch
- * @return array array containing the content and various info
- */
- static function fetch($url, $fetch_content = null)
- {
- static $cache = array();
- if (isset($cache[$url]))
- {
- return $cache;
- }
- if (is_null($fetch_content) || $fetch_content === false)
- {
- // create a new cURL resource
- $ch = curl_init();
- // set URL and other appropriate options
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_HEADER, false);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
- curl_setopt($ch, CURLOPT_NOBODY, true);
- $content = curl_exec($ch);
- $error = curl_error($ch);
- $info = curl_getinfo($ch);
- // close cURL resource, and free up system resources
- curl_close($ch);
- $info['content'] = $content;
- $info['error'] = $error;
- if ($fetch_content === false)
- {
- return $cache[$url] = $info;
- }
- if (isset($info['content_type']) && strpos($info['content_type'], 'text') === false)
- {
- return $cache[$url] = $info;
- }
- }
- // create a new cURL resource
- $ch = curl_init();
- // set URL and other appropriate options
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_HEADER, false);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
- //curl_setopt($ch, CURLOPT_VERBOSE, true);
- $content = curl_exec($ch);
- $error = curl_error($ch);
- $info = curl_getinfo($ch);
- // close cURL resource, and free up system resources
- curl_close($ch);
- $info['content'] = $content;
- $info['error'] = $error;
- return $cache[$url] = $info;
- }
- static function fetch_json($url)
- {
- $content = self::fetch($url, true);
- $content = $content['content'];
- if ($content)
- {
- $result = (array) json_decode($content);
- }
- else
- {
- $result = array();
- }
- return $result;
- }
- protected $url;
- protected $url_params = null;
- protected $info = null;
- protected $source = null;
- protected $metadata = null;
- protected $links = null;
- protected $title = null;
- protected $mime = null;
- protected $doc = null;
- protected $config = array();
- public function __construct($url, $config = array())
- {
- $this->url = $url;
- $this->config = $config;
- }
- public function config($key = '', $default = null)
- {
- return isset($this->config[$key]) ? $this->config[$key] : $default;
- }
- /**
- * Url of the resource
- *
- * @return string
- */
- public function url()
- {
- return $this->url;
- }
- public function url_domain()
- {
- $url = $this->url();
- $url = trim($url, '/');
- if (strpos($url, '//') !== false)
- {
- $parts = explode('//', $url);
- $url = end($parts);
- }
- $parts = explode('/', $url);
- $result = reset($parts);
- return $result;
- }
- /**
- *
- * @param array|string $part
- * @return boolean
- */
- public function url_match($part)
- {
- $params = func_get_args();
- $params = is_array($params) ? $params : array($params);
- $url = strtolower($this->url());
- foreach ($params as $param)
- {
- if (strpos($url, strtolower($param)) !== false)
- {
- return true;
- }
- }
- return false;
- }
- public function url_params()
- {
- if (!is_null($this->url_params))
- {
- return $this->url_params;
- }
- $url = $this->url();
- if (strpos($url, '?') === false)
- {
- return $this->url_params = array();
- }
- $result = array();
- $params = explode('?', $url);
- $params = end($params);
- $params = explode('&', $params);
- foreach ($params as $param)
- {
- list($key, $val) = explode('=', $param);
- $result[$key] = $val;
- }
- return $this->url_params = $result;
- }
- public function url_param($name, $default = false)
- {
- $params = $this->url_params();
- return isset($params[$name]) ? $params[$name] : $default;
- }
- /**
- * The name of the resource. I.e. the last part of the url without the ext
- *
- * @return string
- */
- public function name()
- {
- $url = $this->url();
- $url = explode('/', $url);
- $title = end($url);
- $title = explode('.', $title);
- $title = reset($title);
- return $title;
- }
- /**
- * Extention of the url
- *
- * @return string
- */
- public function ext()
- {
- $url = $this->url();
- $url = explode('.', $url);
- $ext = end($url);
- $ext = strtolower($ext);
- return $ext;
- }
- /**
- * Return true if the object has one of the extentions. Overloaded:
- *
- * $res->has_ext('pdf');
- * $res->has_ext('pdf', 'doc');
- * $res->has_ext(array('pdf', 'doc'));
- *
- * @param array|string $_
- * @return boolean true if the resource has one of the extentions passed
- */
- public function has_ext($_)
- {
- if (is_array($_))
- {
- $params = $_;
- }
- else
- {
- $params = func_get_args();
- $params = is_array($params) ? $params : array($params);
- }
- $ext = $this->ext();
- foreach ($params as $param)
- {
- if (strtolower($param) == $ext)
- {
- return true;
- }
- }
- return false;
- }
- public function charset()
- {
- $info = $this->info();
- $content_type = isset($info['content_type']) ? $info['content_type'] : '';
- if (empty($content_type))
- {
- return null;
- }
- $items = explode(';', $content_type);
- foreach ($items as $item)
- {
- $parts = explode('=', $item);
- if (count($parts) == 2 && reset($parts) == 'charset')
- {
- return strtolower(end($parts));
- }
- }
- return null;
- }
- /**
- * The mime type of the resource or the empty string if none has been specified
- *
- * @return string
- */
- public function mime()
- {
- if (!is_null($this->mime))
- {
- return $this->mime;
- }
- $info = $this->info();
- $content_type = isset($info['content_type']) ? $info['content_type'] : '';
- if ($content_type)
- {
- $result = reset(explode(';', $content_type));
- $result = strtolower($result);
- return $this->mime = $result;
- }
- return $this->mime = '';
- }
- public function is_xml()
- {
- $mime = $this->mime();
- if (!empty($mime))
- {
- return strpos($mime, 'xml') !== false;
- }
- return $this->ext() == 'xml';
- }
- public function is_image()
- {
- $mime = $this->mime();
- if ($mime)
- {
- return strpos($mime, 'image') !== false;
- }
- $ext = $this->ext();
- $formats = array('gif', 'jpeg', 'jpg', 'jpe', 'pjpeg', 'png', 'svg', 'tiff', 'ico');
- foreach ($formats as $format)
- {
- if ($format == $ext)
- {
- return true;
- }
- }
- return false;
- }
- public function is_video()
- {
- $mime = $this->mime();
- if ($mime)
- {
- return strpos($mime, 'video') !== false;
- }
- $ext = $this->ext();
- $formats = array('mpeg', 'mp4', 'ogg', 'wmv', 'mkv');
- foreach ($formats as $format)
- {
- if ($format == $ext)
- {
- return true;
- }
- }
- return false;
- }
- public function is_audio()
- {
- $mime = $this->mime();
- if ($mime)
- {
- return strpos($mime, 'audio') !== false;
- }
- $ext = $this->ext();
- $formats = array('mp3');
- foreach ($formats as $format)
- {
- if ($format == $ext)
- {
- return true;
- }
- }
- return false;
- }
- public function is_rss()
- {
- if (!$this->is_xml())
- {
- return false;
- }
- $doc = $this->doc();
- $nodes = $doc->getElementsByTagName('rss');
- return $nodes->length != 0;
- }
- public function is_gadget()
- {
- if (!$this->is_xml())
- {
- return false;
- }
- $doc = $this->doc();
- $nodes = $doc->getElementsByTagName('ModulePrefs');
- return $nodes->length != 0;
- }
- public function canonic_url($src)
- {
- if (strpos($src, '//') === 0)
- {
- $src = "http:$src";
- }
- else if (strpos($src, '/') === 0) //relative url to the root
- {
- $url = $this->url();
- $protocol = reset(explode('://', $url));
- $domain = end(explode('://', $url));
- $domain = reset(explode('/', $domain));
- $src = "$protocol://$domain/$src";
- }
- else if (strpos($src, 'http') !== 0) //relative url to the document
- {
- $url = $this->url();
- $tail = end(explode('/', $url));
- $base = str_replace($tail, '', $url);
- $src = $base . $src;
- }
- return $src;
- }
- /**
- * Content of the resource.
- *
- * @return string
- */
- public function source()
- {
- if (!is_null($this->source))
- {
- return $this->source;
- }
- $info = $this->info();
- return $this->source = $info['content'];
- }
- /**
- * Array of arrays containing the page's metadata.
- *
- * @return array
- */
- public function metadata()
- {
- if (!is_null($this->metadata))
- {
- return $this->metadata;
- }
- return $this->metadata = $this->get_metadata();
- }
- public function title()
- {
- if (!is_null($this->title))
- {
- return $this->title;
- }
- return $this->title = $this->get_title();
- }
- /**
- *
- * @return DOMDocument|boolean
- */
- public function doc()
- {
- if (!is_null($this->doc))
- {
- return $this->doc;
- }
- return $this->doc = $this->get_doc($this->source());
- }
- function get_meta($name)
- {
- $metadata = $this->metadata();
- $name = strtolower($name);
- foreach ($metadata as $attributes)
- {
- $key = isset($attributes['name']) ? $attributes['name'] : false;
- $key = $key ? strtolower($key) : $key;
- if ($name == $key)
- {
- return $attributes['content'];
- }
- $key = isset($attributes['property']) ? $attributes['property'] : false;
- $key = $key ? strtolower($key) : $key;
- if ($name == $key)
- {
- return isset($attributes['content']) ? $attributes['content'] : false;
- }
- }
- return false;
- }
- function get_link($key, $value)
- {
- $links = $this->links();
- $key = strtolower($key);
- $value = strtolower($value);
- foreach ($links as $attributes)
- {
- $a = isset($attributes[$key]) ? $attributes[$key] : false;
- $a = $a ? strtolower($a) : $a;
- if ($a == $value)
- {
- return $attributes;
- }
- }
- return false;
- }
- public function links()
- {
- if (!is_null($this->links))
- {
- return $this->links;
- }
- return $this->links = $this->get_links();
- }
- /**
- *
- * @param string $xpath dom xpath
- * @return string
- */
- public function findx($query)
- {
- $doc = $this->doc();
- if (empty($doc))
- {
- return array();
- }
- $xpath = new DOMXpath($doc);
- $nodes = $xpath->query($query);
- if ($nodes->length > 0)
- {
- return $doc->saveXML($nodes->item(0));
- }
- else
- {
- return '';
- }
- }
- protected function info()
- {
- if (!is_null($this->info))
- {
- return $this->info;
- }
- return $this->info = self::fetch($this->url());
- }
- /**
- *
- * @param string $source
- * @return boolean|DOMDocument
- */
- protected function get_doc($source)
- {
- if ($source == false)
- {
- return false;
- }
- $source = $this->source();
- $result = new DOMDocument();
- libxml_clear_errors();
- libxml_use_internal_errors(true);
- if ($this->is_xml())
- {
- $success = $result->loadXML($source);
- }
- else
- {
- $success = $result->loadHTML($source);
- }
- //$e = libxml_get_errors();
- return $result ? $result : false;
- }
- protected function get_metadata()
- {
- $result = array();
- $doc = $this->doc();
- if ($doc == false)
- {
- return array();
- }
- $metas = $doc->getElementsByTagName('meta');
- if ($metas->length == 0)
- {
- return $result;
- }
- foreach ($metas as $meta)
- {
- $values = array();
- $attributes = $meta->attributes;
- $length = $attributes->length;
- for ($i = 0; $i < $length; ++$i)
- {
- $name = $attributes->item($i)->name;
- $value = $attributes->item($i)->value;
- $value = $attributes->item($i)->value;
- $values[$name] = $value;
- }
- $result[] = $values;
- }
- return $result;
- }
- protected function get_title()
- {
- $doc = $this->doc();
- if ($doc == false)
- {
- return '';
- }
- $titles = $doc->getElementsByTagName('title');
- if ($titles->length == 0)
- {
- return false;
- }
- $result = $titles->item(0)->nodeValue;
- return $result;
- }
- protected function get_links()
- {
- $doc = $this->doc();
- if ($doc == false)
- {
- return array();
- }
- $result = array();
- $metas = $doc->getElementsByTagName('link');
- if ($metas->length == 0)
- {
- return $result;
- }
- foreach ($metas as $meta)
- {
- $values = array();
- $attributes = $meta->attributes;
- $length = $attributes->length;
- for ($i = 0; $i < $length; ++$i)
- {
- $name = $attributes->item($i)->name;
- $value = $attributes->item($i)->value;
- $values[$name] = $value;
- }
- $result[] = $values;
- }
- return $result;
- }
- }
|