]*>/i', "\n", $string);
$string = preg_replace('/<\/?(div|p|h[1-6]|table|ol|ul|blockquote)[^>]*>/i', "\n", $string);
$string = preg_replace('/<\/(tr|li)[^>]*>/i', "\n", $string);
$string = preg_replace('/<\/(td|th)[^>]*>/i', "\t", $string);
$string = strip_tags($string);
// Line endings unification and cleaning.
$string = str_replace(array("\r\n", "\n\r", "\r"), "\n", $string);
$string = preg_replace('/\s*\n/', "\n", $string);
$string = preg_replace('/\n+/', "\n", $string);
return trim($string);
}
/**
* Detects encoding of html-formatted text.
* @param string $string The input html-formatted text.
* @return string Returns the detected encoding.
*/
static function api_detect_encoding_html($string) {
if (@preg_match('/
]*content=[^>]*>).*<\/head>/si', $string, $matches)) {
if (@preg_match('/]*charset=(.*)["\';][^>]*>/si', $matches[1], $matches)) {
return api_refine_encoding_id(trim($matches[1]));
}
}
return api_detect_encoding(self::api_html_to_text($string));
}
/**
* Converts the text of a html-document to a given encoding, the meta-tag is changed accordingly.
* @param string $string The input full-html document.
* @param string The new encoding value to be set.
*/
static function api_set_encoding_html(&$string, $encoding) {
$old_encoding = self::api_detect_encoding_html($string);
if (@preg_match('/(.*]*content=[^>]*>)(.*<\/head>.*)/si', $string, $matches)) {
$meta = $matches[2];
if (@preg_match("/(]*charset=)(.*)([\"';][^>]*>)/si", $meta, $matches1)) {
$meta = $matches1[1] . $encoding . $matches1[3];
$string = $matches[1] . $meta . $matches[3];
} else {
$string = $matches[1] . '' . $matches[3];
}
} else {
$count = 1;
$string = str_ireplace('', '', $string, $count);
}
$string = api_convert_encoding($string, $encoding, $old_encoding);
}
/**
* Returns the title of a html document.
* @param string $string The contents of the input document.
* @param string $input_encoding The encoding of the input document. If the value is not set, it is detected.
* @param string $$output_encoding The encoding of the retrieved title. If the value is not set, the system encoding is assumend.
* @return string The retrieved title, html-entities and extra-whitespace between the words are cleaned.
*/
static function api_get_title_html(&$string, $output_encoding = null, $input_encoding = null)
{
if (@preg_match('/]*>(.*)<\/title>/msi', $string, $matches)) {
if (empty($output_encoding)) {
$output_encoding = api_get_system_encoding();
}
if (empty($input_encoding)) {
$input_encoding = self::api_detect_encoding_html($string);
}
return trim(@preg_replace('/\s+/', ' ', api_html_entity_decode(api_convert_encoding($matches[1], $output_encoding, $input_encoding), ENT_QUOTES, $output_encoding)));
}
return '';
}
/**
* Detects encoding of xml-formatted text.
* @param string $string The input xml-formatted text.
* @param string $default_encoding This is the default encoding to be returned if there is no way the xml-text's encoding to be detected. If it not spesified, the system encoding is assumed then.
* @return string Returns the detected encoding.
* @todo The second parameter is to be eliminated. See api_detect_encoding_html().
*/
static function api_detect_encoding_xml($string, $default_encoding = null) {
if (preg_match(_PCRE_XML_ENCODING, $string, $matches)) {
return api_refine_encoding_id($matches[1]);
}
if (api_is_valid_utf8($string)) {
return 'UTF-8';
}
if (empty($default_encoding)) {
$default_encoding = _api_mb_internal_encoding();
}
return api_refine_encoding_id($default_encoding);
}
/**
* Converts character encoding of a xml-formatted text. If inside the text the encoding is declared, it is modified accordingly.
* @param string $string The text being converted.
* @param string $to_encoding The encoding that text is being converted to.
* @param string $from_encoding (optional) The encoding that text is being converted from. If it is omited, it is tried to be detected then.
* @return string Returns the converted xml-text.
*/
static function api_convert_encoding_xml($string, $to_encoding, $from_encoding = null) {
return self::_api_convert_encoding_xml($string, $to_encoding, $from_encoding);
}
/**
* Converts character encoding of a xml-formatted text into UTF-8. If inside the text the encoding is declared, it is set to UTF-8.
* @param string $string The text being converted.
* @param string $from_encoding (optional) The encoding that text is being converted from. If it is omited, it is tried to be detected then.
* @return string Returns the converted xml-text.
*/
static function api_utf8_encode_xml($string, $from_encoding = null) {
return self::_api_convert_encoding_xml($string, 'UTF-8', $from_encoding);
}
/**
* Converts character encoding of a xml-formatted text from UTF-8 into a specified encoding. If inside the text the encoding is declared, it is modified accordingly.
* @param string $string The text being converted.
* @param string $to_encoding (optional) The encoding that text is being converted to. If it is omited, the platform character set is assumed.
* @return string Returns the converted xml-text.
*/
static function api_utf8_decode_xml($string, $to_encoding = null) {
if (empty($to_encoding)) {
$to_encoding = _api_mb_internal_encoding();
}
return self::_api_convert_encoding_xml($string, $to_encoding, 'UTF-8');
}
/**
* Converts character encoding of a xml-formatted text. If inside the text the encoding is declared, it is modified accordingly.
* @param string $string The text being converted.
* @param string $to_encoding The encoding that text is being converted to.
* @param string $from_encoding (optional) The encoding that text is being converted from. If the value is empty, it is tried to be detected then.
* @return string Returns the converted xml-text.
*/
static function _api_convert_encoding_xml(&$string, $to_encoding, $from_encoding) {
if (empty($from_encoding)) {
$from_encoding = self::api_detect_encoding_xml($string);
}
$to_encoding = api_refine_encoding_id($to_encoding);
if (!preg_match('/<\?xml.*\?>/m', $string, $matches)) {
return api_convert_encoding('' . "\n" . $string, $to_encoding, $from_encoding);
}
if (!preg_match(_PCRE_XML_ENCODING, $string)) {
if (strpos($matches[0], 'standalone') !== false) {
// The encoding option should precede the standalone option, othewise DOMDocument fails to load the document.
$replace = str_replace('standalone', ' encoding="' . $to_encoding . '" standalone', $matches[0]);
} else {
$replace = str_replace('?>', ' encoding="' . $to_encoding . '"?>', $matches[0]);
}
return api_convert_encoding(str_replace($matches[0], $replace, $string), $to_encoding, $from_encoding);
}
global $_api_encoding;
$_api_encoding = api_refine_encoding_id($to_encoding);
return api_convert_encoding(preg_replace_callback(_PCRE_XML_ENCODING, array('Text', '_api_convert_encoding_xml_callback'), $string), $to_encoding, $from_encoding);
}
/**
* A callback for serving the function _api_convert_encoding_xml().
* @param array $matches Input array of matches corresponding to the xml-declaration.
* @return string Returns the xml-declaration with modified encoding.
*/
static function _api_convert_encoding_xml_callback($matches) {
global $_api_encoding;
return str_replace($matches[1], $_api_encoding, $matches[0]);
}
/* CSV processing functions */
/**
* Parses CSV data (one line) into an array. This function is not affected by the OS-locale settings.
* @param string $string The input string.
* @param string $delimiter (optional) The field delimiter, one character only. The default delimiter character is comma {,).
* @param string $enclosure (optional) The field enclosure, one character only. The default enclosure character is quote (").
* @param string $escape (optional) The escape character, one character only. The default escape character is backslash (\).
* @return array Returns an array containing the fields read.
* Note: In order this function to work correctly with UTF-8, limitation for the parameters $delimiter, $enclosure and $escape
* should be kept. These parameters should be single ASCII characters only. Thus the implementation of this function is faster.
* @link http://php.net/manual/en/function.str-getcsv.php (exists as of PHP 5 >= 5.3.0)
*/
static function & api_str_getcsv(& $string, $delimiter = ',', $enclosure = '"', $escape = '\\') {
$delimiter = (string) $delimiter;
if (api_byte_count($delimiter) > 1) {
$delimiter = $delimiter[1];
}
$enclosure = (string) $enclosure;
if (api_byte_count($enclosure) > 1) {
$enclosure = $enclosure[1];
}
$escape = (string) $escape;
if (api_byte_count($escape) > 1) {
$escape = $escape[1];
}
$str = (string) $string;
$len = api_byte_count($str);
$enclosed = false;
$escaped = false;
$value = '';
$result = array();
for ($i = 0; $i < $len; $i++) {
$char = $str[$i];
if ($char == $escape) {
if (!$escaped) {
$escaped = true;
continue;
}
}
$escaped = false;
switch ($char) {
case $enclosure:
if ($enclosed && $str[$i + 1] == $enclosure) {
$value .= $char;
$i++;
} else {
$enclosed = !$enclosed;
}
break;
case $delimiter:
if (!$enclosed) {
$result[] = $value;
$value = '';
} else {
$value .= $char;
}
break;
default:
$value .= $char;
break;
}
}
if (!empty($value)) {
$result[] = $value;
}
return $result;
}
/**
* Reads a line from a file pointer and parses it for CSV fields. This function is not affected by the OS-locale settings.
* @param resource $handle The file pointer, it must be valid and must point to a file successfully opened by fopen().
* @param int $length (optional) Reading ends when length - 1 bytes have been read, on a newline (which is included in the return value), or on EOF (whichever comes first).
* If no length is specified, it will keep reading from the stream until it reaches the end of the line.
* @param string $delimiter (optional) The field delimiter, one character only. The default delimiter character is comma {,).
* @param string $enclosure (optional) The field enclosure, one character only. The default enclosure character is quote (").
* @param string $escape (optional) The escape character, one character only. The default escape character is backslash (\).
* @return array Returns an array containing the fields read.
* Note: In order this function to work correctly with UTF-8, limitation for the parameters $delimiter, $enclosure and $escape
* should be kept. These parameters should be single ASCII characters only.
* @link http://php.net/manual/en/function.fgetcsv.php
*/
static function api_fgetcsv($handle, $length = null, $delimiter = ',', $enclosure = '"', $escape = '\\') {
if (($line = is_null($length) ? fgets($handle) : fgets($handle, $length)) !== false) {
$line = rtrim($line, "\r\n");
return self::api_str_getcsv($line, $delimiter, $enclosure, $escape);
}
return false;
}
/* Functions for supporting ASCIIMathML mathematical formulas and ASCIIsvg maathematical graphics */
/**
* Dectects ASCIIMathML formula presence within a given html text.
* @param string $html The input html text.
* @return bool Returns TRUE when there is a formula found or FALSE otherwise.
*/
static function api_contains_asciimathml($html) {
if (!preg_match_all('/]*class\s*=\s*[\'"](.*?)[\'"][^>]*>/mi', $html, $matches)) {
return false;
}
foreach ($matches[1] as $string) {
$string = ' ' . str_replace(',', ' ', $string) . ' ';
if (preg_match('/\sAM\s/m', $string)) {
return true;
}
}
return false;
}
/**
* Dectects ASCIIsvg graphics presence within a given html text.
* @param string $html The input html text.
* @return bool Returns TRUE when there is a graph found or FALSE otherwise.
*/
static function api_contains_asciisvg($html) {
if (!preg_match_all('/