]*>/i', "\n", $string);
$string = preg_replace('/<\/?(div|p|h[1-6]|table|ol|ul|blockquote)[^>]*>/i', "\n", $string);
$string = preg_replace('/<\/(tr|li)[^>]*>/i', "\n", $string);
$string = preg_replace('/<\/(td|th)[^>]*>/i', "\t", $string);
$string = strip_tags($string);
// Line endings unification and cleaning.
$string = str_replace(array("\r\n", "\n\r", "\r"), "\n", $string);
$string = preg_replace('/\s*\n/', "\n", $string);
$string = preg_replace('/\n+/', "\n", $string);
return trim($string);
}
/**
* Detects encoding of html-formatted text.
* @param string $string The input html-formatted text.
* @return string Returns the detected encoding.
*/
function api_detect_encoding_html($string)
{
if (@preg_match('/
]*content=[^>]*>).*<\/head>/si', $string, $matches)) {
if (@preg_match('/]*charset=(.*)["\';][^>]*>/si', $matches[1], $matches)) {
return api_refine_encoding_id(trim($matches[1]));
}
}
return api_detect_encoding(api_html_to_text($string));
}
/**
* Converts the text of a html-document to a given encoding, the meta-tag is changed accordingly.
* @param string $string The input full-html document.
* @param string The new encoding value to be set.
*/
function api_set_encoding_html(&$string, $encoding)
{
$old_encoding = api_detect_encoding_html($string);
if (@preg_match('/(.*]*content=[^>]*>)(.*<\/head>.*)/si', $string, $matches)) {
$meta = $matches[2];
if (@preg_match("/(]*charset=)(.*)([\"';][^>]*>)/si", $meta, $matches1)) {
$meta = $matches1[1].$encoding.$matches1[3];
$string = $matches[1].$meta.$matches[3];
} else {
$string = $matches[1].''.$matches[3];
}
} else {
$count = 1;
$string = str_ireplace('', '', $string, $count);
}
$string = api_convert_encoding($string, $encoding, $old_encoding);
}
/**
* Returns the title of a html document.
* @param string $string The contents of the input document.
* @param string $output_encoding The encoding of the retrieved title. If the value is not set, the system encoding is assumend.
* @param string $input_encoding The encoding of the input document. If the value is not set, it is detected.
* @return string The retrieved title, html-entities and extra-whitespace between the words are cleaned.
*/
function api_get_title_html(&$string, $output_encoding = null, $input_encoding = null)
{
if (@preg_match('/]*>(.*)<\/title>/msi', $string, $matches)) {
if (empty($output_encoding)) {
$output_encoding = api_get_system_encoding();
}
if (empty($input_encoding)) {
$input_encoding = api_detect_encoding_html($string);
}
return trim(@preg_replace('/\s+/', ' ', api_html_entity_decode(api_convert_encoding($matches[1], $output_encoding, $input_encoding), ENT_QUOTES, $output_encoding)));
}
return '';
}
/* XML processing functions */
// A regular expression for accessing declared encoding within xml-formatted text.
// Published by Steve Minutillo,
// http://minutillo.com/steve/weblog/2004/6/17/php-xml-and-character-encodings-a-tale-of-sadness-rage-and-data-loss/
define('_PCRE_XML_ENCODING', '/<\?xml.*encoding=[\'"](.*?)[\'"].*\?>/m');
/**
* Detects encoding of xml-formatted text.
* @param string $string The input xml-formatted text.
* @param string $default_encoding This is the default encoding to be returned if there is no way the xml-text's encoding to be detected. If it not spesified, the system encoding is assumed then.
* @return string Returns the detected encoding.
* @todo The second parameter is to be eliminated. See api_detect_encoding_html().
*/
function api_detect_encoding_xml($string, $default_encoding = null) {
if (preg_match(_PCRE_XML_ENCODING, $string, $matches)) {
return api_refine_encoding_id($matches[1]);
}
if (api_is_valid_utf8($string)) {
return 'UTF-8';
}
if (empty($default_encoding)) {
$default_encoding = _api_mb_internal_encoding();
}
return api_refine_encoding_id($default_encoding);
}
/**
* Converts character encoding of a xml-formatted text. If inside the text the encoding is declared, it is modified accordingly.
* @param string $string The text being converted.
* @param string $to_encoding The encoding that text is being converted to.
* @param string $from_encoding (optional) The encoding that text is being converted from. If it is omited, it is tried to be detected then.
* @return string Returns the converted xml-text.
*/
function api_convert_encoding_xml($string, $to_encoding, $from_encoding = null) {
return _api_convert_encoding_xml($string, $to_encoding, $from_encoding);
}
/**
* Converts character encoding of a xml-formatted text into UTF-8. If inside the text the encoding is declared, it is set to UTF-8.
* @param string $string The text being converted.
* @param string $from_encoding (optional) The encoding that text is being converted from. If it is omited, it is tried to be detected then.
* @return string Returns the converted xml-text.
*/
function api_utf8_encode_xml($string, $from_encoding = null) {
return _api_convert_encoding_xml($string, 'UTF-8', $from_encoding);
}
/**
* Converts character encoding of a xml-formatted text from UTF-8 into a specified encoding. If inside the text the encoding is declared, it is modified accordingly.
* @param string $string The text being converted.
* @param string $to_encoding (optional) The encoding that text is being converted to. If it is omited, the platform character set is assumed.
* @return string Returns the converted xml-text.
*/
function api_utf8_decode_xml($string, $to_encoding = 'UTF-8') {
return _api_convert_encoding_xml($string, $to_encoding, 'UTF-8');
}
/**
* Converts character encoding of a xml-formatted text. If inside the text the encoding is declared, it is modified accordingly.
* @param string $string The text being converted.
* @param string $to_encoding The encoding that text is being converted to.
* @param string $from_encoding (optional) The encoding that text is being converted from. If the value is empty, it is tried to be detected then.
* @return string Returns the converted xml-text.
*/
function _api_convert_encoding_xml(&$string, $to_encoding, $from_encoding) {
if (empty($from_encoding)) {
$from_encoding = api_detect_encoding_xml($string);
}
$to_encoding = api_refine_encoding_id($to_encoding);
if (!preg_match('/<\?xml.*\?>/m', $string, $matches)) {
return api_convert_encoding(''."\n".$string, $to_encoding, $from_encoding);
}
if (!preg_match(_PCRE_XML_ENCODING, $string)) {
if (strpos($matches[0], 'standalone') !== false) {
// The encoding option should precede the standalone option, othewise DOMDocument fails to load the document.
$replace = str_replace('standalone', ' encoding="'.$to_encoding.'" standalone', $matches[0]);
} else {
$replace = str_replace('?>', ' encoding="'.$to_encoding.'"?>', $matches[0]);
}
return api_convert_encoding(str_replace($matches[0], $replace, $string), $to_encoding, $from_encoding);
}
global $_api_encoding;
$_api_encoding = api_refine_encoding_id($to_encoding);
return api_convert_encoding(preg_replace_callback(_PCRE_XML_ENCODING, '_api_convert_encoding_xml_callback', $string), $to_encoding, $from_encoding);
}
/**
* A callback for serving the function _api_convert_encoding_xml().
* @param array $matches Input array of matches corresponding to the xml-declaration.
* @return string Returns the xml-declaration with modified encoding.
*/
function _api_convert_encoding_xml_callback($matches) {
global $_api_encoding;
return str_replace($matches[1], $_api_encoding, $matches[0]);
}
/* Functions for supporting ASCIIMathML mathematical formulas and ASCIIsvg maathematical graphics */
/**
* Dectects ASCIIMathML formula presence within a given html text.
* @param string $html The input html text.
* @return bool Returns TRUE when there is a formula found or FALSE otherwise.
*/
function api_contains_asciimathml($html) {
if (!preg_match_all('/]*class\s*=\s*[\'"](.*?)[\'"][^>]*>/mi', $html, $matches)) {
return false;
}
foreach ($matches[1] as $string) {
$string = ' '.str_replace(',', ' ', $string).' ';
if (preg_match('/\sAM\s/m', $string)) {
return true;
}
}
return false;
}
/**
* Dectects ASCIIsvg graphics presence within a given html text.
* @param string $html The input html text.
* @return bool Returns TRUE when there is a graph found or FALSE otherwise.
*/
function api_contains_asciisvg($html)
{
if (!preg_match_all('/