123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287 |
- <?php
- /**
- * Utf8 encoding class. Provides utility function to deal with UTF8 encoding.
- *
- * @license see /license.txt
- * @author Laurent Opprecht <laurent@opprecht.info> for the Univesity of Geneva
- * @author More authors, mentioned in the correpsonding fragments of this source.
- */
- class Utf8 extends Encoding
- {
- const PATTERN_NOT_VISIBLE_CHARS = '/[^[:print:]-]/'; //Visible characters and the space character
- /**
- * @see http://en.wikipedia.org/wiki/Byte_order_mark
- */
- const BOM = "\xEF\xBB\xBF";
- const NAME = 'UTF-8';
- /**
- *
- * @return Utf8
- */
- public static function instance()
- {
- static $result = null;
- if (empty($result)) {
- $result = new self();
- }
- return $result;
- }
- /**
- * Returns true if encoding is UTF8.
- *
- * @param string|Encoding $encoding
- * @return bool
- */
- function is($encoding)
- {
- $encoding = (string) $encoding;
- return strtolower($encoding) == strtolower(self::NAME);
- }
- protected function __construct()
- {
- parent::__construct(self::NAME);
- }
- function name()
- {
- return self::NAME;
- }
- function bom()
- {
- return self::BOM;
- }
- /**
- * Returns the hexa decimal representation of an utf8 string. Usefull to understand
- * what is going on - not printable chars, rare patterns such as e' for é, etc.
- *
- * @param type $text
- * @return string
- */
- function to_hex($text)
- {
- $result = '';
- mb_internal_encoding('utf-8');
- for ($i = 0, $n = mb_strlen($text); $i < $n; $i++) {
- $char = mb_substr($text, $i, 1);
- $num = strlen($char);
- for ($j = 0; $j < $num; $j++) {
- $result .= sprintf('%02x', ord($char[$j]));
- }
- $result .= ' ';
- }
- return $result;
- }
- /**
- * Trim the BOM from an utf-8 string
- *
- * @param string $text
- * @return string
- */
- function trim($text)
- {
- $bom = self::BOM;
- if (strlen($text) < strlen($bom)) {
- return $text;
- }
- if (substr($text, 0, 3) == $bom) {
- return substr($text, 3);
- }
- return $text;
- }
- /**
- * Checks a string for UTF-8 validity.
- *
- * @param string $string The string to be tested.
- * @return bool Returns TRUE when the tested string is valid UTF-8, FALSE othewise.
- * @link http://en.wikipedia.org/wiki/UTF-8
- * @author see internationalization.lib.php
- */
- static function is_valid(&$string)
- {
- //return @mb_detect_encoding($string, 'UTF-8', true) == 'UTF-8' ? true : false;
- // Ivan Tcholakov, 05-OCT-2008: I do not trust mb_detect_encoding(). I have
- // found a string with a single cyrillic letter (single byte), that is
- // wrongly detected as UTF-8. Possibly, there would be problems with other
- // languages too. An alternative implementation will be used.
- $str = (string) $string;
- $len = api_byte_count($str);
- $i = 0;
- while ($i < $len) {
- $byte1 = ord($str[$i++]); // Here the current character begins. Its size is
- // determined by the senior bits in the first byte.
- if (($byte1 & 0x80) == 0x00) { // 0xxxxxxx
- // &
- // 10000000
- // --------
- // 00000000
- // This is s valid character and it contains a single byte.
- } elseif (($byte1 & 0xE0) == 0xC0) { // 110xxxxx 10xxxxxx
- // & &
- // 11100000 11000000
- // -------- --------
- // 11000000 10000000
- // The character contains two bytes.
- if ($i == $len) {
- return false; // Here the string ends unexpectedly.
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80))
- return false; // Invalid second byte, invalid string.
- }
- elseif (($byte1 & 0xF0) == 0xE0) { // 1110xxxx 10xxxxxx 10xxxxxx
- // & & &
- // 11110000 11000000 11000000
- // -------- -------- --------
- // 11100000 10000000 10000000
- // This is a character of three bytes.
- if ($i == $len) {
- return false; // Unexpected end of the string.
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false; // Invalid second byte.
- }
- if ($i == $len) {
- return false; // Unexpected end of the string.
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false; // Invalid third byte, invalid string.
- }
- } elseif (($byte1 & 0xF8) == 0xF0) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- // & & & &
- // 11111000 11000000 11000000 11000000
- // -------- -------- -------- --------
- // 11110000 10000000 10000000 10000000
- // This is a character of four bytes.
- if ($i == $len) {
- return false;
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false;
- }
- if ($i == $len) {
- return false;
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false;
- }
- if ($i == $len) {
- return false;
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false;
- }
- } elseif (($byte1 & 0xFC) == 0xF8) { // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- // & & & & &
- // 11111100 11000000 11000000 11000000 11000000
- // -------- -------- -------- -------- --------
- // 11111000 10000000 10000000 10000000 10000000
- // This is a character of five bytes.
- if ($i == $len) {
- return false;
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false;
- }
- if ($i == $len) {
- return false;
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false;
- }
- if ($i == $len) {
- return false;
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false;
- }
- if ($i == $len) {
- return false;
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false;
- }
- } elseif (($byte1 & 0xFE) == 0xFC) { // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- // & & & & & &
- // 11111110 11000000 11000000 11000000 11000000 11000000
- // -------- -------- -------- -------- -------- --------
- // 11111100 10000000 10000000 10000000 10000000 10000000
- // This is a character of six bytes.
- if ($i == $len) {
- return false;
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false;
- }
- if ($i == $len) {
- return false;
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false;
- }
- if ($i == $len) {
- return false;
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false;
- }
- if ($i == $len) {
- return false;
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false;
- }
- if ($i == $len) {
- return false;
- }
- if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
- return false;
- }
- } else {
- return false; // In any other case the character is invalid.
- }
- // Here the current character is valid, it
- // matches to some of the cases above.
- // The next character is to be examinated.
- }
- return true; // Empty strings are valid too.
- }
- /**
- *
- * @param type $to
- * @return Utf8Decoder
- */
- public function decoder($to = null)
- {
- $to = $to ? $to : Encoding::system();
- return new Utf8Decoder($to);
- }
- /**
- *
- * @param type $from
- * @return Utf8Encoder
- */
- public function encoder($from = null)
- {
- $from = $from ? $from : Encoding::system();
- return new Utf8Encoder($from);
- }
- }
|