utf8.class.php 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. <?php
  2. /**
  3. * Utf8 encoding class. Provides utility function to deal with UTF8 encoding.
  4. *
  5. * @license see /license.txt
  6. * @author Laurent Opprecht <laurent@opprecht.info> for the Univesity of Geneva
  7. * @author More authors, mentioned in the correpsonding fragments of this source.
  8. */
  9. class Utf8 extends Encoding
  10. {
  11. const PATTERN_NOT_VISIBLE_CHARS = '/[^[:print:]-]/'; //Visible characters and the space character
  12. /**
  13. * @see http://en.wikipedia.org/wiki/Byte_order_mark
  14. */
  15. const BOM = "\xEF\xBB\xBF";
  16. const NAME = 'UTF-8';
  17. /**
  18. *
  19. * @return Utf8
  20. */
  21. public static function instance()
  22. {
  23. static $result = null;
  24. if (empty($result)) {
  25. $result = new self();
  26. }
  27. return $result;
  28. }
  29. /**
  30. * Returns true if encoding is UTF8.
  31. *
  32. * @param string|Encoding $encoding
  33. * @return bool
  34. */
  35. function is($encoding)
  36. {
  37. $encoding = (string) $encoding;
  38. return strtolower($encoding) == strtolower(self::NAME);
  39. }
  40. protected function __construct()
  41. {
  42. parent::__construct(self::NAME);
  43. }
  44. function name()
  45. {
  46. return self::NAME;
  47. }
  48. function bom()
  49. {
  50. return self::BOM;
  51. }
  52. /**
  53. * Returns the hexa decimal representation of an utf8 string. Usefull to understand
  54. * what is going on - not printable chars, rare patterns such as e' for é, etc.
  55. *
  56. * @param type $text
  57. * @return string
  58. */
  59. function to_hex($text)
  60. {
  61. $result = '';
  62. mb_internal_encoding('utf-8');
  63. for ($i = 0, $n = mb_strlen($text); $i < $n; $i++) {
  64. $char = mb_substr($text, $i, 1);
  65. $num = strlen($char);
  66. for ($j = 0; $j < $num; $j++) {
  67. $result .= sprintf('%02x', ord($char[$j]));
  68. }
  69. $result .= ' ';
  70. }
  71. return $result;
  72. }
  73. /**
  74. * Trim the BOM from an utf-8 string
  75. *
  76. * @param string $text
  77. * @return string
  78. */
  79. function trim($text)
  80. {
  81. $bom = self::BOM;
  82. if (strlen($text) < strlen($bom)) {
  83. return $text;
  84. }
  85. if (substr($text, 0, 3) == $bom) {
  86. return substr($text, 3);
  87. }
  88. return $text;
  89. }
  90. /**
  91. * Checks a string for UTF-8 validity.
  92. *
  93. * @param string $string The string to be tested.
  94. * @return bool Returns TRUE when the tested string is valid UTF-8, FALSE othewise.
  95. * @link http://en.wikipedia.org/wiki/UTF-8
  96. * @author see internationalization.lib.php
  97. */
  98. static function is_valid(&$string)
  99. {
  100. //return @mb_detect_encoding($string, 'UTF-8', true) == 'UTF-8' ? true : false;
  101. // Ivan Tcholakov, 05-OCT-2008: I do not trust mb_detect_encoding(). I have
  102. // found a string with a single cyrillic letter (single byte), that is
  103. // wrongly detected as UTF-8. Possibly, there would be problems with other
  104. // languages too. An alternative implementation will be used.
  105. $str = (string) $string;
  106. $len = api_byte_count($str);
  107. $i = 0;
  108. while ($i < $len) {
  109. $byte1 = ord($str[$i++]); // Here the current character begins. Its size is
  110. // determined by the senior bits in the first byte.
  111. if (($byte1 & 0x80) == 0x00) { // 0xxxxxxx
  112. // &
  113. // 10000000
  114. // --------
  115. // 00000000
  116. // This is s valid character and it contains a single byte.
  117. } elseif (($byte1 & 0xE0) == 0xC0) { // 110xxxxx 10xxxxxx
  118. // & &
  119. // 11100000 11000000
  120. // -------- --------
  121. // 11000000 10000000
  122. // The character contains two bytes.
  123. if ($i == $len) {
  124. return false; // Here the string ends unexpectedly.
  125. }
  126. if (!((ord($str[$i++]) & 0xC0) == 0x80))
  127. return false; // Invalid second byte, invalid string.
  128. }
  129. elseif (($byte1 & 0xF0) == 0xE0) { // 1110xxxx 10xxxxxx 10xxxxxx
  130. // & & &
  131. // 11110000 11000000 11000000
  132. // -------- -------- --------
  133. // 11100000 10000000 10000000
  134. // This is a character of three bytes.
  135. if ($i == $len) {
  136. return false; // Unexpected end of the string.
  137. }
  138. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  139. return false; // Invalid second byte.
  140. }
  141. if ($i == $len) {
  142. return false; // Unexpected end of the string.
  143. }
  144. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  145. return false; // Invalid third byte, invalid string.
  146. }
  147. } elseif (($byte1 & 0xF8) == 0xF0) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  148. // & & & &
  149. // 11111000 11000000 11000000 11000000
  150. // -------- -------- -------- --------
  151. // 11110000 10000000 10000000 10000000
  152. // This is a character of four bytes.
  153. if ($i == $len) {
  154. return false;
  155. }
  156. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  157. return false;
  158. }
  159. if ($i == $len) {
  160. return false;
  161. }
  162. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  163. return false;
  164. }
  165. if ($i == $len) {
  166. return false;
  167. }
  168. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  169. return false;
  170. }
  171. } elseif (($byte1 & 0xFC) == 0xF8) { // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  172. // & & & & &
  173. // 11111100 11000000 11000000 11000000 11000000
  174. // -------- -------- -------- -------- --------
  175. // 11111000 10000000 10000000 10000000 10000000
  176. // This is a character of five bytes.
  177. if ($i == $len) {
  178. return false;
  179. }
  180. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  181. return false;
  182. }
  183. if ($i == $len) {
  184. return false;
  185. }
  186. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  187. return false;
  188. }
  189. if ($i == $len) {
  190. return false;
  191. }
  192. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  193. return false;
  194. }
  195. if ($i == $len) {
  196. return false;
  197. }
  198. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  199. return false;
  200. }
  201. } elseif (($byte1 & 0xFE) == 0xFC) { // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  202. // & & & & & &
  203. // 11111110 11000000 11000000 11000000 11000000 11000000
  204. // -------- -------- -------- -------- -------- --------
  205. // 11111100 10000000 10000000 10000000 10000000 10000000
  206. // This is a character of six bytes.
  207. if ($i == $len) {
  208. return false;
  209. }
  210. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  211. return false;
  212. }
  213. if ($i == $len) {
  214. return false;
  215. }
  216. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  217. return false;
  218. }
  219. if ($i == $len) {
  220. return false;
  221. }
  222. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  223. return false;
  224. }
  225. if ($i == $len) {
  226. return false;
  227. }
  228. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  229. return false;
  230. }
  231. if ($i == $len) {
  232. return false;
  233. }
  234. if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
  235. return false;
  236. }
  237. } else {
  238. return false; // In any other case the character is invalid.
  239. }
  240. // Here the current character is valid, it
  241. // matches to some of the cases above.
  242. // The next character is to be examinated.
  243. }
  244. return true; // Empty strings are valid too.
  245. }
  246. /**
  247. *
  248. * @param type $to
  249. * @return Utf8Decoder
  250. */
  251. public function decoder($to = null)
  252. {
  253. $to = $to ? $to : Encoding::system();
  254. return new Utf8Decoder($to);
  255. }
  256. /**
  257. *
  258. * @param type $from
  259. * @return Utf8Encoder
  260. */
  261. public function encoder($from = null)
  262. {
  263. $from = $from ? $from : Encoding::system();
  264. return new Utf8Encoder($from);
  265. }
  266. }