Iconv.php 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. <?php
  2. /**
  3. * Zend Framework (http://framework.zend.com/)
  4. *
  5. * @link http://github.com/zendframework/zf2 for the canonical source repository
  6. * @copyright Copyright (c) 2005-2014 Zend Technologies USA Inc. (http://www.zend.com)
  7. * @license http://framework.zend.com/license/new-bsd New BSD License
  8. */
  9. namespace Zend\Stdlib\StringWrapper;
  10. use Zend\Stdlib\Exception;
  11. class Iconv extends AbstractStringWrapper
  12. {
  13. /**
  14. * List of supported character sets (upper case)
  15. *
  16. * @var string[]
  17. * @link http://www.gnu.org/software/libiconv/
  18. */
  19. protected static $encodings = array(
  20. // European languages
  21. 'ASCII',
  22. 'ISO-8859-1',
  23. 'ISO-8859-2',
  24. 'ISO-8859-3',
  25. 'ISO-8859-4',
  26. 'ISO-8859-5',
  27. 'ISO-8859-7',
  28. 'ISO-8859-9',
  29. 'ISO-8859-10',
  30. 'ISO-8859-13',
  31. 'ISO-8859-14',
  32. 'ISO-8859-15',
  33. 'ISO-8859-16',
  34. 'KOI8-R',
  35. 'KOI8-U',
  36. 'KOI8-RU',
  37. 'CP1250',
  38. 'CP1251',
  39. 'CP1252',
  40. 'CP1253',
  41. 'CP1254',
  42. 'CP1257',
  43. 'CP850',
  44. 'CP866',
  45. 'CP1131',
  46. 'MACROMAN',
  47. 'MACCENTRALEUROPE',
  48. 'MACICELAND',
  49. 'MACCROATIAN',
  50. 'MACROMANIA',
  51. 'MACCYRILLIC',
  52. 'MACUKRAINE',
  53. 'MACGREEK',
  54. 'MACTURKISH',
  55. 'MACINTOSH',
  56. // Semitic languages
  57. 'ISO-8859-6',
  58. 'ISO-8859-8',
  59. 'CP1255',
  60. 'CP1256',
  61. 'CP862',
  62. 'MACHEBREW',
  63. 'MACARABIC',
  64. // Japanese
  65. 'EUC-JP',
  66. 'SHIFT_JIS',
  67. 'CP932',
  68. 'ISO-2022-JP',
  69. 'ISO-2022-JP-2',
  70. 'ISO-2022-JP-1',
  71. // Chinese
  72. 'EUC-CN',
  73. 'HZ',
  74. 'GBK',
  75. 'CP936',
  76. 'GB18030',
  77. 'EUC-TW',
  78. 'BIG5',
  79. 'CP950',
  80. 'BIG5-HKSCS',
  81. 'BIG5-HKSCS:2004',
  82. 'BIG5-HKSCS:2001',
  83. 'BIG5-HKSCS:1999',
  84. 'ISO-2022-CN',
  85. 'ISO-2022-CN-EXT',
  86. // Korean
  87. 'EUC-KR',
  88. 'CP949',
  89. 'ISO-2022-KR',
  90. 'JOHAB',
  91. // Armenian
  92. 'ARMSCII-8',
  93. // Georgian
  94. 'GEORGIAN-ACADEMY',
  95. 'GEORGIAN-PS',
  96. // Tajik
  97. 'KOI8-T',
  98. // Kazakh
  99. 'PT154',
  100. 'RK1048',
  101. // Thai
  102. 'ISO-8859-11',
  103. 'TIS-620',
  104. 'CP874',
  105. 'MACTHAI',
  106. // Laotian
  107. 'MULELAO-1',
  108. 'CP1133',
  109. // Vietnamese
  110. 'VISCII',
  111. 'TCVN',
  112. 'CP1258',
  113. // Platform specifics
  114. 'HP-ROMAN8',
  115. 'NEXTSTEP',
  116. // Full Unicode
  117. 'UTF-8',
  118. 'UCS-2',
  119. 'UCS-2BE',
  120. 'UCS-2LE',
  121. 'UCS-4',
  122. 'UCS-4BE',
  123. 'UCS-4LE',
  124. 'UTF-16',
  125. 'UTF-16BE',
  126. 'UTF-16LE',
  127. 'UTF-32',
  128. 'UTF-32BE',
  129. 'UTF-32LE',
  130. 'UTF-7',
  131. 'C99',
  132. 'JAVA',
  133. /* Commented out because that's internal encodings not existing in real world
  134. // Full Unicode, in terms of uint16_t or uint32_t (with machine dependent endianness and alignment)
  135. 'UCS-2-INTERNAL',
  136. 'UCS-4-INTERNAL',
  137. // Locale dependent, in terms of `char' or `wchar_t' (with machine dependent endianness and alignment,
  138. // and with OS and locale dependent semantics)
  139. 'char',
  140. 'wchar_t',
  141. '', // The empty encoding name is equivalent to "char": it denotes the locale dependent character encoding.
  142. */
  143. // When configured with the option --enable-extra-encodings,
  144. // it also provides support for a few extra encodings:
  145. // European languages
  146. 'CP437',
  147. 'CP737',
  148. 'CP775',
  149. 'CP852',
  150. 'CP853',
  151. 'CP855',
  152. 'CP857',
  153. 'CP858',
  154. 'CP860',
  155. 'CP861',
  156. 'CP863',
  157. 'CP865',
  158. 'CP869',
  159. 'CP1125',
  160. // Semitic languages
  161. 'CP864',
  162. // Japanese
  163. 'EUC-JISX0213',
  164. 'Shift_JISX0213',
  165. 'ISO-2022-JP-3',
  166. // Chinese
  167. 'BIG5-2003', // (experimental)
  168. // Turkmen
  169. 'TDS565',
  170. // Platform specifics
  171. 'ATARIST',
  172. 'RISCOS-LATIN1',
  173. );
  174. /**
  175. * Get a list of supported character encodings
  176. *
  177. * @return string[]
  178. */
  179. public static function getSupportedEncodings()
  180. {
  181. return static::$encodings;
  182. }
  183. /**
  184. * Constructor
  185. *
  186. * @throws Exception\ExtensionNotLoadedException
  187. */
  188. public function __construct()
  189. {
  190. if (!extension_loaded('iconv')) {
  191. throw new Exception\ExtensionNotLoadedException(
  192. 'PHP extension "iconv" is required for this wrapper'
  193. );
  194. }
  195. }
  196. /**
  197. * Returns the length of the given string
  198. *
  199. * @param string $str
  200. * @return int|false
  201. */
  202. public function strlen($str)
  203. {
  204. return iconv_strlen($str, $this->getEncoding());
  205. }
  206. /**
  207. * Returns the portion of string specified by the start and length parameters
  208. *
  209. * @param string $str
  210. * @param int $offset
  211. * @param int|null $length
  212. * @return string|false
  213. */
  214. public function substr($str, $offset = 0, $length = null)
  215. {
  216. return iconv_substr($str, $offset, $length, $this->getEncoding());
  217. }
  218. /**
  219. * Find the position of the first occurrence of a substring in a string
  220. *
  221. * @param string $haystack
  222. * @param string $needle
  223. * @param int $offset
  224. * @return int|false
  225. */
  226. public function strpos($haystack, $needle, $offset = 0)
  227. {
  228. return iconv_strpos($haystack, $needle, $offset, $this->getEncoding());
  229. }
  230. /**
  231. * Convert a string from defined encoding to the defined convert encoding
  232. *
  233. * @param string $str
  234. * @param bool $reverse
  235. * @return string|false
  236. */
  237. public function convert($str, $reverse = false)
  238. {
  239. $encoding = $this->getEncoding();
  240. $convertEncoding = $this->getConvertEncoding();
  241. if ($convertEncoding === null) {
  242. throw new Exception\LogicException(
  243. 'No convert encoding defined'
  244. );
  245. }
  246. if ($encoding === $convertEncoding) {
  247. return $str;
  248. }
  249. $fromEncoding = $reverse ? $convertEncoding : $encoding;
  250. $toEncoding = $reverse ? $encoding : $convertEncoding;
  251. // automatically add "//IGNORE" to not stop converting on invalid characters
  252. // invalid characters triggers a notice anyway
  253. return iconv($fromEncoding, $toEncoding . '//IGNORE', $str);
  254. }
  255. }