EncoderTest.php 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. <?php
  2. class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
  3. {
  4. protected $_entity_lookup;
  5. function setUp() {
  6. $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
  7. parent::setUp();
  8. }
  9. function assertCleanUTF8($string, $expect = null) {
  10. if ($expect === null) $expect = $string;
  11. $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string), $expect, 'iconv: %s');
  12. $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string, true), $expect, 'PHP: %s');
  13. }
  14. function test_cleanUTF8() {
  15. $this->assertCleanUTF8('Normal string.');
  16. $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
  17. $this->assertCleanUTF8("null byte: \0", 'null byte: ');
  18. $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
  19. $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
  20. $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
  21. $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
  22. $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
  23. // invalid codepoints
  24. $this->assertCleanUTF8("\xED\xB0\x80", '');
  25. }
  26. function test_convertToUTF8_noConvert() {
  27. // UTF-8 means that we don't touch it
  28. $this->assertIdentical(
  29. HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
  30. "\xF6", // this is invalid
  31. 'Expected identical [Binary: F6]'
  32. );
  33. }
  34. function test_convertToUTF8_spuriousEncoding() {
  35. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  36. $this->config->set('Core.Encoding', 'utf99');
  37. $this->expectError('Invalid encoding utf99');
  38. $this->assertIdentical(
  39. HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
  40. ''
  41. );
  42. }
  43. function test_convertToUTF8_iso8859_1() {
  44. $this->config->set('Core.Encoding', 'ISO-8859-1');
  45. $this->assertIdentical(
  46. HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
  47. "\xC3\xB6"
  48. );
  49. }
  50. function test_convertToUTF8_withoutIconv() {
  51. $this->config->set('Core.Encoding', 'ISO-8859-1');
  52. $this->config->set('Test.ForceNoIconv', true);
  53. $this->assertIdentical(
  54. HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
  55. "\xC3\xB6"
  56. );
  57. }
  58. function getZhongWen() {
  59. return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
  60. }
  61. function test_convertFromUTF8_utf8() {
  62. // UTF-8 means that we don't touch it
  63. $this->assertIdentical(
  64. HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
  65. "\xC3\xB6"
  66. );
  67. }
  68. function test_convertFromUTF8_iso8859_1() {
  69. $this->config->set('Core.Encoding', 'ISO-8859-1');
  70. $this->assertIdentical(
  71. HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
  72. "\xF6",
  73. 'Expected identical [Binary: F6]'
  74. );
  75. }
  76. function test_convertFromUTF8_iconvNoChars() {
  77. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  78. $this->config->set('Core.Encoding', 'ISO-8859-1');
  79. $this->assertIdentical(
  80. HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
  81. " (Chinese)"
  82. );
  83. }
  84. function test_convertFromUTF8_phpNormal() {
  85. // Plain PHP implementation has slightly different behavior
  86. $this->config->set('Core.Encoding', 'ISO-8859-1');
  87. $this->config->set('Test.ForceNoIconv', true);
  88. $this->assertIdentical(
  89. HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
  90. "\xF6",
  91. 'Expected identical [Binary: F6]'
  92. );
  93. }
  94. function test_convertFromUTF8_phpNoChars() {
  95. $this->config->set('Core.Encoding', 'ISO-8859-1');
  96. $this->config->set('Test.ForceNoIconv', true);
  97. $this->assertIdentical(
  98. HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
  99. "?? (Chinese)"
  100. );
  101. }
  102. function test_convertFromUTF8_withProtection() {
  103. // Preserve the characters!
  104. $this->config->set('Core.Encoding', 'ISO-8859-1');
  105. $this->config->set('Core.EscapeNonASCIICharacters', true);
  106. $this->assertIdentical(
  107. HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
  108. "&#20013;&#25991; (Chinese)"
  109. );
  110. }
  111. function test_convertFromUTF8_withProtectionButUtf8() {
  112. // Preserve the characters!
  113. $this->config->set('Core.EscapeNonASCIICharacters', true);
  114. $this->assertIdentical(
  115. HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
  116. "&#20013;&#25991; (Chinese)"
  117. );
  118. }
  119. function test_convertToASCIIDumbLossless() {
  120. // Uppercase thorn letter
  121. $this->assertIdentical(
  122. HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
  123. "&#222;orn"
  124. );
  125. $this->assertIdentical(
  126. HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
  127. "an"
  128. );
  129. // test up to four bytes
  130. $this->assertIdentical(
  131. HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
  132. "&#917536;"
  133. );
  134. }
  135. function assertASCIISupportCheck($enc, $ret) {
  136. $test = HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true);
  137. if ($test === false) return;
  138. $this->assertIdentical(
  139. HTMLPurifier_Encoder::testEncodingSupportsASCII($enc),
  140. $ret
  141. );
  142. $this->assertIdentical(
  143. HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true),
  144. $ret
  145. );
  146. }
  147. function test_testEncodingSupportsASCII() {
  148. if (HTMLPurifier_Encoder::iconvAvailable()) {
  149. $this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
  150. $this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
  151. }
  152. $this->assertASCIISupportCheck('ISO-8859-1', array());
  153. $this->assertASCIISupportCheck('dontexist', array()); // canary
  154. }
  155. function testShiftJIS() {
  156. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  157. $this->config->set('Core.Encoding', 'Shift_JIS');
  158. // This actually looks like a Yen, but we're going to treat it differently
  159. $this->assertIdentical(
  160. HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context),
  161. '\\~'
  162. );
  163. $this->assertIdentical(
  164. HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context),
  165. '\\~'
  166. );
  167. }
  168. function testIconvTruncateBug() {
  169. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  170. if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
  171. $this->config->set('Core.Encoding', 'ISO-8859-1');
  172. $this->assertIdentical(
  173. HTMLPurifier_Encoder::convertFromUTF8("\xE4\xB8\xAD" . str_repeat('a', 10000), $this->config, $this->context),
  174. str_repeat('a', 10000)
  175. );
  176. }
  177. function testIconvChunking() {
  178. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  179. if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
  180. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "a\xF3\xA0\x80\xA0b", 4), 'ab');
  181. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aa\xE4\xB8\xADb", 4), 'aab');
  182. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaa\xCE\xB1b", 4), 'aaab');
  183. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xF3\xA0\x80\xA0b", 4), 'aaaab');
  184. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xE4\xB8\xADb", 4), 'aaaab');
  185. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xCE\xB1b", 4), 'aaaab');
  186. }
  187. }
  188. // vim: et sw=4 sts=4