LexerTest.php 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752
  1. <?php
  2. class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
  3. {
  4. protected $_has_pear = false;
  5. public function __construct() {
  6. parent::__construct();
  7. if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
  8. require_once 'HTMLPurifier/Lexer/PH5P.php';
  9. }
  10. }
  11. // HTMLPurifier_Lexer::create() --------------------------------------------
  12. function test_create() {
  13. $this->config->set('Core.MaintainLineNumbers', true);
  14. $lexer = HTMLPurifier_Lexer::create($this->config);
  15. $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  16. }
  17. function test_create_objectLexerImpl() {
  18. $this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
  19. $lexer = HTMLPurifier_Lexer::create($this->config);
  20. $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  21. }
  22. function test_create_unknownLexer() {
  23. $this->config->set('Core.LexerImpl', 'AsdfAsdf');
  24. $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
  25. HTMLPurifier_Lexer::create($this->config);
  26. }
  27. function test_create_incompatibleLexer() {
  28. $this->config->set('Core.LexerImpl', 'DOMLex');
  29. $this->config->set('Core.MaintainLineNumbers', true);
  30. $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
  31. HTMLPurifier_Lexer::create($this->config);
  32. }
  33. // HTMLPurifier_Lexer->parseData() -----------------------------------------
  34. function assertParseData($input, $expect = true) {
  35. if ($expect === true) $expect = $input;
  36. $lexer = new HTMLPurifier_Lexer();
  37. $this->assertIdentical($expect, $lexer->parseData($input));
  38. }
  39. function test_parseData_plainText() {
  40. $this->assertParseData('asdf');
  41. }
  42. function test_parseData_ampersandEntity() {
  43. $this->assertParseData('&amp;', '&');
  44. }
  45. function test_parseData_quotEntity() {
  46. $this->assertParseData('&quot;', '"');
  47. }
  48. function test_parseData_aposNumericEntity() {
  49. $this->assertParseData('&#039;', "'");
  50. }
  51. function test_parseData_aposCompactNumericEntity() {
  52. $this->assertParseData('&#39;', "'");
  53. }
  54. function test_parseData_adjacentAmpersandEntities() {
  55. $this->assertParseData('&amp;&amp;&amp;', '&&&');
  56. }
  57. function test_parseData_trailingUnescapedAmpersand() {
  58. $this->assertParseData('&amp;&', '&&');
  59. }
  60. function test_parseData_internalUnescapedAmpersand() {
  61. $this->assertParseData('Procter & Gamble');
  62. }
  63. function test_parseData_improperEntityFaultToleranceTest() {
  64. $this->assertParseData('&#x2D;');
  65. }
  66. // HTMLPurifier_Lexer->extractBody() ---------------------------------------
  67. function assertExtractBody($text, $extract = true) {
  68. $lexer = new HTMLPurifier_Lexer();
  69. $result = $lexer->extractBody($text);
  70. if ($extract === true) $extract = $text;
  71. $this->assertIdentical($extract, $result);
  72. }
  73. function test_extractBody_noBodyTags() {
  74. $this->assertExtractBody('<b>Bold</b>');
  75. }
  76. function test_extractBody_lowercaseBodyTags() {
  77. $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
  78. }
  79. function test_extractBody_uppercaseBodyTags() {
  80. $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
  81. }
  82. function test_extractBody_realisticUseCase() {
  83. $this->assertExtractBody(
  84. '<?xml version="1.0"
  85. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
  86. "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  87. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  88. <head>
  89. <title>xyz</title>
  90. </head>
  91. <body>
  92. <form method="post" action="whatever1">
  93. <div>
  94. <input type="text" name="username" />
  95. <input type="text" name="password" />
  96. <input type="submit" />
  97. </div>
  98. </form>
  99. </body>
  100. </html>',
  101. '
  102. <form method="post" action="whatever1">
  103. <div>
  104. <input type="text" name="username" />
  105. <input type="text" name="password" />
  106. <input type="submit" />
  107. </div>
  108. </form>
  109. ');
  110. }
  111. function test_extractBody_bodyWithAttributes() {
  112. $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
  113. }
  114. function test_extractBody_preserveUnclosedBody() {
  115. $this->assertExtractBody('<body>asdf'); // not closed, don't accept
  116. }
  117. function test_extractBody_useLastBody() {
  118. $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
  119. }
  120. // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
  121. function assertTokenization($input, $expect, $alt_expect = array()) {
  122. $lexers = array();
  123. $lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
  124. if (class_exists('DOMDocument')) {
  125. $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
  126. $lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
  127. }
  128. foreach ($lexers as $name => $lexer) {
  129. $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
  130. if (isset($alt_expect[$name])) {
  131. if ($alt_expect[$name] === false) continue;
  132. $t_expect = $alt_expect[$name];
  133. $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
  134. } else {
  135. $t_expect = $expect;
  136. $this->assertIdentical($result, $expect, "$name: %s");
  137. }
  138. if ($t_expect != $result) {
  139. printTokens($result);
  140. }
  141. }
  142. }
  143. function test_tokenizeHTML_emptyInput() {
  144. $this->assertTokenization('', array());
  145. }
  146. function test_tokenizeHTML_plainText() {
  147. $this->assertTokenization(
  148. 'This is regular text.',
  149. array(
  150. new HTMLPurifier_Token_Text('This is regular text.')
  151. )
  152. );
  153. }
  154. function test_tokenizeHTML_textAndTags() {
  155. $this->assertTokenization(
  156. 'This is <b>bold</b> text',
  157. array(
  158. new HTMLPurifier_Token_Text('This is '),
  159. new HTMLPurifier_Token_Start('b', array()),
  160. new HTMLPurifier_Token_Text('bold'),
  161. new HTMLPurifier_Token_End('b'),
  162. new HTMLPurifier_Token_Text(' text'),
  163. )
  164. );
  165. }
  166. function test_tokenizeHTML_normalizeCase() {
  167. $this->assertTokenization(
  168. '<DIV>Totally rad dude. <b>asdf</b></div>',
  169. array(
  170. new HTMLPurifier_Token_Start('DIV', array()),
  171. new HTMLPurifier_Token_Text('Totally rad dude. '),
  172. new HTMLPurifier_Token_Start('b', array()),
  173. new HTMLPurifier_Token_Text('asdf'),
  174. new HTMLPurifier_Token_End('b'),
  175. new HTMLPurifier_Token_End('div'),
  176. )
  177. );
  178. }
  179. function test_tokenizeHTML_notWellFormed() {
  180. $this->assertTokenization(
  181. '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
  182. array(
  183. new HTMLPurifier_Token_Start('asdf'),
  184. new HTMLPurifier_Token_End('asdf'),
  185. new HTMLPurifier_Token_Start('d'),
  186. new HTMLPurifier_Token_End('d'),
  187. new HTMLPurifier_Token_Start('poOloka'),
  188. new HTMLPurifier_Token_Start('poolasdf'),
  189. new HTMLPurifier_Token_Start('ds'),
  190. new HTMLPurifier_Token_End('asdf'),
  191. new HTMLPurifier_Token_End('ASDF'),
  192. ),
  193. array(
  194. 'DOMLex' => $alt = array(
  195. new HTMLPurifier_Token_Empty('asdf'),
  196. new HTMLPurifier_Token_Empty('d'),
  197. new HTMLPurifier_Token_Start('pooloka'),
  198. new HTMLPurifier_Token_Start('poolasdf'),
  199. new HTMLPurifier_Token_Empty('ds'),
  200. new HTMLPurifier_Token_End('poolasdf'),
  201. new HTMLPurifier_Token_End('pooloka'),
  202. ),
  203. 'PH5P' => $alt,
  204. )
  205. );
  206. }
  207. function test_tokenizeHTML_whitespaceInTag() {
  208. $this->assertTokenization(
  209. '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
  210. array(
  211. new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
  212. new HTMLPurifier_Token_Text('Link to '),
  213. new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
  214. new HTMLPurifier_Token_Text('foobar'),
  215. new HTMLPurifier_Token_End('b'),
  216. new HTMLPurifier_Token_End('a'),
  217. )
  218. );
  219. }
  220. function test_tokenizeHTML_singleAttribute() {
  221. $this->assertTokenization(
  222. '<br style="&amp;" />',
  223. array(
  224. new HTMLPurifier_Token_Empty('br', array('style' => '&'))
  225. )
  226. );
  227. }
  228. function test_tokenizeHTML_emptyTag() {
  229. $this->assertTokenization(
  230. '<br />',
  231. array( new HTMLPurifier_Token_Empty('br') )
  232. );
  233. }
  234. function test_tokenizeHTML_comment() {
  235. $this->assertTokenization(
  236. '<!-- Comment -->',
  237. array( new HTMLPurifier_Token_Comment(' Comment ') )
  238. );
  239. }
  240. function test_tokenizeHTML_malformedComment() {
  241. $this->assertTokenization(
  242. '<!-- not so well formed --->',
  243. array( new HTMLPurifier_Token_Comment(' not so well formed -') )
  244. );
  245. }
  246. function test_tokenizeHTML_unterminatedTag() {
  247. $this->assertTokenization(
  248. '<a href=""',
  249. array( new HTMLPurifier_Token_Text('<a href=""') ),
  250. array(
  251. // I like our behavior better, but it's non-standard
  252. 'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
  253. 'PH5P' => false, // total barfing, grabs scaffolding too
  254. )
  255. );
  256. }
  257. function test_tokenizeHTML_specialEntities() {
  258. $this->assertTokenization(
  259. '&lt;b&gt;',
  260. array(
  261. new HTMLPurifier_Token_Text('<b>')
  262. ),
  263. array(
  264. // some parsers will separate entities out
  265. 'PH5P' => array(
  266. new HTMLPurifier_Token_Text('<'),
  267. new HTMLPurifier_Token_Text('b'),
  268. new HTMLPurifier_Token_Text('>'),
  269. ),
  270. )
  271. );
  272. }
  273. function test_tokenizeHTML_earlyQuote() {
  274. $this->assertTokenization(
  275. '<a "=>',
  276. array( new HTMLPurifier_Token_Empty('a') ),
  277. array(
  278. // we barf on this input
  279. 'DirectLex' => array(
  280. new HTMLPurifier_Token_Start('a', array('"' => ''))
  281. ),
  282. 'PH5P' => false, // behavior varies; handle this personally
  283. )
  284. );
  285. }
  286. function test_tokenizeHTML_earlyQuote_PH5P() {
  287. if (!class_exists('DOMDocument')) return;
  288. $lexer = new HTMLPurifier_Lexer_PH5P();
  289. $result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
  290. if ($this->context->get('PH5PError', true)) {
  291. $this->assertIdentical(array(
  292. new HTMLPurifier_Token_Start('a', array('"' => ''))
  293. ), $result);
  294. } else {
  295. $this->assertIdentical(array(
  296. new HTMLPurifier_Token_Empty('a', array('"' => ''))
  297. ), $result);
  298. }
  299. }
  300. function test_tokenizeHTML_unescapedQuote() {
  301. $this->assertTokenization(
  302. '"',
  303. array( new HTMLPurifier_Token_Text('"') )
  304. );
  305. }
  306. function test_tokenizeHTML_escapedQuote() {
  307. $this->assertTokenization(
  308. '&quot;',
  309. array( new HTMLPurifier_Token_Text('"') )
  310. );
  311. }
  312. function test_tokenizeHTML_cdata() {
  313. $this->assertTokenization(
  314. '<![CDATA[You <b>can&#39;t</b> get me!]]>',
  315. array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
  316. array(
  317. 'PH5P' => array(
  318. new HTMLPurifier_Token_Text('You '),
  319. new HTMLPurifier_Token_Text('<'),
  320. new HTMLPurifier_Token_Text('b'),
  321. new HTMLPurifier_Token_Text('>'),
  322. new HTMLPurifier_Token_Text('can'),
  323. new HTMLPurifier_Token_Text('&'),
  324. new HTMLPurifier_Token_Text('#39;t'),
  325. new HTMLPurifier_Token_Text('<'),
  326. new HTMLPurifier_Token_Text('/b'),
  327. new HTMLPurifier_Token_Text('>'),
  328. new HTMLPurifier_Token_Text(' get me!'),
  329. ),
  330. )
  331. );
  332. }
  333. function test_tokenizeHTML_characterEntity() {
  334. $this->assertTokenization(
  335. '&theta;',
  336. array( new HTMLPurifier_Token_Text("\xCE\xB8") )
  337. );
  338. }
  339. function test_tokenizeHTML_characterEntityInCDATA() {
  340. $this->assertTokenization(
  341. '<![CDATA[&rarr;]]>',
  342. array( new HTMLPurifier_Token_Text("&rarr;") ),
  343. array(
  344. 'PH5P' => array(
  345. new HTMLPurifier_Token_Text('&'),
  346. new HTMLPurifier_Token_Text('rarr;'),
  347. ),
  348. )
  349. );
  350. }
  351. function test_tokenizeHTML_entityInAttribute() {
  352. $this->assertTokenization(
  353. '<a href="index.php?title=foo&amp;id=bar">Link</a>',
  354. array(
  355. new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
  356. new HTMLPurifier_Token_Text('Link'),
  357. new HTMLPurifier_Token_End('a'),
  358. )
  359. );
  360. }
  361. function test_tokenizeHTML_preserveUTF8() {
  362. $this->assertTokenization(
  363. "\xCE\xB8",
  364. array( new HTMLPurifier_Token_Text("\xCE\xB8") )
  365. );
  366. }
  367. function test_tokenizeHTML_specialEntityInAttribute() {
  368. $this->assertTokenization(
  369. '<br test="x &lt; 6" />',
  370. array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
  371. );
  372. }
  373. function test_tokenizeHTML_emoticonProtection() {
  374. $this->assertTokenization(
  375. '<b>Whoa! <3 That\'s not good >.></b>',
  376. array(
  377. new HTMLPurifier_Token_Start('b'),
  378. new HTMLPurifier_Token_Text('Whoa! '),
  379. new HTMLPurifier_Token_Text('<'),
  380. new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
  381. new HTMLPurifier_Token_End('b')
  382. ),
  383. array(
  384. // text is absorbed together
  385. 'DOMLex' => array(
  386. new HTMLPurifier_Token_Start('b'),
  387. new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
  388. new HTMLPurifier_Token_End('b'),
  389. ),
  390. 'PH5P' => array( // interesting grouping
  391. new HTMLPurifier_Token_Start('b'),
  392. new HTMLPurifier_Token_Text('Whoa! '),
  393. new HTMLPurifier_Token_Text('<'),
  394. new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
  395. new HTMLPurifier_Token_End('b'),
  396. ),
  397. )
  398. );
  399. }
  400. function test_tokenizeHTML_commentWithFunkyChars() {
  401. $this->assertTokenization(
  402. '<!-- This >< comment --><br />',
  403. array(
  404. new HTMLPurifier_Token_Comment(' This >< comment '),
  405. new HTMLPurifier_Token_Empty('br'),
  406. )
  407. );
  408. }
  409. function test_tokenizeHTML_unterminatedComment() {
  410. $this->assertTokenization(
  411. '<!-- This >< comment',
  412. array( new HTMLPurifier_Token_Comment(' This >< comment') ),
  413. array(
  414. 'DOMLex' => false,
  415. 'PH5P' => false,
  416. )
  417. );
  418. }
  419. function test_tokenizeHTML_scriptCDATAContents() {
  420. $this->config->set('HTML.Trusted', true);
  421. $this->assertTokenization(
  422. 'Foo: <script>alert("<foo>");</script>',
  423. array(
  424. new HTMLPurifier_Token_Text('Foo: '),
  425. new HTMLPurifier_Token_Start('script'),
  426. new HTMLPurifier_Token_Text('alert("<foo>");'),
  427. new HTMLPurifier_Token_End('script'),
  428. ),
  429. array(
  430. // PH5P, for some reason, bubbles the script to <head>
  431. 'PH5P' => false,
  432. )
  433. );
  434. }
  435. function test_tokenizeHTML_entitiesInComment() {
  436. $this->assertTokenization(
  437. '<!-- This comment < &lt; & -->',
  438. array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') )
  439. );
  440. }
  441. function test_tokenizeHTML_attributeWithSpecialCharacters() {
  442. $this->assertTokenization(
  443. '<a href="><>">',
  444. array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
  445. array(
  446. 'DirectLex' => array(
  447. new HTMLPurifier_Token_Start('a', array('href' => '')),
  448. new HTMLPurifier_Token_Text('<'),
  449. new HTMLPurifier_Token_Text('">'),
  450. )
  451. )
  452. );
  453. }
  454. function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
  455. $this->assertTokenization(
  456. '<param name="src" value="http://example.com/video.wmv" />',
  457. array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
  458. );
  459. }
  460. function test_tokenizeHTML_style() {
  461. $extra = array(
  462. // PH5P doesn't seem to like style tags
  463. 'PH5P' => false,
  464. // DirectLex defers to RemoveForeignElements for textification
  465. 'DirectLex' => array(
  466. new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
  467. new HTMLPurifier_Token_Comment("\ndiv {}\n"),
  468. new HTMLPurifier_Token_End('style'),
  469. ),
  470. );
  471. if (!defined('LIBXML_VERSION')) {
  472. // LIBXML_VERSION is missing in early versions of PHP
  473. // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
  474. // this translates to 5.0.x. In such cases, punt the test entirely.
  475. return;
  476. } elseif (LIBXML_VERSION < 20628) {
  477. // libxml's behavior is wrong prior to this version, so make
  478. // appropriate accomodations
  479. $extra['DOMLex'] = $extra['DirectLex'];
  480. }
  481. $this->assertTokenization(
  482. '<style type="text/css"><!--
  483. div {}
  484. --></style>',
  485. array(
  486. new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
  487. new HTMLPurifier_Token_Text("\ndiv {}\n"),
  488. new HTMLPurifier_Token_End('style'),
  489. ),
  490. $extra
  491. );
  492. }
  493. function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
  494. $alt_expect = array(
  495. // Technically this is invalid, but it won't be a
  496. // problem with invalid element removal; also, this
  497. // mimics Mozilla's parsing of the tag.
  498. new HTMLPurifier_Token_Start('a@'),
  499. new HTMLPurifier_Token_Text('>'),
  500. );
  501. $this->assertTokenization(
  502. '<a@>>',
  503. array(
  504. new HTMLPurifier_Token_Start('a'),
  505. new HTMLPurifier_Token_Text('>'),
  506. new HTMLPurifier_Token_End('a'),
  507. ),
  508. array(
  509. 'DirectLex' => $alt_expect,
  510. )
  511. );
  512. }
  513. function test_tokenizeHTML_emoticonHeart() {
  514. $this->assertTokenization(
  515. '<br /><3<br />',
  516. array(
  517. new HTMLPurifier_Token_Empty('br'),
  518. new HTMLPurifier_Token_Text('<'),
  519. new HTMLPurifier_Token_Text('3'),
  520. new HTMLPurifier_Token_Empty('br'),
  521. ),
  522. array(
  523. 'DOMLex' => array(
  524. new HTMLPurifier_Token_Empty('br'),
  525. new HTMLPurifier_Token_Text('<3'),
  526. new HTMLPurifier_Token_Empty('br'),
  527. ),
  528. )
  529. );
  530. }
  531. function test_tokenizeHTML_emoticonShiftyEyes() {
  532. $this->assertTokenization(
  533. '<b><<</b>',
  534. array(
  535. new HTMLPurifier_Token_Start('b'),
  536. new HTMLPurifier_Token_Text('<'),
  537. new HTMLPurifier_Token_Text('<'),
  538. new HTMLPurifier_Token_End('b'),
  539. ),
  540. array(
  541. 'DOMLex' => array(
  542. new HTMLPurifier_Token_Start('b'),
  543. new HTMLPurifier_Token_Text('<<'),
  544. new HTMLPurifier_Token_End('b'),
  545. ),
  546. )
  547. );
  548. }
  549. function test_tokenizeHTML_eon1996() {
  550. $this->assertTokenization(
  551. '< <b>test</b>',
  552. array(
  553. new HTMLPurifier_Token_Text('<'),
  554. new HTMLPurifier_Token_Text(' '),
  555. new HTMLPurifier_Token_Start('b'),
  556. new HTMLPurifier_Token_Text('test'),
  557. new HTMLPurifier_Token_End('b'),
  558. ),
  559. array(
  560. 'DOMLex' => array(
  561. new HTMLPurifier_Token_Text('< '),
  562. new HTMLPurifier_Token_Start('b'),
  563. new HTMLPurifier_Token_Text('test'),
  564. new HTMLPurifier_Token_End('b'),
  565. ),
  566. )
  567. );
  568. }
  569. function test_tokenizeHTML_bodyInCDATA() {
  570. $alt_tokens = array(
  571. new HTMLPurifier_Token_Text('<'),
  572. new HTMLPurifier_Token_Text('body'),
  573. new HTMLPurifier_Token_Text('>'),
  574. new HTMLPurifier_Token_Text('Foo'),
  575. new HTMLPurifier_Token_Text('<'),
  576. new HTMLPurifier_Token_Text('/body'),
  577. new HTMLPurifier_Token_Text('>'),
  578. );
  579. $this->assertTokenization(
  580. '<![CDATA[<body>Foo</body>]]>',
  581. array(
  582. new HTMLPurifier_Token_Text('<body>Foo</body>'),
  583. ),
  584. array(
  585. 'PH5P' => $alt_tokens,
  586. )
  587. );
  588. }
  589. function test_tokenizeHTML_() {
  590. $this->assertTokenization(
  591. '<a><img /></a>',
  592. array(
  593. new HTMLPurifier_Token_Start('a'),
  594. new HTMLPurifier_Token_Empty('img'),
  595. new HTMLPurifier_Token_End('a'),
  596. )
  597. );
  598. }
  599. function test_tokenizeHTML_ignoreIECondComment() {
  600. $this->assertTokenization(
  601. '<!--[if IE]>foo<a>bar<!-- baz --><![endif]-->',
  602. array()
  603. );
  604. }
  605. function test_tokenizeHTML_removeProcessingInstruction() {
  606. $this->config->set('Core.RemoveProcessingInstructions', true);
  607. $this->assertTokenization(
  608. '<?xml blah blah ?>',
  609. array()
  610. );
  611. }
  612. function test_tokenizeHTML_removeNewline() {
  613. $this->config->set('Core.NormalizeNewlines', true);
  614. $this->assertTokenization(
  615. "plain\rtext\r\n",
  616. array(
  617. new HTMLPurifier_Token_Text("plain\ntext\n")
  618. )
  619. );
  620. }
  621. function test_tokenizeHTML_noRemoveNewline() {
  622. $this->config->set('Core.NormalizeNewlines', false);
  623. $this->assertTokenization(
  624. "plain\rtext\r\n",
  625. array(
  626. new HTMLPurifier_Token_Text("plain\rtext\r\n")
  627. )
  628. );
  629. }
  630. function test_tokenizeHTML_conditionalCommentUngreedy() {
  631. $this->assertTokenization(
  632. '<!--[if gte mso 9]>a<![endif]-->b<!--[if gte mso 9]>c<![endif]-->',
  633. array(
  634. new HTMLPurifier_Token_Text("b")
  635. )
  636. );
  637. }
  638. function test_tokenizeHTML_imgTag() {
  639. $start = array(
  640. new HTMLPurifier_Token_Start('img',
  641. array(
  642. 'src' => 'img_11775.jpg',
  643. 'alt' => '[Img #11775]',
  644. 'id' => 'EMBEDDED_IMG_11775',
  645. )
  646. )
  647. );
  648. $this->assertTokenization(
  649. '<img src="img_11775.jpg" alt="[Img #11775]" id="EMBEDDED_IMG_11775" >',
  650. array(
  651. new HTMLPurifier_Token_Empty('img',
  652. array(
  653. 'src' => 'img_11775.jpg',
  654. 'alt' => '[Img #11775]',
  655. 'id' => 'EMBEDDED_IMG_11775',
  656. )
  657. )
  658. ),
  659. array(
  660. 'DirectLex' => $start,
  661. )
  662. );
  663. }
  664. /*
  665. function test_tokenizeHTML_() {
  666. $this->assertTokenization(
  667. ,
  668. array(
  669. )
  670. );
  671. }
  672. */
  673. }
  674. // vim: et sw=4 sts=4