MakeWellFormed.php 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532
  1. <?php
  2. /**
  3. * Takes tokens makes them well-formed (balance end tags, etc.)
  4. *
  5. * Specification of the armor attributes this strategy uses:
  6. *
  7. * - MakeWellFormed_TagClosedError: This armor field is used to
  8. * suppress tag closed errors for certain tokens [TagClosedSuppress],
  9. * in particular, if a tag was generated automatically by HTML
  10. * Purifier, we may rely on our infrastructure to close it for us
  11. * and shouldn't report an error to the user [TagClosedAuto].
  12. */
  13. class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
  14. {
  15. /**
  16. * Array stream of tokens being processed.
  17. */
  18. protected $tokens;
  19. /**
  20. * Current index in $tokens.
  21. */
  22. protected $t;
  23. /**
  24. * Current nesting of elements.
  25. */
  26. protected $stack;
  27. /**
  28. * Injectors active in this stream processing.
  29. */
  30. protected $injectors;
  31. /**
  32. * Current instance of HTMLPurifier_Config.
  33. */
  34. protected $config;
  35. /**
  36. * Current instance of HTMLPurifier_Context.
  37. */
  38. protected $context;
  39. public function execute($tokens, $config, $context) {
  40. $definition = $config->getHTMLDefinition();
  41. // local variables
  42. $generator = new HTMLPurifier_Generator($config, $context);
  43. $escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
  44. // used for autoclose early abortion
  45. $global_parent_allowed_elements = array();
  46. if (isset($definition->info[$definition->info_parent])) {
  47. // may be unset under testing circumstances
  48. $global_parent_allowed_elements = $definition->info[$definition->info_parent]->child->getAllowedElements($config);
  49. }
  50. $e = $context->get('ErrorCollector', true);
  51. $t = false; // token index
  52. $i = false; // injector index
  53. $token = false; // the current token
  54. $reprocess = false; // whether or not to reprocess the same token
  55. $stack = array();
  56. // member variables
  57. $this->stack =& $stack;
  58. $this->t =& $t;
  59. $this->tokens =& $tokens;
  60. $this->config = $config;
  61. $this->context = $context;
  62. // context variables
  63. $context->register('CurrentNesting', $stack);
  64. $context->register('InputIndex', $t);
  65. $context->register('InputTokens', $tokens);
  66. $context->register('CurrentToken', $token);
  67. // -- begin INJECTOR --
  68. $this->injectors = array();
  69. $injectors = $config->getBatch('AutoFormat');
  70. $def_injectors = $definition->info_injector;
  71. $custom_injectors = $injectors['Custom'];
  72. unset($injectors['Custom']); // special case
  73. foreach ($injectors as $injector => $b) {
  74. // XXX: Fix with a legitimate lookup table of enabled filters
  75. if (strpos($injector, '.') !== false) continue;
  76. $injector = "HTMLPurifier_Injector_$injector";
  77. if (!$b) continue;
  78. $this->injectors[] = new $injector;
  79. }
  80. foreach ($def_injectors as $injector) {
  81. // assumed to be objects
  82. $this->injectors[] = $injector;
  83. }
  84. foreach ($custom_injectors as $injector) {
  85. if (!$injector) continue;
  86. if (is_string($injector)) {
  87. $injector = "HTMLPurifier_Injector_$injector";
  88. $injector = new $injector;
  89. }
  90. $this->injectors[] = $injector;
  91. }
  92. // give the injectors references to the definition and context
  93. // variables for performance reasons
  94. foreach ($this->injectors as $ix => $injector) {
  95. $error = $injector->prepare($config, $context);
  96. if (!$error) continue;
  97. array_splice($this->injectors, $ix, 1); // rm the injector
  98. trigger_error("Cannot enable {$injector->name} injector because $error is not allowed", E_USER_WARNING);
  99. }
  100. // -- end INJECTOR --
  101. // a note on reprocessing:
  102. // In order to reduce code duplication, whenever some code needs
  103. // to make HTML changes in order to make things "correct", the
  104. // new HTML gets sent through the purifier, regardless of its
  105. // status. This means that if we add a start token, because it
  106. // was totally necessary, we don't have to update nesting; we just
  107. // punt ($reprocess = true; continue;) and it does that for us.
  108. // isset is in loop because $tokens size changes during loop exec
  109. for (
  110. $t = 0;
  111. $t == 0 || isset($tokens[$t - 1]);
  112. // only increment if we don't need to reprocess
  113. $reprocess ? $reprocess = false : $t++
  114. ) {
  115. // check for a rewind
  116. if (is_int($i) && $i >= 0) {
  117. // possibility: disable rewinding if the current token has a
  118. // rewind set on it already. This would offer protection from
  119. // infinite loop, but might hinder some advanced rewinding.
  120. $rewind_to = $this->injectors[$i]->getRewind();
  121. if (is_int($rewind_to) && $rewind_to < $t) {
  122. if ($rewind_to < 0) $rewind_to = 0;
  123. while ($t > $rewind_to) {
  124. $t--;
  125. $prev = $tokens[$t];
  126. // indicate that other injectors should not process this token,
  127. // but we need to reprocess it
  128. unset($prev->skip[$i]);
  129. $prev->rewind = $i;
  130. if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->stack);
  131. elseif ($prev instanceof HTMLPurifier_Token_End) $this->stack[] = $prev->start;
  132. }
  133. }
  134. $i = false;
  135. }
  136. // handle case of document end
  137. if (!isset($tokens[$t])) {
  138. // kill processing if stack is empty
  139. if (empty($this->stack)) break;
  140. // peek
  141. $top_nesting = array_pop($this->stack);
  142. $this->stack[] = $top_nesting;
  143. // send error [TagClosedSuppress]
  144. if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) {
  145. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting);
  146. }
  147. // append, don't splice, since this is the end
  148. $tokens[] = new HTMLPurifier_Token_End($top_nesting->name);
  149. // punt!
  150. $reprocess = true;
  151. continue;
  152. }
  153. $token = $tokens[$t];
  154. //echo '<br>'; printTokens($tokens, $t); printTokens($this->stack);
  155. //flush();
  156. // quick-check: if it's not a tag, no need to process
  157. if (empty($token->is_tag)) {
  158. if ($token instanceof HTMLPurifier_Token_Text) {
  159. foreach ($this->injectors as $i => $injector) {
  160. if (isset($token->skip[$i])) continue;
  161. if ($token->rewind !== null && $token->rewind !== $i) continue;
  162. $injector->handleText($token);
  163. $this->processToken($token, $i);
  164. $reprocess = true;
  165. break;
  166. }
  167. }
  168. // another possibility is a comment
  169. continue;
  170. }
  171. if (isset($definition->info[$token->name])) {
  172. $type = $definition->info[$token->name]->child->type;
  173. } else {
  174. $type = false; // Type is unknown, treat accordingly
  175. }
  176. // quick tag checks: anything that's *not* an end tag
  177. $ok = false;
  178. if ($type === 'empty' && $token instanceof HTMLPurifier_Token_Start) {
  179. // claims to be a start tag but is empty
  180. $token = new HTMLPurifier_Token_Empty($token->name, $token->attr, $token->line, $token->col, $token->armor);
  181. $ok = true;
  182. } elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) {
  183. // claims to be empty but really is a start tag
  184. $this->swap(new HTMLPurifier_Token_End($token->name));
  185. $this->insertBefore(new HTMLPurifier_Token_Start($token->name, $token->attr, $token->line, $token->col, $token->armor));
  186. // punt (since we had to modify the input stream in a non-trivial way)
  187. $reprocess = true;
  188. continue;
  189. } elseif ($token instanceof HTMLPurifier_Token_Empty) {
  190. // real empty token
  191. $ok = true;
  192. } elseif ($token instanceof HTMLPurifier_Token_Start) {
  193. // start tag
  194. // ...unless they also have to close their parent
  195. if (!empty($this->stack)) {
  196. // Performance note: you might think that it's rather
  197. // inefficient, recalculating the autoclose information
  198. // for every tag that a token closes (since when we
  199. // do an autoclose, we push a new token into the
  200. // stream and then /process/ that, before
  201. // re-processing this token.) But this is
  202. // necessary, because an injector can make an
  203. // arbitrary transformations to the autoclosing
  204. // tokens we introduce, so things may have changed
  205. // in the meantime. Also, doing the inefficient thing is
  206. // "easy" to reason about (for certain perverse definitions
  207. // of "easy")
  208. $parent = array_pop($this->stack);
  209. $this->stack[] = $parent;
  210. if (isset($definition->info[$parent->name])) {
  211. $elements = $definition->info[$parent->name]->child->getAllowedElements($config);
  212. $autoclose = !isset($elements[$token->name]);
  213. } else {
  214. $autoclose = false;
  215. }
  216. if ($autoclose && $definition->info[$token->name]->wrap) {
  217. // Check if an element can be wrapped by another
  218. // element to make it valid in a context (for
  219. // example, <ul><ul> needs a <li> in between)
  220. $wrapname = $definition->info[$token->name]->wrap;
  221. $wrapdef = $definition->info[$wrapname];
  222. $elements = $wrapdef->child->getAllowedElements($config);
  223. $parent_elements = $definition->info[$parent->name]->child->getAllowedElements($config);
  224. if (isset($elements[$token->name]) && isset($parent_elements[$wrapname])) {
  225. $newtoken = new HTMLPurifier_Token_Start($wrapname);
  226. $this->insertBefore($newtoken);
  227. $reprocess = true;
  228. continue;
  229. }
  230. }
  231. $carryover = false;
  232. if ($autoclose && $definition->info[$parent->name]->formatting) {
  233. $carryover = true;
  234. }
  235. if ($autoclose) {
  236. // check if this autoclose is doomed to fail
  237. // (this rechecks $parent, which his harmless)
  238. $autoclose_ok = isset($global_parent_allowed_elements[$token->name]);
  239. if (!$autoclose_ok) {
  240. foreach ($this->stack as $ancestor) {
  241. $elements = $definition->info[$ancestor->name]->child->getAllowedElements($config);
  242. if (isset($elements[$token->name])) {
  243. $autoclose_ok = true;
  244. break;
  245. }
  246. if ($definition->info[$token->name]->wrap) {
  247. $wrapname = $definition->info[$token->name]->wrap;
  248. $wrapdef = $definition->info[$wrapname];
  249. $wrap_elements = $wrapdef->child->getAllowedElements($config);
  250. if (isset($wrap_elements[$token->name]) && isset($elements[$wrapname])) {
  251. $autoclose_ok = true;
  252. break;
  253. }
  254. }
  255. }
  256. }
  257. if ($autoclose_ok) {
  258. // errors need to be updated
  259. $new_token = new HTMLPurifier_Token_End($parent->name);
  260. $new_token->start = $parent;
  261. if ($carryover) {
  262. $element = clone $parent;
  263. // [TagClosedAuto]
  264. $element->armor['MakeWellFormed_TagClosedError'] = true;
  265. $element->carryover = true;
  266. $this->processToken(array($new_token, $token, $element));
  267. } else {
  268. $this->insertBefore($new_token);
  269. }
  270. // [TagClosedSuppress]
  271. if ($e && !isset($parent->armor['MakeWellFormed_TagClosedError'])) {
  272. if (!$carryover) {
  273. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
  274. } else {
  275. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag carryover', $parent);
  276. }
  277. }
  278. } else {
  279. $this->remove();
  280. }
  281. $reprocess = true;
  282. continue;
  283. }
  284. }
  285. $ok = true;
  286. }
  287. if ($ok) {
  288. foreach ($this->injectors as $i => $injector) {
  289. if (isset($token->skip[$i])) continue;
  290. if ($token->rewind !== null && $token->rewind !== $i) continue;
  291. $injector->handleElement($token);
  292. $this->processToken($token, $i);
  293. $reprocess = true;
  294. break;
  295. }
  296. if (!$reprocess) {
  297. // ah, nothing interesting happened; do normal processing
  298. $this->swap($token);
  299. if ($token instanceof HTMLPurifier_Token_Start) {
  300. $this->stack[] = $token;
  301. } elseif ($token instanceof HTMLPurifier_Token_End) {
  302. throw new HTMLPurifier_Exception('Improper handling of end tag in start code; possible error in MakeWellFormed');
  303. }
  304. }
  305. continue;
  306. }
  307. // sanity check: we should be dealing with a closing tag
  308. if (!$token instanceof HTMLPurifier_Token_End) {
  309. throw new HTMLPurifier_Exception('Unaccounted for tag token in input stream, bug in HTML Purifier');
  310. }
  311. // make sure that we have something open
  312. if (empty($this->stack)) {
  313. if ($escape_invalid_tags) {
  314. if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text');
  315. $this->swap(new HTMLPurifier_Token_Text(
  316. $generator->generateFromToken($token)
  317. ));
  318. } else {
  319. $this->remove();
  320. if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed');
  321. }
  322. $reprocess = true;
  323. continue;
  324. }
  325. // first, check for the simplest case: everything closes neatly.
  326. // Eventually, everything passes through here; if there are problems
  327. // we modify the input stream accordingly and then punt, so that
  328. // the tokens get processed again.
  329. $current_parent = array_pop($this->stack);
  330. if ($current_parent->name == $token->name) {
  331. $token->start = $current_parent;
  332. foreach ($this->injectors as $i => $injector) {
  333. if (isset($token->skip[$i])) continue;
  334. if ($token->rewind !== null && $token->rewind !== $i) continue;
  335. $injector->handleEnd($token);
  336. $this->processToken($token, $i);
  337. $this->stack[] = $current_parent;
  338. $reprocess = true;
  339. break;
  340. }
  341. continue;
  342. }
  343. // okay, so we're trying to close the wrong tag
  344. // undo the pop previous pop
  345. $this->stack[] = $current_parent;
  346. // scroll back the entire nest, trying to find our tag.
  347. // (feature could be to specify how far you'd like to go)
  348. $size = count($this->stack);
  349. // -2 because -1 is the last element, but we already checked that
  350. $skipped_tags = false;
  351. for ($j = $size - 2; $j >= 0; $j--) {
  352. if ($this->stack[$j]->name == $token->name) {
  353. $skipped_tags = array_slice($this->stack, $j);
  354. break;
  355. }
  356. }
  357. // we didn't find the tag, so remove
  358. if ($skipped_tags === false) {
  359. if ($escape_invalid_tags) {
  360. $this->swap(new HTMLPurifier_Token_Text(
  361. $generator->generateFromToken($token)
  362. ));
  363. if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text');
  364. } else {
  365. $this->remove();
  366. if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed');
  367. }
  368. $reprocess = true;
  369. continue;
  370. }
  371. // do errors, in REVERSE $j order: a,b,c with </a></b></c>
  372. $c = count($skipped_tags);
  373. if ($e) {
  374. for ($j = $c - 1; $j > 0; $j--) {
  375. // notice we exclude $j == 0, i.e. the current ending tag, from
  376. // the errors... [TagClosedSuppress]
  377. if (!isset($skipped_tags[$j]->armor['MakeWellFormed_TagClosedError'])) {
  378. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$j]);
  379. }
  380. }
  381. }
  382. // insert tags, in FORWARD $j order: c,b,a with </a></b></c>
  383. $replace = array($token);
  384. for ($j = 1; $j < $c; $j++) {
  385. // ...as well as from the insertions
  386. $new_token = new HTMLPurifier_Token_End($skipped_tags[$j]->name);
  387. $new_token->start = $skipped_tags[$j];
  388. array_unshift($replace, $new_token);
  389. if (isset($definition->info[$new_token->name]) && $definition->info[$new_token->name]->formatting) {
  390. // [TagClosedAuto]
  391. $element = clone $skipped_tags[$j];
  392. $element->carryover = true;
  393. $element->armor['MakeWellFormed_TagClosedError'] = true;
  394. $replace[] = $element;
  395. }
  396. }
  397. $this->processToken($replace);
  398. $reprocess = true;
  399. continue;
  400. }
  401. $context->destroy('CurrentNesting');
  402. $context->destroy('InputTokens');
  403. $context->destroy('InputIndex');
  404. $context->destroy('CurrentToken');
  405. unset($this->injectors, $this->stack, $this->tokens, $this->t);
  406. return $tokens;
  407. }
  408. /**
  409. * Processes arbitrary token values for complicated substitution patterns.
  410. * In general:
  411. *
  412. * If $token is an array, it is a list of tokens to substitute for the
  413. * current token. These tokens then get individually processed. If there
  414. * is a leading integer in the list, that integer determines how many
  415. * tokens from the stream should be removed.
  416. *
  417. * If $token is a regular token, it is swapped with the current token.
  418. *
  419. * If $token is false, the current token is deleted.
  420. *
  421. * If $token is an integer, that number of tokens (with the first token
  422. * being the current one) will be deleted.
  423. *
  424. * @param $token Token substitution value
  425. * @param $injector Injector that performed the substitution; default is if
  426. * this is not an injector related operation.
  427. */
  428. protected function processToken($token, $injector = -1) {
  429. // normalize forms of token
  430. if (is_object($token)) $token = array(1, $token);
  431. if (is_int($token)) $token = array($token);
  432. if ($token === false) $token = array(1);
  433. if (!is_array($token)) throw new HTMLPurifier_Exception('Invalid token type from injector');
  434. if (!is_int($token[0])) array_unshift($token, 1);
  435. if ($token[0] === 0) throw new HTMLPurifier_Exception('Deleting zero tokens is not valid');
  436. // $token is now an array with the following form:
  437. // array(number nodes to delete, new node 1, new node 2, ...)
  438. $delete = array_shift($token);
  439. $old = array_splice($this->tokens, $this->t, $delete, $token);
  440. if ($injector > -1) {
  441. // determine appropriate skips
  442. $oldskip = isset($old[0]) ? $old[0]->skip : array();
  443. foreach ($token as $object) {
  444. $object->skip = $oldskip;
  445. $object->skip[$injector] = true;
  446. }
  447. }
  448. }
  449. /**
  450. * Inserts a token before the current token. Cursor now points to
  451. * this token. You must reprocess after this.
  452. */
  453. private function insertBefore($token) {
  454. array_splice($this->tokens, $this->t, 0, array($token));
  455. }
  456. /**
  457. * Removes current token. Cursor now points to new token occupying previously
  458. * occupied space. You must reprocess after this.
  459. */
  460. private function remove() {
  461. array_splice($this->tokens, $this->t, 1);
  462. }
  463. /**
  464. * Swap current token with new token. Cursor points to new token (no
  465. * change). You must reprocess after this.
  466. */
  467. private function swap($token) {
  468. $this->tokens[$this->t] = $token;
  469. }
  470. }
  471. // vim: et sw=4 sts=4