MakeWellFormed.php 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. <?php
  2. /**
  3. * Takes tokens makes them well-formed (balance end tags, etc.)
  4. */
  5. class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
  6. {
  7. /**
  8. * Array stream of tokens being processed.
  9. */
  10. protected $tokens;
  11. /**
  12. * Current index in $tokens.
  13. */
  14. protected $t;
  15. /**
  16. * Current nesting of elements.
  17. */
  18. protected $stack;
  19. /**
  20. * Injectors active in this stream processing.
  21. */
  22. protected $injectors;
  23. /**
  24. * Current instance of HTMLPurifier_Config.
  25. */
  26. protected $config;
  27. /**
  28. * Current instance of HTMLPurifier_Context.
  29. */
  30. protected $context;
  31. public function execute($tokens, $config, $context) {
  32. $definition = $config->getHTMLDefinition();
  33. // local variables
  34. $generator = new HTMLPurifier_Generator($config, $context);
  35. $escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
  36. $e = $context->get('ErrorCollector', true);
  37. $t = false; // token index
  38. $i = false; // injector index
  39. $token = false; // the current token
  40. $reprocess = false; // whether or not to reprocess the same token
  41. $stack = array();
  42. // member variables
  43. $this->stack =& $stack;
  44. $this->t =& $t;
  45. $this->tokens =& $tokens;
  46. $this->config = $config;
  47. $this->context = $context;
  48. // context variables
  49. $context->register('CurrentNesting', $stack);
  50. $context->register('InputIndex', $t);
  51. $context->register('InputTokens', $tokens);
  52. $context->register('CurrentToken', $token);
  53. // -- begin INJECTOR --
  54. $this->injectors = array();
  55. $injectors = $config->getBatch('AutoFormat');
  56. $def_injectors = $definition->info_injector;
  57. $custom_injectors = $injectors['Custom'];
  58. unset($injectors['Custom']); // special case
  59. foreach ($injectors as $injector => $b) {
  60. // XXX: Fix with a legitimate lookup table of enabled filters
  61. if (strpos($injector, '.') !== false) continue;
  62. $injector = "HTMLPurifier_Injector_$injector";
  63. if (!$b) continue;
  64. $this->injectors[] = new $injector;
  65. }
  66. foreach ($def_injectors as $injector) {
  67. // assumed to be objects
  68. $this->injectors[] = $injector;
  69. }
  70. foreach ($custom_injectors as $injector) {
  71. if (!$injector) continue;
  72. if (is_string($injector)) {
  73. $injector = "HTMLPurifier_Injector_$injector";
  74. $injector = new $injector;
  75. }
  76. $this->injectors[] = $injector;
  77. }
  78. // give the injectors references to the definition and context
  79. // variables for performance reasons
  80. foreach ($this->injectors as $ix => $injector) {
  81. $error = $injector->prepare($config, $context);
  82. if (!$error) continue;
  83. array_splice($this->injectors, $ix, 1); // rm the injector
  84. trigger_error("Cannot enable {$injector->name} injector because $error is not allowed", E_USER_WARNING);
  85. }
  86. // -- end INJECTOR --
  87. // a note on punting:
  88. // In order to reduce code duplication, whenever some code needs
  89. // to make HTML changes in order to make things "correct", the
  90. // new HTML gets sent through the purifier, regardless of its
  91. // status. This means that if we add a start token, because it
  92. // was totally necessary, we don't have to update nesting; we just
  93. // punt ($reprocess = true; continue;) and it does that for us.
  94. // isset is in loop because $tokens size changes during loop exec
  95. for (
  96. $t = 0;
  97. $t == 0 || isset($tokens[$t - 1]);
  98. // only increment if we don't need to reprocess
  99. $reprocess ? $reprocess = false : $t++
  100. ) {
  101. // check for a rewind
  102. if (is_int($i) && $i >= 0) {
  103. // possibility: disable rewinding if the current token has a
  104. // rewind set on it already. This would offer protection from
  105. // infinite loop, but might hinder some advanced rewinding.
  106. $rewind_to = $this->injectors[$i]->getRewind();
  107. if (is_int($rewind_to) && $rewind_to < $t) {
  108. if ($rewind_to < 0) $rewind_to = 0;
  109. while ($t > $rewind_to) {
  110. $t--;
  111. $prev = $tokens[$t];
  112. // indicate that other injectors should not process this token,
  113. // but we need to reprocess it
  114. unset($prev->skip[$i]);
  115. $prev->rewind = $i;
  116. if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->stack);
  117. elseif ($prev instanceof HTMLPurifier_Token_End) $this->stack[] = $prev->start;
  118. }
  119. }
  120. $i = false;
  121. }
  122. // handle case of document end
  123. if (!isset($tokens[$t])) {
  124. // kill processing if stack is empty
  125. if (empty($this->stack)) break;
  126. // peek
  127. $top_nesting = array_pop($this->stack);
  128. $this->stack[] = $top_nesting;
  129. // send error
  130. if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) {
  131. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting);
  132. }
  133. // append, don't splice, since this is the end
  134. $tokens[] = new HTMLPurifier_Token_End($top_nesting->name);
  135. // punt!
  136. $reprocess = true;
  137. continue;
  138. }
  139. $token = $tokens[$t];
  140. //echo '<br>'; printTokens($tokens, $t); printTokens($this->stack);
  141. //flush();
  142. // quick-check: if it's not a tag, no need to process
  143. if (empty($token->is_tag)) {
  144. if ($token instanceof HTMLPurifier_Token_Text) {
  145. foreach ($this->injectors as $i => $injector) {
  146. if (isset($token->skip[$i])) continue;
  147. if ($token->rewind !== null && $token->rewind !== $i) continue;
  148. $injector->handleText($token);
  149. $this->processToken($token, $i);
  150. $reprocess = true;
  151. break;
  152. }
  153. }
  154. // another possibility is a comment
  155. continue;
  156. }
  157. if (isset($definition->info[$token->name])) {
  158. $type = $definition->info[$token->name]->child->type;
  159. } else {
  160. $type = false; // Type is unknown, treat accordingly
  161. }
  162. // quick tag checks: anything that's *not* an end tag
  163. $ok = false;
  164. if ($type === 'empty' && $token instanceof HTMLPurifier_Token_Start) {
  165. // claims to be a start tag but is empty
  166. $token = new HTMLPurifier_Token_Empty($token->name, $token->attr);
  167. $ok = true;
  168. } elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) {
  169. // claims to be empty but really is a start tag
  170. $this->swap(new HTMLPurifier_Token_End($token->name));
  171. $this->insertBefore(new HTMLPurifier_Token_Start($token->name, $token->attr));
  172. // punt (since we had to modify the input stream in a non-trivial way)
  173. $reprocess = true;
  174. continue;
  175. } elseif ($token instanceof HTMLPurifier_Token_Empty) {
  176. // real empty token
  177. $ok = true;
  178. } elseif ($token instanceof HTMLPurifier_Token_Start) {
  179. // start tag
  180. // ...unless they also have to close their parent
  181. if (!empty($this->stack)) {
  182. $parent = array_pop($this->stack);
  183. $this->stack[] = $parent;
  184. if (isset($definition->info[$parent->name])) {
  185. $elements = $definition->info[$parent->name]->child->getAllowedElements($config);
  186. $autoclose = !isset($elements[$token->name]);
  187. } else {
  188. $autoclose = false;
  189. }
  190. if ($autoclose && $definition->info[$token->name]->wrap) {
  191. // Check if an element can be wrapped by another
  192. // element to make it valid in a context (for
  193. // example, <ul><ul> needs a <li> in between)
  194. $wrapname = $definition->info[$token->name]->wrap;
  195. $wrapdef = $definition->info[$wrapname];
  196. $elements = $wrapdef->child->getAllowedElements($config);
  197. $parent_elements = $definition->info[$parent->name]->child->getAllowedElements($config);
  198. if (isset($elements[$token->name]) && isset($parent_elements[$wrapname])) {
  199. $newtoken = new HTMLPurifier_Token_Start($wrapname);
  200. $this->insertBefore($newtoken);
  201. $reprocess = true;
  202. continue;
  203. }
  204. }
  205. $carryover = false;
  206. if ($autoclose && $definition->info[$parent->name]->formatting) {
  207. $carryover = true;
  208. }
  209. if ($autoclose) {
  210. // errors need to be updated
  211. $new_token = new HTMLPurifier_Token_End($parent->name);
  212. $new_token->start = $parent;
  213. if ($carryover) {
  214. $element = clone $parent;
  215. $element->armor['MakeWellFormed_TagClosedError'] = true;
  216. $element->carryover = true;
  217. $this->processToken(array($new_token, $token, $element));
  218. } else {
  219. $this->insertBefore($new_token);
  220. }
  221. if ($e && !isset($parent->armor['MakeWellFormed_TagClosedError'])) {
  222. if (!$carryover) {
  223. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
  224. } else {
  225. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag carryover', $parent);
  226. }
  227. }
  228. $reprocess = true;
  229. continue;
  230. }
  231. }
  232. $ok = true;
  233. }
  234. if ($ok) {
  235. foreach ($this->injectors as $i => $injector) {
  236. if (isset($token->skip[$i])) continue;
  237. if ($token->rewind !== null && $token->rewind !== $i) continue;
  238. $injector->handleElement($token);
  239. $this->processToken($token, $i);
  240. $reprocess = true;
  241. break;
  242. }
  243. if (!$reprocess) {
  244. // ah, nothing interesting happened; do normal processing
  245. $this->swap($token);
  246. if ($token instanceof HTMLPurifier_Token_Start) {
  247. $this->stack[] = $token;
  248. } elseif ($token instanceof HTMLPurifier_Token_End) {
  249. throw new HTMLPurifier_Exception('Improper handling of end tag in start code; possible error in MakeWellFormed');
  250. }
  251. }
  252. continue;
  253. }
  254. // sanity check: we should be dealing with a closing tag
  255. if (!$token instanceof HTMLPurifier_Token_End) {
  256. throw new HTMLPurifier_Exception('Unaccounted for tag token in input stream, bug in HTML Purifier');
  257. }
  258. // make sure that we have something open
  259. if (empty($this->stack)) {
  260. if ($escape_invalid_tags) {
  261. if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text');
  262. $this->swap(new HTMLPurifier_Token_Text(
  263. $generator->generateFromToken($token)
  264. ));
  265. } else {
  266. $this->remove();
  267. if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed');
  268. }
  269. $reprocess = true;
  270. continue;
  271. }
  272. // first, check for the simplest case: everything closes neatly.
  273. // Eventually, everything passes through here; if there are problems
  274. // we modify the input stream accordingly and then punt, so that
  275. // the tokens get processed again.
  276. $current_parent = array_pop($this->stack);
  277. if ($current_parent->name == $token->name) {
  278. $token->start = $current_parent;
  279. foreach ($this->injectors as $i => $injector) {
  280. if (isset($token->skip[$i])) continue;
  281. if ($token->rewind !== null && $token->rewind !== $i) continue;
  282. $injector->handleEnd($token);
  283. $this->processToken($token, $i);
  284. $this->stack[] = $current_parent;
  285. $reprocess = true;
  286. break;
  287. }
  288. continue;
  289. }
  290. // okay, so we're trying to close the wrong tag
  291. // undo the pop previous pop
  292. $this->stack[] = $current_parent;
  293. // scroll back the entire nest, trying to find our tag.
  294. // (feature could be to specify how far you'd like to go)
  295. $size = count($this->stack);
  296. // -2 because -1 is the last element, but we already checked that
  297. $skipped_tags = false;
  298. for ($j = $size - 2; $j >= 0; $j--) {
  299. if ($this->stack[$j]->name == $token->name) {
  300. $skipped_tags = array_slice($this->stack, $j);
  301. break;
  302. }
  303. }
  304. // we didn't find the tag, so remove
  305. if ($skipped_tags === false) {
  306. if ($escape_invalid_tags) {
  307. $this->swap(new HTMLPurifier_Token_Text(
  308. $generator->generateFromToken($token)
  309. ));
  310. if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text');
  311. } else {
  312. $this->remove();
  313. if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed');
  314. }
  315. $reprocess = true;
  316. continue;
  317. }
  318. // do errors, in REVERSE $j order: a,b,c with </a></b></c>
  319. $c = count($skipped_tags);
  320. if ($e) {
  321. for ($j = $c - 1; $j > 0; $j--) {
  322. // notice we exclude $j == 0, i.e. the current ending tag, from
  323. // the errors...
  324. if (!isset($skipped_tags[$j]->armor['MakeWellFormed_TagClosedError'])) {
  325. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$j]);
  326. }
  327. }
  328. }
  329. // insert tags, in FORWARD $j order: c,b,a with </a></b></c>
  330. $replace = array($token);
  331. for ($j = 1; $j < $c; $j++) {
  332. // ...as well as from the insertions
  333. $new_token = new HTMLPurifier_Token_End($skipped_tags[$j]->name);
  334. $new_token->start = $skipped_tags[$j];
  335. array_unshift($replace, $new_token);
  336. if (isset($definition->info[$new_token->name]) && $definition->info[$new_token->name]->formatting) {
  337. $element = clone $skipped_tags[$j];
  338. $element->carryover = true;
  339. $element->armor['MakeWellFormed_TagClosedError'] = true;
  340. $replace[] = $element;
  341. }
  342. }
  343. $this->processToken($replace);
  344. $reprocess = true;
  345. continue;
  346. }
  347. $context->destroy('CurrentNesting');
  348. $context->destroy('InputTokens');
  349. $context->destroy('InputIndex');
  350. $context->destroy('CurrentToken');
  351. unset($this->injectors, $this->stack, $this->tokens, $this->t);
  352. return $tokens;
  353. }
  354. /**
  355. * Processes arbitrary token values for complicated substitution patterns.
  356. * In general:
  357. *
  358. * If $token is an array, it is a list of tokens to substitute for the
  359. * current token. These tokens then get individually processed. If there
  360. * is a leading integer in the list, that integer determines how many
  361. * tokens from the stream should be removed.
  362. *
  363. * If $token is a regular token, it is swapped with the current token.
  364. *
  365. * If $token is false, the current token is deleted.
  366. *
  367. * If $token is an integer, that number of tokens (with the first token
  368. * being the current one) will be deleted.
  369. *
  370. * @param $token Token substitution value
  371. * @param $injector Injector that performed the substitution; default is if
  372. * this is not an injector related operation.
  373. */
  374. protected function processToken($token, $injector = -1) {
  375. // normalize forms of token
  376. if (is_object($token)) $token = array(1, $token);
  377. if (is_int($token)) $token = array($token);
  378. if ($token === false) $token = array(1);
  379. if (!is_array($token)) throw new HTMLPurifier_Exception('Invalid token type from injector');
  380. if (!is_int($token[0])) array_unshift($token, 1);
  381. if ($token[0] === 0) throw new HTMLPurifier_Exception('Deleting zero tokens is not valid');
  382. // $token is now an array with the following form:
  383. // array(number nodes to delete, new node 1, new node 2, ...)
  384. $delete = array_shift($token);
  385. $old = array_splice($this->tokens, $this->t, $delete, $token);
  386. if ($injector > -1) {
  387. // determine appropriate skips
  388. $oldskip = isset($old[0]) ? $old[0]->skip : array();
  389. foreach ($token as $object) {
  390. $object->skip = $oldskip;
  391. $object->skip[$injector] = true;
  392. }
  393. }
  394. }
  395. /**
  396. * Inserts a token before the current token. Cursor now points to this token
  397. */
  398. private function insertBefore($token) {
  399. array_splice($this->tokens, $this->t, 0, array($token));
  400. }
  401. /**
  402. * Removes current token. Cursor now points to new token occupying previously
  403. * occupied space.
  404. */
  405. private function remove() {
  406. array_splice($this->tokens, $this->t, 1);
  407. }
  408. /**
  409. * Swap current token with new token. Cursor points to new token (no change).
  410. */
  411. private function swap($token) {
  412. $this->tokens[$this->t] = $token;
  413. }
  414. }
  415. // vim: et sw=4 sts=4