AutoParagraph.php 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. <?php
  2. /**
  3. * Injector that auto paragraphs text in the root node based on
  4. * double-spacing.
  5. * @todo Ensure all states are unit tested, including variations as well.
  6. * @todo Make a graph of the flow control for this Injector.
  7. */
  8. class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
  9. {
  10. public $name = 'AutoParagraph';
  11. public $needed = array('p');
  12. private function _pStart() {
  13. $par = new HTMLPurifier_Token_Start('p');
  14. $par->armor['MakeWellFormed_TagClosedError'] = true;
  15. return $par;
  16. }
  17. public function handleText(&$token) {
  18. $text = $token->data;
  19. // Does the current parent allow <p> tags?
  20. if ($this->allowsElement('p')) {
  21. if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
  22. // Note that we have differing behavior when dealing with text
  23. // in the anonymous root node, or a node inside the document.
  24. // If the text as a double-newline, the treatment is the same;
  25. // if it doesn't, see the next if-block if you're in the document.
  26. $i = $nesting = null;
  27. if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
  28. // State 1.1: ... ^ (whitespace, then document end)
  29. // ----
  30. // This is a degenerate case
  31. } else {
  32. if (!$token->is_whitespace || $this->_isInline($current)) {
  33. // State 1.2: PAR1
  34. // ----
  35. // State 1.3: PAR1\n\nPAR2
  36. // ------------
  37. // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
  38. // ------------
  39. $token = array($this->_pStart());
  40. $this->_splitText($text, $token);
  41. } else {
  42. // State 1.5: \n<hr />
  43. // --
  44. }
  45. }
  46. } else {
  47. // State 2: <div>PAR1... (similar to 1.4)
  48. // ----
  49. // We're in an element that allows paragraph tags, but we're not
  50. // sure if we're going to need them.
  51. if ($this->_pLookAhead()) {
  52. // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
  53. // ----
  54. // Note: This will always be the first child, since any
  55. // previous inline element would have triggered this very
  56. // same routine, and found the double newline. One possible
  57. // exception would be a comment.
  58. $token = array($this->_pStart(), $token);
  59. } else {
  60. // State 2.2.1: <div>PAR1<div>
  61. // ----
  62. // State 2.2.2: <div>PAR1<b>PAR1</b></div>
  63. // ----
  64. }
  65. }
  66. // Is the current parent a <p> tag?
  67. } elseif (
  68. !empty($this->currentNesting) &&
  69. $this->currentNesting[count($this->currentNesting)-1]->name == 'p'
  70. ) {
  71. // State 3.1: ...<p>PAR1
  72. // ----
  73. // State 3.2: ...<p>PAR1\n\nPAR2
  74. // ------------
  75. $token = array();
  76. $this->_splitText($text, $token);
  77. // Abort!
  78. } else {
  79. // State 4.1: ...<b>PAR1
  80. // ----
  81. // State 4.2: ...<b>PAR1\n\nPAR2
  82. // ------------
  83. }
  84. }
  85. public function handleElement(&$token) {
  86. // We don't have to check if we're already in a <p> tag for block
  87. // tokens, because the tag would have been autoclosed by MakeWellFormed.
  88. if ($this->allowsElement('p')) {
  89. if (!empty($this->currentNesting)) {
  90. if ($this->_isInline($token)) {
  91. // State 1: <div>...<b>
  92. // ---
  93. // Check if this token is adjacent to the parent token
  94. // (seek backwards until token isn't whitespace)
  95. $i = null;
  96. $this->backward($i, $prev);
  97. if (!$prev instanceof HTMLPurifier_Token_Start) {
  98. // Token wasn't adjacent
  99. if (
  100. $prev instanceof HTMLPurifier_Token_Text &&
  101. substr($prev->data, -2) === "\n\n"
  102. ) {
  103. // State 1.1.4: <div><p>PAR1</p>\n\n<b>
  104. // ---
  105. // Quite frankly, this should be handled by splitText
  106. $token = array($this->_pStart(), $token);
  107. } else {
  108. // State 1.1.1: <div><p>PAR1</p><b>
  109. // ---
  110. // State 1.1.2: <div><br /><b>
  111. // ---
  112. // State 1.1.3: <div>PAR<b>
  113. // ---
  114. }
  115. } else {
  116. // State 1.2.1: <div><b>
  117. // ---
  118. // Lookahead to see if <p> is needed.
  119. if ($this->_pLookAhead()) {
  120. // State 1.3.1: <div><b>PAR1\n\nPAR2
  121. // ---
  122. $token = array($this->_pStart(), $token);
  123. } else {
  124. // State 1.3.2: <div><b>PAR1</b></div>
  125. // ---
  126. // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
  127. // ---
  128. }
  129. }
  130. } else {
  131. // State 2.3: ...<div>
  132. // -----
  133. }
  134. } else {
  135. if ($this->_isInline($token)) {
  136. // State 3.1: <b>
  137. // ---
  138. // This is where the {p} tag is inserted, not reflected in
  139. // inputTokens yet, however.
  140. $token = array($this->_pStart(), $token);
  141. } else {
  142. // State 3.2: <div>
  143. // -----
  144. }
  145. $i = null;
  146. if ($this->backward($i, $prev)) {
  147. if (
  148. !$prev instanceof HTMLPurifier_Token_Text
  149. ) {
  150. // State 3.1.1: ...</p>{p}<b>
  151. // ---
  152. // State 3.2.1: ...</p><div>
  153. // -----
  154. if (!is_array($token)) $token = array($token);
  155. array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
  156. } else {
  157. // State 3.1.2: ...</p>\n\n{p}<b>
  158. // ---
  159. // State 3.2.2: ...</p>\n\n<div>
  160. // -----
  161. // Note: PAR<ELEM> cannot occur because PAR would have been
  162. // wrapped in <p> tags.
  163. }
  164. }
  165. }
  166. } else {
  167. // State 2.2: <ul><li>
  168. // ----
  169. // State 2.4: <p><b>
  170. // ---
  171. }
  172. }
  173. /**
  174. * Splits up a text in paragraph tokens and appends them
  175. * to the result stream that will replace the original
  176. * @param $data String text data that will be processed
  177. * into paragraphs
  178. * @param $result Reference to array of tokens that the
  179. * tags will be appended onto
  180. * @param $config Instance of HTMLPurifier_Config
  181. * @param $context Instance of HTMLPurifier_Context
  182. */
  183. private function _splitText($data, &$result) {
  184. $raw_paragraphs = explode("\n\n", $data);
  185. $paragraphs = array(); // without empty paragraphs
  186. $needs_start = false;
  187. $needs_end = false;
  188. $c = count($raw_paragraphs);
  189. if ($c == 1) {
  190. // There were no double-newlines, abort quickly. In theory this
  191. // should never happen.
  192. $result[] = new HTMLPurifier_Token_Text($data);
  193. return;
  194. }
  195. for ($i = 0; $i < $c; $i++) {
  196. $par = $raw_paragraphs[$i];
  197. if (trim($par) !== '') {
  198. $paragraphs[] = $par;
  199. } else {
  200. if ($i == 0) {
  201. // Double newline at the front
  202. if (empty($result)) {
  203. // The empty result indicates that the AutoParagraph
  204. // injector did not add any start paragraph tokens.
  205. // This means that we have been in a paragraph for
  206. // a while, and the newline means we should start a new one.
  207. $result[] = new HTMLPurifier_Token_End('p');
  208. $result[] = new HTMLPurifier_Token_Text("\n\n");
  209. // However, the start token should only be added if
  210. // there is more processing to be done (i.e. there are
  211. // real paragraphs in here). If there are none, the
  212. // next start paragraph tag will be handled by the
  213. // next call to the injector
  214. $needs_start = true;
  215. } else {
  216. // We just started a new paragraph!
  217. // Reinstate a double-newline for presentation's sake, since
  218. // it was in the source code.
  219. array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
  220. }
  221. } elseif ($i + 1 == $c) {
  222. // Double newline at the end
  223. // There should be a trailing </p> when we're finally done.
  224. $needs_end = true;
  225. }
  226. }
  227. }
  228. // Check if this was just a giant blob of whitespace. Move this earlier,
  229. // perhaps?
  230. if (empty($paragraphs)) {
  231. return;
  232. }
  233. // Add the start tag indicated by \n\n at the beginning of $data
  234. if ($needs_start) {
  235. $result[] = $this->_pStart();
  236. }
  237. // Append the paragraphs onto the result
  238. foreach ($paragraphs as $par) {
  239. $result[] = new HTMLPurifier_Token_Text($par);
  240. $result[] = new HTMLPurifier_Token_End('p');
  241. $result[] = new HTMLPurifier_Token_Text("\n\n");
  242. $result[] = $this->_pStart();
  243. }
  244. // Remove trailing start token; Injector will handle this later if
  245. // it was indeed needed. This prevents from needing to do a lookahead,
  246. // at the cost of a lookbehind later.
  247. array_pop($result);
  248. // If there is no need for an end tag, remove all of it and let
  249. // MakeWellFormed close it later.
  250. if (!$needs_end) {
  251. array_pop($result); // removes \n\n
  252. array_pop($result); // removes </p>
  253. }
  254. }
  255. /**
  256. * Returns true if passed token is inline (and, ergo, allowed in
  257. * paragraph tags)
  258. */
  259. private function _isInline($token) {
  260. return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
  261. }
  262. /**
  263. * Looks ahead in the token list and determines whether or not we need
  264. * to insert a <p> tag.
  265. */
  266. private function _pLookAhead() {
  267. $this->current($i, $current);
  268. if ($current instanceof HTMLPurifier_Token_Start) $nesting = 1;
  269. else $nesting = 0;
  270. $ok = false;
  271. while ($this->forwardUntilEndToken($i, $current, $nesting)) {
  272. $result = $this->_checkNeedsP($current);
  273. if ($result !== null) {
  274. $ok = $result;
  275. break;
  276. }
  277. }
  278. return $ok;
  279. }
  280. /**
  281. * Determines if a particular token requires an earlier inline token
  282. * to get a paragraph. This should be used with _forwardUntilEndToken
  283. */
  284. private function _checkNeedsP($current) {
  285. if ($current instanceof HTMLPurifier_Token_Start){
  286. if (!$this->_isInline($current)) {
  287. // <div>PAR1<div>
  288. // ----
  289. // Terminate early, since we hit a block element
  290. return false;
  291. }
  292. } elseif ($current instanceof HTMLPurifier_Token_Text) {
  293. if (strpos($current->data, "\n\n") !== false) {
  294. // <div>PAR1<b>PAR1\n\nPAR2
  295. // ----
  296. return true;
  297. } else {
  298. // <div>PAR1<b>PAR1...
  299. // ----
  300. }
  301. }
  302. return null;
  303. }
  304. }
  305. // vim: et sw=4 sts=4