kses_original.php 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557
  1. <?php
  2. # kses 0.2.2 - HTML/XHTML filter that only allows some elements and attributes
  3. # Copyright (C) 2002, 2003, 2005 Ulf Harnhammar
  4. #
  5. # This program is free software and open source software; you can redistribute
  6. # it and/or modify it under the terms of the GNU General Public License as
  7. # published by the Free Software Foundation; either version 2 of the License,
  8. # or (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful, but WITHOUT
  11. # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12. # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  13. # more details.
  14. #
  15. # You should have received a copy of the GNU General Public License along
  16. # with this program; if not, write to the Free Software Foundation, Inc.,
  17. # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA or visit
  18. # http://www.gnu.org/licenses/gpl.html
  19. #
  20. # *** CONTACT INFORMATION ***
  21. #
  22. # E-mail: metaur at users dot sourceforge dot net
  23. # Web page: http://sourceforge.net/projects/kses
  24. # Paper mail: Ulf Harnhammar
  25. # Ymergatan 17 C
  26. # 753 25 Uppsala
  27. # SWEDEN
  28. #
  29. # [kses strips evil scripts!]
  30. function kses($string, $allowed_html, $allowed_protocols =
  31. array('http', 'https', 'ftp', 'news', 'nntp', 'telnet',
  32. 'gopher', 'mailto'))
  33. ###############################################################################
  34. # This function makes sure that only the allowed HTML element names, attribute
  35. # names and attribute values plus only sane HTML entities will occur in
  36. # $string. You have to remove any slashes from PHP's magic quotes before you
  37. # call this function.
  38. ###############################################################################
  39. {
  40. $string = kses_no_null($string);
  41. $string = kses_js_entities($string);
  42. $string = kses_normalize_entities($string);
  43. $string = kses_hook($string);
  44. $allowed_html_fixed = kses_array_lc($allowed_html);
  45. return kses_split($string, $allowed_html_fixed, $allowed_protocols);
  46. } # function kses
  47. function kses_hook($string)
  48. ###############################################################################
  49. # You add any kses hooks here.
  50. ###############################################################################
  51. {
  52. return $string;
  53. } # function kses_hook
  54. function kses_version()
  55. ###############################################################################
  56. # This function returns kses' version number.
  57. ###############################################################################
  58. {
  59. return '0.2.2';
  60. } # function kses_version
  61. function kses_split($string, $allowed_html, $allowed_protocols)
  62. ###############################################################################
  63. # This function searches for HTML tags, no matter how malformed. It also
  64. # matches stray ">" characters.
  65. ###############################################################################
  66. {
  67. return preg_replace('%(<'. # EITHER: <
  68. '[^>]*'. # things that aren't >
  69. '(>|$)'. # > or end of string
  70. '|>)%e', # OR: just a >
  71. "kses_split2('\\1', \$allowed_html, ".
  72. '$allowed_protocols)',
  73. $string);
  74. } # function kses_split
  75. function kses_split2($string, $allowed_html, $allowed_protocols)
  76. ###############################################################################
  77. # This function does a lot of work. It rejects some very malformed things
  78. # like <:::>. It returns an empty string, if the element isn't allowed (look
  79. # ma, no strip_tags()!). Otherwise it splits the tag into an element and an
  80. # attribute list.
  81. ###############################################################################
  82. {
  83. $string = kses_stripslashes($string);
  84. if (substr($string, 0, 1) != '<')
  85. return '&gt;';
  86. # It matched a ">" character
  87. if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?$%', $string, $matches))
  88. return '';
  89. # It's seriously malformed
  90. $slash = trim($matches[1]);
  91. $elem = $matches[2];
  92. $attrlist = $matches[3];
  93. if (!@isset($allowed_html[strtolower($elem)]))
  94. return '';
  95. # They are using a not allowed HTML element
  96. if ($slash != '')
  97. return "<$slash$elem>";
  98. # No attributes are allowed for closing elements
  99. return kses_attr("$slash$elem", $attrlist, $allowed_html,
  100. $allowed_protocols);
  101. } # function kses_split2
  102. function kses_attr($element, $attr, $allowed_html, $allowed_protocols)
  103. ###############################################################################
  104. # This function removes all attributes, if none are allowed for this element.
  105. # If some are allowed it calls kses_hair() to split them further, and then it
  106. # builds up new HTML code from the data that kses_hair() returns. It also
  107. # removes "<" and ">" characters, if there are any left. One more thing it
  108. # does is to check if the tag has a closing XHTML slash, and if it does,
  109. # it puts one in the returned code as well.
  110. ###############################################################################
  111. {
  112. # Is there a closing XHTML slash at the end of the attributes?
  113. $xhtml_slash = '';
  114. if (preg_match('%\s/\s*$%', $attr))
  115. $xhtml_slash = ' /';
  116. # Are any attributes allowed at all for this element?
  117. if (@count($allowed_html[strtolower($element)]) == 0)
  118. return "<$element$xhtml_slash>";
  119. # Split it
  120. $attrarr = kses_hair($attr, $allowed_protocols);
  121. # Go through $attrarr, and save the allowed attributes for this element
  122. # in $attr2
  123. $attr2 = '';
  124. foreach ($attrarr as $arreach)
  125. {
  126. if (!@isset($allowed_html[strtolower($element)]
  127. [strtolower($arreach['name'])]))
  128. continue; # the attribute is not allowed
  129. $current = $allowed_html[strtolower($element)]
  130. [strtolower($arreach['name'])];
  131. if (!is_array($current))
  132. $attr2 .= ' '.$arreach['whole'];
  133. # there are no checks
  134. else
  135. {
  136. # there are some checks
  137. $ok = true;
  138. foreach ($current as $currkey => $currval)
  139. if (!kses_check_attr_val($arreach['value'], $arreach['vless'],
  140. $currkey, $currval))
  141. { $ok = false; break; }
  142. if ($ok)
  143. $attr2 .= ' '.$arreach['whole']; # it passed them
  144. } # if !is_array($current)
  145. } # foreach
  146. # Remove any "<" or ">" characters
  147. $attr2 = preg_replace('/[<>]/', '', $attr2);
  148. return "<$element$attr2$xhtml_slash>";
  149. } # function kses_attr
  150. function kses_hair($attr, $allowed_protocols)
  151. ###############################################################################
  152. # This function does a lot of work. It parses an attribute list into an array
  153. # with attribute data, and tries to do the right thing even if it gets weird
  154. # input. It will add quotes around attribute values that don't have any quotes
  155. # or apostrophes around them, to make it easier to produce HTML code that will
  156. # conform to W3C's HTML specification. It will also remove bad URL protocols
  157. # from attribute values.
  158. ###############################################################################
  159. {
  160. $attrarr = array();
  161. $mode = 0;
  162. $attrname = '';
  163. # Loop through the whole attribute list
  164. while (strlen($attr) != 0)
  165. {
  166. $working = 0; # Was the last operation successful?
  167. switch ($mode)
  168. {
  169. case 0: # attribute name, href for instance
  170. if (preg_match('/^([-a-zA-Z]+)/', $attr, $match))
  171. {
  172. $attrname = $match[1];
  173. $working = $mode = 1;
  174. $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
  175. }
  176. break;
  177. case 1: # equals sign or valueless ("selected")
  178. if (preg_match('/^\s*=\s*/', $attr)) # equals sign
  179. {
  180. $working = 1; $mode = 2;
  181. $attr = preg_replace('/^\s*=\s*/', '', $attr);
  182. break;
  183. }
  184. if (preg_match('/^\s+/', $attr)) # valueless
  185. {
  186. $working = 1; $mode = 0;
  187. $attrarr[] = array
  188. ('name' => $attrname,
  189. 'value' => '',
  190. 'whole' => $attrname,
  191. 'vless' => 'y');
  192. $attr = preg_replace('/^\s+/', '', $attr);
  193. }
  194. break;
  195. case 2: # attribute value, a URL after href= for instance
  196. if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match))
  197. # "value"
  198. {
  199. $thisval = kses_bad_protocol($match[1], $allowed_protocols);
  200. $attrarr[] = array
  201. ('name' => $attrname,
  202. 'value' => $thisval,
  203. 'whole' => "$attrname=\"$thisval\"",
  204. 'vless' => 'n');
  205. $working = 1; $mode = 0;
  206. $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
  207. break;
  208. }
  209. if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match))
  210. # 'value'
  211. {
  212. $thisval = kses_bad_protocol($match[1], $allowed_protocols);
  213. $attrarr[] = array
  214. ('name' => $attrname,
  215. 'value' => $thisval,
  216. 'whole' => "$attrname='$thisval'",
  217. 'vless' => 'n');
  218. $working = 1; $mode = 0;
  219. $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
  220. break;
  221. }
  222. if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match))
  223. # value
  224. {
  225. $thisval = kses_bad_protocol($match[1], $allowed_protocols);
  226. $attrarr[] = array
  227. ('name' => $attrname,
  228. 'value' => $thisval,
  229. 'whole' => "$attrname=\"$thisval\"",
  230. 'vless' => 'n');
  231. # We add quotes to conform to W3C's HTML spec.
  232. $working = 1; $mode = 0;
  233. $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr);
  234. }
  235. break;
  236. } # switch
  237. if ($working == 0) # not well formed, remove and try again
  238. {
  239. $attr = kses_html_error($attr);
  240. $mode = 0;
  241. }
  242. } # while
  243. if ($mode == 1)
  244. # special case, for when the attribute list ends with a valueless
  245. # attribute like "selected"
  246. $attrarr[] = array
  247. ('name' => $attrname,
  248. 'value' => '',
  249. 'whole' => $attrname,
  250. 'vless' => 'y');
  251. return $attrarr;
  252. } # function kses_hair
  253. function kses_check_attr_val($value, $vless, $checkname, $checkvalue)
  254. ###############################################################################
  255. # This function performs different checks for attribute values. The currently
  256. # implemented checks are "maxlen", "minlen", "maxval", "minval" and "valueless"
  257. # with even more checks to come soon.
  258. ###############################################################################
  259. {
  260. $ok = true;
  261. switch (strtolower($checkname))
  262. {
  263. case 'maxlen':
  264. # The maxlen check makes sure that the attribute value has a length not
  265. # greater than the given value. This can be used to avoid Buffer Overflows
  266. # in WWW clients and various Internet servers.
  267. if (strlen($value) > $checkvalue)
  268. $ok = false;
  269. break;
  270. case 'minlen':
  271. # The minlen check makes sure that the attribute value has a length not
  272. # smaller than the given value.
  273. if (strlen($value) < $checkvalue)
  274. $ok = false;
  275. break;
  276. case 'maxval':
  277. # The maxval check does two things: it checks that the attribute value is
  278. # an integer from 0 and up, without an excessive amount of zeroes or
  279. # whitespace (to avoid Buffer Overflows). It also checks that the attribute
  280. # value is not greater than the given value.
  281. # This check can be used to avoid Denial of Service attacks.
  282. if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value))
  283. $ok = false;
  284. if ($value > $checkvalue)
  285. $ok = false;
  286. break;
  287. case 'minval':
  288. # The minval check checks that the attribute value is a positive integer,
  289. # and that it is not smaller than the given value.
  290. if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value))
  291. $ok = false;
  292. if ($value < $checkvalue)
  293. $ok = false;
  294. break;
  295. case 'valueless':
  296. # The valueless check checks if the attribute has a value
  297. # (like <a href="blah">) or not (<option selected>). If the given value
  298. # is a "y" or a "Y", the attribute must not have a value.
  299. # If the given value is an "n" or an "N", the attribute must have one.
  300. if (strtolower($checkvalue) != $vless)
  301. $ok = false;
  302. break;
  303. } # switch
  304. return $ok;
  305. } # function kses_check_attr_val
  306. function kses_bad_protocol($string, $allowed_protocols)
  307. ###############################################################################
  308. # This function removes all non-allowed protocols from the beginning of
  309. # $string. It ignores whitespace and the case of the letters, and it does
  310. # understand HTML entities. It does its work in a while loop, so it won't be
  311. # fooled by a string like "javascript:javascript:alert(57)".
  312. ###############################################################################
  313. {
  314. $string = kses_no_null($string);
  315. $string = preg_replace('/\xad+/', '', $string); # deals with Opera "feature"
  316. $string2 = $string.'a';
  317. while ($string != $string2)
  318. {
  319. $string2 = $string;
  320. $string = kses_bad_protocol_once($string, $allowed_protocols);
  321. } # while
  322. return $string;
  323. } # function kses_bad_protocol
  324. function kses_no_null($string)
  325. ###############################################################################
  326. # This function removes any NULL characters in $string.
  327. ###############################################################################
  328. {
  329. $string = preg_replace('/\0+/', '', $string);
  330. $string = preg_replace('/(\\\\0)+/', '', $string);
  331. return $string;
  332. } # function kses_no_null
  333. function kses_stripslashes($string)
  334. ###############################################################################
  335. # This function changes the character sequence \" to just "
  336. # It leaves all other slashes alone. It's really weird, but the quoting from
  337. # preg_replace(//e) seems to require this.
  338. ###############################################################################
  339. {
  340. return preg_replace('%\\\\"%', '"', $string);
  341. } # function kses_stripslashes
  342. function kses_array_lc($inarray)
  343. ###############################################################################
  344. # This function goes through an array, and changes the keys to all lower case.
  345. ###############################################################################
  346. {
  347. $outarray = array();
  348. foreach ($inarray as $inkey => $inval)
  349. {
  350. $outkey = strtolower($inkey);
  351. $outarray[$outkey] = array();
  352. foreach ($inval as $inkey2 => $inval2)
  353. {
  354. $outkey2 = strtolower($inkey2);
  355. $outarray[$outkey][$outkey2] = $inval2;
  356. } # foreach $inval
  357. } # foreach $inarray
  358. return $outarray;
  359. } # function kses_array_lc
  360. function kses_js_entities($string)
  361. ###############################################################################
  362. # This function removes the HTML JavaScript entities found in early versions of
  363. # Netscape 4.
  364. ###############################################################################
  365. {
  366. return preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string);
  367. } # function kses_js_entities
  368. function kses_html_error($string)
  369. ###############################################################################
  370. # This function deals with parsing errors in kses_hair(). The general plan is
  371. # to remove everything to and including some whitespace, but it deals with
  372. # quotes and apostrophes as well.
  373. ###############################################################################
  374. {
  375. return preg_replace('/^("[^"]*("|$)|\'[^\']*(\'|$)|\S)*\s*/', '', $string);
  376. } # function kses_html_error
  377. function kses_bad_protocol_once($string, $allowed_protocols)
  378. ###############################################################################
  379. # This function searches for URL protocols at the beginning of $string, while
  380. # handling whitespace and HTML entities.
  381. ###############################################################################
  382. {
  383. $string2 = preg_split('/:|&#58;|&#x3a;/i', $string, 2);
  384. if(isset($string2[1]) && !preg_match('%/\?%',$string2[0]))
  385. {
  386. $string = kses_bad_protocol_once2($string2[0],$allowed_protocols).trim($string2[1]);
  387. }
  388. return $string;
  389. } # function kses_bad_protocol_once
  390. function kses_bad_protocol_once2($string, $allowed_protocols)
  391. ###############################################################################
  392. # This function processes URL protocols, checks to see if they're in the white-
  393. # list or not, and returns different data depending on the answer.
  394. ###############################################################################
  395. {
  396. $string2 = kses_decode_entities($string);
  397. $string2 = preg_replace('/\s/', '', $string2);
  398. $string2 = kses_no_null($string2);
  399. $string2 = preg_replace('/\xad+/', '', $string2);
  400. # deals with Opera "feature"
  401. $string2 = strtolower($string2);
  402. $allowed = false;
  403. foreach ($allowed_protocols as $one_protocol)
  404. if (strtolower($one_protocol) == $string2)
  405. {
  406. $allowed = true;
  407. break;
  408. }
  409. if ($allowed)
  410. return "$string2:";
  411. else
  412. return '';
  413. } # function kses_bad_protocol_once2
  414. function kses_normalize_entities($string)
  415. ###############################################################################
  416. # This function normalizes HTML entities. It will convert "AT&T" to the correct
  417. # "AT&amp;T", "&#00058;" to "&#58;", "&#XYZZY;" to "&amp;#XYZZY;" and so on.
  418. ###############################################################################
  419. {
  420. # Disarm all entities by converting & to &amp;
  421. $string = str_replace('&', '&amp;', $string);
  422. # Change back the allowed entities in our entity whitelist
  423. $string = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]{0,19});/',
  424. '&\\1;', $string);
  425. $string = preg_replace('/&amp;#0*([0-9]{1,5});/e',
  426. 'kses_normalize_entities2("\\1")', $string);
  427. $string = preg_replace('/&amp;#([Xx])0*(([0-9A-Fa-f]{2}){1,2});/',
  428. '&#\\1\\2;', $string);
  429. return $string;
  430. } # function kses_normalize_entities
  431. function kses_normalize_entities2($i)
  432. ###############################################################################
  433. # This function helps kses_normalize_entities() to only accept 16 bit values
  434. # and nothing more for &#number; entities.
  435. ###############################################################################
  436. {
  437. return (($i > 65535) ? "&amp;#$i;" : "&#$i;");
  438. } # function kses_normalize_entities2
  439. function kses_decode_entities($string)
  440. ###############################################################################
  441. # This function decodes numeric HTML entities (&#65; and &#x41;). It doesn't
  442. # do anything with other entities like &auml;, but we don't need them in the
  443. # URL protocol whitelisting system anyway.
  444. ###############################################################################
  445. {
  446. $string = preg_replace('/&#([0-9]+);/e', 'chr("\\1")', $string);
  447. $string = preg_replace('/&#[Xx]([0-9A-Fa-f]+);/e', 'chr(hexdec("\\1"))',
  448. $string);
  449. return $string;
  450. } # function kses_decode_entities
  451. ?>