EntityParser.php 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. <?php
  2. // if want to implement error collecting here, we'll need to use some sort
  3. // of global data (probably trigger_error) because it's impossible to pass
  4. // $config or $context to the callback functions.
  5. /**
  6. * Handles referencing and derefencing character entities
  7. */
  8. class HTMLPurifier_EntityParser
  9. {
  10. /**
  11. * Reference to entity lookup table.
  12. */
  13. protected $_entity_lookup;
  14. /**
  15. * Callback regex string for parsing entities.
  16. */
  17. protected $_substituteEntitiesRegex =
  18. '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
  19. // 1. hex 2. dec 3. string (XML style)
  20. /**
  21. * Decimal to parsed string conversion table for special entities.
  22. */
  23. protected $_special_dec2str =
  24. array(
  25. 34 => '"',
  26. 38 => '&',
  27. 39 => "'",
  28. 60 => '<',
  29. 62 => '>'
  30. );
  31. /**
  32. * Stripped entity names to decimal conversion table for special entities.
  33. */
  34. protected $_special_ent2dec =
  35. array(
  36. 'quot' => 34,
  37. 'amp' => 38,
  38. 'lt' => 60,
  39. 'gt' => 62
  40. );
  41. /**
  42. * Substitutes non-special entities with their parsed equivalents. Since
  43. * running this whenever you have parsed character is t3h 5uck, we run
  44. * it before everything else.
  45. *
  46. * @param $string String to have non-special entities parsed.
  47. * @returns Parsed string.
  48. */
  49. public function substituteNonSpecialEntities($string) {
  50. // it will try to detect missing semicolons, but don't rely on it
  51. return preg_replace_callback(
  52. $this->_substituteEntitiesRegex,
  53. array($this, 'nonSpecialEntityCallback'),
  54. $string
  55. );
  56. }
  57. /**
  58. * Callback function for substituteNonSpecialEntities() that does the work.
  59. *
  60. * @param $matches PCRE matches array, with 0 the entire match, and
  61. * either index 1, 2 or 3 set with a hex value, dec value,
  62. * or string (respectively).
  63. * @returns Replacement string.
  64. */
  65. protected function nonSpecialEntityCallback($matches) {
  66. // replaces all but big five
  67. $entity = $matches[0];
  68. $is_num = (@$matches[0][1] === '#');
  69. if ($is_num) {
  70. $is_hex = (@$entity[2] === 'x');
  71. $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
  72. // abort for special characters
  73. if (isset($this->_special_dec2str[$code])) return $entity;
  74. return HTMLPurifier_Encoder::unichr($code);
  75. } else {
  76. if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
  77. if (!$this->_entity_lookup) {
  78. $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
  79. }
  80. if (isset($this->_entity_lookup->table[$matches[3]])) {
  81. return $this->_entity_lookup->table[$matches[3]];
  82. } else {
  83. return $entity;
  84. }
  85. }
  86. }
  87. /**
  88. * Substitutes only special entities with their parsed equivalents.
  89. *
  90. * @notice We try to avoid calling this function because otherwise, it
  91. * would have to be called a lot (for every parsed section).
  92. *
  93. * @param $string String to have non-special entities parsed.
  94. * @returns Parsed string.
  95. */
  96. public function substituteSpecialEntities($string) {
  97. return preg_replace_callback(
  98. $this->_substituteEntitiesRegex,
  99. array($this, 'specialEntityCallback'),
  100. $string);
  101. }
  102. /**
  103. * Callback function for substituteSpecialEntities() that does the work.
  104. *
  105. * This callback has same syntax as nonSpecialEntityCallback().
  106. *
  107. * @param $matches PCRE-style matches array, with 0 the entire match, and
  108. * either index 1, 2 or 3 set with a hex value, dec value,
  109. * or string (respectively).
  110. * @returns Replacement string.
  111. */
  112. protected function specialEntityCallback($matches) {
  113. $entity = $matches[0];
  114. $is_num = (@$matches[0][1] === '#');
  115. if ($is_num) {
  116. $is_hex = (@$entity[2] === 'x');
  117. $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
  118. return isset($this->_special_dec2str[$int]) ?
  119. $this->_special_dec2str[$int] :
  120. $entity;
  121. } else {
  122. return isset($this->_special_ent2dec[$matches[3]]) ?
  123. $this->_special_ent2dec[$matches[3]] :
  124. $entity;
  125. }
  126. }
  127. }
  128. // vim: et sw=4 sts=4