FontFamily.php 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. <?php
  2. /**
  3. * Validates a font family list according to CSS spec
  4. */
  5. class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
  6. {
  7. protected $mask = null;
  8. public function __construct() {
  9. $this->mask = '_- ';
  10. for ($c = 'a'; $c <= 'z'; $c++) $this->mask .= $c;
  11. for ($c = 'A'; $c <= 'Z'; $c++) $this->mask .= $c;
  12. for ($c = '0'; $c <= '9'; $c++) $this->mask .= $c; // cast-y, but should be fine
  13. // special bytes used by UTF-8
  14. for ($i = 0x80; $i <= 0xFF; $i++) {
  15. // We don't bother excluding invalid bytes in this range,
  16. // because the our restriction of well-formed UTF-8 will
  17. // prevent these from ever occurring.
  18. $this->mask .= chr($i);
  19. }
  20. /*
  21. PHP's internal strcspn implementation is
  22. O(length of string * length of mask), making it inefficient
  23. for large masks. However, it's still faster than
  24. preg_match 8)
  25. for (p = s1;;) {
  26. spanp = s2;
  27. do {
  28. if (*spanp == c || p == s1_end) {
  29. return p - s1;
  30. }
  31. } while (spanp++ < (s2_end - 1));
  32. c = *++p;
  33. }
  34. */
  35. // possible optimization: invert the mask.
  36. }
  37. public function validate($string, $config, $context) {
  38. static $generic_names = array(
  39. 'serif' => true,
  40. 'sans-serif' => true,
  41. 'monospace' => true,
  42. 'fantasy' => true,
  43. 'cursive' => true
  44. );
  45. $allowed_fonts = $config->get('CSS.AllowedFonts');
  46. // assume that no font names contain commas in them
  47. $fonts = explode(',', $string);
  48. $final = '';
  49. foreach($fonts as $font) {
  50. $font = trim($font);
  51. if ($font === '') continue;
  52. // match a generic name
  53. if (isset($generic_names[$font])) {
  54. if ($allowed_fonts === null || isset($allowed_fonts[$font])) {
  55. $final .= $font . ', ';
  56. }
  57. continue;
  58. }
  59. // match a quoted name
  60. if ($font[0] === '"' || $font[0] === "'") {
  61. $length = strlen($font);
  62. if ($length <= 2) continue;
  63. $quote = $font[0];
  64. if ($font[$length - 1] !== $quote) continue;
  65. $font = substr($font, 1, $length - 2);
  66. }
  67. $font = $this->expandCSSEscape($font);
  68. // $font is a pure representation of the font name
  69. if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {
  70. continue;
  71. }
  72. if (ctype_alnum($font) && $font !== '') {
  73. // very simple font, allow it in unharmed
  74. $final .= $font . ', ';
  75. continue;
  76. }
  77. // bugger out on whitespace. form feed (0C) really
  78. // shouldn't show up regardless
  79. $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);
  80. // Here, there are various classes of characters which need
  81. // to be treated differently:
  82. // - Alphanumeric characters are essentially safe. We
  83. // handled these above.
  84. // - Spaces require quoting, though most parsers will do
  85. // the right thing if there aren't any characters that
  86. // can be misinterpreted
  87. // - Dashes rarely occur, but they fairly unproblematic
  88. // for parsing/rendering purposes.
  89. // The above characters cover the majority of Western font
  90. // names.
  91. // - Arbitrary Unicode characters not in ASCII. Because
  92. // most parsers give little thought to Unicode, treatment
  93. // of these codepoints is basically uniform, even for
  94. // punctuation-like codepoints. These characters can
  95. // show up in non-Western pages and are supported by most
  96. // major browsers, for example: "MS 明朝" is a
  97. // legitimate font-name
  98. // <http://ja.wikipedia.org/wiki/MS_明朝>. See
  99. // the CSS3 spec for more examples:
  100. // <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>
  101. // You can see live samples of these on the Internet:
  102. // <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック>
  103. // However, most of these fonts have ASCII equivalents:
  104. // for example, 'MS Mincho', and it's considered
  105. // professional to use ASCII font names instead of
  106. // Unicode font names. Thanks Takeshi Terada for
  107. // providing this information.
  108. // The following characters, to my knowledge, have not been
  109. // used to name font names.
  110. // - Single quote. While theoretically you might find a
  111. // font name that has a single quote in its name (serving
  112. // as an apostrophe, e.g. Dave's Scribble), I haven't
  113. // been able to find any actual examples of this.
  114. // Internet Explorer's cssText translation (which I
  115. // believe is invoked by innerHTML) normalizes any
  116. // quoting to single quotes, and fails to escape single
  117. // quotes. (Note that this is not IE's behavior for all
  118. // CSS properties, just some sort of special casing for
  119. // font-family). So a single quote *cannot* be used
  120. // safely in the font-family context if there will be an
  121. // innerHTML/cssText translation. Note that Firefox 3.x
  122. // does this too.
  123. // - Double quote. In IE, these get normalized to
  124. // single-quotes, no matter what the encoding. (Fun
  125. // fact, in IE8, the 'content' CSS property gained
  126. // support, where they special cased to preserve encoded
  127. // double quotes, but still translate unadorned double
  128. // quotes into single quotes.) So, because their
  129. // fixpoint behavior is identical to single quotes, they
  130. // cannot be allowed either. Firefox 3.x displays
  131. // single-quote style behavior.
  132. // - Backslashes are reduced by one (so \\ -> \) every
  133. // iteration, so they cannot be used safely. This shows
  134. // up in IE7, IE8 and FF3
  135. // - Semicolons, commas and backticks are handled properly.
  136. // - The rest of the ASCII punctuation is handled properly.
  137. // We haven't checked what browsers do to unadorned
  138. // versions, but this is not important as long as the
  139. // browser doesn't /remove/ surrounding quotes (as IE does
  140. // for HTML).
  141. //
  142. // With these results in hand, we conclude that there are
  143. // various levels of safety:
  144. // - Paranoid: alphanumeric, spaces and dashes(?)
  145. // - International: Paranoid + non-ASCII Unicode
  146. // - Edgy: Everything except quotes, backslashes
  147. // - NoJS: Standards compliance, e.g. sod IE. Note that
  148. // with some judicious character escaping (since certain
  149. // types of escaping doesn't work) this is theoretically
  150. // OK as long as innerHTML/cssText is not called.
  151. // We believe that international is a reasonable default
  152. // (that we will implement now), and once we do more
  153. // extensive research, we may feel comfortable with dropping
  154. // it down to edgy.
  155. // Edgy: alphanumeric, spaces, dashes, underscores and Unicode. Use of
  156. // str(c)spn assumes that the string was already well formed
  157. // Unicode (which of course it is).
  158. if (strspn($font, $this->mask) !== strlen($font)) {
  159. continue;
  160. }
  161. // Historical:
  162. // In the absence of innerHTML/cssText, these ugly
  163. // transforms don't pose a security risk (as \\ and \"
  164. // might--these escapes are not supported by most browsers).
  165. // We could try to be clever and use single-quote wrapping
  166. // when there is a double quote present, but I have choosen
  167. // not to implement that. (NOTE: you can reduce the amount
  168. // of escapes by one depending on what quoting style you use)
  169. // $font = str_replace('\\', '\\5C ', $font);
  170. // $font = str_replace('"', '\\22 ', $font);
  171. // $font = str_replace("'", '\\27 ', $font);
  172. // font possibly with spaces, requires quoting
  173. $final .= "'$font', ";
  174. }
  175. $final = rtrim($final, ', ');
  176. if ($final === '') return false;
  177. return $final;
  178. }
  179. }
  180. // vim: et sw=4 sts=4