generate-entity-file.php 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. #!/usr/bin/php
  2. <?php
  3. chdir(dirname(__FILE__));
  4. require_once 'common.php';
  5. assertCli();
  6. /**
  7. * @file
  8. * Parses *.ent files into an entity lookup table, and then serializes and
  9. * writes the whole kaboodle to a file. The resulting file is cached so
  10. * that this script does not need to be run. This script should rarely,
  11. * if ever, be run, since HTML's entities are fairly immutable.
  12. */
  13. // here's where the entity files are located, assuming working directory
  14. // is the same as the location of this PHP file. Needs trailing slash.
  15. $entity_dir = '../docs/entities/';
  16. // defines the output file for the serialized content.
  17. $output_file = '../library/HTMLPurifier/EntityLookup/entities.ser';
  18. // courtesy of a PHP manual comment
  19. function unichr($dec) {
  20. if ($dec < 128) {
  21. $utf = chr($dec);
  22. } else if ($dec < 2048) {
  23. $utf = chr(192 + (($dec - ($dec % 64)) / 64));
  24. $utf .= chr(128 + ($dec % 64));
  25. } else {
  26. $utf = chr(224 + (($dec - ($dec % 4096)) / 4096));
  27. $utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64));
  28. $utf .= chr(128 + ($dec % 64));
  29. }
  30. return $utf;
  31. }
  32. if ( !is_dir($entity_dir) ) exit("Fatal Error: Can't find entity directory.\n");
  33. if ( file_exists($output_file) ) exit("Fatal Error: entity-lookup.txt already exists.\n");
  34. $dh = @opendir($entity_dir);
  35. if ( !$dh ) exit("Fatal Error: Cannot read entity directory.\n");
  36. $entity_files = array();
  37. while (($file = readdir($dh)) !== false) {
  38. if (@$file[0] === '.') continue;
  39. if (substr(strrchr($file, "."), 1) !== 'ent') continue;
  40. $entity_files[] = $file;
  41. }
  42. closedir($dh);
  43. if ( !$entity_files ) exit("Fatal Error: No entity files to parse.\n");
  44. $entity_table = array();
  45. $regexp = '/<!ENTITY\s+([A-Za-z]+)\s+"&#(?:38;#)?([0-9]+);">/';
  46. foreach ( $entity_files as $file ) {
  47. $contents = file_get_contents($entity_dir . $file);
  48. $matches = array();
  49. preg_match_all($regexp, $contents, $matches, PREG_SET_ORDER);
  50. foreach ($matches as $match) {
  51. $entity_table[$match[1]] = unichr($match[2]);
  52. }
  53. }
  54. $output = serialize($entity_table);
  55. $fh = fopen($output_file, 'w');
  56. fwrite($fh, $output);
  57. fclose($fh);
  58. echo "Completed successfully.";
  59. // vim: et sw=4 sts=4