langstats_file_builder.php 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. <?php
  2. /* For licensing terms, see /license.txt */
  3. /**
  4. * This script generates a directory based on the English language variables
  5. * but only composed of the 10,000 (can be configured) most frequent words
  6. * used in Chamilo. This implies first using the langstats.php script, which
  7. * in turn implies configuring an additional variable in configuration.php
  8. * (see langstats.php for more info).
  9. * When running the language_builder, please make sure this parameter is
  10. * set to 0 in the configuration.php file, otherwise it will take *ages*.
  11. */
  12. require_once '../../inc/global.inc.php';
  13. require_once 'langstats.class.php';
  14. global $_configuration;
  15. $_configuration['language_measure_frequency'] = 0;
  16. $langstats = new langstats();
  17. $orig_lang = 'english';
  18. /**
  19. * Init.
  20. */
  21. $words_limit = 10000; //change this if you want more words
  22. $terms_limit = 3000; //change this if you think you'll need more terms
  23. $terms = $langstats->get_popular_terms($terms_limit);
  24. $words_counter = 0;
  25. $i = 0;
  26. $terms_in_limit = [];
  27. $lang_dir = api_get_path(SYS_LANG_PATH);
  28. $arch_dir = api_get_path(SYS_ARCHIVE_PATH);
  29. /**
  30. * Code run.
  31. */
  32. foreach ($terms as $row) {
  33. if ($words_counter > 10000) {
  34. break;
  35. }
  36. $words = str_word_count(get_lang($row['term_name'], null, $orig_lang));
  37. $words_counter += $words;
  38. $terms_in_limit[$row['term_name']] = $i;
  39. //echo "Term <b>".$row['term_name']."</b> is <b>'".get_lang($row['term_name'],null,$orig_lang)."'</b> which means $words words<br /><br />\n";
  40. //if ($words_counter%1000 >= 0) {
  41. //echo "Reached $words_counter words at term $i (".$row['term_name']." used ".$row['term_count']." times)...<br />\n";
  42. //}
  43. $i++;
  44. }
  45. //echo $words_counter.'<br />';
  46. echo "Reached ".count($terms_in_limit)." terms for the $words_counter most-used words<br /><br />\n";
  47. echo "Scanning English files, trying to find these terms...<br />\n";
  48. if (!is_dir($arch_dir.'/langstats')) {
  49. mkdir($arch_dir.'/langstats');
  50. mkdir($arch_dir.'/langstats/'.$orig_lang);
  51. }
  52. $list_files = scandir($lang_dir.'/'.$orig_lang);
  53. $j = 1;
  54. $terms_found = [];
  55. $words_found = 0;
  56. $global_var = []; //keep the combination of all vars
  57. $terms_in_limit = array_flip($terms_in_limit);
  58. foreach ($list_files as $file) {
  59. if (substr($file, 0, 1) == '.') {
  60. continue;
  61. }
  62. //echo "'".substr($file,0,-8)."',<br />"; //print in a PHP array format
  63. $vars = file($lang_dir.'/'.$orig_lang.'/'.$file);
  64. $local_var = [];
  65. $file_string = '<?php'."\n";
  66. foreach ($vars as $line) {
  67. $var = [];
  68. $res = preg_match('/^(\$\w*)/', $line, $var);
  69. if ($res > 0) {
  70. //echo $var[1]."<br />";
  71. if (in_array(substr($var[1], 1), $terms_in_limit)) {
  72. //echo "Var ".$var[1]." was in the limit<br />";
  73. $local_var[$var[1]] = $line;
  74. $file_string .= $line;
  75. $terms_found[] = substr($var[1], 1); //e.g. store Tools
  76. $words_found += str_word_count(get_lang($var[1], null, $orig_lang));
  77. } elseif (in_array(substr($var[1], 5), $terms_in_limit)) {
  78. //echo "Var ".$var[1]." was in the limit<br />";
  79. $local_var[$var[1]] = $line;
  80. $file_string .= $line;
  81. $terms_found[] = substr($var[1], 5); //e.g. store langTools
  82. $words_found += str_word_count(get_lang(substr($var[1], 5), null, $orig_lang));
  83. } //else do not care
  84. }
  85. }
  86. echo "Writing ".$arch_dir.'/langstats/'.$orig_lang.'/'.$file."<br />\n";
  87. file_put_contents($arch_dir.'/langstats/'.$orig_lang.'/'.$file, $file_string);
  88. $global_var += $local_var;
  89. }
  90. $terms_diff = count($global_var) - count($terms_in_limit);
  91. echo count(
  92. $global_var
  93. )." terms found in English files (summing up to $words_found words). Some terms ($terms_diff in this case) might have appeared in two different files<br />";
  94. /**
  95. * Display results.
  96. */
  97. echo "Difference between filtered and found in English:<br />";
  98. //print_r($terms_found);
  99. echo "<pre>".print_r(array_diff($terms_in_limit, $terms_found), 1)."</pre>";
  100. echo "#";