XapianIndexer.class.php 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. <?php
  2. /* For licensing terms, see /license.txt */
  3. /**
  4. * @package chamilo.include.search
  5. */
  6. /**
  7. * Code
  8. */
  9. require_once 'xapian.php';
  10. require_once dirname(__FILE__) . '/../IndexableChunk.class.php';
  11. /**
  12. * Abstract helper class
  13. * @package chamilo.include.search
  14. */
  15. abstract class XapianIndexer {
  16. /* XapianWritableDatabase */
  17. protected $db;
  18. /* IndexableChunk[] */
  19. protected $chunks;
  20. /* XapianTermGenerator */
  21. public $indexer;
  22. /* XapianStem */
  23. public $stemmer;
  24. /**
  25. * Generates a list of languages Xapian manages
  26. *
  27. * This method enables the definition of more matches between
  28. * Chamilo languages and Xapian languages (through hardcoding)
  29. * @return array Array of languages codes -> Xapian languages
  30. */
  31. public final function xapian_languages() {
  32. /* http://xapian.org/docs/apidoc/html/classXapian_1_1Stem.html */
  33. return array(
  34. 'none' => 'none', //don't stem terms
  35. 'da' => 'danish',
  36. 'nl' => 'dutch',
  37. /* Martin Porter's 2002 revision of his stemmer */
  38. 'en' => 'english',
  39. /* Lovin's stemmer */
  40. 'lovins' => 'english_lovins',
  41. /* Porter's stemmer as described in his 1980 paper */
  42. 'porter' => 'english_porter',
  43. 'fi' => 'finnish',
  44. 'fr' => 'french',
  45. 'de' => 'german',
  46. 'it' => 'italian',
  47. 'no' => 'norwegian',
  48. 'pt' => 'portuguese',
  49. 'ru' => 'russian',
  50. 'es' => 'spanish',
  51. 'sv' => 'swedish',
  52. );
  53. }
  54. /**
  55. * Connect to the database, and create it if it doesn't exist
  56. */
  57. function connectDb($path = NULL, $dbMode = NULL, $lang = 'english') {
  58. if ($this->db != NULL)
  59. return $this->db;
  60. if ($dbMode == NULL)
  61. $dbMode = Xapian::DB_CREATE_OR_OPEN;
  62. if ($path == NULL)
  63. $path = api_get_path(SYS_PATH) . 'searchdb/';
  64. try {
  65. $this->db = new XapianWritableDatabase($path, $dbMode);
  66. $this->indexer = new XapianTermGenerator();
  67. if (!in_array($lang, $this->xapian_languages())) {
  68. $lang = 'english';
  69. }
  70. $this->stemmer = new XapianStem($lang);
  71. $this->indexer->set_stemmer($this->stemmer);
  72. return $this->db;
  73. } catch (Exception $e) {
  74. Display::display_error_message($e->getMessage());
  75. return 1;
  76. }
  77. }
  78. /**
  79. * Simple getter for the db attribute
  80. * @return object The db attribute
  81. */
  82. function getDb() {
  83. return $this->db;
  84. }
  85. /**
  86. * Add this chunk to the chunk array attribute
  87. * @param string Chunk of text
  88. * @return void
  89. */
  90. function addChunk($chunk) {
  91. $this->chunks[] = $chunk;
  92. }
  93. /**
  94. * Actually index the current data
  95. *
  96. * @return integer New Xapian document ID or NULL upon failure
  97. */
  98. function index() {
  99. try {
  100. if (!empty($this->chunks)) {
  101. foreach ($this->chunks as $chunk) {
  102. $doc = new XapianDocument();
  103. $this->indexer->set_document($doc);
  104. if (!empty($chunk->terms)) {
  105. foreach ($chunk->terms as $term) {
  106. /* FIXME: think of getting weight */
  107. $doc->add_term($term['flag'] . $term['name'], 1);
  108. }
  109. }
  110. // free-form index all data array (title, content, etc)
  111. if (!empty($chunk->data)) {
  112. foreach ($chunk->data as $key => $value) {
  113. $this->indexer->index_text($value, 1);
  114. }
  115. }
  116. $doc->set_data($chunk->xapian_data, 1);
  117. $did = $this->db->add_document($doc);
  118. //write to disk
  119. $this->db->flush();
  120. return $did;
  121. }
  122. }
  123. } catch (Exception $e) {
  124. Display::display_error_message($e->getMessage());
  125. exit(1);
  126. }
  127. }
  128. /**
  129. * Get a specific document from xapian db
  130. *
  131. * @param int did Xapian::docid
  132. * @return mixed XapianDocument, or false on error
  133. */
  134. function get_document($did) {
  135. if ($this->db == NULL) {
  136. $this->connectDb();
  137. }
  138. try {
  139. $docid = $this->db->get_document($did);
  140. } catch (Exception $e) {
  141. //Display::display_error_message($e->getMessage());
  142. return false;
  143. }
  144. return $docid;
  145. }
  146. /**
  147. * Get document data on a xapian document
  148. *
  149. * @param XapianDocument $doc xapian document to push into the db
  150. * @return mixed xapian document data or FALSE if error
  151. */
  152. function get_document_data($doc) {
  153. if ($this->db == NULL) {
  154. $this->connectDb();
  155. }
  156. try {
  157. if (!is_a($doc, 'XapianDocument')) {
  158. return FALSE;
  159. }
  160. $doc_data = $doc->get_data();
  161. return $doc_data;
  162. } catch (Exception $e) {
  163. //Display::display_error_message($e->getMessage());
  164. return false;
  165. }
  166. }
  167. /**
  168. * Replace all terms of a document in xapian db
  169. *
  170. * @param int $did Xapian::docid
  171. * @param array $terms New terms of the document
  172. * @param string $prefix Prefix used to categorize the doc (usually 'T' for title, 'A' for author)
  173. * @return boolean false on error
  174. */
  175. function update_terms($did, $terms, $prefix) {
  176. $doc = $this->get_document($did);
  177. if ($doc === false) {
  178. return false;
  179. }
  180. $doc->clear_terms();
  181. foreach ($terms as $term) {
  182. //add directly
  183. $doc->add_term($prefix . $term, 1);
  184. }
  185. $this->db->replace_document($did, $doc);
  186. $this->db->flush();
  187. return true;
  188. }
  189. /**
  190. * Remove a document from xapian db
  191. *
  192. * @param int did Xapian::docid
  193. */
  194. function remove_document($did) {
  195. if ($this->db == NULL) {
  196. $this->connectDb();
  197. }
  198. if (is_numeric($did) && $did > 0) {
  199. $doc = $this->get_document($did);
  200. if ($doc !== FALSE) {
  201. $this->db->delete_document($did);
  202. $this->db->flush();
  203. }
  204. }
  205. }
  206. /**
  207. * Adds a term to the document specified
  208. *
  209. * @param string $term The term to add
  210. * @param XapianDocument $doc The xapian document where to add the term
  211. * @return mixed XapianDocument, or false on error
  212. */
  213. function add_term_to_doc($term, $doc) {
  214. if (!is_a($doc, 'XapianDocument')) {
  215. return FALSE;
  216. }
  217. try {
  218. $doc->add_term($term);
  219. } catch (Exception $e) {
  220. Display::display_error_message($e->getMessage());
  221. return 1;
  222. }
  223. }
  224. /**
  225. * Remove a term from the document specified
  226. *
  227. * @param string $term The term to add
  228. * @param XapianDocument $doc The xapian document where to add the term
  229. * @return mixed XapianDocument, or false on error
  230. */
  231. function remove_term_from_doc($term, $doc) {
  232. if (!is_a($doc, 'XapianDocument')) {
  233. return FALSE;
  234. }
  235. try {
  236. $doc->remove_term($term);
  237. } catch (Exception $e) {
  238. Display::display_error_message($e->getMessage());
  239. return 1;
  240. }
  241. }
  242. /**
  243. * Replace a document in the actual db
  244. *
  245. * @param XapianDocument $doc xapian document to push into the db
  246. * @param Xapian::docid $did xapian document id of the document to replace
  247. */
  248. function replace_document($doc, $did) {
  249. if (!is_a($doc, 'XapianDocument')) {
  250. return FALSE;
  251. }
  252. if ($this->db == NULL) {
  253. $this->connectDb();
  254. }
  255. try {
  256. $this->getDb()->replace_document((int) $did, $doc);
  257. $this->getDb()->flush();
  258. } catch (Exception $e) {
  259. Display::display_error_message($e->getMessage());
  260. return 1;
  261. }
  262. }
  263. /**
  264. * Class contructor
  265. */
  266. function __construct() {
  267. $this->db = NULL;
  268. $this->stemmer = NULL;
  269. }
  270. /**
  271. * Class destructor
  272. */
  273. function __destruct() {
  274. unset($this->db);
  275. unset($this->stemmer);
  276. }
  277. }