XapianIndexer.class.php 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. <?php
  2. /* For licensing terms, see /license.txt */
  3. /**
  4. * @package chamilo.include.search
  5. */
  6. require_once 'xapian.php';
  7. require_once __DIR__.'/../IndexableChunk.class.php';
  8. /**
  9. * Abstract helper class
  10. * @package chamilo.include.search
  11. */
  12. abstract class XapianIndexer
  13. {
  14. /* XapianWritableDatabase */
  15. protected $db;
  16. /* IndexableChunk[] */
  17. protected $chunks;
  18. /* XapianTermGenerator */
  19. public $indexer;
  20. /* XapianStem */
  21. public $stemmer;
  22. /**
  23. * Class contructor
  24. */
  25. public function __construct()
  26. {
  27. $this->db = null;
  28. $this->stemmer = null;
  29. }
  30. /**
  31. * Generates a list of languages Xapian manages
  32. *
  33. * This method enables the definition of more matches between
  34. * Chamilo languages and Xapian languages (through hardcoding)
  35. * @return array Array of languages codes -> Xapian languages
  36. */
  37. public final function xapian_languages()
  38. {
  39. /* http://xapian.org/docs/apidoc/html/classXapian_1_1Stem.html */
  40. return array(
  41. 'none' => 'none', //don't stem terms
  42. 'da' => 'danish',
  43. 'nl' => 'dutch',
  44. /* Martin Porter's 2002 revision of his stemmer */
  45. 'en' => 'english',
  46. /* Lovin's stemmer */
  47. 'lovins' => 'english_lovins',
  48. /* Porter's stemmer as described in his 1980 paper */
  49. 'porter' => 'english_porter',
  50. 'fi' => 'finnish',
  51. 'fr' => 'french',
  52. 'de' => 'german',
  53. 'it' => 'italian',
  54. 'no' => 'norwegian',
  55. 'pt' => 'portuguese',
  56. 'ru' => 'russian',
  57. 'es' => 'spanish',
  58. 'sv' => 'swedish',
  59. );
  60. }
  61. /**
  62. * Connect to the database, and create it if it doesn't exist
  63. */
  64. function connectDb($path = null, $dbMode = null, $lang = 'english')
  65. {
  66. if ($this->db != null) {
  67. return $this->db;
  68. }
  69. if ($dbMode == null) {
  70. $dbMode = Xapian::DB_CREATE_OR_OPEN;
  71. }
  72. if ($path == null) {
  73. $path = api_get_path(SYS_UPLOAD_PATH).'plugins/xapian/searchdb/';
  74. }
  75. try {
  76. $this->db = new XapianWritableDatabase($path, $dbMode);
  77. $this->indexer = new XapianTermGenerator();
  78. if (!in_array($lang, $this->xapian_languages())) {
  79. $lang = 'english';
  80. }
  81. $this->stemmer = new XapianStem($lang);
  82. $this->indexer->set_stemmer($this->stemmer);
  83. return $this->db;
  84. } catch (Exception $e) {
  85. echo Display::return_message($e->getMessage(), 'error');
  86. return 1;
  87. }
  88. }
  89. /**
  90. * Simple getter for the db attribute
  91. * @return object The db attribute
  92. */
  93. function getDb()
  94. {
  95. return $this->db;
  96. }
  97. /**
  98. * Add this chunk to the chunk array attribute
  99. * @param string Chunk of text
  100. * @return void
  101. */
  102. function addChunk($chunk)
  103. {
  104. $this->chunks[] = $chunk;
  105. }
  106. /**
  107. * Actually index the current data
  108. *
  109. * @return integer New Xapian document ID or null upon failure
  110. */
  111. function index()
  112. {
  113. try {
  114. if (!empty($this->chunks)) {
  115. foreach ($this->chunks as $chunk) {
  116. $doc = new XapianDocument();
  117. $this->indexer->set_document($doc);
  118. if (!empty($chunk->terms)) {
  119. foreach ($chunk->terms as $term) {
  120. /* FIXME: think of getting weight */
  121. $doc->add_term($term['flag'].$term['name'], 1);
  122. }
  123. }
  124. // free-form index all data array (title, content, etc)
  125. if (!empty($chunk->data)) {
  126. foreach ($chunk->data as $key => $value) {
  127. $this->indexer->index_text($value, 1);
  128. }
  129. }
  130. $doc->set_data($chunk->xapian_data, 1);
  131. $did = $this->db->add_document($doc);
  132. //write to disk
  133. $this->db->flush();
  134. return $did;
  135. }
  136. }
  137. } catch (Exception $e) {
  138. echo Display::return_message($e->getMessage(), 'error');
  139. exit(1);
  140. }
  141. }
  142. /**
  143. * Get a specific document from xapian db
  144. *
  145. * @param int did Xapian::docid
  146. * @return mixed XapianDocument, or false on error
  147. */
  148. function get_document($did)
  149. {
  150. if ($this->db == null) {
  151. $this->connectDb();
  152. }
  153. try {
  154. $docid = $this->db->get_document($did);
  155. } catch (Exception $e) {
  156. //echo Display::return_message($e->getMessage(), 'error');
  157. return false;
  158. }
  159. return $docid;
  160. }
  161. /**
  162. * Get document data on a xapian document
  163. *
  164. * @param XapianDocument $doc xapian document to push into the db
  165. * @return mixed xapian document data or FALSE if error
  166. */
  167. function get_document_data($doc)
  168. {
  169. if ($this->db == null) {
  170. $this->connectDb();
  171. }
  172. try {
  173. if (!is_a($doc, 'XapianDocument')) {
  174. return false;
  175. }
  176. $doc_data = $doc->get_data();
  177. return $doc_data;
  178. } catch (Exception $e) {
  179. //echo Display::return_message($e->getMessage(), 'error');
  180. return false;
  181. }
  182. }
  183. /**
  184. * Replace all terms of a document in xapian db
  185. *
  186. * @param int $did Xapian::docid
  187. * @param array $terms New terms of the document
  188. * @param string $prefix Prefix used to categorize the doc
  189. * (usually 'T' for title, 'A' for author)
  190. * @return boolean false on error
  191. */
  192. function update_terms($did, $terms, $prefix)
  193. {
  194. $doc = $this->get_document($did);
  195. if ($doc === false) {
  196. return false;
  197. }
  198. $doc->clear_terms();
  199. foreach ($terms as $term) {
  200. //add directly
  201. $doc->add_term($prefix.$term, 1);
  202. }
  203. $this->db->replace_document($did, $doc);
  204. $this->db->flush();
  205. return true;
  206. }
  207. /**
  208. * Remove a document from xapian db
  209. *
  210. * @param int did Xapian::docid
  211. */
  212. function remove_document($did)
  213. {
  214. if ($this->db == null) {
  215. $this->connectDb();
  216. }
  217. if (is_numeric($did) && $did > 0) {
  218. $doc = $this->get_document($did);
  219. if ($doc !== false) {
  220. $this->db->delete_document($did);
  221. $this->db->flush();
  222. }
  223. }
  224. }
  225. /**
  226. * Adds a term to the document specified
  227. *
  228. * @param string $term The term to add
  229. * @param XapianDocument $doc The xapian document where to add the term
  230. * @return mixed XapianDocument, or false on error
  231. */
  232. function add_term_to_doc($term, $doc)
  233. {
  234. if (!is_a($doc, 'XapianDocument')) {
  235. return false;
  236. }
  237. try {
  238. $doc->add_term($term);
  239. } catch (Exception $e) {
  240. echo Display::return_message($e->getMessage(), 'error');
  241. return 1;
  242. }
  243. }
  244. /**
  245. * Remove a term from the document specified
  246. *
  247. * @param string $term The term to add
  248. * @param XapianDocument $doc The xapian document where to add the term
  249. * @return mixed XapianDocument, or false on error
  250. */
  251. function remove_term_from_doc($term, $doc)
  252. {
  253. if (!is_a($doc, 'XapianDocument')) {
  254. return false;
  255. }
  256. try {
  257. $doc->remove_term($term);
  258. } catch (Exception $e) {
  259. echo Display::return_message($e->getMessage(), 'error');
  260. return 1;
  261. }
  262. }
  263. /**
  264. * Replace a document in the actual db
  265. *
  266. * @param XapianDocument $doc xapian document to push into the db
  267. * @param int $did xapian document id of the document to replace
  268. * @return mixed
  269. */
  270. function replace_document($doc, $did)
  271. {
  272. if (!is_a($doc, 'XapianDocument')) {
  273. return false;
  274. }
  275. if ($this->db == null) {
  276. $this->connectDb();
  277. }
  278. try {
  279. $this->getDb()->replace_document((int) $did, $doc);
  280. $this->getDb()->flush();
  281. } catch (Exception $e) {
  282. echo Display::return_message($e->getMessage(), 'error');
  283. return 1;
  284. }
  285. }
  286. /**
  287. * Class destructor
  288. */
  289. function __destruct()
  290. {
  291. unset($this->db);
  292. unset($this->stemmer);
  293. }
  294. }