XapianIndexer.class.php 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. <?php
  2. /* For licensing terms, see /license.txt */
  3. //@todo add setting to add xapian.php
  4. //require_once 'xapian.php';
  5. /**
  6. * Abstract helper class.
  7. */
  8. abstract class XapianIndexer
  9. {
  10. /* XapianTermGenerator */
  11. public $indexer;
  12. /* XapianStem */
  13. public $stemmer;
  14. /* XapianWritableDatabase */
  15. protected $db;
  16. /* IndexableChunk[] */
  17. protected $chunks;
  18. /**
  19. * Class contructor.
  20. */
  21. public function __construct()
  22. {
  23. $this->db = null;
  24. $this->stemmer = null;
  25. }
  26. /**
  27. * Class destructor.
  28. */
  29. public function __destruct()
  30. {
  31. unset($this->db);
  32. unset($this->stemmer);
  33. }
  34. /**
  35. * Generates a list of languages Xapian manages.
  36. *
  37. * This method enables the definition of more matches between
  38. * Chamilo languages and Xapian languages (through hardcoding)
  39. *
  40. * @return array Array of languages codes -> Xapian languages
  41. */
  42. final public function xapian_languages()
  43. {
  44. /* http://xapian.org/docs/apidoc/html/classXapian_1_1Stem.html */
  45. return [
  46. 'none' => 'none', //don't stem terms
  47. 'da' => 'danish',
  48. 'nl' => 'dutch',
  49. /* Martin Porter's 2002 revision of his stemmer */
  50. 'en' => 'english',
  51. /* Lovin's stemmer */
  52. 'lovins' => 'english_lovins',
  53. /* Porter's stemmer as described in his 1980 paper */
  54. 'porter' => 'english_porter',
  55. 'fi' => 'finnish',
  56. 'fr' => 'french',
  57. 'de' => 'german',
  58. 'it' => 'italian',
  59. 'no' => 'norwegian',
  60. 'pt' => 'portuguese',
  61. 'ru' => 'russian',
  62. 'es' => 'spanish',
  63. 'sv' => 'swedish',
  64. ];
  65. }
  66. /**
  67. * Connect to the database, and create it if it doesn't exist.
  68. */
  69. public function connectDb($path = null, $dbMode = null, $lang = 'english')
  70. {
  71. if ($this->db != null) {
  72. return $this->db;
  73. }
  74. if ($dbMode == null) {
  75. $dbMode = Xapian::DB_CREATE_OR_OPEN;
  76. }
  77. if ($path == null) {
  78. $path = api_get_path(SYS_UPLOAD_PATH).'plugins/xapian/searchdb/';
  79. }
  80. try {
  81. $this->db = new XapianWritableDatabase($path, $dbMode);
  82. $this->indexer = new XapianTermGenerator();
  83. if (!in_array($lang, $this->xapian_languages())) {
  84. $lang = 'english';
  85. }
  86. $this->stemmer = new XapianStem($lang);
  87. $this->indexer->set_stemmer($this->stemmer);
  88. return $this->db;
  89. } catch (Exception $e) {
  90. echo Display::return_message($e->getMessage(), 'error');
  91. return 1;
  92. }
  93. }
  94. /**
  95. * Simple getter for the db attribute.
  96. *
  97. * @return object The db attribute
  98. */
  99. public function getDb()
  100. {
  101. return $this->db;
  102. }
  103. /**
  104. * Add this chunk to the chunk array attribute.
  105. *
  106. * @param string Chunk of text
  107. */
  108. public function addChunk($chunk)
  109. {
  110. $this->chunks[] = $chunk;
  111. }
  112. /**
  113. * Actually index the current data.
  114. *
  115. * @return int New Xapian document ID or null upon failure
  116. */
  117. public function index()
  118. {
  119. try {
  120. if (!empty($this->chunks)) {
  121. foreach ($this->chunks as $chunk) {
  122. $doc = new XapianDocument();
  123. $this->indexer->set_document($doc);
  124. if (!empty($chunk->terms)) {
  125. foreach ($chunk->terms as $term) {
  126. /* FIXME: think of getting weight */
  127. $doc->add_term($term['flag'].$term['name'], 1);
  128. }
  129. }
  130. // free-form index all data array (title, content, etc)
  131. if (!empty($chunk->data)) {
  132. foreach ($chunk->data as $key => $value) {
  133. $this->indexer->index_text($value, 1);
  134. }
  135. }
  136. $doc->set_data($chunk->xapian_data, 1);
  137. $did = $this->db->add_document($doc);
  138. //write to disk
  139. $this->db->flush();
  140. return $did;
  141. }
  142. }
  143. } catch (Exception $e) {
  144. echo Display::return_message($e->getMessage(), 'error');
  145. exit(1);
  146. }
  147. }
  148. /**
  149. * Get a specific document from xapian db.
  150. *
  151. * @param int did Xapian::docid
  152. *
  153. * @return mixed XapianDocument, or false on error
  154. */
  155. public function get_document($did)
  156. {
  157. if ($this->db == null) {
  158. $this->connectDb();
  159. }
  160. try {
  161. $docid = $this->db->get_document($did);
  162. } catch (Exception $e) {
  163. //echo Display::return_message($e->getMessage(), 'error');
  164. return false;
  165. }
  166. return $docid;
  167. }
  168. /**
  169. * Get document data on a xapian document.
  170. *
  171. * @param XapianDocument $doc xapian document to push into the db
  172. *
  173. * @return mixed xapian document data or FALSE if error
  174. */
  175. public function get_document_data($doc)
  176. {
  177. if ($this->db == null) {
  178. $this->connectDb();
  179. }
  180. try {
  181. if (!is_a($doc, 'XapianDocument')) {
  182. return false;
  183. }
  184. $doc_data = $doc->get_data();
  185. return $doc_data;
  186. } catch (Exception $e) {
  187. //echo Display::return_message($e->getMessage(), 'error');
  188. return false;
  189. }
  190. }
  191. /**
  192. * Replace all terms of a document in xapian db.
  193. *
  194. * @param int $did Xapian::docid
  195. * @param array $terms New terms of the document
  196. * @param string $prefix Prefix used to categorize the doc
  197. * (usually 'T' for title, 'A' for author)
  198. *
  199. * @return bool false on error
  200. */
  201. public function update_terms($did, $terms, $prefix)
  202. {
  203. $doc = $this->get_document($did);
  204. if ($doc === false) {
  205. return false;
  206. }
  207. $doc->clear_terms();
  208. foreach ($terms as $term) {
  209. //add directly
  210. $doc->add_term($prefix.$term, 1);
  211. }
  212. $this->db->replace_document($did, $doc);
  213. $this->db->flush();
  214. return true;
  215. }
  216. /**
  217. * Remove a document from xapian db.
  218. *
  219. * @param int did Xapian::docid
  220. */
  221. public function remove_document($did)
  222. {
  223. if ($this->db == null) {
  224. $this->connectDb();
  225. }
  226. $did = (int) $did;
  227. if ($did > 0) {
  228. $doc = $this->get_document($did);
  229. if ($doc !== false) {
  230. $this->db->delete_document($did);
  231. $this->db->flush();
  232. }
  233. }
  234. }
  235. /**
  236. * Adds a term to the document specified.
  237. *
  238. * @param string $term The term to add
  239. * @param XapianDocument $doc The xapian document where to add the term
  240. *
  241. * @return mixed XapianDocument, or false on error
  242. */
  243. public function add_term_to_doc($term, $doc)
  244. {
  245. if (!is_a($doc, 'XapianDocument')) {
  246. return false;
  247. }
  248. try {
  249. $doc->add_term($term);
  250. } catch (Exception $e) {
  251. echo Display::return_message($e->getMessage(), 'error');
  252. return 1;
  253. }
  254. }
  255. /**
  256. * Remove a term from the document specified.
  257. *
  258. * @param string $term The term to add
  259. * @param XapianDocument $doc The xapian document where to add the term
  260. *
  261. * @return mixed XapianDocument, or false on error
  262. */
  263. public function remove_term_from_doc($term, $doc)
  264. {
  265. if (!is_a($doc, 'XapianDocument')) {
  266. return false;
  267. }
  268. try {
  269. $doc->remove_term($term);
  270. } catch (Exception $e) {
  271. echo Display::return_message($e->getMessage(), 'error');
  272. return 1;
  273. }
  274. }
  275. /**
  276. * Replace a document in the actual db.
  277. *
  278. * @param XapianDocument $doc xapian document to push into the db
  279. * @param int $did xapian document id of the document to replace
  280. *
  281. * @return mixed
  282. */
  283. public function replace_document($doc, $did)
  284. {
  285. if (!is_a($doc, 'XapianDocument')) {
  286. return false;
  287. }
  288. if ($this->db == null) {
  289. $this->connectDb();
  290. }
  291. try {
  292. $this->getDb()->replace_document((int) $did, $doc);
  293. $this->getDb()->flush();
  294. } catch (Exception $e) {
  295. echo Display::return_message($e->getMessage(), 'error');
  296. return 1;
  297. }
  298. }
  299. }