/********************************************************************** * * mgsearch.cpp -- * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: mgsearch.cpp 539 1999-09-07 22:52:52Z rjmcnab $ * *********************************************************************/ /* $Log$ Revision 1.19 1999/09/07 22:52:52 rjmcnab Seems to be an error in mg for retrieving documents using a paragraph based index for some cases. Just added a work around (loads the default index every time). Revision 1.18 1999/09/07 04:57:22 sjboddie added gpl notice Revision 1.17 1999/08/31 22:42:41 rjmcnab A couple of minor things. Revision 1.16 1999/08/25 04:51:06 sjboddie small change to allow for searching using boolean operators Revision 1.15 1999/07/16 08:35:03 rjmcnab Fixed a weird bug to do with a faulty case statement. Revision 1.14 1999/07/16 03:42:22 sjboddie changed isApprox Revision 1.13 1999/07/16 00:12:46 sjboddie removed all the old post-processing stuff Revision 1.12 1999/07/07 06:17:47 rjmcnab broke search_index into index+subcollection+language within mgsearch Revision 1.11 1999/07/05 21:06:43 rjmcnab Disabled quoted strings. Revision 1.10 1999/07/01 09:29:19 rjmcnab Changes for better reporting of number documents which match a query. Changes should still work as before with older versions of mg. Revision 1.9 1999/07/01 03:54:48 rjmcnab Added code to plug in the equivalent terms of each of the query terms. Also added a function to get a raw utf8 encoded mg document (for speeding up a phrase matching function) Revision 1.8 1999/06/30 04:04:12 rjmcnab made stemming functions available from mgsearch and made the stems for the query terms available in queryinfo Revision 1.7 1999/06/27 22:07:27 sjboddie got rid of all the old functions for dealing with dir indexes Revision 1.6 1999/06/09 00:41:32 sjboddie phrase searching now uses case-folding if it's turned on Revision 1.5 1999/02/21 22:31:35 rjmcnab Removed locateinfo. Revision 1.4 1999/02/03 01:13:27 sjboddie Got interface to handle subcollections and language subcollections - committed changes made to some of the collections Revision 1.3 1999/01/19 01:38:17 rjmcnab Made the source more portable. Revision 1.2 1999/01/12 01:51:02 rjmcnab Standard header. Revision 1.1 1999/01/08 09:02:16 rjmcnab Moved from src/library. */ #include "gsdlconf.h" #include "mgsearch.h" #include "fileutil.h" #include #include #include #include #if defined(GSDL_USE_OBJECTSPACE) # include #elif defined(GSDL_USE_IOS_H) # include #else # include #endif #if defined(__WIN32__) // gdbm stuff # include "autoconf.h" # include "systems.h" # include "gdbmconst.h" # include "gdbm.h" #else # include #endif #include #include "mgq.h" // #include "locateinfo.h" #include "gsdlunicode.h" #include "unitool.h" ///////////// // globals // ///////////// static char *tempdoc = NULL; static int templen = 0; ////////////////////// // useful functions // ////////////////////// // input and output are in utf8 text_t mgsearch_stemword (const text_t &word) { // allocate working stem space int maxstemlen = mgq_getmaxstemlen (); unsigned char *word_stem = new unsigned char [maxstemlen + 2]; if (word_stem == NULL) return ""; // copy word to word_stem int len = 0; text_t::const_iterator here = word.begin(); text_t::const_iterator end = word.end(); while (len < maxstemlen && here != end) { word_stem[len+1] = (unsigned char)(*here); len++; here++; } word_stem[len+1] = '\0'; word_stem[0] = len; mgq_stemword (word_stem); // copy word_stem back to tempstr text_t tempstr; tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]); delete [] word_stem; return tempstr; } //////////////////////// // callback functions // //////////////////////// // This routine is called for each document found in a search // it assumes that cache_num is set up correctly to point to // a suitable result cache int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum, float Weight, void *info) { queryresultsclass *queryresults = (queryresultsclass * )info; // append this entry to the document results docresultclass docresult; docresult.docnum = DocNum; docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg... docresult.docweight = Weight - docresult.num_query_terms_matched*100; queryresults->docs.docset[DocNum] = docresult; queryresults->docs.docorder.push_back(DocNum); return 0; } int termequivcallback(char *Word, int ULen, int /*Freq*/, float /*Weight*/, void *info) { text_tset *equivterms = (text_tset *)info; if (equivterms == NULL) return 0; text_t thisterm; thisterm.setcarr(Word, ULen); equivterms->insert(thisterm); return 0; } void mgsearch_equivterms (const text_t &word, text_tset &equivterms) { // allocate working stem space int maxstemlen = mgq_getmaxstemlen (); unsigned char *word_stem = new unsigned char [maxstemlen + 2]; if (word_stem == NULL) return; // copy word to word_stem int len = 0; text_t::const_iterator here = word.begin(); text_t::const_iterator end = word.end(); while (len < maxstemlen && here != end) { word_stem[len+1] = (unsigned char)(*here); len++; here++; } word_stem[len+1] = '\0'; word_stem[0] = len; // get the equivalent terms mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms)); delete [] word_stem; return; } text_tset utf8equivterms; // kept as utf8 string for fast matching // This callback is called once for each term in the query int termfreqcallback(char *Word, int ULen, int Freq, float /*Weight*/, void *info) { queryresultsclass *queryresults = (queryresultsclass *)info; if (queryresults == NULL) return 0; text_t term; term.setcarr(Word, ULen); termfreqclass termfreq; termfreq.termstr = to_uni(term); text_t utf8termstem = mgsearch_stemword (term); termfreq.termstemstr = to_uni (utf8termstem); mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms); termfreq.termfreq = Freq; queryresults->orgterms.push_back(termfreq); return 0; } // this callback is called once for each variation of each term int termvariantscallback(char *Word, int ULen, int /*Freq*/, float /*Weight*/, void *info) { text_t term; term.setcarr(Word, ULen); queryresultsclass *queryresults = (queryresultsclass *)info; queryresults->termvariants.insert(to_uni(term)); return 0; } // This callback is for getting document text int doctextcallback(char *Doc, int ULen, int /*Freq*/, float /*Weight*/, void * /*info*/) { tempdoc = Doc; templen = ULen; return 0; } static text_t getindexsuffix (const text_t &collection, const text_t &index) { text_t indexsuffix = "index"; indexsuffix = filename_cat (indexsuffix, index); indexsuffix = filename_cat (indexsuffix, collection); return indexsuffix; } //////////////////// // mgsearch class // //////////////////// mgsearchclass::mgsearchclass () { cache = new querycache (RESULTCACHESIZE); } mgsearchclass::~mgsearchclass () { if (cache != NULL) { delete cache; cache = NULL; } } void mgsearchclass::setcollectdir (const text_t &thecollectdir) { collectdir = thecollectdir; } // you only need to use this function before doing any stemming // casefolding and stemming will be set if values for them are // provided (0 or 1). // makeindexcurrent returns true if it was able to load the database bool mgsearchclass::makeindexcurrent (const text_t &index, const text_t &subcollection, const text_t &language, const text_t &collection, int casefolding, int stemming) { bool databaseloaded = true; // get the names of the collection, index and text suffixes char *ccollection = collection.getcstr(); assert (ccollection != NULL); char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr(); assert (idxsuffix != NULL); char *txtsuffix = (getindexsuffix (collection, "text")).getcstr(); assert (txtsuffix != NULL); #ifdef __WIN32__ char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL); #else char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL); #endif if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) { if (casefolding == 0) mgq_ask(".set casefold off"); else if (casefolding > 0) mgq_ask(".set casefold on"); if (stemming == 0) mgq_ask(".set stem off"); else if (stemming > 0) mgq_ask(".set stem on"); } else databaseloaded = false; // free up the c strings delete ccollection; delete idxsuffix; delete txtsuffix; delete ccollectdir; return databaseloaded; } // stem word uses the values set in the last call to makeindexcurrent // to stem the word. It is assumed that word is in unicode text_t mgsearchclass::stemword (const text_t &word) { return to_uni (mgsearch_stemword (to_utf8 (word))); } text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) { return to_uni (mgsearch_stemword (to_utf8 (here, end))); } bool mgsearchclass::search(const queryparamclass &queryparams, queryresultsclass &queryresults) { assert (cache != NULL); queryresults.clear(); // first check the cache if (cache->find(queryparams, queryresults)) return true; // make sure there is a query to be processed text_t::const_iterator queryhere = queryparams.querystring.begin(); text_t::const_iterator queryend = queryparams.querystring.end(); while (queryhere != queryend) { if (is_unicode_letdig (*queryhere)) break; queryhere++; } // if we reached the end of the query string without finding // any alphanumeric characters then return no results (and say // the database was loaded) if (queryhere == queryend) return true; if (makeindexcurrent (queryparams.index, queryparams.subcollection, queryparams.language, queryparams.collection)) { setsearchmode (queryparams); submitquery (queryparams); getresults (queryparams, queryresults); return true; } return false; } void mgsearchclass::setsearchmode (const queryparamclass &queryparams) { mgq_ask(".set expert true"); mgq_ask(".set sorted_terms true"); mgq_ask(".set accumulator_method list"); mgq_ask(".set max_accumulators 500000"); mgq_ask(".set maxparas 500000"); mgq_ask(".set verbatim true"); mgq_ask(".unset skip_dump"); mgq_ask(".set mode docnums"); switch (queryparams.search_type) { case 0: mgq_ask(".set query boolean"); break; case 1: mgq_ask(".set query ranked"); break; } switch (queryparams.casefolding) { case 1: mgq_ask(".set casefold on"); break; case 0: mgq_ask(".set casefold off"); break; } switch (queryparams.stemming) { case 1: mgq_ask(".set stem on"); break; case 0: mgq_ask(".set stem off"); break; } mgq_ask(".set heads_length 150"); if (queryparams.maxdocs == -1) { mgq_ask(".set maxdocs all"); } else { char maxdocstr[32]; sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs); mgq_ask(maxdocstr); } } void mgsearchclass::submitquery (const queryparamclass &queryparams) { // sort out the query string text_t ttquerystring = queryparams.querystring; filterquery (ttquerystring); char *querystring = to_utf8(ttquerystring).getcstr(); // submit the query mgq_ask(querystring); delete querystring; } void mgsearchclass::getresults (const queryparamclass &queryparams, queryresultsclass &queryresults) { mgq_results(result_docnums, 0, MAXNUMDOCS, ourquerycallback, (void *)(&queryresults)); // get the term frequencies mgq_results(result_termfreqs, 0, MAXNUMTERMS, termfreqcallback, (void *)(&queryresults)); queryresults.sortuniqqueryterms(); // get term variants mgq_results(result_terms, 0, MAXNUMTERMS, termvariantscallback, (void *)(&queryresults)); // get the number of documents retrieved int total_retrieved = 0, is_approx = 0; mgq_docsretrieved (&total_retrieved, &is_approx); if (total_retrieved == 0) { // not available (or really was zero) queryresults.docs_matched = queryresults.docs.docset.size(); if (queryresults.docs_matched < queryparams.maxdocs) queryresults.is_approx = Exact; else queryresults.is_approx = MoreThan; } else { queryresults.docs_matched = total_retrieved; if (is_approx) queryresults.is_approx = Approximate; else queryresults.is_approx = Exact; } } void mgsearchclass::filterquery (text_t &ttquerystring) { text_t::iterator ithere = ttquerystring.begin (); text_t::iterator itend = ttquerystring.end (); // remove all non alphanumeric characters (except // boolean operators while (ithere != itend) { if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') && (*ithere != '&') && (*ithere != '|') && (*ithere != '(') && (*ithere != ')')) (*ithere) = ' '; ithere++; } } // the document text for 'docnum' is placed in 'output' // docTargetDocument returns 'true' if it was able to // try to get a document // collection is needed to see if an index from the // collection is loaded. If no index has been loaded // defaultindex is needed to load one bool mgsearchclass::docTargetDocument(const text_t &defaultindex, const text_t &defaultsubcollection, const text_t &defaultlanguage, const text_t &collection, int docnum, text_t &output) { output.clear(); // get the mg version of the document char *mgdoc = NULL; int doclen = 0; if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage, collection, docnum, mgdoc, doclen)) return false; if (mgdoc == NULL) return false; // replace all control-Cs with spaces char *mgdoc_here = mgdoc; char *mgdoc_end = mgdoc + doclen; while (mgdoc_here < mgdoc_end) { if (*mgdoc_here == '\x3') *mgdoc_here = ' '; mgdoc_here++; } // convert this document to unicode utf8inconvertclass inconvert; convertclass::status_t status; inconvert.reset (); inconvert.setinput (mgdoc, doclen); inconvert.convert (output, status); return true; } bool mgsearchclass::mgdocument (const text_t &defaultindex, const text_t &defaultsubcollection, const text_t &defaultlanguage, const text_t &collection, int docnum, char *&UDoc, int &ULen) { int databaseloaded = 0; UDoc = NULL; ULen = 0; // see if we can make an appropriate database current // char *ccollection = collection.getcstr(); // assert (ccollection != NULL); // databaseloaded = load_text_database (ccollection); // delete ccollection; // try and load the database // if (!databaseloaded) databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection, defaultlanguage, collection); if (databaseloaded) { // retrieve the document from mg char docstr[32]; sprintf(docstr, "%i", docnum); mgq_ask(".set mode text"); mgq_ask(".set query docnums"); mgq_ask(docstr); tempdoc = NULL; templen = 0; mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL); UDoc = tempdoc; ULen = templen; } return (bool)databaseloaded; }