/********************************************************************** * * mgsearch.cpp -- * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #include "gsdlconf.h" #include "mgsearch.h" #include "fileutil.h" #include #include #include #include #if defined(GSDL_USE_OBJECTSPACE) # include #elif defined(GSDL_USE_IOS_H) # include #else # include #endif #if defined(__WIN32__) // gdbm stuff # include "autoconf.h" # include "systems.h" # include "gdbmconst.h" # include "gdbm.h" #else # include #endif #include #include "mgq.h" // #include "locateinfo.h" #include "gsdlunicode.h" #include "unitool.h" ///////////// // globals // ///////////// static char *tempdoc = NULL; static int templen = 0; ////////////////////// // useful functions // ////////////////////// // input and output are in utf8 text_t mgsearch_stemword (const text_t &word) { // allocate working stem space int maxstemlen = mgq_getmaxstemlen (); unsigned char *word_stem = new unsigned char [maxstemlen + 2]; if (word_stem == NULL) return ""; // copy word to word_stem int len = 0; text_t::const_iterator here = word.begin(); text_t::const_iterator end = word.end(); while (len < maxstemlen && here != end) { word_stem[len+1] = (unsigned char)(*here); ++len; ++here; } word_stem[len+1] = '\0'; word_stem[0] = len; mgq_stemword (word_stem); // copy word_stem back to tempstr text_t tempstr; tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]); delete [] word_stem; return tempstr; } //////////////////////// // callback functions // //////////////////////// // This routine is called for each document found in a search // it assumes that cache_num is set up correctly to point to // a suitable result cache int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum, float Weight, void *info) { queryresultsclass *queryresults = (queryresultsclass * )info; // append this entry to the document results docresultclass docresult; docresult.docnum = DocNum; docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg... docresult.docweight = Weight - docresult.num_query_terms_matched*100; queryresults->docs.docset[DocNum] = docresult; queryresults->docs.docorder.push_back(DocNum); return 0; } int termequivcallback(char *Word, int ULen, int /*Freq*/, float /*Weight*/, void *info) { text_tset *equivterms = (text_tset *)info; if (equivterms == NULL) return 0; text_t thisterm; thisterm.setcarr(Word, ULen); equivterms->insert(thisterm); return 0; } void mgsearch_equivterms (const text_t &word, text_tset &equivterms) { // allocate working stem space int maxstemlen = mgq_getmaxstemlen (); unsigned char *word_stem = new unsigned char [maxstemlen + 2]; if (word_stem == NULL) return; // copy word to word_stem int len = 0; text_t::const_iterator here = word.begin(); text_t::const_iterator end = word.end(); while (len < maxstemlen && here != end) { word_stem[len+1] = (unsigned char)(*here); ++len; ++here; } word_stem[len+1] = '\0'; word_stem[0] = len; // get the equivalent terms mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms)); delete [] word_stem; return; } text_tset utf8equivterms; // kept as utf8 string for fast matching // This callback is called once for each term in the query int termfreqcallback(char *Word, int ULen, int Freq, float /*Weight*/, void *info) { queryresultsclass *queryresults = (queryresultsclass *)info; if (queryresults == NULL) return 0; text_t term; term.setcarr(Word, ULen); termfreqclass termfreq; termfreq.termstr = to_uni(term); text_t utf8termstem = mgsearch_stemword (term); termfreq.termstemstr = to_uni (utf8termstem); mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms); termfreq.termfreq = Freq; queryresults->orgterms.push_back(termfreq); return 0; } // this callback is called once for each variation of each term int termvariantscallback(char *Word, int ULen, int /*Freq*/, float /*Weight*/, void *info) { text_t term; term.setcarr(Word, ULen); queryresultsclass *queryresults = (queryresultsclass *)info; queryresults->termvariants.insert(to_uni(term)); return 0; } // This callback is for getting document text int doctextcallback(char *Doc, int ULen, int /*Freq*/, float /*Weight*/, void * /*info*/) { if (Doc != NULL) { // Make a copy of this string so we can unload the database without losing it tempdoc = new char[ULen + 1]; strcpy(tempdoc, Doc); } templen = ULen; return 0; } text_t mgsearchclass::getindexsuffix (const text_t &collection, const text_t &index) { text_t indexsuffix = "index"; indexsuffix = filename_cat (indexsuffix, index); if (indexstem.empty()) { // no index stem, use the coll name indexsuffix = filename_cat (indexsuffix, collection); } else { indexsuffix = filename_cat (indexsuffix, indexstem); } return indexsuffix; } //////////////////// // mgsearch class // //////////////////// mgsearchclass::mgsearchclass () : searchclass() { } mgsearchclass::~mgsearchclass () { if (cache != NULL) { delete cache; cache = NULL; } } void mgsearchclass::set_indexstem(const text_t &stem) { indexstem = stem; } // you only need to use this function before doing any stemming // casefolding and stemming will be set if values for them are // provided (0 or 1). // makeindexcurrent returns true if it was able to load the database bool mgsearchclass::makeindexcurrent (const text_t &index, const text_t &subcollection, const text_t &language, const text_t &collection, int casefolding, int stemming) { bool databaseloaded = true; // get the names of the collection, index and text suffixes char *ccollection = collection.getcstr(); assert (ccollection != NULL); char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr(); assert (idxsuffix != NULL); char *txtsuffix = (getindexsuffix (collection, "text")).getcstr(); assert (txtsuffix != NULL); #ifdef __WIN32__ char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL); #else char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL); #endif if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) { if (casefolding == 0) mgq_ask(".set casefold off"); else if (casefolding > 0) mgq_ask(".set casefold on"); if (stemming == 0) mgq_ask(".set stem off"); else if (stemming > 0) mgq_ask(".set stem on"); } else databaseloaded = false; // free up the c strings delete []ccollection; delete []idxsuffix; delete []txtsuffix; delete []ccollectdir; return databaseloaded; } // stem word uses the values set in the last call to makeindexcurrent // to stem the word. It is assumed that word is in unicode text_t mgsearchclass::stemword (const text_t &word) { return to_uni (mgsearch_stemword (to_utf8 (word))); } text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) { return to_uni (mgsearch_stemword (to_utf8 (here, end))); } /** * search directs the whole execution of the search; a number of other * functions in this class are called as a result, and precondition * checks are also made */ bool mgsearchclass::search(const queryparamclass &queryparams, queryresultsclass &queryresults) { // assert (cache != NULL); // clear any previous results queryresults.clear(); // first check the cache if (cache != NULL) { if (cache->find(queryparams, queryresults)) return true; } // make sure there is a query to be processed if (!has_unicode_letdig(queryparams.querystring)) return true; if (makeindexcurrent (queryparams.index, queryparams.subcollection, queryparams.language, queryparams.collection)) { // initialise the form of results setsearchmode (queryparams); // execute the query submitquery (queryparams); // retrieve the results getresults (queryparams, queryresults); unload_database(); // Important that local library doesn't leave any files open return true; } return false; } /* accumulator_method has been changed to use array rather than list. list appears to be broken somewhat - for some ranked queries, it returned fewer results than it should have (eg 45 instead of 50). The three other methods (array, splay_tree, hash_table) all return the same number of documents, in the same order, with the same ranks. list returns what appears to be the same documents (but less of them), but with different ranks, and in a different order. Minimal time tests dont show any speed improvement of list over array (maybe because its broken??). [02/2001, kjm18] ... [sjboddie, also 02/2001] turns out that changing the accumulator_method introduced a more serious bug than it fixed (i.e. occasionally when doing a ranked search for a very common word you get no results at all). I've changed it back to list for now, one day we should play with other accumulator_methods but for now I don't have time and don't want to risk introducing bugs (better the devil you know ;) */ void mgsearchclass::setsearchmode (const queryparamclass &queryparams) { mgq_ask(".set expert true"); mgq_ask(".set sorted_terms true"); mgq_ask(".set accumulator_method list"); mgq_ask(".set max_accumulators 500000"); mgq_ask(".set maxparas 500000"); mgq_ask(".set verbatim true"); mgq_ask(".unset skip_dump"); mgq_ask(".set mode docnums"); switch (queryparams.search_type) { case 0: mgq_ask(".set query boolean"); break; case 1: mgq_ask(".set query ranked"); break; } switch (queryparams.casefolding) { case 1: mgq_ask(".set casefold on"); break; case 0: mgq_ask(".set casefold off"); break; } switch (queryparams.stemming) { case 1: mgq_ask(".set stem on"); break; case 0: mgq_ask(".set stem off"); break; } mgq_ask(".set heads_length 150"); if (queryparams.maxdocs == -1) { mgq_ask(".set maxdocs all"); } else { char maxdocstr[32]; sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs); mgq_ask(maxdocstr); } char maxnumericstr[32]; sprintf(maxnumericstr, ".set maxnumeric %i", queryparams.maxnumeric); mgq_ask(maxnumericstr); } /** * submitquery constructs the query string (into UTF8 encoding) * and submits it using mgq_ask to the mg search engine. Most * of the processing will be done inside Greenstone */ void mgsearchclass::submitquery (const queryparamclass &queryparams) { // sort out the query string; copy it, remove all special characters // and then convert it to a string in UTF8 format text_t ttquerystring = queryparams.querystring; filterquery (ttquerystring); char *querystring = to_utf8(ttquerystring).getcstr(); // submit the query mgq_ask(querystring); // destroy the temporary character array delete []querystring; } /** * getrults is called to retrieve the required data on the docs * which responded to the query submitted in submitquery above. * * It calls the local mgquery (mgq) interface to MG several times, * to obtain the document numbers, term frequencies, term variants * etc. All processing of the query will be done by Greenstone * thereafter */ void mgsearchclass::getresults (const queryparamclass &queryparams, queryresultsclass &queryresults) { // get the configuration for the maximum number of documents to // retrieve int howmany = queryparams.maxdocs; if (howmany == -1) howmany = MAXNUMDOCS; mgq_results(result_docnums, 0, howmany, ourquerycallback, (void *)(&queryresults)); // get the term frequencies mgq_results(result_termfreqs, 0, MAXNUMTERMS, termfreqcallback, (void *)(&queryresults)); queryresults.sortuniqqueryterms(); // get term variants mgq_results(result_terms, 0, MAXNUMTERMS, termvariantscallback, (void *)(&queryresults)); // get the number of documents retrieved int total_retrieved = 0, is_approx = 0; mgq_docsretrieved (&total_retrieved, &is_approx); if (total_retrieved == 0) { // not available (or really was zero) queryresults.docs_matched = queryresults.docs.docset.size(); if ((queryparams.maxdocs == -1) || (queryresults.docs_matched < queryparams.maxdocs)) queryresults.is_approx = Exact; else queryresults.is_approx = MoreThan; } else { queryresults.docs_matched = total_retrieved; if (is_approx) queryresults.is_approx = Approximate; else queryresults.is_approx = Exact; } } /** * Tidies the given querystring, removing special characters */ void mgsearchclass::filterquery (text_t &ttquerystring) { text_t::iterator ithere = ttquerystring.begin (); text_t::iterator itend = ttquerystring.end (); // remove all non alphanumeric characters (except // boolean operators while (ithere != itend) { if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') && (*ithere != '&') && (*ithere != '|') && (*ithere != '(') && (*ithere != ')')) (*ithere) = ' '; ++ithere; } } // the document text for 'docnum' is placed in 'output' // docTargetDocument returns 'true' if it was able to // try to get a document // collection is needed to see if an index from the // collection is loaded. If no index has been loaded // defaultindex is needed to load one bool mgsearchclass::docTargetDocument(const text_t &defaultindex, const text_t &defaultsubcollection, const text_t &defaultlanguage, const text_t &collection, int docnum, text_t &output) { output.clear(); // get the mg version of the document char *mgdoc = NULL; int doclen = 0; if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage, collection, docnum, mgdoc, doclen)) return false; if (mgdoc == NULL) return false; // replace all control-Cs with spaces char *mgdoc_here = mgdoc; char *mgdoc_end = mgdoc + doclen; while (mgdoc_here < mgdoc_end) { if (*mgdoc_here == '\x3') *mgdoc_here = ' '; ++mgdoc_here; } // convert this document to unicode utf8inconvertclass inconvert; convertclass::status_t status; inconvert.reset (); inconvert.setinput (mgdoc, doclen); inconvert.convert (output, status); delete[] mgdoc; return true; } bool mgsearchclass::mgdocument (const text_t &defaultindex, const text_t &defaultsubcollection, const text_t &defaultlanguage, const text_t &collection, int docnum, char *&UDoc, int &ULen) { int databaseloaded = 0; UDoc = NULL; ULen = 0; // see if we can make an appropriate database current // char *ccollection = collection.getcstr(); // assert (ccollection != NULL); // databaseloaded = load_text_database (ccollection); // delete []ccollection; // try and load the database // if (!databaseloaded) databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection, defaultlanguage, collection); if (databaseloaded) { // retrieve the document from mg char docstr[32]; sprintf(docstr, "%i", docnum); mgq_ask(".set mode text"); mgq_ask(".set query docnums"); mgq_ask(docstr); tempdoc = NULL; templen = 0; mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL); UDoc = tempdoc; ULen = templen; } unload_database(); // Important that local library doesn't leave any files open return (bool)databaseloaded; } // unload_database simply calls mgq's close_all_databases function to clear // any cached databases - this is useful when attempting to completely // remove all trace of a collectionserver at runtime (when using a // persistent version of Greenstone like the windows local library) void mgsearchclass::unload_database () { close_all_databases(); }