#include #include #include #include #ifdef __GNUG__ # include # include #else # ifndef USE_OBJECTSPACE # include # else # include # endif // gdbm stuff # include "autoconf.h" # include "systems.h" # include "gdbmconst.h" # include "gdbm.h" #endif #include #include "mgq.h" #include "mgsearch.h" #include "locateinfo.h" #include "gsdlunicode.h" #include "unitool.h" ///////////// // globals // ///////////// static char *quotedquery = NULL; //////////////////////// // callback functions // //////////////////////// // This routine is called for each document found in a search // it assumes that cache_num is set up correctly to point to // a suitable result cache int ourquerycallback(char *UDoc, int ULen, int DocNum, float Weight, void *info) { queryresultsclass *queryresults = (queryresultsclass * )info; // check the returned document for the presence of the // quoted part of the query, if there was one if (UDoc != NULL && quotedquery != NULL && quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0; // append this entry to the document results docresultclass docresult; docresult.docnum = DocNum; docresult.docweight = Weight; queryresults->docs.push_back(docresult); return 0; } // This callback is called once for each term in the query int termfreqcallback(char *Word, int ULen, int Freq, float Weight, void *info) { queryresultsclass *queryresults = (queryresultsclass *)info; termfreqclass termfreq; termfreq.termstr.setcarr(Word, ULen); termfreq.termfreq = Freq; queryresults->terms.push_back(termfreq); return 0; } // this callback is called once for each variation of each term int termscallback(char *Word, int ULen, int Freq, float Weight, void *info) { // convert term from utf8 to unicode text_t term; utf8inconvertclass inconvert; convertclass::status_t status; inconvert.reset (); inconvert.setinput (Word, ULen); inconvert.convert (term, status); queryresultsclass *queryresults = (queryresultsclass *)info; queryresults->termvariants.push_back(term); return 0; } // This callback is for getting document text int doctextcallback(char *Word, int ULen, int Freq, float Weight, void *info) { text_t *output = (text_t *)info; if (output == NULL) return 0; output->clear(); utf8inconvertclass inconvert; convertclass::status_t status; inconvert.reset (); inconvert.setinput (Word, ULen); inconvert.convert (*output, status); // replace all control-Cs with spaces text_t::iterator here = output->begin(); text_t::iterator end = output->end(); while (here != end) { if (*here == '\x3') *here = ' '; here++; } return 0; } //////////////////// // mgsearch class // //////////////////// mgsearchclass::mgsearchclass () { cache = new querycache (RESULTCACHESIZE); } mgsearchclass::~mgsearchclass () { if (cache != NULL) { delete cache; cache = NULL; } } void mgsearchclass::setcollectdir (const text_t &thecollectdir) { collectdir = thecollectdir; } bool mgsearchclass::search(const queryparamclass &queryparams, queryresultsclass &queryresults) { bool databaseloaded = true; assert (cache != NULL); queryresults.clear(); // first check the cache if (cache->find(queryparams, queryresults)) return true; // make sure there is a query to be processed text_t::const_iterator queryhere = queryparams.querystring.begin(); text_t::const_iterator queryend = queryparams.querystring.end(); while (queryhere != queryend) { if (is_unicode_letdig (*queryhere)) break; queryhere++; } // if we reached the end of the query string without finding // any alphanumeric characters then return no results (and say // the database was loaded) if (queryhere == queryend) return true; // get the names of the index and text suffixes char *idxsuffix = (getindexsuffix (queryparams.collection, queryparams.search_index)).getcstr(); assert (idxsuffix != NULL); char *txtsuffix = (gettextsuffix (queryparams.collection)).getcstr(); assert (txtsuffix != NULL); #ifdef __WIN32__ char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL); #else char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL); #endif if (load_database(ccollectdir, idxsuffix, txtsuffix)) { setsearchmode (queryparams); submitquery (queryparams); getresults (queryresults); } else databaseloaded = false; // free up the c strings delete idxsuffix; delete txtsuffix; delete ccollectdir; return databaseloaded; } void mgsearchclass::setsearchmode (const queryparamclass &queryparams) { mgq_ask(".set expert true"); mgq_ask(".set accumulator_method list"); mgq_ask(".set max_accumulators 50000"); mgq_ask(".set verbatim true"); mgq_ask(".unset skip_dump"); mgq_ask(".set mode docnums"); switch (queryparams.search_type) { case 0: mgq_ask(".set query boolean"); break; case 1: mgq_ask(".set query ranked"); break; } switch (queryparams.casefolding) { case 1: mgq_ask(".set casefold on"); break; case 0: mgq_ask(".set casefold off"); break; } switch (queryparams.stemming) { case 1: mgq_ask(".set stem on"); break; case 0: mgq_ask(".set stem off"); break; } mgq_ask(".set heads_length 150"); char maxdocstr[32]; sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs); mgq_ask(maxdocstr); } void mgsearchclass::submitquery (const queryparamclass &queryparams) { // sort out the query string text_t ttquerystring = queryparams.querystring; text_t ttquotedquery; extractquoted (ttquerystring, ttquotedquery); filterquery (ttquerystring); // turn the strings into c strings for mg if (quotedquery != NULL) // quotedquery is a global { delete quotedquery; quotedquery = NULL; } // quotedquery will be deleted on the next call to this function quotedquery = ttquotedquery.getcstr (); char *querystring = ttquerystring.getcstr(); // submit the query mgq_ask(querystring); delete querystring; } void mgsearchclass::getresults (queryresultsclass &queryresults) { if (quotedquery[0] == '\0') { // don't need the text mgq_results(result_docnums, 0, MAXNUMDOCS, ourquerycallback, (void *)(&queryresults)); } else { // we need the text for this one mgq_results(result_docs, 0, MAXNUMDOCS, ourquerycallback, (void *)(&queryresults)); } // get the term frequencies mgq_results(result_termfreqs, 0, MAXNUMTERMS, termfreqcallback, (void *)(&queryresults)); mgq_results(result_terms, 0, MAXNUMTERMS, termscallback, (void *)(&queryresults)); queryresults.sortqueryterms(); queryresults.uniqqueryterms(); } void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery) { ttquotedquery.clear(); text_t::iterator ithere = ttquerystring.begin (); text_t::iterator itend = ttquerystring.end (); bool inquote = false; while (ithere != itend) { if ((*ithere) == '\"') { if (!inquote) ttquotedquery.clear (); inquote = !inquote; *ithere = ' '; // delete the quote } else if (inquote) { ttquotedquery.push_back(*ithere); *ithere = ' '; } ithere++; } } void mgsearchclass::filterquery (text_t &ttquerystring) { text_t::iterator ithere = ttquerystring.begin (); text_t::iterator itend = ttquerystring.end (); unsigned short c; // remove all non alphanumeric characters below 127 while (ithere != itend) { c = *ithere; // if ((c <= 127) && !((c >= '0' && c <= '9') || // (c >= 'A' && c <= 'Z') || // (c >= 'a' && c <= 'z'))) if (!(((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122)) || ((c >= 192) && (c <= 214)) || ((c >= 216) && (c <= 246)) || ((c >= 248) && (c <= 255)) || ((c >= '0') && (c <= '9')) || (c == 176))) (*ithere) = ' '; ithere++; } } // the document text for 'docnum' is placed in 'output' // docTargetDocument returns 'true' if it was able to // try to get a document // collection is needed to see if an index from the // collection is loaded. If no index has been loaded // defaultindex is needed to load one bool mgsearchclass::docTargetDocument(const text_t &defaultindex, const text_t &collection, int docnum, text_t &output) { bool databaseloaded = true; output.clear(); // make sure index is level 2 ////// this changed with new naming scheme in new building software ///// i.e paragraph level index no longer contain number '3' but begin ///// with letter 'p' text_t db_loaded = db_loaded_name; if (!db_loaded.empty()) { text_t::const_iterator here = db_loaded.begin(); text_t::const_iterator end = db_loaded.end(); //while (here != end) { // if (*here == '3') // databaseloaded = false; // here ++; //} char separator = '/'; text_t db; int found = 0; #ifdef __WIN32__ separator = '\\'; #endif; // strip away path to db and following collection name end --; while (end != here) { if (*end == separator) { if (found) break; else {db.clear(); found = 1; end--; continue;} } db.push_back(*end); end --; } // string will have been reversed above so see if last // character is 'p' if (db[db.size()-1] == 'p') databaseloaded = false; } // find out if the database is already loaded // this is needed because a different index (but valid one) // might be already loaded. // this comparison is needed because 'load_database' // is now more oriented towards indexes if (databaseloaded == true) { text_t::const_iterator here = collection.begin(); text_t::const_iterator end = collection.end(); char *dbhere = &db_loaded_name[strlen(db_loaded_name) - collection.size()]; // assumes collection shorter than db_loaded_name while (here != end) { if (*here != *dbhere) { databaseloaded = false; break; } here++; dbhere++; } } // try and load the database if (!databaseloaded) { // get the names of the index and text suffixes char *idxsuffix = (getindexsuffix (collection, defaultindex)).getcstr(); assert (idxsuffix != NULL); char *txtsuffix = (gettextsuffix (collection)).getcstr(); assert (txtsuffix != NULL); #ifdef __WIN32__ char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL); #else char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL); #endif if (load_database(ccollectdir, idxsuffix, txtsuffix)) databaseloaded = true; else databaseloaded = false; // free up the c strings delete idxsuffix; delete txtsuffix; delete ccollectdir; } if (databaseloaded) { // retrieve the document from mg char docstr[32]; sprintf(docstr, "%i", docnum); mgq_ask(".set mode text"); mgq_ask(".set query docnums"); mgq_ask(docstr); mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output); } return databaseloaded; }