/********************************************************************** * * mgsearch.cpp -- * Copyright (C) 1999 The New Zealand Digital Library Project * * PUT COPYRIGHT NOTICE HERE * * $Id: mgsearch.cpp 265 1999-06-09 00:41:32Z sjboddie $ * *********************************************************************/ /* $Log$ Revision 1.6 1999/06/09 00:41:32 sjboddie phrase searching now uses case-folding if it's turned on Revision 1.5 1999/02/21 22:31:35 rjmcnab Removed locateinfo. Revision 1.4 1999/02/03 01:13:27 sjboddie Got interface to handle subcollections and language subcollections - committed changes made to some of the collections Revision 1.3 1999/01/19 01:38:17 rjmcnab Made the source more portable. Revision 1.2 1999/01/12 01:51:02 rjmcnab Standard header. Revision 1.1 1999/01/08 09:02:16 rjmcnab Moved from src/library. */ #include "gsdlconf.h" #include "mgsearch.h" #include "fileutil.h" #include #include #include #include #if defined(GSDL_USE_OBJECTSPACE) # include #elif defined(GSDL_USE_IOS_H) # include #else # include #endif #if defined(__WIN32__) // gdbm stuff # include "autoconf.h" # include "systems.h" # include "gdbmconst.h" # include "gdbm.h" #else # include #endif #include #include "mgq.h" // #include "locateinfo.h" #include "gsdlunicode.h" #include "unitool.h" ///////////// // globals // ///////////// static char *quotedquery = NULL; static int casefold; ///////////////////////// // index map functions // ///////////////////////// void getrealdir (const text_t &map, text_t &realpart, text_t &dirpart) { realpart.clear (); dirpart.clear(); text_t::const_iterator here = map.begin(); text_t::const_iterator end = map.end(); // get the real index while (here != end && *here != '-') { realpart.push_back(*here); here++; } if (here != end) here++; if (here != end && *here == '>') here++; // get the dir index while (here != end) { dirpart.push_back(*here); here++; } } void getrealdirindex (const text_t &indexmap, const text_t &subcollectionmap, const text_t &languagemap, text_t &realindex, text_t &dirindex) { text_t real, dir; realindex.clear(); dirindex.clear(); getrealdir (indexmap, real, dir); realindex += real; dirindex += dir; getrealdir (subcollectionmap, real, dir); realindex += real; dirindex += dir; getrealdir (languagemap, real, dir); realindex += real; dirindex += dir; } //bool isdirindex (const text_tarray &indexmap, const text_t &dirindex) { // text_tarray::const_iterator here = indexmap.begin(); // text_tarray::const_iterator end = indexmap.end(); // text_t maprealindex, mapdirindex; // while (here != end) { // getrealdirindex (*here, maprealindex, mapdirindex); // if (mapdirindex == dirindex) return true; // here++; // } // return false; //} void getrealindexparts (const text_tarray &/*indexmap*/, const text_tarray &/*subcollectionmap*/, const text_tarray &languagemap, const text_t &realindex, text_t &index, text_t &subcollection, text_t &language) { index.clear(); subcollection.clear(); language.clear(); text_tarray parts; splitchar (realindex.begin(), realindex.end(), ':', parts); int numparts = parts.size(); if (numparts >= 2) { index = parts[0] + ":" + parts[1]; if (numparts == 3) { if (languagemap.empty()) subcollection = parts[2]; else language = parts[2]; } else if (numparts == 4) { subcollection = parts[2]; language = parts[3]; } } } void getdirindexparts (const text_tarray &/*indexmap*/, const text_tarray &/*subcollectionmap*/, const text_tarray &languagemap, const text_t &dirindex, text_t &index, text_t &subcollection, text_t &language) { index.clear(); subcollection.clear(); language.clear(); int indexsize = dirindex.size(); if (indexsize != 3 && indexsize != 5 && indexsize != 7) return; text_t::const_iterator dibegin = dirindex.begin(); text_t::const_iterator diend = dirindex.end(); // first three characters make up index part index = substr(dibegin, dibegin+3); if (indexsize == 5) { if (languagemap.empty()) subcollection = substr(dibegin+3, dibegin+5); else language = substr(dibegin+3, dibegin+5); } else if (indexsize == 7) { subcollection = substr(dibegin+3, dibegin+5); language = substr(dibegin+5, diend); } } bool isrealindex (const text_tarray &indexmap, const text_tarray &subcollectionmap, const text_tarray &languagemap, const text_t &realindex) { text_t index, subcollection, language, realpart, dirpart; getrealindexparts (indexmap, subcollectionmap, languagemap, realindex, index, subcollection, language); // check index part text_tarray::const_iterator here = indexmap.begin(); text_tarray::const_iterator end = indexmap.end(); bool exists = false; while (here != end) { getrealdir (*here, realpart, dirpart); if (realpart == index) {exists = true; break;} here++; } if (!exists) return false; // check subcollection part if there is one if (!subcollection.empty()) { here = subcollectionmap.begin(); end = subcollectionmap.end(); exists = false; while (here != end) { getrealdir (*here, realpart, dirpart); if (realpart == subcollection) {exists = true; break;} here++; } if (!exists) return false; } // check language part if there is one if (!language.empty()) { here = languagemap.begin(); end = languagemap.end(); exists = false; while (here != end) { getrealdir (*here, realpart, dirpart); if (realpart == language) {exists = true; break;} here++; } if (!exists) return false; } return true; } text_t dir2realindex (const text_tarray &indexmap, const text_tarray &subcollectionmap, const text_tarray &languagemap, const text_t &dirindex) { text_t index, subcollection, language, realpart, dirpart, realindex; getdirindexparts (indexmap, subcollectionmap, languagemap, dirindex, index, subcollection, language); // get index part text_tarray::const_iterator here = indexmap.begin(); text_tarray::const_iterator end = indexmap.end(); while (here != end) { getrealdir (*here, realpart, dirpart); if (dirpart == index) {realindex += realpart; break;} here++; } if (realindex.empty()) return ""; // get subcollection part here = subcollectionmap.begin(); end = subcollectionmap.end(); while (here != end) { getrealdir (*here, realpart, dirpart); if (dirpart == subcollection) {realindex += ":" + realpart; break;} here++; } // get language part here = languagemap.begin(); end = languagemap.end(); while (here != end) { getrealdir (*here, realpart, dirpart); if (dirpart == language) {realindex += ":" + realpart; break;} here++; } return realindex; } text_t real2dirindex (const text_tarray &indexmap, const text_tarray &subcollectionmap, const text_tarray &languagemap, const text_t &realindex) { text_t index, subcollection, language, realpart, dirpart, dirindex; getrealindexparts (indexmap, subcollectionmap, languagemap, realindex, index, subcollection, language); // get index part text_tarray::const_iterator here = indexmap.begin(); text_tarray::const_iterator end = indexmap.end(); while (here != end) { getrealdir (*here, realpart, dirpart); if (realpart == index) {dirindex += dirpart; break;} here++; } if (dirindex.empty()) return ""; // get subcollection part here = subcollectionmap.begin(); end = subcollectionmap.end(); while (here != end) { getrealdir (*here, realpart, dirpart); if (realpart == subcollection) {dirindex += dirpart; break;} here++; } // get language part here = languagemap.begin(); end = languagemap.end(); while (here != end) { getrealdir (*here, realpart, dirpart); if (realpart == language) {dirindex += dirpart; break;} here++; } return dirindex; } text_t real2macroindex (const text_t &realindex) { text_t macroindex; text_t::const_iterator here = realindex.begin(); text_t::const_iterator end = realindex.end(); unsigned short c; while (here != end) { c = *here; if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) macroindex.push_back (*here); here++; } return macroindex; } bool isdoclevelindex (const text_t &realindex) { char *docstr = "document"; text_t::const_iterator here = realindex.begin (); text_t::const_iterator end = realindex.end (); while (here != end) { if (*docstr == '\0') return true; if (*docstr != (char)(*here)) return false; docstr++; here++; } return false; } text_t getdoclevelindex (const text_tarray &/*indexmap*/) { //text_tarray::const_iterator here = indexmap.begin(); //text_tarray::const_iterator end = indexmap.end(); //text_t maprealindex, mapdirindex; // while (here != end) { // getrealdirindex (*here, maprealindex, mapdirindex); // if (isdoclevelindex (maprealindex)) return maprealindex; // here++; //} return ""; } //////////////////////// // callback functions // //////////////////////// // This routine is called for each document found in a search // it assumes that cache_num is set up correctly to point to // a suitable result cache int ourquerycallback(char *UDoc, int /*ULen*/, int DocNum, float Weight, void *info) { queryresultsclass *queryresults = (queryresultsclass * )info; // check the returned document for the presence of the // quoted part of the query, if there was one // if (UDoc != NULL && quotedquery != NULL && // quotedquery[0] != '\0' && strstr (UDoc, quotedquery) == NULL) return 0; if (UDoc != NULL && quotedquery != NULL && quotedquery[0] != '\0') { if (casefold) { int len; for (len = 0; quotedquery[len] != '\0'; len ++) quotedquery[len] = tolower (quotedquery[len]); for (len = 0; UDoc[len] != '\0'; len ++) UDoc[len] = tolower (UDoc[len]); } if (strstr (UDoc, quotedquery) == NULL) return 0; } // append this entry to the document results docresultclass docresult; docresult.docnum = DocNum; docresult.docweight = Weight; queryresults->docs.push_back(docresult); return 0; } // This callback is called once for each term in the query int termfreqcallback(char *Word, int ULen, int Freq, float /*Weight*/, void *info) { queryresultsclass *queryresults = (queryresultsclass *)info; text_t term; term.setcarr(Word, ULen); termfreqclass termfreq; termfreq.termstr = to_uni(term); termfreq.termfreq = Freq; queryresults->terms.push_back(termfreq); return 0; } // this callback is called once for each variation of each term int termscallback(char *Word, int ULen, int /*Freq*/, float /*Weight*/, void *info) { text_t term; term.setcarr(Word, ULen); queryresultsclass *queryresults = (queryresultsclass *)info; queryresults->termvariants.push_back(to_uni(term)); return 0; } // This callback is for getting document text int doctextcallback(char *Word, int ULen, int /*Freq*/, float /*Weight*/, void *info) { text_t *output = (text_t *)info; if (output == NULL) return 0; output->clear(); utf8inconvertclass inconvert; convertclass::status_t status; inconvert.reset (); inconvert.setinput (Word, ULen); inconvert.convert (*output, status); // replace all control-Cs with spaces text_t::iterator here = output->begin(); text_t::iterator end = output->end(); while (here != end) { if (*here == '\x3') *here = ' '; here++; } return 0; } static text_t getindexsuffix (const text_t &collection, const text_t &index) { text_t indexsuffix = "index"; indexsuffix = filename_cat (indexsuffix, index); indexsuffix = filename_cat (indexsuffix, collection); return indexsuffix; } //////////////////// // mgsearch class // //////////////////// mgsearchclass::mgsearchclass () { cache = new querycache (RESULTCACHESIZE); } mgsearchclass::~mgsearchclass () { if (cache != NULL) { delete cache; cache = NULL; } } void mgsearchclass::setcollectdir (const text_t &thecollectdir) { collectdir = thecollectdir; } bool mgsearchclass::search(const queryparamclass &queryparams, queryresultsclass &queryresults) { bool databaseloaded = true; assert (cache != NULL); queryresults.clear(); // first check the cache if (cache->find(queryparams, queryresults)) return true; // make sure there is a query to be processed text_t::const_iterator queryhere = queryparams.querystring.begin(); text_t::const_iterator queryend = queryparams.querystring.end(); while (queryhere != queryend) { if (is_unicode_letdig (*queryhere)) break; queryhere++; } // if we reached the end of the query string without finding // any alphanumeric characters then return no results (and say // the database was loaded) if (queryhere == queryend) return true; casefold = queryparams.casefolding; // get the names of the collection, index and text suffixes char *ccollection = queryparams.collection.getcstr(); assert (ccollection != NULL); char *idxsuffix = (getindexsuffix (queryparams.collection, queryparams.search_index)).getcstr(); assert (idxsuffix != NULL); char *txtsuffix = (getindexsuffix (queryparams.collection, "text")).getcstr(); assert (txtsuffix != NULL); #ifdef __WIN32__ char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL); #else char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL); #endif if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) { setsearchmode (queryparams); submitquery (queryparams); getresults (queryresults); } else databaseloaded = false; // free up the c strings delete ccollection; delete idxsuffix; delete txtsuffix; delete ccollectdir; return databaseloaded; } void mgsearchclass::setsearchmode (const queryparamclass &queryparams) { mgq_ask(".set expert true"); mgq_ask(".set accumulator_method list"); mgq_ask(".set max_accumulators 50000"); mgq_ask(".set verbatim true"); mgq_ask(".unset skip_dump"); mgq_ask(".set mode docnums"); switch (queryparams.search_type) { case 0: mgq_ask(".set query boolean"); break; case 1: mgq_ask(".set query ranked"); break; } switch (queryparams.casefolding) { case 1: mgq_ask(".set casefold on"); break; case 0: mgq_ask(".set casefold off"); break; } switch (queryparams.stemming) { case 1: mgq_ask(".set stem on"); break; case 0: mgq_ask(".set stem off"); break; } mgq_ask(".set heads_length 150"); char maxdocstr[32]; sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs); mgq_ask(maxdocstr); } void mgsearchclass::submitquery (const queryparamclass &queryparams) { // sort out the query string text_t ttquerystring = queryparams.querystring; text_t ttquotedquery; extractquoted (ttquerystring, ttquotedquery); filterquery (ttquerystring); // turn the strings into c strings for mg if (quotedquery != NULL) // quotedquery is a global { delete quotedquery; quotedquery = NULL; } // quotedquery will be deleted on the next call to this function quotedquery = to_utf8(ttquotedquery).getcstr (); char *querystring = to_utf8(ttquerystring).getcstr(); // submit the query mgq_ask(querystring); delete querystring; } void mgsearchclass::getresults (queryresultsclass &queryresults) { if (quotedquery[0] == '\0') { // don't need the text mgq_results(result_docnums, 0, MAXNUMDOCS, ourquerycallback, (void *)(&queryresults)); } else { // we need the text for this one mgq_results(result_docs, 0, MAXNUMDOCS, ourquerycallback, (void *)(&queryresults)); } // get the term frequencies mgq_results(result_termfreqs, 0, MAXNUMTERMS, termfreqcallback, (void *)(&queryresults)); mgq_results(result_terms, 0, MAXNUMTERMS, termscallback, (void *)(&queryresults)); queryresults.sortqueryterms(); queryresults.uniqqueryterms(); } void mgsearchclass::extractquoted (text_t &ttquerystring, text_t &ttquotedquery) { ttquotedquery.clear(); text_t::iterator ithere = ttquerystring.begin (); text_t::iterator itend = ttquerystring.end (); bool inquote = false; while (ithere != itend) { if ((*ithere) == '\"') { if (!inquote) ttquotedquery.clear (); inquote = !inquote; *ithere = ' '; // delete the quote } else if (inquote) { ttquotedquery.push_back(*ithere); *ithere = ' '; } ithere++; } } void mgsearchclass::filterquery (text_t &ttquerystring) { text_t::iterator ithere = ttquerystring.begin (); text_t::iterator itend = ttquerystring.end (); // remove all non alphanumeric characters while (ithere != itend) { if (!is_unicode_letdig(*ithere)) (*ithere) = ' '; ithere++; } } // the document text for 'docnum' is placed in 'output' // docTargetDocument returns 'true' if it was able to // try to get a document // collection is needed to see if an index from the // collection is loaded. If no index has been loaded // defaultindex is needed to load one bool mgsearchclass::docTargetDocument(const text_t &defaultindex, const text_t &collection, int docnum, text_t &output) { int databaseloaded = 0; output.clear(); char *ccollection = collection.getcstr(); assert (ccollection != NULL); // see if we can make an appropriate database current databaseloaded = load_text_database (ccollection); // try and load the database if (!databaseloaded) { // get the names of the index and text suffixes char *idxsuffix = (getindexsuffix (collection, defaultindex)).getcstr(); assert (idxsuffix != NULL); char *txtsuffix = (getindexsuffix (collection, "text")).getcstr(); assert (txtsuffix != NULL); #ifdef __WIN32__ char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL); #else char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL); #endif databaseloaded = load_database(ccollection, ccollectdir, idxsuffix, txtsuffix); // free up the c strings delete idxsuffix; delete txtsuffix; delete ccollectdir; } // free up the c collection string delete ccollection; if (databaseloaded) { // retrieve the document from mg char docstr[32]; sprintf(docstr, "%i", docnum); mgq_ask(".set mode text"); mgq_ask(".set query docnums"); mgq_ask(docstr); mgq_results (result_docs, 0, 1, doctextcallback, (void *)&output); } return databaseloaded; }