/********************************************************************** * * mgqueryfilter.cpp -- implementation of queryfilter for old mg * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #include "mgqueryfilter.h" #include "fileutil.h" #include "phrasesearch.h" #include "mgsearch.h" #include "phrases.h" /////////////////////////////// // methods for resultsorderer_t /////////////////////////////// resultsorderer_t::resultsorderer_t() { clear (); } void resultsorderer_t::clear() { compare_phrase_match = false; compare_terms_match = false; compare_doc_weight = true; docset = NULL; } bool resultsorderer_t::operator()(const int &t1, const int &t2) const { if (docset == NULL) return t1>t2; docresultmap::iterator t1_here = docset->find(t1); docresultmap::iterator t2_here = docset->find(t2); docresultmap::iterator end = docset->end(); // sort all the document numbers not in the document set to // the end of the list if (t1_here == end) { if (t2_here == end) return t1>t2; else return true; } else if (t2_here == end) return false; if (compare_phrase_match) { if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true; if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false; } if (compare_terms_match) { if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true; if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false; } if (compare_doc_weight) { if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true; if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false; } return t1>t2; } ///////////////////////////////// // functions for mgqueryfilterclass ///////////////////////////////// void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) { queryfilterclass::configure (key, cfgline); if (key == "indexstem") { ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]); } } // loads up phrases data structure with any phrases (that's the quoted bits) // occuring in the querystring void mgqueryfilterclass::get_phrase_terms (const text_t &querystring, const termfreqclassarray &orgterms, vector &phrases) { text_t::const_iterator here = querystring.begin(); text_t::const_iterator end = querystring.end(); termfreqclassarray tmpterms; int termcount = 0; bool foundquote = false; bool foundbreak = false; bool start = true; while (here != end) { if (*here == '\"') { if (foundquote) { if (!foundbreak && !start) { tmpterms.push_back (orgterms[termcount]); ++termcount; } if (tmpterms.size() > 1) { phrases.push_back (tmpterms); } tmpterms.erase (tmpterms.begin(), tmpterms.end()); foundquote = false; foundbreak = true; } else foundquote = true; } else if (!is_unicode_letdig(*here)) { // found a break between terms if (!foundbreak && !start) { if (foundquote) { tmpterms.push_back (orgterms[termcount]); } ++termcount; } foundbreak = true; } else { start = false; foundbreak = false; } ++here; } } // do aditional query processing void mgqueryfilterclass::post_process (const queryparamclass &queryparams, queryresultsclass &queryresults) { // post-process the results if needed if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) { // get the terms between quotes (if any) vector phrases; get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases); num_phrases = phrases.size(); if (num_phrases > 0) { // get the long version of the index text_t longindex; indexmap.to2from (queryparams.index, longindex); vector::const_iterator this_phrase = phrases.begin(); vector::const_iterator end_phrase = phrases.end(); while (this_phrase != end_phrase) { // process each of the matched documents docresultmap::iterator docs_here = queryresults.docs.docset.begin(); docresultmap::iterator docs_end = queryresults.docs.docset.end(); while (docs_here != docs_end) { if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *db_ptr, queryparams.index, queryparams.subcollection, queryparams.language, longindex, queryparams.collection, *this_phrase, (*docs_here).second.docnum)) { ++docs_here->second.num_phrase_match; } ++docs_here; } ++this_phrase; } } } } // do query that might involve multiple sub queries // textsearchptr and db_ptr are assumed to be valid void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request, const vector &query_params, queryresultsclass &multiresults, comerror_t &err, ostream &logout) { outconvertclass text_t2ascii; err = noError; textsearchptr->setcollectdir (collectdir); multiresults.clear(); vector::const_iterator query_here = query_params.begin(); vector::const_iterator query_end = query_params.end(); while (query_here != query_end) { queryresultsclass thisqueryresults; if (!textsearchptr->search(*query_here, thisqueryresults)) { // most likely a system problem logout << text_t2ascii << "system problem: could not do search with mg for index \"" << (*query_here).index << (*query_here).subcollection << (*query_here).language << "\".\n\n"; err = systemProblem; return; } // combine the results if (need_matching_docs (request.filterResultOptions)) { // post-process the results if needed if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 && !thisqueryresults.docs.docset.empty()) { post_process (*query_here, thisqueryresults); thisqueryresults.postprocessed = true; multiresults.postprocessed = true; } else { num_phrases = 0; } if (query_params.size() == 1) { multiresults.docs = thisqueryresults.docs; // just one set of results multiresults.docs_matched = thisqueryresults.docs_matched; multiresults.is_approx = thisqueryresults.is_approx; } else { if ((*query_here).combinequery == "and") { multiresults.docs.combine_and (thisqueryresults.docs); } else if ((*query_here).combinequery == "or") { multiresults.docs.combine_or (thisqueryresults.docs); } else if ((*query_here).combinequery == "not") { multiresults.docs.combine_not (thisqueryresults.docs); } multiresults.docs_matched = multiresults.docs.docset.size(); multiresults.is_approx = Exact; } } // combine the term information if (need_term_info (request.filterResultOptions)) { // append the terms multiresults.orgterms.insert(multiresults.orgterms.end(), thisqueryresults.orgterms.begin(), thisqueryresults.orgterms.end()); // add the term variants text_tset::iterator termvar_here = thisqueryresults.termvariants.begin(); text_tset::iterator termvar_end = thisqueryresults.termvariants.end(); while (termvar_here != termvar_end) { multiresults.termvariants.insert(*termvar_here); ++termvar_here; } } ++query_here; } // sort and unique the query terms multiresults.sortuniqqueryterms (); } void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/, docresultsclass &docs) { resultsorderer_t resultsorderer; resultsorderer.compare_phrase_match = true; resultsorderer.docset = &(docs.docset); // first get a list of document numbers docs.docnum_order(); sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer); } mgqueryfilterclass::mgqueryfilterclass () :queryfilterclass() { num_phrases = 0; } mgqueryfilterclass::~mgqueryfilterclass () { } void mgqueryfilterclass::filter (const FilterRequest_t &request, FilterResponse_t &response, comerror_t &err, ostream &logout) { outconvertclass text_t2ascii; response.clear (); err = noError; if (db_ptr == NULL) { // most likely a configuration problem logout << text_t2ascii << "configuration error: mgqueryfilter contains a null dbclass\n\n"; err = configurationError; return; } if (textsearchptr == NULL) { // most likely a configuration problem logout << text_t2ascii << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n"; err = configurationError; return; } // open the database db_ptr->setlogout(&logout); if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) { // most likely a system problem (we have already checked that the database exists) logout << text_t2ascii << "system problem: open on database \"" << db_filename << "\" failed\n\n"; err = systemProblem; return; } // get the query parameters int startresults = filterOptions["StartResults"].defaultValue.getint(); int endresults = filterOptions["EndResults"].defaultValue.getint(); text_t phrasematch = filterOptions["PhraseMatch"].defaultValue; vector queryfilterparams; parse_query_params (request, queryfilterparams, startresults, endresults, phrasematch, logout); // do any mg specific diddling with query parameters that may be required mg_parse_query_params (request, queryfilterparams, startresults, endresults, phrasematch, logout); // do query queryresultsclass queryresults; do_multi_query (request, queryfilterparams, queryresults, err, logout); if (err != noError) return; // assemble document results if (need_matching_docs (request.filterResultOptions)) { // sort the query results // only want to sort the docs if we have done a ranked search or there were phrases if (num_phrases > 0 || (request.filterResultOptions & FRranking)) { sort_doc_results (request, queryresults.docs); } int resultnum = 1; ResultDocInfo_t resultdoc; text_t trans_OID; vector::iterator docorder_here = queryresults.docs.docorder.begin(); vector::iterator docorder_end = queryresults.docs.docorder.end(); // documents containing matching phrases will be sorted to the top so // we can break out once we're past those that match the PhraseMatch // option -- "all_phrases" = return only those documents containing all // phrases in query string // "some_phrases" = return only those documents containing // at least 1 of the phrases in the document // "all_docs" = return all documents regardless if (num_phrases > 0) { int numdocs = 0; while (docorder_here != docorder_end) { docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here); if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) || ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) { queryresults.docs_matched = numdocs; break; } ++numdocs; ++docorder_here; } } if (endresults == -1) endresults = MAXNUMDOCS; docorder_here = queryresults.docs.docorder.begin(); while (docorder_here != docorder_end) { if (resultnum > endresults || resultnum > queryresults.docs_matched) break; // translate the document number if (!translate(db_ptr, *docorder_here, trans_OID)) { logout << text_t2ascii << "warning: could not translate mg document number \"" << *docorder_here << "\"to OID.\n\n"; } else { docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here); // see if there is a result for this number, // if it is in the request set (or the request set is empty) if (docset_here != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, trans_OID))) { if (resultnum >= startresults) { // add this document resultdoc.OID = trans_OID; resultdoc.result_num = resultnum; resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5); // these next two are not available on all versions of mg resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched; resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match; response.docInfo.push_back (resultdoc); } ++resultnum; } } ++docorder_here; } } // assemble the term results if (need_term_info(request.filterResultOptions)) { // note: the terms have already been sorted and uniqued TermInfo_t terminfo; bool terms_first = true; termfreqclassarray::iterator terms_here = queryresults.terms.begin(); termfreqclassarray::iterator terms_end = queryresults.terms.end(); while (terms_here != terms_end) { terminfo.clear(); terminfo.term = (*terms_here).termstr; terminfo.freq = (*terms_here).termfreq; if (terms_first) { text_tset::iterator termvariants_here = queryresults.termvariants.begin(); text_tset::iterator termvariants_end = queryresults.termvariants.end(); while (termvariants_here != termvariants_end) { terminfo.matchTerms.push_back (*termvariants_here); ++termvariants_here; } } terms_first = false; response.termInfo.push_back (terminfo); ++terms_here; } } db_ptr->closedatabase(); // Important that local library doesn't leave any files open response.numDocs = queryresults.docs_matched; response.isApprox = queryresults.is_approx; } void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/, vector &query_params, int &/*startresults*/, int &/*endresults*/, text_t &/*phrasematch*/, ostream &/*logout*/) { // outconvertclass text_t2ascii; vector::iterator query_here = query_params.begin(); vector::iterator query_end = query_params.end(); while (query_here != query_end) { // if we're doing a phrase search we want to maximise hits by making it // a boolean search on the index with the finest granularity - we'll // also set maxdocs to "all" (realizing that this will cause searches // like "and the" on a large collection to take a very very long time). // we're deciding it's a phrase search based on if the querystring // contains at least 2 double quotes (not very scientific but // then neither is the rest of the mg phrase searching functionality :-) //if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) { // [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here text_tarray phrases; get_phrases((*query_here).querystring, phrases); if (phrases.size() > 0) { (*query_here).search_type = 0; // set maxdocs to "all" (*query_here).maxdocs = -1; // Get the long version of the index and test to see if any indexes with // finer granularity exist. Indexes must be the same type (i.e. same metadata // or "text"). text_t longindex; text_tarray splitindex; indexmap.to2from ((*query_here).index, longindex); splitchar (longindex.begin(), longindex.end(), ':', splitindex); text_t &granularity = splitindex[0]; text_t &indextype = splitindex[1]; bool found = false; // currently supported granularity options are "document", "section" and "paragraph" if (granularity == "document" || granularity == "section") { text_t shortindex; if (indexmap.fromexists ("paragraph:" + indextype)) { indexmap.from2to ("paragraph:" + indextype, shortindex); (*query_here).index = shortindex; found = true; } if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) { indexmap.from2to ("section:" + indextype, shortindex); (*query_here).index = shortindex; } } } #ifdef GSDL_BBC_COLLECTION // This is a special hack for the BBC collection's ProgNumber and zzabn // indexes (they're built this way to prevent mg_perf_hash_build from // dying at build time) // if we're searching the ProgNumber index we want to // remove all non-alphanumeric characters from the query string text_t longindex; text_tarray splitindex; indexmap.to2from ((*query_here).index, longindex); splitchar (longindex.begin(), longindex.end(), ':', splitindex); text_t &indextype = splitindex[1]; if (indextype == "ProgNumber") { text_t new_querystring; text_t::const_iterator here = (*query_here).querystring.begin(); text_t::const_iterator end = (*query_here).querystring.end(); while (here != end) { if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') || (*here >= '0' && *here <= '9')) { new_querystring.push_back (*here); } ++here; } (*query_here).querystring = new_querystring; } #endif ++query_here; } }