[1324] | 1 | /**********************************************************************
|
---|
| 2 | *
|
---|
| 3 | * mgqueryfilter.cpp -- implementation of queryfilter for old mg
|
---|
| 4 | * Copyright (C) 1999 The New Zealand Digital Library Project
|
---|
| 5 | *
|
---|
| 6 | * A component of the Greenstone digital library software
|
---|
| 7 | * from the New Zealand Digital Library Project at the
|
---|
| 8 | * University of Waikato, New Zealand.
|
---|
| 9 | *
|
---|
| 10 | * This program is free software; you can redistribute it and/or modify
|
---|
| 11 | * it under the terms of the GNU General Public License as published by
|
---|
| 12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | * (at your option) any later version.
|
---|
| 14 | *
|
---|
| 15 | * This program is distributed in the hope that it will be useful,
|
---|
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | * GNU General Public License for more details.
|
---|
| 19 | *
|
---|
| 20 | * You should have received a copy of the GNU General Public License
|
---|
| 21 | * along with this program; if not, write to the Free Software
|
---|
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | *
|
---|
| 24 | *********************************************************************/
|
---|
| 25 |
|
---|
| 26 | #include "mgqueryfilter.h"
|
---|
| 27 | #include "fileutil.h"
|
---|
| 28 | #include "phrasesearch.h"
|
---|
| 29 | #include "mgsearch.h"
|
---|
[11002] | 30 | #include "phrases.h"
|
---|
[1324] | 31 |
|
---|
| 32 | ///////////////////////////////
|
---|
| 33 | // methods for resultsorderer_t
|
---|
| 34 | ///////////////////////////////
|
---|
| 35 |
|
---|
| 36 | resultsorderer_t::resultsorderer_t() {
|
---|
| 37 | clear ();
|
---|
| 38 | }
|
---|
| 39 |
|
---|
| 40 | void resultsorderer_t::clear() {
|
---|
| 41 | compare_phrase_match = false;
|
---|
| 42 | compare_terms_match = false;
|
---|
| 43 | compare_doc_weight = true;
|
---|
| 44 |
|
---|
| 45 | docset = NULL;
|
---|
| 46 | }
|
---|
| 47 |
|
---|
[16445] | 48 | bool resultsorderer_t::operator()(const text_t &t1, const text_t &t2) const {
|
---|
[1324] | 49 | if (docset == NULL) return t1>t2;
|
---|
| 50 |
|
---|
| 51 | docresultmap::iterator t1_here = docset->find(t1);
|
---|
| 52 | docresultmap::iterator t2_here = docset->find(t2);
|
---|
| 53 | docresultmap::iterator end = docset->end();
|
---|
| 54 |
|
---|
| 55 | // sort all the document numbers not in the document set to
|
---|
| 56 | // the end of the list
|
---|
| 57 | if (t1_here == end) {
|
---|
| 58 | if (t2_here == end) return t1>t2;
|
---|
| 59 | else return true;
|
---|
| 60 | } else if (t2_here == end) return false;
|
---|
| 61 |
|
---|
| 62 | if (compare_phrase_match) {
|
---|
| 63 | if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
|
---|
| 64 | if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
|
---|
| 65 | }
|
---|
| 66 |
|
---|
| 67 | if (compare_terms_match) {
|
---|
| 68 | if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
|
---|
| 69 | if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
|
---|
| 70 | }
|
---|
| 71 |
|
---|
| 72 | if (compare_doc_weight) {
|
---|
| 73 | if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
|
---|
| 74 | if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
|
---|
| 75 | }
|
---|
| 76 |
|
---|
| 77 | return t1>t2;
|
---|
| 78 | }
|
---|
| 79 |
|
---|
| 80 |
|
---|
| 81 |
|
---|
| 82 |
|
---|
| 83 | /////////////////////////////////
|
---|
| 84 | // functions for mgqueryfilterclass
|
---|
| 85 | /////////////////////////////////
|
---|
| 86 |
|
---|
[4193] | 87 |
|
---|
| 88 | void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
|
---|
| 89 | queryfilterclass::configure (key, cfgline);
|
---|
| 90 |
|
---|
[12314] | 91 | if (key == "indexstem") {
|
---|
[9937] | 92 | ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
|
---|
| 93 | }
|
---|
| 94 |
|
---|
[4193] | 95 | }
|
---|
| 96 |
|
---|
[1324] | 97 | // loads up phrases data structure with any phrases (that's the quoted bits)
|
---|
| 98 | // occuring in the querystring
|
---|
| 99 | void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
|
---|
| 100 | const termfreqclassarray &orgterms,
|
---|
| 101 | vector<termfreqclassarray> &phrases) {
|
---|
| 102 |
|
---|
| 103 | text_t::const_iterator here = querystring.begin();
|
---|
| 104 | text_t::const_iterator end = querystring.end();
|
---|
| 105 |
|
---|
| 106 | termfreqclassarray tmpterms;
|
---|
| 107 |
|
---|
| 108 | int termcount = 0;
|
---|
| 109 | bool foundquote = false;
|
---|
| 110 | bool foundbreak = false;
|
---|
| 111 | bool start = true;
|
---|
| 112 | while (here != end) {
|
---|
| 113 | if (*here == '\"') {
|
---|
| 114 | if (foundquote) {
|
---|
| 115 | if (!foundbreak && !start) {
|
---|
| 116 | tmpterms.push_back (orgterms[termcount]);
|
---|
[9620] | 117 | ++termcount;
|
---|
[1324] | 118 | }
|
---|
| 119 | if (tmpterms.size() > 1) {
|
---|
| 120 | phrases.push_back (tmpterms);
|
---|
| 121 | }
|
---|
[11002] | 122 | tmpterms.erase (tmpterms.begin(), tmpterms.end());
|
---|
| 123 |
|
---|
[1324] | 124 | foundquote = false;
|
---|
| 125 | foundbreak = true;
|
---|
| 126 | } else foundquote = true;
|
---|
| 127 | } else if (!is_unicode_letdig(*here)) {
|
---|
| 128 | // found a break between terms
|
---|
| 129 | if (!foundbreak && !start) {
|
---|
[11002] | 130 | if (foundquote) {
|
---|
[1324] | 131 | tmpterms.push_back (orgterms[termcount]);
|
---|
[11002] | 132 | }
|
---|
[9620] | 133 | ++termcount;
|
---|
[1324] | 134 | }
|
---|
| 135 | foundbreak = true;
|
---|
| 136 | } else {
|
---|
| 137 | start = false;
|
---|
| 138 | foundbreak = false;
|
---|
| 139 | }
|
---|
[9620] | 140 | ++here;
|
---|
[1324] | 141 | }
|
---|
| 142 | }
|
---|
| 143 |
|
---|
| 144 | // do aditional query processing
|
---|
| 145 | void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
|
---|
| 146 | queryresultsclass &queryresults) {
|
---|
| 147 |
|
---|
| 148 | // post-process the results if needed
|
---|
| 149 | if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
|
---|
| 150 |
|
---|
| 151 | // get the terms between quotes (if any)
|
---|
| 152 | vector<termfreqclassarray> phrases;
|
---|
| 153 | get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
|
---|
| 154 |
|
---|
| 155 | num_phrases = phrases.size();
|
---|
| 156 | if (num_phrases > 0) {
|
---|
| 157 |
|
---|
| 158 | // get the long version of the index
|
---|
| 159 | text_t longindex;
|
---|
| 160 | indexmap.to2from (queryparams.index, longindex);
|
---|
| 161 |
|
---|
| 162 | vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
|
---|
| 163 | vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
|
---|
| 164 |
|
---|
| 165 | while (this_phrase != end_phrase) {
|
---|
| 166 |
|
---|
| 167 | // process each of the matched documents
|
---|
| 168 | docresultmap::iterator docs_here = queryresults.docs.docset.begin();
|
---|
| 169 | docresultmap::iterator docs_end = queryresults.docs.docset.end();
|
---|
| 170 | while (docs_here != docs_end) {
|
---|
[15558] | 171 | if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *db_ptr, queryparams.index,
|
---|
[1324] | 172 | queryparams.subcollection, queryparams.language,
|
---|
| 173 | longindex, queryparams.collection, *this_phrase,
|
---|
| 174 | (*docs_here).second.docnum)) {
|
---|
[9620] | 175 | ++docs_here->second.num_phrase_match;
|
---|
[1324] | 176 | }
|
---|
| 177 |
|
---|
[9620] | 178 | ++docs_here;
|
---|
[1324] | 179 | }
|
---|
[9620] | 180 | ++this_phrase;
|
---|
[1324] | 181 | }
|
---|
| 182 | }
|
---|
| 183 | }
|
---|
| 184 | }
|
---|
| 185 |
|
---|
| 186 |
|
---|
| 187 | // do query that might involve multiple sub queries
|
---|
[15595] | 188 | // textsearchptr and db_ptr are assumed to be valid
|
---|
[1324] | 189 | void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
|
---|
| 190 | const vector<queryparamclass> &query_params,
|
---|
| 191 | queryresultsclass &multiresults,
|
---|
| 192 | comerror_t &err, ostream &logout) {
|
---|
| 193 | outconvertclass text_t2ascii;
|
---|
| 194 |
|
---|
| 195 | err = noError;
|
---|
[8026] | 196 | textsearchptr->setcollectdir (collectdir);
|
---|
[16310] | 197 |
|
---|
[1324] | 198 | multiresults.clear();
|
---|
| 199 |
|
---|
| 200 | vector<queryparamclass>::const_iterator query_here = query_params.begin();
|
---|
| 201 | vector<queryparamclass>::const_iterator query_end = query_params.end();
|
---|
| 202 | while (query_here != query_end) {
|
---|
| 203 | queryresultsclass thisqueryresults;
|
---|
[1662] | 204 |
|
---|
[8026] | 205 | if (!textsearchptr->search(*query_here, thisqueryresults)) {
|
---|
[1324] | 206 | // most likely a system problem
|
---|
| 207 | logout << text_t2ascii
|
---|
| 208 | << "system problem: could not do search with mg for index \""
|
---|
| 209 | << (*query_here).index << (*query_here).subcollection
|
---|
| 210 | << (*query_here).language << "\".\n\n";
|
---|
| 211 | err = systemProblem;
|
---|
| 212 | return;
|
---|
| 213 | }
|
---|
| 214 |
|
---|
| 215 | // combine the results
|
---|
| 216 | if (need_matching_docs (request.filterResultOptions)) {
|
---|
| 217 | // post-process the results if needed
|
---|
| 218 | if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
|
---|
| 219 | !thisqueryresults.docs.docset.empty()) {
|
---|
| 220 | post_process (*query_here, thisqueryresults);
|
---|
| 221 | thisqueryresults.postprocessed = true;
|
---|
| 222 | multiresults.postprocessed = true;
|
---|
[1721] | 223 | } else {
|
---|
| 224 | num_phrases = 0;
|
---|
[1324] | 225 | }
|
---|
| 226 |
|
---|
| 227 | if (query_params.size() == 1) {
|
---|
| 228 | multiresults.docs = thisqueryresults.docs; // just one set of results
|
---|
| 229 | multiresults.docs_matched = thisqueryresults.docs_matched;
|
---|
| 230 | multiresults.is_approx = thisqueryresults.is_approx;
|
---|
| 231 |
|
---|
| 232 | } else {
|
---|
| 233 | if ((*query_here).combinequery == "and") {
|
---|
| 234 | multiresults.docs.combine_and (thisqueryresults.docs);
|
---|
| 235 | } else if ((*query_here).combinequery == "or") {
|
---|
| 236 | multiresults.docs.combine_or (thisqueryresults.docs);
|
---|
| 237 | } else if ((*query_here).combinequery == "not") {
|
---|
| 238 | multiresults.docs.combine_not (thisqueryresults.docs);
|
---|
| 239 | }
|
---|
| 240 | multiresults.docs_matched = multiresults.docs.docset.size();
|
---|
| 241 | multiresults.is_approx = Exact;
|
---|
| 242 | }
|
---|
| 243 | }
|
---|
| 244 |
|
---|
| 245 | // combine the term information
|
---|
| 246 | if (need_term_info (request.filterResultOptions)) {
|
---|
| 247 | // append the terms
|
---|
| 248 | multiresults.orgterms.insert(multiresults.orgterms.end(),
|
---|
| 249 | thisqueryresults.orgterms.begin(),
|
---|
| 250 | thisqueryresults.orgterms.end());
|
---|
| 251 |
|
---|
| 252 | // add the term variants
|
---|
| 253 | text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
|
---|
| 254 | text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
|
---|
| 255 | while (termvar_here != termvar_end) {
|
---|
| 256 | multiresults.termvariants.insert(*termvar_here);
|
---|
[9620] | 257 | ++termvar_here;
|
---|
[1324] | 258 | }
|
---|
| 259 | }
|
---|
| 260 |
|
---|
[9620] | 261 | ++query_here;
|
---|
[1324] | 262 | }
|
---|
| 263 |
|
---|
| 264 | // sort and unique the query terms
|
---|
| 265 | multiresults.sortuniqqueryterms ();
|
---|
| 266 | }
|
---|
| 267 |
|
---|
| 268 |
|
---|
| 269 | void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
|
---|
| 270 | docresultsclass &docs) {
|
---|
| 271 | resultsorderer_t resultsorderer;
|
---|
| 272 | resultsorderer.compare_phrase_match = true;
|
---|
| 273 | resultsorderer.docset = &(docs.docset);
|
---|
| 274 |
|
---|
| 275 | // first get a list of document numbers
|
---|
| 276 | docs.docnum_order();
|
---|
| 277 |
|
---|
| 278 | sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
|
---|
| 279 | }
|
---|
| 280 |
|
---|
| 281 |
|
---|
| 282 |
|
---|
| 283 | mgqueryfilterclass::mgqueryfilterclass ()
|
---|
| 284 | :queryfilterclass() {
|
---|
| 285 |
|
---|
| 286 | num_phrases = 0;
|
---|
[27064] | 287 |
|
---|
| 288 | FilterOption_t filtopt;
|
---|
| 289 | // -- onePerQuery PhraseMatch enumerated
|
---|
| 290 | filtopt.name = "PhraseMatch";
|
---|
| 291 | filtopt.type = FilterOption_t::enumeratedt;
|
---|
| 292 | filtopt.repeatable = FilterOption_t::onePerQuery;
|
---|
| 293 | filtopt.defaultValue = "some_phrases";
|
---|
| 294 | filtopt.validValues.push_back ("all_phrases");
|
---|
| 295 | filtopt.validValues.push_back ("some_phrases");
|
---|
| 296 | filtopt.validValues.push_back ("all_docs");
|
---|
| 297 | filterOptions["PhraseMatch"] = filtopt;
|
---|
| 298 |
|
---|
[1324] | 299 | }
|
---|
| 300 |
|
---|
| 301 | mgqueryfilterclass::~mgqueryfilterclass () {
|
---|
| 302 | }
|
---|
| 303 |
|
---|
| 304 | void mgqueryfilterclass::filter (const FilterRequest_t &request,
|
---|
| 305 | FilterResponse_t &response,
|
---|
| 306 | comerror_t &err, ostream &logout) {
|
---|
| 307 | outconvertclass text_t2ascii;
|
---|
| 308 |
|
---|
| 309 | response.clear ();
|
---|
| 310 | err = noError;
|
---|
[15558] | 311 | if (db_ptr == NULL) {
|
---|
[1324] | 312 | // most likely a configuration problem
|
---|
| 313 | logout << text_t2ascii
|
---|
[15558] | 314 | << "configuration error: mgqueryfilter contains a null dbclass\n\n";
|
---|
[1324] | 315 | err = configurationError;
|
---|
| 316 | return;
|
---|
| 317 | }
|
---|
[8026] | 318 | if (textsearchptr == NULL) {
|
---|
[1324] | 319 | // most likely a configuration problem
|
---|
| 320 | logout << text_t2ascii
|
---|
[8026] | 321 | << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
|
---|
[1324] | 322 | err = configurationError;
|
---|
| 323 | return;
|
---|
| 324 | }
|
---|
| 325 |
|
---|
| 326 | // open the database
|
---|
[15558] | 327 | db_ptr->setlogout(&logout);
|
---|
| 328 | if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
|
---|
| 329 | // most likely a system problem (we have already checked that the database exists)
|
---|
[1324] | 330 | logout << text_t2ascii
|
---|
[15558] | 331 | << "system problem: open on database \"" << db_filename << "\" failed\n\n";
|
---|
[1324] | 332 | err = systemProblem;
|
---|
| 333 | return;
|
---|
| 334 | }
|
---|
| 335 |
|
---|
| 336 | // get the query parameters
|
---|
| 337 | int startresults = filterOptions["StartResults"].defaultValue.getint();
|
---|
| 338 | int endresults = filterOptions["EndResults"].defaultValue.getint();
|
---|
| 339 | text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
|
---|
| 340 |
|
---|
| 341 | vector<queryparamclass> queryfilterparams;
|
---|
| 342 | parse_query_params (request, queryfilterparams, startresults,
|
---|
[1662] | 343 | endresults, phrasematch, logout);
|
---|
| 344 | // do any mg specific diddling with query parameters that may be required
|
---|
[27064] | 345 | // mg_parse_query_params (request, queryfilterparams, startresults,
|
---|
| 346 | // endresults, phrasematch, logout);
|
---|
[1662] | 347 |
|
---|
| 348 |
|
---|
[1324] | 349 | // do query
|
---|
| 350 | queryresultsclass queryresults;
|
---|
| 351 | do_multi_query (request, queryfilterparams, queryresults, err, logout);
|
---|
| 352 | if (err != noError) return;
|
---|
| 353 |
|
---|
| 354 | // assemble document results
|
---|
| 355 | if (need_matching_docs (request.filterResultOptions)) {
|
---|
| 356 | // sort the query results
|
---|
[5850] | 357 | // only want to sort the docs if we have done a ranked search or there were phrases
|
---|
| 358 | if (num_phrases > 0 || (request.filterResultOptions & FRranking)) {
|
---|
| 359 | sort_doc_results (request, queryresults.docs);
|
---|
| 360 | }
|
---|
[1324] | 361 | int resultnum = 1;
|
---|
| 362 | ResultDocInfo_t resultdoc;
|
---|
| 363 | text_t trans_OID;
|
---|
[16445] | 364 | vector<text_t>::iterator docorder_here = queryresults.docs.docorder.begin();
|
---|
| 365 | vector<text_t>::iterator docorder_end = queryresults.docs.docorder.end();
|
---|
[1324] | 366 |
|
---|
[1662] | 367 | // documents containing matching phrases will be sorted to the top so
|
---|
| 368 | // we can break out once we're past those that match the PhraseMatch
|
---|
| 369 | // option -- "all_phrases" = return only those documents containing all
|
---|
| 370 | // phrases in query string
|
---|
| 371 | // "some_phrases" = return only those documents containing
|
---|
| 372 | // at least 1 of the phrases in the document
|
---|
| 373 | // "all_docs" = return all documents regardless
|
---|
| 374 | if (num_phrases > 0) {
|
---|
| 375 | int numdocs = 0;
|
---|
| 376 | while (docorder_here != docorder_end) {
|
---|
| 377 | docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
|
---|
| 378 |
|
---|
| 379 | if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
|
---|
| 380 | ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
|
---|
| 381 | queryresults.docs_matched = numdocs;
|
---|
| 382 | break;
|
---|
| 383 | }
|
---|
[9620] | 384 | ++numdocs;
|
---|
| 385 | ++docorder_here;
|
---|
[1662] | 386 | }
|
---|
| 387 | }
|
---|
| 388 |
|
---|
[1324] | 389 | if (endresults == -1) endresults = MAXNUMDOCS;
|
---|
[1662] | 390 | docorder_here = queryresults.docs.docorder.begin();
|
---|
[1324] | 391 | while (docorder_here != docorder_end) {
|
---|
[1662] | 392 | if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
|
---|
[1324] | 393 |
|
---|
| 394 | // translate the document number
|
---|
[15558] | 395 | if (!translate(db_ptr, *docorder_here, trans_OID)) {
|
---|
[1324] | 396 | logout << text_t2ascii
|
---|
| 397 | << "warning: could not translate mg document number \""
|
---|
| 398 | << *docorder_here << "\"to OID.\n\n";
|
---|
| 399 |
|
---|
| 400 | } else {
|
---|
| 401 | docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
|
---|
| 402 |
|
---|
| 403 | // see if there is a result for this number,
|
---|
| 404 | // if it is in the request set (or the request set is empty)
|
---|
| 405 | if (docset_here != queryresults.docs.docset.end() &&
|
---|
| 406 | (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
|
---|
| 407 | if (resultnum >= startresults) {
|
---|
| 408 | // add this document
|
---|
| 409 | resultdoc.OID = trans_OID;
|
---|
| 410 | resultdoc.result_num = resultnum;
|
---|
| 411 | resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
|
---|
| 412 |
|
---|
| 413 | // these next two are not available on all versions of mg
|
---|
| 414 | resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
|
---|
| 415 | resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
|
---|
| 416 |
|
---|
| 417 | response.docInfo.push_back (resultdoc);
|
---|
| 418 | }
|
---|
| 419 |
|
---|
[9620] | 420 | ++resultnum;
|
---|
[1324] | 421 | }
|
---|
| 422 | }
|
---|
| 423 |
|
---|
[9620] | 424 | ++docorder_here;
|
---|
[1324] | 425 | }
|
---|
| 426 | }
|
---|
| 427 |
|
---|
| 428 | // assemble the term results
|
---|
| 429 | if (need_term_info(request.filterResultOptions)) {
|
---|
| 430 | // note: the terms have already been sorted and uniqued
|
---|
| 431 |
|
---|
| 432 | TermInfo_t terminfo;
|
---|
| 433 | bool terms_first = true;
|
---|
| 434 | termfreqclassarray::iterator terms_here = queryresults.terms.begin();
|
---|
| 435 | termfreqclassarray::iterator terms_end = queryresults.terms.end();
|
---|
| 436 |
|
---|
| 437 | while (terms_here != terms_end) {
|
---|
| 438 | terminfo.clear();
|
---|
| 439 | terminfo.term = (*terms_here).termstr;
|
---|
| 440 | terminfo.freq = (*terms_here).termfreq;
|
---|
| 441 | if (terms_first) {
|
---|
| 442 | text_tset::iterator termvariants_here = queryresults.termvariants.begin();
|
---|
| 443 | text_tset::iterator termvariants_end = queryresults.termvariants.end();
|
---|
| 444 | while (termvariants_here != termvariants_end) {
|
---|
| 445 | terminfo.matchTerms.push_back (*termvariants_here);
|
---|
[9620] | 446 | ++termvariants_here;
|
---|
[1324] | 447 | }
|
---|
| 448 | }
|
---|
| 449 | terms_first = false;
|
---|
| 450 |
|
---|
| 451 | response.termInfo.push_back (terminfo);
|
---|
| 452 |
|
---|
[9620] | 453 | ++terms_here;
|
---|
[1324] | 454 | }
|
---|
| 455 | }
|
---|
| 456 |
|
---|
[15558] | 457 | db_ptr->closedatabase(); // Important that local library doesn't leave any files open
|
---|
[1324] | 458 | response.numDocs = queryresults.docs_matched;
|
---|
| 459 | response.isApprox = queryresults.is_approx;
|
---|
| 460 | }
|
---|
| 461 |
|
---|
[27064] | 462 | void mgqueryfilterclass::parse_query_params (const FilterRequest_t &request,
|
---|
[1662] | 463 | vector<queryparamclass> &query_params,
|
---|
[27064] | 464 | int &startresults, int &endresults,
|
---|
| 465 | text_t &phrasematch, ostream &logout) {
|
---|
[1662] | 466 |
|
---|
[27064] | 467 | queryfilterclass::parse_query_params (request, query_params,
|
---|
| 468 | startresults, endresults, logout);
|
---|
| 469 |
|
---|
| 470 | phrasematch = filterOptions["PhraseMatch"].defaultValue;
|
---|
| 471 |
|
---|
| 472 | // is there a better way to do this than iterate through all the options again??
|
---|
| 473 | OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
|
---|
| 474 | OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
|
---|
| 475 | while (options_here != options_end) {
|
---|
| 476 | if ((*options_here).name == "PhraseMatch") {
|
---|
| 477 | phrasematch = (*options_here).value;
|
---|
| 478 | break;
|
---|
| 479 | }
|
---|
| 480 | ++options_here;
|
---|
| 481 | }
|
---|
[1662] | 482 |
|
---|
| 483 | vector<queryparamclass>::iterator query_here = query_params.begin();
|
---|
| 484 | vector<queryparamclass>::iterator query_end = query_params.end();
|
---|
| 485 | while (query_here != query_end) {
|
---|
| 486 |
|
---|
[2134] | 487 | // if we're doing a phrase search we want to maximise hits by making it
|
---|
| 488 | // a boolean search on the index with the finest granularity - we'll
|
---|
| 489 | // also set maxdocs to "all" (realizing that this will cause searches
|
---|
| 490 | // like "and the" on a large collection to take a very very long time).
|
---|
| 491 |
|
---|
[1662] | 492 | // we're deciding it's a phrase search based on if the querystring
|
---|
| 493 | // contains at least 2 double quotes (not very scientific but
|
---|
| 494 | // then neither is the rest of the mg phrase searching functionality :-)
|
---|
[11002] | 495 | //if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
|
---|
| 496 |
|
---|
| 497 | // [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
|
---|
| 498 | text_tarray phrases;
|
---|
| 499 | get_phrases((*query_here).querystring, phrases);
|
---|
| 500 |
|
---|
| 501 | if (phrases.size() > 0) {
|
---|
[1662] | 502 | (*query_here).search_type = 0;
|
---|
| 503 |
|
---|
[2134] | 504 | // set maxdocs to "all"
|
---|
| 505 | (*query_here).maxdocs = -1;
|
---|
| 506 |
|
---|
[1662] | 507 | // Get the long version of the index and test to see if any indexes with
|
---|
| 508 | // finer granularity exist. Indexes must be the same type (i.e. same metadata
|
---|
| 509 | // or "text").
|
---|
| 510 | text_t longindex; text_tarray splitindex;
|
---|
| 511 | indexmap.to2from ((*query_here).index, longindex);
|
---|
| 512 | splitchar (longindex.begin(), longindex.end(), ':', splitindex);
|
---|
| 513 | text_t &granularity = splitindex[0];
|
---|
| 514 | text_t &indextype = splitindex[1];
|
---|
| 515 | bool found = false;
|
---|
| 516 | // currently supported granularity options are "document", "section" and "paragraph"
|
---|
| 517 | if (granularity == "document" || granularity == "section") {
|
---|
| 518 | text_t shortindex;
|
---|
| 519 | if (indexmap.fromexists ("paragraph:" + indextype)) {
|
---|
| 520 | indexmap.from2to ("paragraph:" + indextype, shortindex);
|
---|
| 521 | (*query_here).index = shortindex;
|
---|
| 522 | found = true;
|
---|
| 523 | }
|
---|
| 524 | if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
|
---|
| 525 | indexmap.from2to ("section:" + indextype, shortindex);
|
---|
| 526 | (*query_here).index = shortindex;
|
---|
| 527 | }
|
---|
| 528 | }
|
---|
| 529 | }
|
---|
[4507] | 530 |
|
---|
| 531 | #ifdef GSDL_BBC_COLLECTION
|
---|
| 532 | // This is a special hack for the BBC collection's ProgNumber and zzabn
|
---|
| 533 | // indexes (they're built this way to prevent mg_perf_hash_build from
|
---|
| 534 | // dying at build time)
|
---|
| 535 |
|
---|
[4735] | 536 | // if we're searching the ProgNumber index we want to
|
---|
[4507] | 537 | // remove all non-alphanumeric characters from the query string
|
---|
| 538 | text_t longindex; text_tarray splitindex;
|
---|
| 539 | indexmap.to2from ((*query_here).index, longindex);
|
---|
| 540 | splitchar (longindex.begin(), longindex.end(), ':', splitindex);
|
---|
| 541 | text_t &indextype = splitindex[1];
|
---|
[4735] | 542 | if (indextype == "ProgNumber") {
|
---|
[4507] | 543 | text_t new_querystring;
|
---|
| 544 | text_t::const_iterator here = (*query_here).querystring.begin();
|
---|
| 545 | text_t::const_iterator end = (*query_here).querystring.end();
|
---|
| 546 | while (here != end) {
|
---|
| 547 | if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
|
---|
| 548 | (*here >= '0' && *here <= '9')) {
|
---|
| 549 | new_querystring.push_back (*here);
|
---|
| 550 | }
|
---|
[9620] | 551 | ++here;
|
---|
[4507] | 552 | }
|
---|
| 553 | (*query_here).querystring = new_querystring;
|
---|
| 554 | }
|
---|
| 555 | #endif
|
---|
[9620] | 556 | ++query_here;
|
---|
[1662] | 557 | }
|
---|
| 558 | }
|
---|
| 559 |
|
---|