[1324] | 1 | /**********************************************************************
|
---|
| 2 | *
|
---|
| 3 | * mgqueryfilter.cpp -- implementation of queryfilter for old mg
|
---|
| 4 | * Copyright (C) 1999 The New Zealand Digital Library Project
|
---|
| 5 | *
|
---|
| 6 | * A component of the Greenstone digital library software
|
---|
| 7 | * from the New Zealand Digital Library Project at the
|
---|
| 8 | * University of Waikato, New Zealand.
|
---|
| 9 | *
|
---|
| 10 | * This program is free software; you can redistribute it and/or modify
|
---|
| 11 | * it under the terms of the GNU General Public License as published by
|
---|
| 12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | * (at your option) any later version.
|
---|
| 14 | *
|
---|
| 15 | * This program is distributed in the hope that it will be useful,
|
---|
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | * GNU General Public License for more details.
|
---|
| 19 | *
|
---|
| 20 | * You should have received a copy of the GNU General Public License
|
---|
| 21 | * along with this program; if not, write to the Free Software
|
---|
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | *
|
---|
| 24 | *********************************************************************/
|
---|
| 25 |
|
---|
| 26 | #include "mgqueryfilter.h"
|
---|
| 27 | #include "fileutil.h"
|
---|
| 28 | #include "phrasesearch.h"
|
---|
| 29 | #include <assert.h>
|
---|
| 30 | #include "mgsearch.h"
|
---|
[11002] | 31 | #include "phrases.h"
|
---|
[1324] | 32 |
|
---|
| 33 | ///////////////////////////////
|
---|
| 34 | // methods for resultsorderer_t
|
---|
| 35 | ///////////////////////////////
|
---|
| 36 |
|
---|
| 37 | resultsorderer_t::resultsorderer_t() {
|
---|
| 38 | clear ();
|
---|
| 39 | }
|
---|
| 40 |
|
---|
| 41 | void resultsorderer_t::clear() {
|
---|
| 42 | compare_phrase_match = false;
|
---|
| 43 | compare_terms_match = false;
|
---|
| 44 | compare_doc_weight = true;
|
---|
| 45 |
|
---|
| 46 | docset = NULL;
|
---|
| 47 | }
|
---|
| 48 |
|
---|
| 49 | bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
|
---|
| 50 | if (docset == NULL) return t1>t2;
|
---|
| 51 |
|
---|
| 52 | docresultmap::iterator t1_here = docset->find(t1);
|
---|
| 53 | docresultmap::iterator t2_here = docset->find(t2);
|
---|
| 54 | docresultmap::iterator end = docset->end();
|
---|
| 55 |
|
---|
| 56 | // sort all the document numbers not in the document set to
|
---|
| 57 | // the end of the list
|
---|
| 58 | if (t1_here == end) {
|
---|
| 59 | if (t2_here == end) return t1>t2;
|
---|
| 60 | else return true;
|
---|
| 61 | } else if (t2_here == end) return false;
|
---|
| 62 |
|
---|
| 63 | if (compare_phrase_match) {
|
---|
| 64 | if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
|
---|
| 65 | if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
|
---|
| 66 | }
|
---|
| 67 |
|
---|
| 68 | if (compare_terms_match) {
|
---|
| 69 | if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
|
---|
| 70 | if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
|
---|
| 71 | }
|
---|
| 72 |
|
---|
| 73 | if (compare_doc_weight) {
|
---|
| 74 | if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
|
---|
| 75 | if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
|
---|
| 76 | }
|
---|
| 77 |
|
---|
| 78 | return t1>t2;
|
---|
| 79 | }
|
---|
| 80 |
|
---|
| 81 |
|
---|
| 82 |
|
---|
| 83 |
|
---|
| 84 | /////////////////////////////////
|
---|
| 85 | // functions for mgqueryfilterclass
|
---|
| 86 | /////////////////////////////////
|
---|
| 87 |
|
---|
[4193] | 88 |
|
---|
| 89 | void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
|
---|
| 90 | queryfilterclass::configure (key, cfgline);
|
---|
| 91 |
|
---|
[12314] | 92 | if (key == "indexstem") {
|
---|
[9937] | 93 | ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
|
---|
| 94 | }
|
---|
| 95 |
|
---|
[4193] | 96 | }
|
---|
| 97 |
|
---|
[1324] | 98 | // loads up phrases data structure with any phrases (that's the quoted bits)
|
---|
| 99 | // occuring in the querystring
|
---|
| 100 | void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
|
---|
| 101 | const termfreqclassarray &orgterms,
|
---|
| 102 | vector<termfreqclassarray> &phrases) {
|
---|
| 103 |
|
---|
| 104 | text_t::const_iterator here = querystring.begin();
|
---|
| 105 | text_t::const_iterator end = querystring.end();
|
---|
| 106 |
|
---|
| 107 | termfreqclassarray tmpterms;
|
---|
| 108 |
|
---|
| 109 | int termcount = 0;
|
---|
| 110 | bool foundquote = false;
|
---|
| 111 | bool foundbreak = false;
|
---|
| 112 | bool start = true;
|
---|
| 113 | while (here != end) {
|
---|
| 114 | if (*here == '\"') {
|
---|
| 115 | if (foundquote) {
|
---|
| 116 | if (!foundbreak && !start) {
|
---|
| 117 | tmpterms.push_back (orgterms[termcount]);
|
---|
[9620] | 118 | ++termcount;
|
---|
[1324] | 119 | }
|
---|
| 120 | if (tmpterms.size() > 1) {
|
---|
| 121 | phrases.push_back (tmpterms);
|
---|
| 122 | }
|
---|
[11002] | 123 | tmpterms.erase (tmpterms.begin(), tmpterms.end());
|
---|
| 124 |
|
---|
[1324] | 125 | foundquote = false;
|
---|
| 126 | foundbreak = true;
|
---|
| 127 | } else foundquote = true;
|
---|
| 128 | } else if (!is_unicode_letdig(*here)) {
|
---|
| 129 | // found a break between terms
|
---|
| 130 | if (!foundbreak && !start) {
|
---|
[11002] | 131 | if (foundquote) {
|
---|
[1324] | 132 | tmpterms.push_back (orgterms[termcount]);
|
---|
[11002] | 133 | }
|
---|
[9620] | 134 | ++termcount;
|
---|
[1324] | 135 | }
|
---|
| 136 | foundbreak = true;
|
---|
| 137 | } else {
|
---|
| 138 | start = false;
|
---|
| 139 | foundbreak = false;
|
---|
| 140 | }
|
---|
[9620] | 141 | ++here;
|
---|
[1324] | 142 | }
|
---|
| 143 | }
|
---|
| 144 |
|
---|
| 145 | // do aditional query processing
|
---|
| 146 | void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
|
---|
| 147 | queryresultsclass &queryresults) {
|
---|
| 148 |
|
---|
| 149 | // post-process the results if needed
|
---|
| 150 | if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
|
---|
| 151 |
|
---|
| 152 | // get the terms between quotes (if any)
|
---|
| 153 | vector<termfreqclassarray> phrases;
|
---|
| 154 | get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
|
---|
| 155 |
|
---|
| 156 | num_phrases = phrases.size();
|
---|
| 157 | if (num_phrases > 0) {
|
---|
| 158 |
|
---|
| 159 | // get the long version of the index
|
---|
| 160 | text_t longindex;
|
---|
| 161 | indexmap.to2from (queryparams.index, longindex);
|
---|
| 162 |
|
---|
| 163 | vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
|
---|
| 164 | vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
|
---|
| 165 |
|
---|
| 166 | while (this_phrase != end_phrase) {
|
---|
| 167 |
|
---|
| 168 | // process each of the matched documents
|
---|
| 169 | docresultmap::iterator docs_here = queryresults.docs.docset.begin();
|
---|
| 170 | docresultmap::iterator docs_end = queryresults.docs.docset.end();
|
---|
| 171 | while (docs_here != docs_end) {
|
---|
[15558] | 172 | if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *db_ptr, queryparams.index,
|
---|
[1324] | 173 | queryparams.subcollection, queryparams.language,
|
---|
| 174 | longindex, queryparams.collection, *this_phrase,
|
---|
| 175 | (*docs_here).second.docnum)) {
|
---|
[9620] | 176 | ++docs_here->second.num_phrase_match;
|
---|
[1324] | 177 | }
|
---|
| 178 |
|
---|
[9620] | 179 | ++docs_here;
|
---|
[1324] | 180 | }
|
---|
[9620] | 181 | ++this_phrase;
|
---|
[1324] | 182 | }
|
---|
| 183 | }
|
---|
| 184 | }
|
---|
| 185 | }
|
---|
| 186 |
|
---|
| 187 |
|
---|
| 188 | // do query that might involve multiple sub queries
|
---|
[15558] | 189 | // mgsearchptr and db_ptr are assumed to be valid
|
---|
[1324] | 190 | void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
|
---|
| 191 | const vector<queryparamclass> &query_params,
|
---|
| 192 | queryresultsclass &multiresults,
|
---|
| 193 | comerror_t &err, ostream &logout) {
|
---|
| 194 | outconvertclass text_t2ascii;
|
---|
| 195 |
|
---|
| 196 | err = noError;
|
---|
[8026] | 197 | textsearchptr->setcollectdir (collectdir);
|
---|
[1324] | 198 | multiresults.clear();
|
---|
| 199 |
|
---|
| 200 | vector<queryparamclass>::const_iterator query_here = query_params.begin();
|
---|
| 201 | vector<queryparamclass>::const_iterator query_end = query_params.end();
|
---|
| 202 | while (query_here != query_end) {
|
---|
| 203 | queryresultsclass thisqueryresults;
|
---|
[1662] | 204 |
|
---|
[8026] | 205 | if (!textsearchptr->search(*query_here, thisqueryresults)) {
|
---|
[1324] | 206 | // most likely a system problem
|
---|
| 207 | logout << text_t2ascii
|
---|
| 208 | << "system problem: could not do search with mg for index \""
|
---|
| 209 | << (*query_here).index << (*query_here).subcollection
|
---|
| 210 | << (*query_here).language << "\".\n\n";
|
---|
| 211 | err = systemProblem;
|
---|
| 212 | return;
|
---|
| 213 | }
|
---|
| 214 |
|
---|
| 215 | // combine the results
|
---|
| 216 | if (need_matching_docs (request.filterResultOptions)) {
|
---|
| 217 | // post-process the results if needed
|
---|
| 218 | if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
|
---|
| 219 | !thisqueryresults.docs.docset.empty()) {
|
---|
| 220 | post_process (*query_here, thisqueryresults);
|
---|
| 221 | thisqueryresults.postprocessed = true;
|
---|
| 222 | multiresults.postprocessed = true;
|
---|
[1721] | 223 | } else {
|
---|
| 224 | num_phrases = 0;
|
---|
[1324] | 225 | }
|
---|
| 226 |
|
---|
| 227 | if (query_params.size() == 1) {
|
---|
| 228 | multiresults.docs = thisqueryresults.docs; // just one set of results
|
---|
| 229 | multiresults.docs_matched = thisqueryresults.docs_matched;
|
---|
| 230 | multiresults.is_approx = thisqueryresults.is_approx;
|
---|
| 231 |
|
---|
| 232 | } else {
|
---|
| 233 | if ((*query_here).combinequery == "and") {
|
---|
| 234 | multiresults.docs.combine_and (thisqueryresults.docs);
|
---|
| 235 | } else if ((*query_here).combinequery == "or") {
|
---|
| 236 | multiresults.docs.combine_or (thisqueryresults.docs);
|
---|
| 237 | } else if ((*query_here).combinequery == "not") {
|
---|
| 238 | multiresults.docs.combine_not (thisqueryresults.docs);
|
---|
| 239 | }
|
---|
| 240 | multiresults.docs_matched = multiresults.docs.docset.size();
|
---|
| 241 | multiresults.is_approx = Exact;
|
---|
| 242 | }
|
---|
| 243 | }
|
---|
| 244 |
|
---|
| 245 | // combine the term information
|
---|
| 246 | if (need_term_info (request.filterResultOptions)) {
|
---|
| 247 | // append the terms
|
---|
| 248 | multiresults.orgterms.insert(multiresults.orgterms.end(),
|
---|
| 249 | thisqueryresults.orgterms.begin(),
|
---|
| 250 | thisqueryresults.orgterms.end());
|
---|
| 251 |
|
---|
| 252 | // add the term variants
|
---|
| 253 | text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
|
---|
| 254 | text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
|
---|
| 255 | while (termvar_here != termvar_end) {
|
---|
| 256 | multiresults.termvariants.insert(*termvar_here);
|
---|
[9620] | 257 | ++termvar_here;
|
---|
[1324] | 258 | }
|
---|
| 259 | }
|
---|
| 260 |
|
---|
[9620] | 261 | ++query_here;
|
---|
[1324] | 262 | }
|
---|
| 263 |
|
---|
| 264 | // sort and unique the query terms
|
---|
| 265 | multiresults.sortuniqqueryterms ();
|
---|
| 266 | }
|
---|
| 267 |
|
---|
| 268 |
|
---|
| 269 | void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
|
---|
| 270 | docresultsclass &docs) {
|
---|
| 271 | resultsorderer_t resultsorderer;
|
---|
| 272 | resultsorderer.compare_phrase_match = true;
|
---|
| 273 | resultsorderer.docset = &(docs.docset);
|
---|
| 274 |
|
---|
| 275 | // first get a list of document numbers
|
---|
| 276 | docs.docnum_order();
|
---|
| 277 |
|
---|
| 278 | sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
|
---|
| 279 | }
|
---|
| 280 |
|
---|
| 281 |
|
---|
| 282 |
|
---|
| 283 | mgqueryfilterclass::mgqueryfilterclass ()
|
---|
| 284 | :queryfilterclass() {
|
---|
| 285 |
|
---|
| 286 | num_phrases = 0;
|
---|
| 287 | }
|
---|
| 288 |
|
---|
| 289 | mgqueryfilterclass::~mgqueryfilterclass () {
|
---|
| 290 | }
|
---|
| 291 |
|
---|
| 292 | void mgqueryfilterclass::filter (const FilterRequest_t &request,
|
---|
| 293 | FilterResponse_t &response,
|
---|
| 294 | comerror_t &err, ostream &logout) {
|
---|
| 295 | outconvertclass text_t2ascii;
|
---|
| 296 |
|
---|
| 297 | response.clear ();
|
---|
| 298 | err = noError;
|
---|
[15558] | 299 | if (db_ptr == NULL) {
|
---|
[1324] | 300 | // most likely a configuration problem
|
---|
| 301 | logout << text_t2ascii
|
---|
[15558] | 302 | << "configuration error: mgqueryfilter contains a null dbclass\n\n";
|
---|
[1324] | 303 | err = configurationError;
|
---|
| 304 | return;
|
---|
| 305 | }
|
---|
[8026] | 306 | if (textsearchptr == NULL) {
|
---|
[1324] | 307 | // most likely a configuration problem
|
---|
| 308 | logout << text_t2ascii
|
---|
[8026] | 309 | << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
|
---|
[1324] | 310 | err = configurationError;
|
---|
| 311 | return;
|
---|
| 312 | }
|
---|
| 313 |
|
---|
| 314 | // open the database
|
---|
[15558] | 315 | db_ptr->setlogout(&logout);
|
---|
| 316 | if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
|
---|
| 317 | // most likely a system problem (we have already checked that the database exists)
|
---|
[1324] | 318 | logout << text_t2ascii
|
---|
[15558] | 319 | << "system problem: open on database \"" << db_filename << "\" failed\n\n";
|
---|
[1324] | 320 | err = systemProblem;
|
---|
| 321 | return;
|
---|
| 322 | }
|
---|
| 323 |
|
---|
| 324 | // get the query parameters
|
---|
| 325 | int startresults = filterOptions["StartResults"].defaultValue.getint();
|
---|
| 326 | int endresults = filterOptions["EndResults"].defaultValue.getint();
|
---|
| 327 | text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
|
---|
| 328 |
|
---|
| 329 | vector<queryparamclass> queryfilterparams;
|
---|
| 330 | parse_query_params (request, queryfilterparams, startresults,
|
---|
[1662] | 331 | endresults, phrasematch, logout);
|
---|
| 332 | // do any mg specific diddling with query parameters that may be required
|
---|
| 333 | mg_parse_query_params (request, queryfilterparams, startresults,
|
---|
| 334 | endresults, phrasematch, logout);
|
---|
| 335 |
|
---|
| 336 |
|
---|
[1324] | 337 | // do query
|
---|
| 338 | queryresultsclass queryresults;
|
---|
| 339 | do_multi_query (request, queryfilterparams, queryresults, err, logout);
|
---|
| 340 | if (err != noError) return;
|
---|
| 341 |
|
---|
| 342 | // assemble document results
|
---|
| 343 | if (need_matching_docs (request.filterResultOptions)) {
|
---|
| 344 | // sort the query results
|
---|
[5850] | 345 | // only want to sort the docs if we have done a ranked search or there were phrases
|
---|
| 346 | if (num_phrases > 0 || (request.filterResultOptions & FRranking)) {
|
---|
| 347 | sort_doc_results (request, queryresults.docs);
|
---|
| 348 | }
|
---|
[1324] | 349 | int resultnum = 1;
|
---|
| 350 | ResultDocInfo_t resultdoc;
|
---|
| 351 | text_t trans_OID;
|
---|
| 352 | vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
|
---|
| 353 | vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
|
---|
| 354 |
|
---|
[1662] | 355 | // documents containing matching phrases will be sorted to the top so
|
---|
| 356 | // we can break out once we're past those that match the PhraseMatch
|
---|
| 357 | // option -- "all_phrases" = return only those documents containing all
|
---|
| 358 | // phrases in query string
|
---|
| 359 | // "some_phrases" = return only those documents containing
|
---|
| 360 | // at least 1 of the phrases in the document
|
---|
| 361 | // "all_docs" = return all documents regardless
|
---|
| 362 | if (num_phrases > 0) {
|
---|
| 363 | int numdocs = 0;
|
---|
| 364 | while (docorder_here != docorder_end) {
|
---|
| 365 | docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
|
---|
| 366 |
|
---|
| 367 | if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
|
---|
| 368 | ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
|
---|
| 369 | queryresults.docs_matched = numdocs;
|
---|
| 370 | break;
|
---|
| 371 | }
|
---|
[9620] | 372 | ++numdocs;
|
---|
| 373 | ++docorder_here;
|
---|
[1662] | 374 | }
|
---|
| 375 | }
|
---|
| 376 |
|
---|
[1324] | 377 | if (endresults == -1) endresults = MAXNUMDOCS;
|
---|
[1662] | 378 | docorder_here = queryresults.docs.docorder.begin();
|
---|
[1324] | 379 | while (docorder_here != docorder_end) {
|
---|
[1662] | 380 | if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
|
---|
[1324] | 381 |
|
---|
| 382 | // translate the document number
|
---|
[15558] | 383 | if (!translate(db_ptr, *docorder_here, trans_OID)) {
|
---|
[1324] | 384 | logout << text_t2ascii
|
---|
| 385 | << "warning: could not translate mg document number \""
|
---|
| 386 | << *docorder_here << "\"to OID.\n\n";
|
---|
| 387 |
|
---|
| 388 | } else {
|
---|
| 389 | docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
|
---|
| 390 |
|
---|
| 391 | // see if there is a result for this number,
|
---|
| 392 | // if it is in the request set (or the request set is empty)
|
---|
| 393 | if (docset_here != queryresults.docs.docset.end() &&
|
---|
| 394 | (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
|
---|
| 395 | if (resultnum >= startresults) {
|
---|
| 396 | // add this document
|
---|
| 397 | resultdoc.OID = trans_OID;
|
---|
| 398 | resultdoc.result_num = resultnum;
|
---|
| 399 | resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
|
---|
| 400 |
|
---|
| 401 | // these next two are not available on all versions of mg
|
---|
| 402 | resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
|
---|
| 403 | resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
|
---|
| 404 |
|
---|
| 405 | response.docInfo.push_back (resultdoc);
|
---|
| 406 | }
|
---|
| 407 |
|
---|
[9620] | 408 | ++resultnum;
|
---|
[1324] | 409 | }
|
---|
| 410 | }
|
---|
| 411 |
|
---|
[9620] | 412 | ++docorder_here;
|
---|
[1324] | 413 | }
|
---|
| 414 | }
|
---|
| 415 |
|
---|
| 416 | // assemble the term results
|
---|
| 417 | if (need_term_info(request.filterResultOptions)) {
|
---|
| 418 | // note: the terms have already been sorted and uniqued
|
---|
| 419 |
|
---|
| 420 | TermInfo_t terminfo;
|
---|
| 421 | bool terms_first = true;
|
---|
| 422 | termfreqclassarray::iterator terms_here = queryresults.terms.begin();
|
---|
| 423 | termfreqclassarray::iterator terms_end = queryresults.terms.end();
|
---|
| 424 |
|
---|
| 425 | while (terms_here != terms_end) {
|
---|
| 426 | terminfo.clear();
|
---|
| 427 | terminfo.term = (*terms_here).termstr;
|
---|
| 428 | terminfo.freq = (*terms_here).termfreq;
|
---|
| 429 | if (terms_first) {
|
---|
| 430 | text_tset::iterator termvariants_here = queryresults.termvariants.begin();
|
---|
| 431 | text_tset::iterator termvariants_end = queryresults.termvariants.end();
|
---|
| 432 | while (termvariants_here != termvariants_end) {
|
---|
| 433 | terminfo.matchTerms.push_back (*termvariants_here);
|
---|
[9620] | 434 | ++termvariants_here;
|
---|
[1324] | 435 | }
|
---|
| 436 | }
|
---|
| 437 | terms_first = false;
|
---|
| 438 |
|
---|
| 439 | response.termInfo.push_back (terminfo);
|
---|
| 440 |
|
---|
[9620] | 441 | ++terms_here;
|
---|
[1324] | 442 | }
|
---|
| 443 | }
|
---|
| 444 |
|
---|
[15558] | 445 | db_ptr->closedatabase(); // Important that local library doesn't leave any files open
|
---|
[1324] | 446 | response.numDocs = queryresults.docs_matched;
|
---|
| 447 | response.isApprox = queryresults.is_approx;
|
---|
| 448 | }
|
---|
| 449 |
|
---|
[1662] | 450 | void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/,
|
---|
| 451 | vector<queryparamclass> &query_params,
|
---|
| 452 | int &/*startresults*/, int &/*endresults*/,
|
---|
| 453 | text_t &/*phrasematch*/, ostream &/*logout*/) {
|
---|
| 454 |
|
---|
| 455 | // outconvertclass text_t2ascii;
|
---|
| 456 |
|
---|
| 457 | vector<queryparamclass>::iterator query_here = query_params.begin();
|
---|
| 458 | vector<queryparamclass>::iterator query_end = query_params.end();
|
---|
| 459 | while (query_here != query_end) {
|
---|
| 460 |
|
---|
[2134] | 461 | // if we're doing a phrase search we want to maximise hits by making it
|
---|
| 462 | // a boolean search on the index with the finest granularity - we'll
|
---|
| 463 | // also set maxdocs to "all" (realizing that this will cause searches
|
---|
| 464 | // like "and the" on a large collection to take a very very long time).
|
---|
| 465 |
|
---|
[1662] | 466 | // we're deciding it's a phrase search based on if the querystring
|
---|
| 467 | // contains at least 2 double quotes (not very scientific but
|
---|
| 468 | // then neither is the rest of the mg phrase searching functionality :-)
|
---|
[11002] | 469 | //if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
|
---|
| 470 |
|
---|
| 471 | // [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
|
---|
| 472 | text_tarray phrases;
|
---|
| 473 | get_phrases((*query_here).querystring, phrases);
|
---|
| 474 |
|
---|
| 475 | if (phrases.size() > 0) {
|
---|
[1662] | 476 | (*query_here).search_type = 0;
|
---|
| 477 |
|
---|
[2134] | 478 | // set maxdocs to "all"
|
---|
| 479 | (*query_here).maxdocs = -1;
|
---|
| 480 |
|
---|
[1662] | 481 | // Get the long version of the index and test to see if any indexes with
|
---|
| 482 | // finer granularity exist. Indexes must be the same type (i.e. same metadata
|
---|
| 483 | // or "text").
|
---|
| 484 | text_t longindex; text_tarray splitindex;
|
---|
| 485 | indexmap.to2from ((*query_here).index, longindex);
|
---|
| 486 | splitchar (longindex.begin(), longindex.end(), ':', splitindex);
|
---|
| 487 | text_t &granularity = splitindex[0];
|
---|
| 488 | text_t &indextype = splitindex[1];
|
---|
| 489 | bool found = false;
|
---|
| 490 | // currently supported granularity options are "document", "section" and "paragraph"
|
---|
| 491 | if (granularity == "document" || granularity == "section") {
|
---|
| 492 | text_t shortindex;
|
---|
| 493 | if (indexmap.fromexists ("paragraph:" + indextype)) {
|
---|
| 494 | indexmap.from2to ("paragraph:" + indextype, shortindex);
|
---|
| 495 | (*query_here).index = shortindex;
|
---|
| 496 | found = true;
|
---|
| 497 | }
|
---|
| 498 | if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
|
---|
| 499 | indexmap.from2to ("section:" + indextype, shortindex);
|
---|
| 500 | (*query_here).index = shortindex;
|
---|
| 501 | }
|
---|
| 502 | }
|
---|
| 503 | }
|
---|
[4507] | 504 |
|
---|
| 505 | #ifdef GSDL_BBC_COLLECTION
|
---|
| 506 | // This is a special hack for the BBC collection's ProgNumber and zzabn
|
---|
| 507 | // indexes (they're built this way to prevent mg_perf_hash_build from
|
---|
| 508 | // dying at build time)
|
---|
| 509 |
|
---|
[4735] | 510 | // if we're searching the ProgNumber index we want to
|
---|
[4507] | 511 | // remove all non-alphanumeric characters from the query string
|
---|
| 512 | text_t longindex; text_tarray splitindex;
|
---|
| 513 | indexmap.to2from ((*query_here).index, longindex);
|
---|
| 514 | splitchar (longindex.begin(), longindex.end(), ':', splitindex);
|
---|
| 515 | text_t &indextype = splitindex[1];
|
---|
[4735] | 516 | if (indextype == "ProgNumber") {
|
---|
[4507] | 517 | text_t new_querystring;
|
---|
| 518 | text_t::const_iterator here = (*query_here).querystring.begin();
|
---|
| 519 | text_t::const_iterator end = (*query_here).querystring.end();
|
---|
| 520 | while (here != end) {
|
---|
| 521 | if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
|
---|
| 522 | (*here >= '0' && *here <= '9')) {
|
---|
| 523 | new_querystring.push_back (*here);
|
---|
| 524 | }
|
---|
[9620] | 525 | ++here;
|
---|
[4507] | 526 | }
|
---|
| 527 | (*query_here).querystring = new_querystring;
|
---|
| 528 | }
|
---|
| 529 | #endif
|
---|
[9620] | 530 | ++query_here;
|
---|
[1662] | 531 | }
|
---|
| 532 | }
|
---|
| 533 |
|
---|