[1324] | 1 | /**********************************************************************
|
---|
| 2 | *
|
---|
| 3 | * mgqueryfilter.cpp -- implementation of queryfilter for old mg
|
---|
| 4 | * Copyright (C) 1999 The New Zealand Digital Library Project
|
---|
| 5 | *
|
---|
| 6 | * A component of the Greenstone digital library software
|
---|
| 7 | * from the New Zealand Digital Library Project at the
|
---|
| 8 | * University of Waikato, New Zealand.
|
---|
| 9 | *
|
---|
| 10 | * This program is free software; you can redistribute it and/or modify
|
---|
| 11 | * it under the terms of the GNU General Public License as published by
|
---|
| 12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | * (at your option) any later version.
|
---|
| 14 | *
|
---|
| 15 | * This program is distributed in the hope that it will be useful,
|
---|
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | * GNU General Public License for more details.
|
---|
| 19 | *
|
---|
| 20 | * You should have received a copy of the GNU General Public License
|
---|
| 21 | * along with this program; if not, write to the Free Software
|
---|
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | *
|
---|
| 24 | *********************************************************************/
|
---|
| 25 |
|
---|
| 26 | #include "mgqueryfilter.h"
|
---|
| 27 | #include "fileutil.h"
|
---|
| 28 | #include "phrasesearch.h"
|
---|
| 29 | #include "mgsearch.h"
|
---|
[11002] | 30 | #include "phrases.h"
|
---|
[1324] | 31 |
|
---|
| 32 | ///////////////////////////////
|
---|
| 33 | // methods for resultsorderer_t
|
---|
| 34 | ///////////////////////////////
|
---|
| 35 |
|
---|
| 36 | resultsorderer_t::resultsorderer_t() {
|
---|
| 37 | clear ();
|
---|
| 38 | }
|
---|
| 39 |
|
---|
| 40 | void resultsorderer_t::clear() {
|
---|
| 41 | compare_phrase_match = false;
|
---|
| 42 | compare_terms_match = false;
|
---|
| 43 | compare_doc_weight = true;
|
---|
| 44 |
|
---|
| 45 | docset = NULL;
|
---|
| 46 | }
|
---|
| 47 |
|
---|
| 48 | bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
|
---|
| 49 | if (docset == NULL) return t1>t2;
|
---|
| 50 |
|
---|
| 51 | docresultmap::iterator t1_here = docset->find(t1);
|
---|
| 52 | docresultmap::iterator t2_here = docset->find(t2);
|
---|
| 53 | docresultmap::iterator end = docset->end();
|
---|
| 54 |
|
---|
| 55 | // sort all the document numbers not in the document set to
|
---|
| 56 | // the end of the list
|
---|
| 57 | if (t1_here == end) {
|
---|
| 58 | if (t2_here == end) return t1>t2;
|
---|
| 59 | else return true;
|
---|
| 60 | } else if (t2_here == end) return false;
|
---|
| 61 |
|
---|
| 62 | if (compare_phrase_match) {
|
---|
| 63 | if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
|
---|
| 64 | if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
|
---|
| 65 | }
|
---|
| 66 |
|
---|
| 67 | if (compare_terms_match) {
|
---|
| 68 | if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
|
---|
| 69 | if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
|
---|
| 70 | }
|
---|
| 71 |
|
---|
| 72 | if (compare_doc_weight) {
|
---|
| 73 | if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
|
---|
| 74 | if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
|
---|
| 75 | }
|
---|
| 76 |
|
---|
| 77 | return t1>t2;
|
---|
| 78 | }
|
---|
| 79 |
|
---|
| 80 |
|
---|
| 81 |
|
---|
| 82 |
|
---|
| 83 | /////////////////////////////////
|
---|
| 84 | // functions for mgqueryfilterclass
|
---|
| 85 | /////////////////////////////////
|
---|
| 86 |
|
---|
[4193] | 87 |
|
---|
| 88 | void mgqueryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
|
---|
| 89 | queryfilterclass::configure (key, cfgline);
|
---|
| 90 |
|
---|
[12314] | 91 | if (key == "indexstem") {
|
---|
[9937] | 92 | ((mgsearchclass *)textsearchptr)->set_indexstem (cfgline[0]);
|
---|
| 93 | }
|
---|
| 94 |
|
---|
[4193] | 95 | }
|
---|
| 96 |
|
---|
[1324] | 97 | // loads up phrases data structure with any phrases (that's the quoted bits)
|
---|
| 98 | // occuring in the querystring
|
---|
| 99 | void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
|
---|
| 100 | const termfreqclassarray &orgterms,
|
---|
| 101 | vector<termfreqclassarray> &phrases) {
|
---|
| 102 |
|
---|
| 103 | text_t::const_iterator here = querystring.begin();
|
---|
| 104 | text_t::const_iterator end = querystring.end();
|
---|
| 105 |
|
---|
| 106 | termfreqclassarray tmpterms;
|
---|
| 107 |
|
---|
| 108 | int termcount = 0;
|
---|
| 109 | bool foundquote = false;
|
---|
| 110 | bool foundbreak = false;
|
---|
| 111 | bool start = true;
|
---|
| 112 | while (here != end) {
|
---|
| 113 | if (*here == '\"') {
|
---|
| 114 | if (foundquote) {
|
---|
| 115 | if (!foundbreak && !start) {
|
---|
| 116 | tmpterms.push_back (orgterms[termcount]);
|
---|
[9620] | 117 | ++termcount;
|
---|
[1324] | 118 | }
|
---|
| 119 | if (tmpterms.size() > 1) {
|
---|
| 120 | phrases.push_back (tmpterms);
|
---|
| 121 | }
|
---|
[11002] | 122 | tmpterms.erase (tmpterms.begin(), tmpterms.end());
|
---|
| 123 |
|
---|
[1324] | 124 | foundquote = false;
|
---|
| 125 | foundbreak = true;
|
---|
| 126 | } else foundquote = true;
|
---|
| 127 | } else if (!is_unicode_letdig(*here)) {
|
---|
| 128 | // found a break between terms
|
---|
| 129 | if (!foundbreak && !start) {
|
---|
[11002] | 130 | if (foundquote) {
|
---|
[1324] | 131 | tmpterms.push_back (orgterms[termcount]);
|
---|
[11002] | 132 | }
|
---|
[9620] | 133 | ++termcount;
|
---|
[1324] | 134 | }
|
---|
| 135 | foundbreak = true;
|
---|
| 136 | } else {
|
---|
| 137 | start = false;
|
---|
| 138 | foundbreak = false;
|
---|
| 139 | }
|
---|
[9620] | 140 | ++here;
|
---|
[1324] | 141 | }
|
---|
| 142 | }
|
---|
| 143 |
|
---|
| 144 | // do aditional query processing
|
---|
| 145 | void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
|
---|
| 146 | queryresultsclass &queryresults) {
|
---|
| 147 |
|
---|
| 148 | // post-process the results if needed
|
---|
| 149 | if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
|
---|
| 150 |
|
---|
| 151 | // get the terms between quotes (if any)
|
---|
| 152 | vector<termfreqclassarray> phrases;
|
---|
| 153 | get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
|
---|
| 154 |
|
---|
| 155 | num_phrases = phrases.size();
|
---|
| 156 | if (num_phrases > 0) {
|
---|
| 157 |
|
---|
| 158 | // get the long version of the index
|
---|
| 159 | text_t longindex;
|
---|
| 160 | indexmap.to2from (queryparams.index, longindex);
|
---|
| 161 |
|
---|
| 162 | vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
|
---|
| 163 | vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
|
---|
| 164 |
|
---|
| 165 | while (this_phrase != end_phrase) {
|
---|
| 166 |
|
---|
| 167 | // process each of the matched documents
|
---|
| 168 | docresultmap::iterator docs_here = queryresults.docs.docset.begin();
|
---|
| 169 | docresultmap::iterator docs_end = queryresults.docs.docset.end();
|
---|
| 170 | while (docs_here != docs_end) {
|
---|
[15558] | 171 | if (OID_phrase_search (*((mgsearchclass*)textsearchptr), *db_ptr, queryparams.index,
|
---|
[1324] | 172 | queryparams.subcollection, queryparams.language,
|
---|
| 173 | longindex, queryparams.collection, *this_phrase,
|
---|
| 174 | (*docs_here).second.docnum)) {
|
---|
[9620] | 175 | ++docs_here->second.num_phrase_match;
|
---|
[1324] | 176 | }
|
---|
| 177 |
|
---|
[9620] | 178 | ++docs_here;
|
---|
[1324] | 179 | }
|
---|
[9620] | 180 | ++this_phrase;
|
---|
[1324] | 181 | }
|
---|
| 182 | }
|
---|
| 183 | }
|
---|
| 184 | }
|
---|
| 185 |
|
---|
| 186 |
|
---|
| 187 | // do query that might involve multiple sub queries
|
---|
[15595] | 188 | // textsearchptr and db_ptr are assumed to be valid
|
---|
[1324] | 189 | void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
|
---|
| 190 | const vector<queryparamclass> &query_params,
|
---|
| 191 | queryresultsclass &multiresults,
|
---|
| 192 | comerror_t &err, ostream &logout) {
|
---|
| 193 | outconvertclass text_t2ascii;
|
---|
| 194 |
|
---|
| 195 | err = noError;
|
---|
[8026] | 196 | textsearchptr->setcollectdir (collectdir);
|
---|
[1324] | 197 | multiresults.clear();
|
---|
| 198 |
|
---|
| 199 | vector<queryparamclass>::const_iterator query_here = query_params.begin();
|
---|
| 200 | vector<queryparamclass>::const_iterator query_end = query_params.end();
|
---|
| 201 | while (query_here != query_end) {
|
---|
| 202 | queryresultsclass thisqueryresults;
|
---|
[1662] | 203 |
|
---|
[8026] | 204 | if (!textsearchptr->search(*query_here, thisqueryresults)) {
|
---|
[1324] | 205 | // most likely a system problem
|
---|
| 206 | logout << text_t2ascii
|
---|
| 207 | << "system problem: could not do search with mg for index \""
|
---|
| 208 | << (*query_here).index << (*query_here).subcollection
|
---|
| 209 | << (*query_here).language << "\".\n\n";
|
---|
| 210 | err = systemProblem;
|
---|
| 211 | return;
|
---|
| 212 | }
|
---|
| 213 |
|
---|
| 214 | // combine the results
|
---|
| 215 | if (need_matching_docs (request.filterResultOptions)) {
|
---|
| 216 | // post-process the results if needed
|
---|
| 217 | if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
|
---|
| 218 | !thisqueryresults.docs.docset.empty()) {
|
---|
| 219 | post_process (*query_here, thisqueryresults);
|
---|
| 220 | thisqueryresults.postprocessed = true;
|
---|
| 221 | multiresults.postprocessed = true;
|
---|
[1721] | 222 | } else {
|
---|
| 223 | num_phrases = 0;
|
---|
[1324] | 224 | }
|
---|
| 225 |
|
---|
| 226 | if (query_params.size() == 1) {
|
---|
| 227 | multiresults.docs = thisqueryresults.docs; // just one set of results
|
---|
| 228 | multiresults.docs_matched = thisqueryresults.docs_matched;
|
---|
| 229 | multiresults.is_approx = thisqueryresults.is_approx;
|
---|
| 230 |
|
---|
| 231 | } else {
|
---|
| 232 | if ((*query_here).combinequery == "and") {
|
---|
| 233 | multiresults.docs.combine_and (thisqueryresults.docs);
|
---|
| 234 | } else if ((*query_here).combinequery == "or") {
|
---|
| 235 | multiresults.docs.combine_or (thisqueryresults.docs);
|
---|
| 236 | } else if ((*query_here).combinequery == "not") {
|
---|
| 237 | multiresults.docs.combine_not (thisqueryresults.docs);
|
---|
| 238 | }
|
---|
| 239 | multiresults.docs_matched = multiresults.docs.docset.size();
|
---|
| 240 | multiresults.is_approx = Exact;
|
---|
| 241 | }
|
---|
| 242 | }
|
---|
| 243 |
|
---|
| 244 | // combine the term information
|
---|
| 245 | if (need_term_info (request.filterResultOptions)) {
|
---|
| 246 | // append the terms
|
---|
| 247 | multiresults.orgterms.insert(multiresults.orgterms.end(),
|
---|
| 248 | thisqueryresults.orgterms.begin(),
|
---|
| 249 | thisqueryresults.orgterms.end());
|
---|
| 250 |
|
---|
| 251 | // add the term variants
|
---|
| 252 | text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
|
---|
| 253 | text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
|
---|
| 254 | while (termvar_here != termvar_end) {
|
---|
| 255 | multiresults.termvariants.insert(*termvar_here);
|
---|
[9620] | 256 | ++termvar_here;
|
---|
[1324] | 257 | }
|
---|
| 258 | }
|
---|
| 259 |
|
---|
[9620] | 260 | ++query_here;
|
---|
[1324] | 261 | }
|
---|
| 262 |
|
---|
| 263 | // sort and unique the query terms
|
---|
| 264 | multiresults.sortuniqqueryterms ();
|
---|
| 265 | }
|
---|
| 266 |
|
---|
| 267 |
|
---|
| 268 | void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
|
---|
| 269 | docresultsclass &docs) {
|
---|
| 270 | resultsorderer_t resultsorderer;
|
---|
| 271 | resultsorderer.compare_phrase_match = true;
|
---|
| 272 | resultsorderer.docset = &(docs.docset);
|
---|
| 273 |
|
---|
| 274 | // first get a list of document numbers
|
---|
| 275 | docs.docnum_order();
|
---|
| 276 |
|
---|
| 277 | sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
|
---|
| 278 | }
|
---|
| 279 |
|
---|
| 280 |
|
---|
| 281 |
|
---|
| 282 | mgqueryfilterclass::mgqueryfilterclass ()
|
---|
| 283 | :queryfilterclass() {
|
---|
| 284 |
|
---|
| 285 | num_phrases = 0;
|
---|
| 286 | }
|
---|
| 287 |
|
---|
| 288 | mgqueryfilterclass::~mgqueryfilterclass () {
|
---|
| 289 | }
|
---|
| 290 |
|
---|
| 291 | void mgqueryfilterclass::filter (const FilterRequest_t &request,
|
---|
| 292 | FilterResponse_t &response,
|
---|
| 293 | comerror_t &err, ostream &logout) {
|
---|
| 294 | outconvertclass text_t2ascii;
|
---|
| 295 |
|
---|
| 296 | response.clear ();
|
---|
| 297 | err = noError;
|
---|
[15558] | 298 | if (db_ptr == NULL) {
|
---|
[1324] | 299 | // most likely a configuration problem
|
---|
| 300 | logout << text_t2ascii
|
---|
[15558] | 301 | << "configuration error: mgqueryfilter contains a null dbclass\n\n";
|
---|
[1324] | 302 | err = configurationError;
|
---|
| 303 | return;
|
---|
| 304 | }
|
---|
[8026] | 305 | if (textsearchptr == NULL) {
|
---|
[1324] | 306 | // most likely a configuration problem
|
---|
| 307 | logout << text_t2ascii
|
---|
[8026] | 308 | << "configuration error: mgqueryfilter contains a null textsearchclass (mg)\n\n";
|
---|
[1324] | 309 | err = configurationError;
|
---|
| 310 | return;
|
---|
| 311 | }
|
---|
| 312 |
|
---|
| 313 | // open the database
|
---|
[15558] | 314 | db_ptr->setlogout(&logout);
|
---|
| 315 | if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) {
|
---|
| 316 | // most likely a system problem (we have already checked that the database exists)
|
---|
[1324] | 317 | logout << text_t2ascii
|
---|
[15558] | 318 | << "system problem: open on database \"" << db_filename << "\" failed\n\n";
|
---|
[1324] | 319 | err = systemProblem;
|
---|
| 320 | return;
|
---|
| 321 | }
|
---|
| 322 |
|
---|
| 323 | // get the query parameters
|
---|
| 324 | int startresults = filterOptions["StartResults"].defaultValue.getint();
|
---|
| 325 | int endresults = filterOptions["EndResults"].defaultValue.getint();
|
---|
| 326 | text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
|
---|
| 327 |
|
---|
| 328 | vector<queryparamclass> queryfilterparams;
|
---|
| 329 | parse_query_params (request, queryfilterparams, startresults,
|
---|
[1662] | 330 | endresults, phrasematch, logout);
|
---|
| 331 | // do any mg specific diddling with query parameters that may be required
|
---|
| 332 | mg_parse_query_params (request, queryfilterparams, startresults,
|
---|
| 333 | endresults, phrasematch, logout);
|
---|
| 334 |
|
---|
| 335 |
|
---|
[1324] | 336 | // do query
|
---|
| 337 | queryresultsclass queryresults;
|
---|
| 338 | do_multi_query (request, queryfilterparams, queryresults, err, logout);
|
---|
| 339 | if (err != noError) return;
|
---|
| 340 |
|
---|
| 341 | // assemble document results
|
---|
| 342 | if (need_matching_docs (request.filterResultOptions)) {
|
---|
| 343 | // sort the query results
|
---|
[5850] | 344 | // only want to sort the docs if we have done a ranked search or there were phrases
|
---|
| 345 | if (num_phrases > 0 || (request.filterResultOptions & FRranking)) {
|
---|
| 346 | sort_doc_results (request, queryresults.docs);
|
---|
| 347 | }
|
---|
[1324] | 348 | int resultnum = 1;
|
---|
| 349 | ResultDocInfo_t resultdoc;
|
---|
| 350 | text_t trans_OID;
|
---|
| 351 | vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
|
---|
| 352 | vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
|
---|
| 353 |
|
---|
[1662] | 354 | // documents containing matching phrases will be sorted to the top so
|
---|
| 355 | // we can break out once we're past those that match the PhraseMatch
|
---|
| 356 | // option -- "all_phrases" = return only those documents containing all
|
---|
| 357 | // phrases in query string
|
---|
| 358 | // "some_phrases" = return only those documents containing
|
---|
| 359 | // at least 1 of the phrases in the document
|
---|
| 360 | // "all_docs" = return all documents regardless
|
---|
| 361 | if (num_phrases > 0) {
|
---|
| 362 | int numdocs = 0;
|
---|
| 363 | while (docorder_here != docorder_end) {
|
---|
| 364 | docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
|
---|
| 365 |
|
---|
| 366 | if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
|
---|
| 367 | ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
|
---|
| 368 | queryresults.docs_matched = numdocs;
|
---|
| 369 | break;
|
---|
| 370 | }
|
---|
[9620] | 371 | ++numdocs;
|
---|
| 372 | ++docorder_here;
|
---|
[1662] | 373 | }
|
---|
| 374 | }
|
---|
| 375 |
|
---|
[1324] | 376 | if (endresults == -1) endresults = MAXNUMDOCS;
|
---|
[1662] | 377 | docorder_here = queryresults.docs.docorder.begin();
|
---|
[1324] | 378 | while (docorder_here != docorder_end) {
|
---|
[1662] | 379 | if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
|
---|
[1324] | 380 |
|
---|
| 381 | // translate the document number
|
---|
[15558] | 382 | if (!translate(db_ptr, *docorder_here, trans_OID)) {
|
---|
[1324] | 383 | logout << text_t2ascii
|
---|
| 384 | << "warning: could not translate mg document number \""
|
---|
| 385 | << *docorder_here << "\"to OID.\n\n";
|
---|
| 386 |
|
---|
| 387 | } else {
|
---|
| 388 | docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
|
---|
| 389 |
|
---|
| 390 | // see if there is a result for this number,
|
---|
| 391 | // if it is in the request set (or the request set is empty)
|
---|
| 392 | if (docset_here != queryresults.docs.docset.end() &&
|
---|
| 393 | (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
|
---|
| 394 | if (resultnum >= startresults) {
|
---|
| 395 | // add this document
|
---|
| 396 | resultdoc.OID = trans_OID;
|
---|
| 397 | resultdoc.result_num = resultnum;
|
---|
| 398 | resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
|
---|
| 399 |
|
---|
| 400 | // these next two are not available on all versions of mg
|
---|
| 401 | resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
|
---|
| 402 | resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
|
---|
| 403 |
|
---|
| 404 | response.docInfo.push_back (resultdoc);
|
---|
| 405 | }
|
---|
| 406 |
|
---|
[9620] | 407 | ++resultnum;
|
---|
[1324] | 408 | }
|
---|
| 409 | }
|
---|
| 410 |
|
---|
[9620] | 411 | ++docorder_here;
|
---|
[1324] | 412 | }
|
---|
| 413 | }
|
---|
| 414 |
|
---|
| 415 | // assemble the term results
|
---|
| 416 | if (need_term_info(request.filterResultOptions)) {
|
---|
| 417 | // note: the terms have already been sorted and uniqued
|
---|
| 418 |
|
---|
| 419 | TermInfo_t terminfo;
|
---|
| 420 | bool terms_first = true;
|
---|
| 421 | termfreqclassarray::iterator terms_here = queryresults.terms.begin();
|
---|
| 422 | termfreqclassarray::iterator terms_end = queryresults.terms.end();
|
---|
| 423 |
|
---|
| 424 | while (terms_here != terms_end) {
|
---|
| 425 | terminfo.clear();
|
---|
| 426 | terminfo.term = (*terms_here).termstr;
|
---|
| 427 | terminfo.freq = (*terms_here).termfreq;
|
---|
| 428 | if (terms_first) {
|
---|
| 429 | text_tset::iterator termvariants_here = queryresults.termvariants.begin();
|
---|
| 430 | text_tset::iterator termvariants_end = queryresults.termvariants.end();
|
---|
| 431 | while (termvariants_here != termvariants_end) {
|
---|
| 432 | terminfo.matchTerms.push_back (*termvariants_here);
|
---|
[9620] | 433 | ++termvariants_here;
|
---|
[1324] | 434 | }
|
---|
| 435 | }
|
---|
| 436 | terms_first = false;
|
---|
| 437 |
|
---|
| 438 | response.termInfo.push_back (terminfo);
|
---|
| 439 |
|
---|
[9620] | 440 | ++terms_here;
|
---|
[1324] | 441 | }
|
---|
| 442 | }
|
---|
| 443 |
|
---|
[15558] | 444 | db_ptr->closedatabase(); // Important that local library doesn't leave any files open
|
---|
[1324] | 445 | response.numDocs = queryresults.docs_matched;
|
---|
| 446 | response.isApprox = queryresults.is_approx;
|
---|
| 447 | }
|
---|
| 448 |
|
---|
[1662] | 449 | void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/,
|
---|
| 450 | vector<queryparamclass> &query_params,
|
---|
| 451 | int &/*startresults*/, int &/*endresults*/,
|
---|
| 452 | text_t &/*phrasematch*/, ostream &/*logout*/) {
|
---|
| 453 |
|
---|
| 454 | // outconvertclass text_t2ascii;
|
---|
| 455 |
|
---|
| 456 | vector<queryparamclass>::iterator query_here = query_params.begin();
|
---|
| 457 | vector<queryparamclass>::iterator query_end = query_params.end();
|
---|
| 458 | while (query_here != query_end) {
|
---|
| 459 |
|
---|
[2134] | 460 | // if we're doing a phrase search we want to maximise hits by making it
|
---|
| 461 | // a boolean search on the index with the finest granularity - we'll
|
---|
| 462 | // also set maxdocs to "all" (realizing that this will cause searches
|
---|
| 463 | // like "and the" on a large collection to take a very very long time).
|
---|
| 464 |
|
---|
[1662] | 465 | // we're deciding it's a phrase search based on if the querystring
|
---|
| 466 | // contains at least 2 double quotes (not very scientific but
|
---|
| 467 | // then neither is the rest of the mg phrase searching functionality :-)
|
---|
[11002] | 468 | //if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
|
---|
| 469 |
|
---|
| 470 | // [kjdon 12/2005] we don't want to do a phrase search if the only phrases are single words, so we'll parse out the phrases properly here
|
---|
| 471 | text_tarray phrases;
|
---|
| 472 | get_phrases((*query_here).querystring, phrases);
|
---|
| 473 |
|
---|
| 474 | if (phrases.size() > 0) {
|
---|
[1662] | 475 | (*query_here).search_type = 0;
|
---|
| 476 |
|
---|
[2134] | 477 | // set maxdocs to "all"
|
---|
| 478 | (*query_here).maxdocs = -1;
|
---|
| 479 |
|
---|
[1662] | 480 | // Get the long version of the index and test to see if any indexes with
|
---|
| 481 | // finer granularity exist. Indexes must be the same type (i.e. same metadata
|
---|
| 482 | // or "text").
|
---|
| 483 | text_t longindex; text_tarray splitindex;
|
---|
| 484 | indexmap.to2from ((*query_here).index, longindex);
|
---|
| 485 | splitchar (longindex.begin(), longindex.end(), ':', splitindex);
|
---|
| 486 | text_t &granularity = splitindex[0];
|
---|
| 487 | text_t &indextype = splitindex[1];
|
---|
| 488 | bool found = false;
|
---|
| 489 | // currently supported granularity options are "document", "section" and "paragraph"
|
---|
| 490 | if (granularity == "document" || granularity == "section") {
|
---|
| 491 | text_t shortindex;
|
---|
| 492 | if (indexmap.fromexists ("paragraph:" + indextype)) {
|
---|
| 493 | indexmap.from2to ("paragraph:" + indextype, shortindex);
|
---|
| 494 | (*query_here).index = shortindex;
|
---|
| 495 | found = true;
|
---|
| 496 | }
|
---|
| 497 | if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
|
---|
| 498 | indexmap.from2to ("section:" + indextype, shortindex);
|
---|
| 499 | (*query_here).index = shortindex;
|
---|
| 500 | }
|
---|
| 501 | }
|
---|
| 502 | }
|
---|
[4507] | 503 |
|
---|
| 504 | #ifdef GSDL_BBC_COLLECTION
|
---|
| 505 | // This is a special hack for the BBC collection's ProgNumber and zzabn
|
---|
| 506 | // indexes (they're built this way to prevent mg_perf_hash_build from
|
---|
| 507 | // dying at build time)
|
---|
| 508 |
|
---|
[4735] | 509 | // if we're searching the ProgNumber index we want to
|
---|
[4507] | 510 | // remove all non-alphanumeric characters from the query string
|
---|
| 511 | text_t longindex; text_tarray splitindex;
|
---|
| 512 | indexmap.to2from ((*query_here).index, longindex);
|
---|
| 513 | splitchar (longindex.begin(), longindex.end(), ':', splitindex);
|
---|
| 514 | text_t &indextype = splitindex[1];
|
---|
[4735] | 515 | if (indextype == "ProgNumber") {
|
---|
[4507] | 516 | text_t new_querystring;
|
---|
| 517 | text_t::const_iterator here = (*query_here).querystring.begin();
|
---|
| 518 | text_t::const_iterator end = (*query_here).querystring.end();
|
---|
| 519 | while (here != end) {
|
---|
| 520 | if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
|
---|
| 521 | (*here >= '0' && *here <= '9')) {
|
---|
| 522 | new_querystring.push_back (*here);
|
---|
| 523 | }
|
---|
[9620] | 524 | ++here;
|
---|
[4507] | 525 | }
|
---|
| 526 | (*query_here).querystring = new_querystring;
|
---|
| 527 | }
|
---|
| 528 | #endif
|
---|
[9620] | 529 | ++query_here;
|
---|
[1662] | 530 | }
|
---|
| 531 | }
|
---|
| 532 |
|
---|