[1324] | 1 | /**********************************************************************
|
---|
| 2 | *
|
---|
| 3 | * mgqueryfilter.cpp -- implementation of queryfilter for old mg
|
---|
| 4 | * Copyright (C) 1999 The New Zealand Digital Library Project
|
---|
| 5 | *
|
---|
| 6 | * A component of the Greenstone digital library software
|
---|
| 7 | * from the New Zealand Digital Library Project at the
|
---|
| 8 | * University of Waikato, New Zealand.
|
---|
| 9 | *
|
---|
| 10 | * This program is free software; you can redistribute it and/or modify
|
---|
| 11 | * it under the terms of the GNU General Public License as published by
|
---|
| 12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | * (at your option) any later version.
|
---|
| 14 | *
|
---|
| 15 | * This program is distributed in the hope that it will be useful,
|
---|
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | * GNU General Public License for more details.
|
---|
| 19 | *
|
---|
| 20 | * You should have received a copy of the GNU General Public License
|
---|
| 21 | * along with this program; if not, write to the Free Software
|
---|
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | *
|
---|
| 24 | *********************************************************************/
|
---|
| 25 |
|
---|
| 26 | #include "mgqueryfilter.h"
|
---|
| 27 | #include "fileutil.h"
|
---|
| 28 | #include "phrasesearch.h"
|
---|
| 29 | #include <assert.h>
|
---|
| 30 | #include "mgsearch.h"
|
---|
| 31 |
|
---|
| 32 | ///////////////////////////////
|
---|
| 33 | // methods for resultsorderer_t
|
---|
| 34 | ///////////////////////////////
|
---|
| 35 |
|
---|
| 36 | resultsorderer_t::resultsorderer_t() {
|
---|
| 37 | clear ();
|
---|
| 38 | }
|
---|
| 39 |
|
---|
| 40 | void resultsorderer_t::clear() {
|
---|
| 41 | compare_phrase_match = false;
|
---|
| 42 | compare_terms_match = false;
|
---|
| 43 | compare_doc_weight = true;
|
---|
| 44 |
|
---|
| 45 | docset = NULL;
|
---|
| 46 | }
|
---|
| 47 |
|
---|
| 48 | bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
|
---|
| 49 | if (docset == NULL) return t1>t2;
|
---|
| 50 |
|
---|
| 51 | docresultmap::iterator t1_here = docset->find(t1);
|
---|
| 52 | docresultmap::iterator t2_here = docset->find(t2);
|
---|
| 53 | docresultmap::iterator end = docset->end();
|
---|
| 54 |
|
---|
| 55 | // sort all the document numbers not in the document set to
|
---|
| 56 | // the end of the list
|
---|
| 57 | if (t1_here == end) {
|
---|
| 58 | if (t2_here == end) return t1>t2;
|
---|
| 59 | else return true;
|
---|
| 60 | } else if (t2_here == end) return false;
|
---|
| 61 |
|
---|
| 62 | if (compare_phrase_match) {
|
---|
| 63 | if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
|
---|
| 64 | if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
|
---|
| 65 | }
|
---|
| 66 |
|
---|
| 67 | if (compare_terms_match) {
|
---|
| 68 | if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
|
---|
| 69 | if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
|
---|
| 70 | }
|
---|
| 71 |
|
---|
| 72 | if (compare_doc_weight) {
|
---|
| 73 | if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
|
---|
| 74 | if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
|
---|
| 75 | }
|
---|
| 76 |
|
---|
| 77 | return t1>t2;
|
---|
| 78 | }
|
---|
| 79 |
|
---|
| 80 |
|
---|
| 81 |
|
---|
| 82 |
|
---|
| 83 | /////////////////////////////////
|
---|
| 84 | // functions for mgqueryfilterclass
|
---|
| 85 | /////////////////////////////////
|
---|
| 86 |
|
---|
| 87 | // loads up phrases data structure with any phrases (that's the quoted bits)
|
---|
| 88 | // occuring in the querystring
|
---|
| 89 | void mgqueryfilterclass::get_phrase_terms (const text_t &querystring,
|
---|
| 90 | const termfreqclassarray &orgterms,
|
---|
| 91 | vector<termfreqclassarray> &phrases) {
|
---|
| 92 |
|
---|
| 93 | text_t::const_iterator here = querystring.begin();
|
---|
| 94 | text_t::const_iterator end = querystring.end();
|
---|
| 95 |
|
---|
| 96 | termfreqclassarray tmpterms;
|
---|
| 97 |
|
---|
| 98 | int termcount = 0;
|
---|
| 99 | bool foundquote = false;
|
---|
| 100 | bool foundbreak = false;
|
---|
| 101 | bool start = true;
|
---|
| 102 | while (here != end) {
|
---|
| 103 | if (*here == '\"') {
|
---|
| 104 | if (foundquote) {
|
---|
| 105 | if (!foundbreak && !start) {
|
---|
| 106 | tmpterms.push_back (orgterms[termcount]);
|
---|
| 107 | termcount ++;
|
---|
| 108 | }
|
---|
| 109 | if (tmpterms.size() > 1) {
|
---|
| 110 | phrases.push_back (tmpterms);
|
---|
| 111 | tmpterms.erase (tmpterms.begin(), tmpterms.end());
|
---|
| 112 | }
|
---|
| 113 | foundquote = false;
|
---|
| 114 | foundbreak = true;
|
---|
| 115 | } else foundquote = true;
|
---|
| 116 | } else if (!is_unicode_letdig(*here)) {
|
---|
| 117 | // found a break between terms
|
---|
| 118 | if (!foundbreak && !start) {
|
---|
| 119 | if (foundquote)
|
---|
| 120 | tmpterms.push_back (orgterms[termcount]);
|
---|
| 121 | termcount ++;
|
---|
| 122 | }
|
---|
| 123 | foundbreak = true;
|
---|
| 124 | } else {
|
---|
| 125 | start = false;
|
---|
| 126 | foundbreak = false;
|
---|
| 127 | }
|
---|
| 128 | here++;
|
---|
| 129 | }
|
---|
| 130 | }
|
---|
| 131 |
|
---|
| 132 | // do aditional query processing
|
---|
| 133 | void mgqueryfilterclass::post_process (const queryparamclass &queryparams,
|
---|
| 134 | queryresultsclass &queryresults) {
|
---|
| 135 |
|
---|
| 136 | // post-process the results if needed
|
---|
| 137 | if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
|
---|
| 138 |
|
---|
| 139 | // get the terms between quotes (if any)
|
---|
| 140 | vector<termfreqclassarray> phrases;
|
---|
| 141 | get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
|
---|
| 142 |
|
---|
| 143 | num_phrases = phrases.size();
|
---|
| 144 | if (num_phrases > 0) {
|
---|
| 145 |
|
---|
| 146 | // get the long version of the index
|
---|
| 147 | text_t longindex;
|
---|
| 148 | indexmap.to2from (queryparams.index, longindex);
|
---|
| 149 |
|
---|
| 150 | vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
|
---|
| 151 | vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
|
---|
| 152 |
|
---|
| 153 | while (this_phrase != end_phrase) {
|
---|
| 154 |
|
---|
| 155 | // process each of the matched documents
|
---|
| 156 | docresultmap::iterator docs_here = queryresults.docs.docset.begin();
|
---|
| 157 | docresultmap::iterator docs_end = queryresults.docs.docset.end();
|
---|
| 158 | while (docs_here != docs_end) {
|
---|
| 159 | if (OID_phrase_search (*((mgsearchclass*)mgsearchptr), *gdbmptr, queryparams.index,
|
---|
| 160 | queryparams.subcollection, queryparams.language,
|
---|
| 161 | longindex, queryparams.collection, *this_phrase,
|
---|
| 162 | (*docs_here).second.docnum)) {
|
---|
| 163 | (*docs_here).second.num_phrase_match++;
|
---|
| 164 | }
|
---|
| 165 |
|
---|
| 166 | docs_here++;
|
---|
| 167 | }
|
---|
| 168 | this_phrase++;
|
---|
| 169 | }
|
---|
| 170 | }
|
---|
| 171 | }
|
---|
| 172 | }
|
---|
| 173 |
|
---|
| 174 |
|
---|
| 175 | // do query that might involve multiple sub queries
|
---|
| 176 | // mgsearchptr and gdbmptr are assumed to be valid
|
---|
| 177 | void mgqueryfilterclass::do_multi_query (const FilterRequest_t &request,
|
---|
| 178 | const vector<queryparamclass> &query_params,
|
---|
| 179 | queryresultsclass &multiresults,
|
---|
| 180 | comerror_t &err, ostream &logout) {
|
---|
| 181 | outconvertclass text_t2ascii;
|
---|
| 182 |
|
---|
| 183 | err = noError;
|
---|
| 184 | mgsearchptr->setcollectdir (collectdir);
|
---|
| 185 | multiresults.clear();
|
---|
| 186 |
|
---|
| 187 | vector<queryparamclass>::const_iterator query_here = query_params.begin();
|
---|
| 188 | vector<queryparamclass>::const_iterator query_end = query_params.end();
|
---|
| 189 | while (query_here != query_end) {
|
---|
| 190 | queryresultsclass thisqueryresults;
|
---|
[1662] | 191 |
|
---|
[1324] | 192 | if (!mgsearchptr->search(*query_here, thisqueryresults)) {
|
---|
| 193 | // most likely a system problem
|
---|
| 194 | logout << text_t2ascii
|
---|
| 195 | << "system problem: could not do search with mg for index \""
|
---|
| 196 | << (*query_here).index << (*query_here).subcollection
|
---|
| 197 | << (*query_here).language << "\".\n\n";
|
---|
| 198 | err = systemProblem;
|
---|
| 199 | return;
|
---|
| 200 | }
|
---|
| 201 |
|
---|
| 202 | // combine the results
|
---|
| 203 | if (need_matching_docs (request.filterResultOptions)) {
|
---|
| 204 | // post-process the results if needed
|
---|
| 205 | if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
|
---|
| 206 | !thisqueryresults.docs.docset.empty()) {
|
---|
| 207 | post_process (*query_here, thisqueryresults);
|
---|
| 208 | thisqueryresults.postprocessed = true;
|
---|
| 209 | multiresults.postprocessed = true;
|
---|
[1721] | 210 | } else {
|
---|
| 211 | num_phrases = 0;
|
---|
[1324] | 212 | }
|
---|
| 213 |
|
---|
| 214 | if (query_params.size() == 1) {
|
---|
| 215 | multiresults.docs = thisqueryresults.docs; // just one set of results
|
---|
| 216 | multiresults.docs_matched = thisqueryresults.docs_matched;
|
---|
| 217 | multiresults.is_approx = thisqueryresults.is_approx;
|
---|
| 218 |
|
---|
| 219 | } else {
|
---|
| 220 | if ((*query_here).combinequery == "and") {
|
---|
| 221 | multiresults.docs.combine_and (thisqueryresults.docs);
|
---|
| 222 | } else if ((*query_here).combinequery == "or") {
|
---|
| 223 | multiresults.docs.combine_or (thisqueryresults.docs);
|
---|
| 224 | } else if ((*query_here).combinequery == "not") {
|
---|
| 225 | multiresults.docs.combine_not (thisqueryresults.docs);
|
---|
| 226 | }
|
---|
| 227 | multiresults.docs_matched = multiresults.docs.docset.size();
|
---|
| 228 | multiresults.is_approx = Exact;
|
---|
| 229 | }
|
---|
| 230 | }
|
---|
| 231 |
|
---|
| 232 | // combine the term information
|
---|
| 233 | if (need_term_info (request.filterResultOptions)) {
|
---|
| 234 | // append the terms
|
---|
| 235 | multiresults.orgterms.insert(multiresults.orgterms.end(),
|
---|
| 236 | thisqueryresults.orgterms.begin(),
|
---|
| 237 | thisqueryresults.orgterms.end());
|
---|
| 238 |
|
---|
| 239 | // add the term variants
|
---|
| 240 | text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
|
---|
| 241 | text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
|
---|
| 242 | while (termvar_here != termvar_end) {
|
---|
| 243 | multiresults.termvariants.insert(*termvar_here);
|
---|
| 244 | termvar_here++;
|
---|
| 245 | }
|
---|
| 246 | }
|
---|
| 247 |
|
---|
| 248 | query_here++;
|
---|
| 249 | }
|
---|
| 250 |
|
---|
| 251 | // sort and unique the query terms
|
---|
| 252 | multiresults.sortuniqqueryterms ();
|
---|
| 253 | }
|
---|
| 254 |
|
---|
| 255 |
|
---|
| 256 | void mgqueryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
|
---|
| 257 | docresultsclass &docs) {
|
---|
| 258 | resultsorderer_t resultsorderer;
|
---|
| 259 | resultsorderer.compare_phrase_match = true;
|
---|
| 260 | resultsorderer.docset = &(docs.docset);
|
---|
| 261 |
|
---|
| 262 | // first get a list of document numbers
|
---|
| 263 | docs.docnum_order();
|
---|
| 264 |
|
---|
| 265 | sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
|
---|
| 266 | }
|
---|
| 267 |
|
---|
| 268 |
|
---|
| 269 |
|
---|
| 270 | mgqueryfilterclass::mgqueryfilterclass ()
|
---|
| 271 | :queryfilterclass() {
|
---|
| 272 |
|
---|
| 273 | num_phrases = 0;
|
---|
| 274 |
|
---|
| 275 | }
|
---|
| 276 |
|
---|
| 277 | mgqueryfilterclass::~mgqueryfilterclass () {
|
---|
| 278 | }
|
---|
| 279 |
|
---|
| 280 | void mgqueryfilterclass::filter (const FilterRequest_t &request,
|
---|
| 281 | FilterResponse_t &response,
|
---|
| 282 | comerror_t &err, ostream &logout) {
|
---|
| 283 | outconvertclass text_t2ascii;
|
---|
| 284 |
|
---|
| 285 | response.clear ();
|
---|
| 286 | err = noError;
|
---|
| 287 | if (gdbmptr == NULL) {
|
---|
| 288 | // most likely a configuration problem
|
---|
| 289 | logout << text_t2ascii
|
---|
| 290 | << "configuration error: mgqueryfilter contains a null gdbmclass\n\n";
|
---|
| 291 | err = configurationError;
|
---|
| 292 | return;
|
---|
| 293 | }
|
---|
| 294 | if (mgsearchptr == NULL) {
|
---|
| 295 | // most likely a configuration problem
|
---|
| 296 | logout << text_t2ascii
|
---|
| 297 | << "configuration error: mgqueryfilter contains a null mgsearchclass\n\n";
|
---|
| 298 | err = configurationError;
|
---|
| 299 | return;
|
---|
| 300 | }
|
---|
| 301 |
|
---|
| 302 | // open the database
|
---|
| 303 | gdbmptr->setlogout(&logout);
|
---|
| 304 | if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
|
---|
| 305 | // most likely a system problem (we have already checked that the
|
---|
| 306 | // gdbm database exists)
|
---|
| 307 | logout << text_t2ascii
|
---|
| 308 | << "system problem: open on gdbm database \""
|
---|
| 309 | << gdbm_filename << "\" failed\n\n";
|
---|
| 310 | err = systemProblem;
|
---|
| 311 | return;
|
---|
| 312 | }
|
---|
| 313 |
|
---|
| 314 | // get the query parameters
|
---|
| 315 | int startresults = filterOptions["StartResults"].defaultValue.getint();
|
---|
| 316 | int endresults = filterOptions["EndResults"].defaultValue.getint();
|
---|
| 317 | text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
|
---|
| 318 |
|
---|
| 319 | vector<queryparamclass> queryfilterparams;
|
---|
| 320 | parse_query_params (request, queryfilterparams, startresults,
|
---|
[1662] | 321 | endresults, phrasematch, logout);
|
---|
| 322 | // do any mg specific diddling with query parameters that may be required
|
---|
| 323 | mg_parse_query_params (request, queryfilterparams, startresults,
|
---|
| 324 | endresults, phrasematch, logout);
|
---|
| 325 |
|
---|
| 326 |
|
---|
[1324] | 327 | // do query
|
---|
| 328 | queryresultsclass queryresults;
|
---|
| 329 | do_multi_query (request, queryfilterparams, queryresults, err, logout);
|
---|
| 330 | if (err != noError) return;
|
---|
| 331 |
|
---|
| 332 | // assemble document results
|
---|
| 333 | if (need_matching_docs (request.filterResultOptions)) {
|
---|
| 334 | // sort the query results
|
---|
| 335 | sort_doc_results (request, queryresults.docs);
|
---|
| 336 |
|
---|
| 337 | int resultnum = 1;
|
---|
| 338 | ResultDocInfo_t resultdoc;
|
---|
| 339 | text_t trans_OID;
|
---|
| 340 | vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
|
---|
| 341 | vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
|
---|
| 342 |
|
---|
[1662] | 343 | // documents containing matching phrases will be sorted to the top so
|
---|
| 344 | // we can break out once we're past those that match the PhraseMatch
|
---|
| 345 | // option -- "all_phrases" = return only those documents containing all
|
---|
| 346 | // phrases in query string
|
---|
| 347 | // "some_phrases" = return only those documents containing
|
---|
| 348 | // at least 1 of the phrases in the document
|
---|
| 349 | // "all_docs" = return all documents regardless
|
---|
| 350 | if (num_phrases > 0) {
|
---|
| 351 | int numdocs = 0;
|
---|
| 352 | while (docorder_here != docorder_end) {
|
---|
| 353 | docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
|
---|
| 354 |
|
---|
| 355 | if (((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) ||
|
---|
| 356 | ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1))) {
|
---|
| 357 | queryresults.docs_matched = numdocs;
|
---|
| 358 | break;
|
---|
| 359 | }
|
---|
| 360 | numdocs ++;
|
---|
| 361 | docorder_here ++;
|
---|
| 362 | }
|
---|
| 363 | }
|
---|
| 364 |
|
---|
[1324] | 365 | if (endresults == -1) endresults = MAXNUMDOCS;
|
---|
[1662] | 366 | docorder_here = queryresults.docs.docorder.begin();
|
---|
[1324] | 367 | while (docorder_here != docorder_end) {
|
---|
[1662] | 368 | if (resultnum > endresults || resultnum > queryresults.docs_matched) break;
|
---|
[1324] | 369 |
|
---|
| 370 | // translate the document number
|
---|
| 371 | if (!translate(gdbmptr, *docorder_here, trans_OID)) {
|
---|
| 372 | logout << text_t2ascii
|
---|
| 373 | << "warning: could not translate mg document number \""
|
---|
| 374 | << *docorder_here << "\"to OID.\n\n";
|
---|
| 375 |
|
---|
| 376 | } else {
|
---|
| 377 | docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
|
---|
| 378 |
|
---|
| 379 | // see if there is a result for this number,
|
---|
| 380 | // if it is in the request set (or the request set is empty)
|
---|
| 381 | if (docset_here != queryresults.docs.docset.end() &&
|
---|
| 382 | (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
|
---|
| 383 | if (resultnum >= startresults) {
|
---|
| 384 | // add this document
|
---|
| 385 | resultdoc.OID = trans_OID;
|
---|
| 386 | resultdoc.result_num = resultnum;
|
---|
| 387 | resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
|
---|
| 388 |
|
---|
| 389 | // these next two are not available on all versions of mg
|
---|
| 390 | resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
|
---|
| 391 | resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
|
---|
| 392 |
|
---|
| 393 | response.docInfo.push_back (resultdoc);
|
---|
| 394 | }
|
---|
| 395 |
|
---|
| 396 | resultnum++;
|
---|
| 397 | }
|
---|
| 398 | }
|
---|
| 399 |
|
---|
| 400 | docorder_here++;
|
---|
| 401 | }
|
---|
| 402 | }
|
---|
| 403 |
|
---|
| 404 | // assemble the term results
|
---|
| 405 | if (need_term_info(request.filterResultOptions)) {
|
---|
| 406 | // note: the terms have already been sorted and uniqued
|
---|
| 407 |
|
---|
| 408 | TermInfo_t terminfo;
|
---|
| 409 | bool terms_first = true;
|
---|
| 410 | termfreqclassarray::iterator terms_here = queryresults.terms.begin();
|
---|
| 411 | termfreqclassarray::iterator terms_end = queryresults.terms.end();
|
---|
| 412 |
|
---|
| 413 | while (terms_here != terms_end) {
|
---|
| 414 | terminfo.clear();
|
---|
| 415 | terminfo.term = (*terms_here).termstr;
|
---|
| 416 | terminfo.freq = (*terms_here).termfreq;
|
---|
| 417 | if (terms_first) {
|
---|
| 418 | text_tset::iterator termvariants_here = queryresults.termvariants.begin();
|
---|
| 419 | text_tset::iterator termvariants_end = queryresults.termvariants.end();
|
---|
| 420 | while (termvariants_here != termvariants_end) {
|
---|
| 421 | terminfo.matchTerms.push_back (*termvariants_here);
|
---|
| 422 | termvariants_here++;
|
---|
| 423 | }
|
---|
| 424 | }
|
---|
| 425 | terms_first = false;
|
---|
| 426 |
|
---|
| 427 | response.termInfo.push_back (terminfo);
|
---|
| 428 |
|
---|
| 429 | terms_here++;
|
---|
| 430 | }
|
---|
| 431 | }
|
---|
| 432 |
|
---|
| 433 | response.numDocs = queryresults.docs_matched;
|
---|
| 434 | response.isApprox = queryresults.is_approx;
|
---|
| 435 | }
|
---|
| 436 |
|
---|
[1662] | 437 | void mgqueryfilterclass::mg_parse_query_params (const FilterRequest_t &/*request*/,
|
---|
| 438 | vector<queryparamclass> &query_params,
|
---|
| 439 | int &/*startresults*/, int &/*endresults*/,
|
---|
| 440 | text_t &/*phrasematch*/, ostream &/*logout*/) {
|
---|
| 441 |
|
---|
| 442 | // outconvertclass text_t2ascii;
|
---|
| 443 |
|
---|
| 444 | vector<queryparamclass>::iterator query_here = query_params.begin();
|
---|
| 445 | vector<queryparamclass>::iterator query_end = query_params.end();
|
---|
| 446 | while (query_here != query_end) {
|
---|
| 447 |
|
---|
[2134] | 448 | // if we're doing a phrase search we want to maximise hits by making it
|
---|
| 449 | // a boolean search on the index with the finest granularity - we'll
|
---|
| 450 | // also set maxdocs to "all" (realizing that this will cause searches
|
---|
| 451 | // like "and the" on a large collection to take a very very long time).
|
---|
| 452 |
|
---|
[1662] | 453 | // we're deciding it's a phrase search based on if the querystring
|
---|
| 454 | // contains at least 2 double quotes (not very scientific but
|
---|
| 455 | // then neither is the rest of the mg phrase searching functionality :-)
|
---|
| 456 | if (countchar ((*query_here).querystring.begin(), (*query_here).querystring.end(), '"') > 1) {
|
---|
| 457 | (*query_here).search_type = 0;
|
---|
| 458 |
|
---|
[2134] | 459 | // set maxdocs to "all"
|
---|
| 460 | (*query_here).maxdocs = -1;
|
---|
| 461 |
|
---|
[1662] | 462 | // Get the long version of the index and test to see if any indexes with
|
---|
| 463 | // finer granularity exist. Indexes must be the same type (i.e. same metadata
|
---|
| 464 | // or "text").
|
---|
| 465 | text_t longindex; text_tarray splitindex;
|
---|
| 466 | indexmap.to2from ((*query_here).index, longindex);
|
---|
| 467 | splitchar (longindex.begin(), longindex.end(), ':', splitindex);
|
---|
| 468 | text_t &granularity = splitindex[0];
|
---|
| 469 | text_t &indextype = splitindex[1];
|
---|
| 470 | bool found = false;
|
---|
| 471 | // currently supported granularity options are "document", "section" and "paragraph"
|
---|
| 472 | if (granularity == "document" || granularity == "section") {
|
---|
| 473 | text_t shortindex;
|
---|
| 474 | if (indexmap.fromexists ("paragraph:" + indextype)) {
|
---|
| 475 | indexmap.from2to ("paragraph:" + indextype, shortindex);
|
---|
| 476 | (*query_here).index = shortindex;
|
---|
| 477 | found = true;
|
---|
| 478 | }
|
---|
| 479 | if (!found && granularity == "document" && indexmap.fromexists ("section:" + indextype)) {
|
---|
| 480 | indexmap.from2to ("section:" + indextype, shortindex);
|
---|
| 481 | (*query_here).index = shortindex;
|
---|
| 482 | }
|
---|
| 483 | }
|
---|
| 484 | }
|
---|
| 485 |
|
---|
[2134] | 486 | #ifdef GSDL_BBC_COLLECTION
|
---|
| 487 | // This is a special hack for the BBC collection's ProgNumber index
|
---|
| 488 |
|
---|
| 489 | // if we're searching a ProgNumber index we want to:
|
---|
| 490 | // 1. Remove all non-alphanumeric characters from the query string
|
---|
| 491 | // 2. Make it a boolean search
|
---|
| 492 | // 3. Turn off case-folding
|
---|
| 493 | text_t longindex; text_tarray splitindex;
|
---|
| 494 | indexmap.to2from ((*query_here).index, longindex);
|
---|
| 495 | splitchar (longindex.begin(), longindex.end(), ':', splitindex);
|
---|
| 496 | text_t &indextype = splitindex[1];
|
---|
| 497 | if (indextype == "ProgNumber") {
|
---|
| 498 | (*query_here).search_type = 0;
|
---|
| 499 | (*query_here).casefolding = 0;
|
---|
| 500 | text_t new_querystring;
|
---|
| 501 | text_t::const_iterator here = (*query_here).querystring.begin();
|
---|
| 502 | text_t::const_iterator end = (*query_here).querystring.end();
|
---|
| 503 | while (here != end) {
|
---|
| 504 | if ((*here >= 'a' && *here <= 'z') || (*here >= 'A' && *here <= 'Z') ||
|
---|
| 505 | (*here >= '0' && *here <= '9')) {
|
---|
| 506 | new_querystring.push_back (*here);
|
---|
| 507 | }
|
---|
| 508 | here ++;
|
---|
| 509 | }
|
---|
| 510 | (*query_here).querystring = new_querystring;
|
---|
| 511 | }
|
---|
| 512 | #endif
|
---|
| 513 |
|
---|
[1662] | 514 | query_here ++;
|
---|
| 515 | }
|
---|
| 516 | }
|
---|
| 517 |
|
---|