[227] | 1 | /**********************************************************************
|
---|
| 2 | *
|
---|
| 3 | * queryfilter.cpp --
|
---|
| 4 | * Copyright (C) 1999 The New Zealand Digital Library Project
|
---|
| 5 | *
|
---|
[534] | 6 | * A component of the Greenstone digital library software
|
---|
| 7 | * from the New Zealand Digital Library Project at the
|
---|
| 8 | * University of Waikato, New Zealand.
|
---|
[227] | 9 | *
|
---|
[534] | 10 | * This program is free software; you can redistribute it and/or modify
|
---|
| 11 | * it under the terms of the GNU General Public License as published by
|
---|
| 12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | * (at your option) any later version.
|
---|
| 14 | *
|
---|
| 15 | * This program is distributed in the hope that it will be useful,
|
---|
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | * GNU General Public License for more details.
|
---|
| 19 | *
|
---|
| 20 | * You should have received a copy of the GNU General Public License
|
---|
| 21 | * along with this program; if not, write to the Free Software
|
---|
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | *
|
---|
[227] | 24 | * $Id: queryfilter.cpp 990 2000-02-29 01:35:56Z sjboddie $
|
---|
| 25 | *
|
---|
| 26 | *********************************************************************/
|
---|
| 27 |
|
---|
| 28 | /*
|
---|
| 29 | $Log$
|
---|
[990] | 30 | Revision 1.22 2000/02/29 01:35:56 sjboddie
|
---|
| 31 | tidied up endianness and fastcgi
|
---|
| 32 |
|
---|
[787] | 33 | Revision 1.21 1999/11/25 02:21:13 sjboddie
|
---|
| 34 | fixed bug in phrasematch stuff
|
---|
| 35 |
|
---|
[766] | 36 | Revision 1.20 1999/11/01 22:06:06 sjboddie
|
---|
| 37 | Added filter option to remove documents not matching a phrase match.
|
---|
| 38 | This used to be done in the receptionist.
|
---|
| 39 |
|
---|
[722] | 40 | Revision 1.19 1999/10/19 03:23:40 davidb
|
---|
| 41 | Collection building support through web pages
|
---|
| 42 | and internal and external link handling for collection documents
|
---|
| 43 |
|
---|
[621] | 44 | Revision 1.18 1999/09/22 03:43:18 sjboddie
|
---|
| 45 | Endresults queryfilter option may now take '-1' for 'all'
|
---|
| 46 |
|
---|
[613] | 47 | Revision 1.17 1999/09/21 12:01:07 sjboddie
|
---|
| 48 | added Maxdocs queryfilter option (which may be -1 for 'all')
|
---|
| 49 |
|
---|
[534] | 50 | Revision 1.16 1999/09/07 04:57:24 sjboddie
|
---|
| 51 | added gpl notice
|
---|
| 52 |
|
---|
[501] | 53 | Revision 1.15 1999/08/31 22:47:09 rjmcnab
|
---|
| 54 | Added matchmode option for some and all.
|
---|
| 55 |
|
---|
[398] | 56 | Revision 1.14 1999/07/16 03:42:21 sjboddie
|
---|
| 57 | changed isApprox
|
---|
| 58 |
|
---|
[396] | 59 | Revision 1.13 1999/07/16 00:17:06 sjboddie
|
---|
| 60 | got using phrasesearch for post-processing
|
---|
| 61 |
|
---|
[358] | 62 | Revision 1.12 1999/07/09 02:19:43 rjmcnab
|
---|
| 63 | Fixed a couple of compiler conflicts
|
---|
| 64 |
|
---|
[355] | 65 | Revision 1.11 1999/07/08 20:49:44 rjmcnab
|
---|
| 66 | Added result_num to the ResultDocInto_t structure.
|
---|
| 67 |
|
---|
[351] | 68 | Revision 1.10 1999/07/07 06:19:46 rjmcnab
|
---|
| 69 | Added ability to combine two or more independant queries.
|
---|
| 70 |
|
---|
[334] | 71 | Revision 1.9 1999/07/01 09:29:20 rjmcnab
|
---|
| 72 | Changes for better reporting of number documents which match a query. Changes
|
---|
| 73 | should still work as before with older versions of mg.
|
---|
| 74 |
|
---|
[327] | 75 | Revision 1.8 1999/07/01 03:59:54 rjmcnab
|
---|
| 76 | reduced MAXDOCS to 200 (more reasonable ???). I also added a virtual
|
---|
| 77 | method for post-processing the query.
|
---|
| 78 |
|
---|
[319] | 79 | Revision 1.7 1999/06/30 04:04:13 rjmcnab
|
---|
| 80 | made stemming functions available from mgsearch and made the stems
|
---|
| 81 | for the query terms available in queryinfo
|
---|
| 82 |
|
---|
[311] | 83 | Revision 1.6 1999/06/29 22:06:23 rjmcnab
|
---|
| 84 | Added a couple of fields to queryinfo to handle a special version
|
---|
| 85 | of mg.
|
---|
| 86 |
|
---|
[302] | 87 | Revision 1.5 1999/06/27 22:08:48 sjboddie
|
---|
| 88 | now check for defaultindex, defaultsubcollection, and defaultlanguage
|
---|
| 89 | entries in config files
|
---|
| 90 |
|
---|
[273] | 91 | Revision 1.4 1999/06/16 02:03:25 sjboddie
|
---|
| 92 | fixed bug in isApprox and set MAXDOCS to always be 500
|
---|
| 93 |
|
---|
[238] | 94 | Revision 1.3 1999/04/19 23:56:09 rjmcnab
|
---|
| 95 | Finished the gdbm metadata stuff
|
---|
| 96 |
|
---|
[235] | 97 | Revision 1.2 1999/04/12 03:45:03 rjmcnab
|
---|
| 98 | Finished the query filter.
|
---|
| 99 |
|
---|
[227] | 100 | Revision 1.1 1999/04/06 22:22:09 rjmcnab
|
---|
| 101 | Initial revision.
|
---|
| 102 |
|
---|
| 103 | */
|
---|
| 104 |
|
---|
| 105 |
|
---|
| 106 | #include "queryfilter.h"
|
---|
| 107 | #include "fileutil.h"
|
---|
[235] | 108 | #include "queryinfo.h"
|
---|
[396] | 109 | #include "phrasesearch.h"
|
---|
[990] | 110 | #include "gsdltools.h"
|
---|
[396] | 111 | #include <assert.h>
|
---|
[227] | 112 |
|
---|
| 113 |
|
---|
[235] | 114 | // some useful functions
|
---|
| 115 |
|
---|
| 116 | // translate will return true if successful
|
---|
| 117 | static bool translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
|
---|
| 118 | infodbclass info;
|
---|
| 119 |
|
---|
| 120 | trans_OID.clear();
|
---|
| 121 |
|
---|
| 122 | // get the info
|
---|
| 123 | if (gdbmptr == NULL) return false;
|
---|
| 124 | if (!gdbmptr->getinfo(docnum, info)) return false;
|
---|
| 125 |
|
---|
| 126 | // translate
|
---|
| 127 | if (info["section"].empty()) return false;
|
---|
| 128 |
|
---|
| 129 | trans_OID = info["section"];
|
---|
| 130 | return true;
|
---|
| 131 | }
|
---|
| 132 |
|
---|
| 133 |
|
---|
[351] | 134 | // whether document results are needed
|
---|
| 135 | static bool need_matching_docs (int filterResultOptions) {
|
---|
| 136 | return ((filterResultOptions & FROID) || (filterResultOptions & FRranking) ||
|
---|
| 137 | (filterResultOptions & FRmetadata));
|
---|
| 138 | }
|
---|
| 139 |
|
---|
| 140 | // whether term information is needed
|
---|
| 141 | static bool need_term_info (int filterResultOptions) {
|
---|
| 142 | return ((filterResultOptions & FRtermFreq) || (filterResultOptions & FRmatchTerms));
|
---|
| 143 | }
|
---|
| 144 |
|
---|
| 145 | ///////////////////////////////
|
---|
| 146 | // methods for resultsorderer_t
|
---|
| 147 | ///////////////////////////////
|
---|
| 148 |
|
---|
| 149 | resultsorderer_t::resultsorderer_t() {
|
---|
| 150 | clear ();
|
---|
| 151 | }
|
---|
| 152 |
|
---|
| 153 | void resultsorderer_t::clear() {
|
---|
| 154 | compare_phrase_match = false;
|
---|
| 155 | compare_terms_match = false;
|
---|
| 156 | compare_doc_weight = true;
|
---|
| 157 |
|
---|
| 158 | docset = NULL;
|
---|
| 159 | }
|
---|
| 160 |
|
---|
| 161 | bool resultsorderer_t::operator()(const int &t1, const int &t2) const {
|
---|
| 162 | if (docset == NULL) return t1>t2;
|
---|
| 163 |
|
---|
| 164 | docresultmap::iterator t1_here = docset->find(t1);
|
---|
| 165 | docresultmap::iterator t2_here = docset->find(t2);
|
---|
| 166 | docresultmap::iterator end = docset->end();
|
---|
| 167 |
|
---|
| 168 | // sort all the document numbers not in the document set to
|
---|
| 169 | // the end of the list
|
---|
| 170 | if (t1_here == end) {
|
---|
| 171 | if (t2_here == end) return t1>t2;
|
---|
| 172 | else return true;
|
---|
| 173 | } else if (t2_here == end) return false;
|
---|
| 174 |
|
---|
| 175 | if (compare_phrase_match) {
|
---|
| 176 | if ((*t1_here).second.num_phrase_match > (*t2_here).second.num_phrase_match) return true;
|
---|
| 177 | if ((*t1_here).second.num_phrase_match < (*t2_here).second.num_phrase_match) return false;
|
---|
| 178 | }
|
---|
| 179 |
|
---|
| 180 | if (compare_terms_match) {
|
---|
| 181 | if ((*t1_here).second.num_query_terms_matched > (*t2_here).second.num_query_terms_matched) return true;
|
---|
| 182 | if ((*t1_here).second.num_query_terms_matched < (*t2_here).second.num_query_terms_matched) return false;
|
---|
| 183 | }
|
---|
| 184 |
|
---|
| 185 | if (compare_doc_weight) {
|
---|
| 186 | if ((*t1_here).second.docweight > (*t2_here).second.docweight) return true;
|
---|
| 187 | if ((*t1_here).second.docweight < (*t2_here).second.docweight) return false;
|
---|
| 188 | }
|
---|
| 189 |
|
---|
| 190 | return t1>t2;
|
---|
| 191 | }
|
---|
| 192 |
|
---|
| 193 |
|
---|
| 194 |
|
---|
| 195 |
|
---|
| 196 | /////////////////////////////////
|
---|
| 197 | // functions for queryfilterclass
|
---|
| 198 | /////////////////////////////////
|
---|
| 199 |
|
---|
[396] | 200 | // loads up phrases data structure with any phrases (that's the quoted bits)
|
---|
| 201 | // occuring in the querystring
|
---|
| 202 | void queryfilterclass::get_phrase_terms (const text_t &querystring,
|
---|
| 203 | const termfreqclassarray &orgterms,
|
---|
| 204 | vector<termfreqclassarray> &phrases) {
|
---|
| 205 |
|
---|
| 206 | text_t::const_iterator here = querystring.begin();
|
---|
| 207 | text_t::const_iterator end = querystring.end();
|
---|
| 208 |
|
---|
| 209 | termfreqclassarray tmpterms;
|
---|
| 210 |
|
---|
| 211 | int termcount = 0;
|
---|
| 212 | bool foundquote = false;
|
---|
| 213 | bool foundbreak = false;
|
---|
| 214 | bool start = true;
|
---|
| 215 | while (here != end) {
|
---|
| 216 | if (*here == '\"') {
|
---|
| 217 | if (foundquote) {
|
---|
| 218 | if (!foundbreak && !start) {
|
---|
| 219 | tmpterms.push_back (orgterms[termcount]);
|
---|
| 220 | termcount ++;
|
---|
| 221 | }
|
---|
| 222 | if (tmpterms.size() > 1) {
|
---|
| 223 | phrases.push_back (tmpterms);
|
---|
| 224 | tmpterms.erase (tmpterms.begin(), tmpterms.end());
|
---|
| 225 | }
|
---|
| 226 | foundquote = false;
|
---|
| 227 | foundbreak = true;
|
---|
| 228 | } else foundquote = true;
|
---|
| 229 | } else if (!is_unicode_letdig(*here)) {
|
---|
| 230 | // found a break between terms
|
---|
| 231 | if (!foundbreak && !start) {
|
---|
| 232 | if (foundquote)
|
---|
| 233 | tmpterms.push_back (orgterms[termcount]);
|
---|
| 234 | termcount ++;
|
---|
| 235 | }
|
---|
| 236 | foundbreak = true;
|
---|
| 237 | } else {
|
---|
| 238 | start = false;
|
---|
| 239 | foundbreak = false;
|
---|
| 240 | }
|
---|
| 241 | here++;
|
---|
| 242 | }
|
---|
| 243 | }
|
---|
| 244 |
|
---|
[327] | 245 | // do aditional query processing
|
---|
[396] | 246 | void queryfilterclass::post_process (const queryparamclass &queryparams,
|
---|
| 247 | queryresultsclass &queryresults) {
|
---|
| 248 |
|
---|
| 249 | // post-process the results if needed
|
---|
| 250 | if (queryresults.orgterms.size() > 1 && !queryresults.docs.docset.empty()) {
|
---|
| 251 |
|
---|
| 252 | // get the terms between quotes (if any)
|
---|
| 253 | vector<termfreqclassarray> phrases;
|
---|
| 254 | get_phrase_terms (queryparams.querystring, queryresults.orgterms, phrases);
|
---|
| 255 |
|
---|
[766] | 256 | num_phrases = phrases.size();
|
---|
| 257 | if (num_phrases > 0) {
|
---|
[396] | 258 |
|
---|
| 259 | // get the long version of the index
|
---|
| 260 | text_t longindex;
|
---|
| 261 | indexmap.to2from (queryparams.index, longindex);
|
---|
| 262 |
|
---|
| 263 | vector<termfreqclassarray>::const_iterator this_phrase = phrases.begin();
|
---|
| 264 | vector<termfreqclassarray>::const_iterator end_phrase = phrases.end();
|
---|
| 265 |
|
---|
| 266 | while (this_phrase != end_phrase) {
|
---|
| 267 |
|
---|
| 268 | // process each of the matched documents
|
---|
| 269 | docresultmap::iterator docs_here = queryresults.docs.docset.begin();
|
---|
| 270 | docresultmap::iterator docs_end = queryresults.docs.docset.end();
|
---|
| 271 | while (docs_here != docs_end) {
|
---|
| 272 | if (OID_phrase_search (*mgsearchptr, *gdbmptr, queryparams.index,
|
---|
| 273 | queryparams.subcollection, queryparams.language,
|
---|
| 274 | longindex, queryparams.collection, *this_phrase,
|
---|
| 275 | (*docs_here).second.docnum)) {
|
---|
| 276 | (*docs_here).second.num_phrase_match++;
|
---|
| 277 | }
|
---|
| 278 |
|
---|
| 279 | docs_here++;
|
---|
| 280 | }
|
---|
| 281 | this_phrase++;
|
---|
| 282 | }
|
---|
| 283 | }
|
---|
| 284 | }
|
---|
[327] | 285 | }
|
---|
[235] | 286 |
|
---|
[351] | 287 | // get the query parameters
|
---|
| 288 | void queryfilterclass::parse_query_params (const FilterRequest_t &request,
|
---|
| 289 | vector<queryparamclass> &query_params,
|
---|
[766] | 290 | int &startresults, int &endresults,
|
---|
| 291 | text_t &phrasematch, ostream &logout) {
|
---|
[351] | 292 | outconvertclass text_t2ascii;
|
---|
[327] | 293 |
|
---|
[351] | 294 | // set defaults for the return parameters
|
---|
[358] | 295 | query_params.erase(query_params.begin(), query_params.end());
|
---|
[351] | 296 | startresults = filterOptions["StartResults"].defaultValue.getint();
|
---|
| 297 | endresults = filterOptions["EndResults"].defaultValue.getint();
|
---|
[766] | 298 | phrasematch = filterOptions["PhraseMatch"].defaultValue;
|
---|
[327] | 299 |
|
---|
[351] | 300 | // set defaults for query parameters
|
---|
| 301 | queryparamclass query;
|
---|
| 302 | query.combinequery = "or"; // first one must be "or"
|
---|
| 303 | query.collection = collection;
|
---|
| 304 | query.index = filterOptions["Index"].defaultValue;
|
---|
| 305 | query.subcollection = filterOptions["Subcollection"].defaultValue;
|
---|
| 306 | query.language = filterOptions["Language"].defaultValue;
|
---|
| 307 | query.querystring.clear();
|
---|
| 308 | query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
|
---|
[501] | 309 | query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
|
---|
[351] | 310 | query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
|
---|
| 311 | query.stemming = (filterOptions["Stem"].defaultValue == "true");
|
---|
[613] | 312 | query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
|
---|
[351] | 313 |
|
---|
| 314 | OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
|
---|
| 315 | OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
|
---|
| 316 | while (options_here != options_end) {
|
---|
| 317 | if ((*options_here).name == "CombineQuery") {
|
---|
| 318 | // add this query
|
---|
| 319 |
|
---|
| 320 | // "all", needed when combining queries where the document results are needed
|
---|
| 321 | if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
|
---|
| 322 | query_params.push_back (query);
|
---|
| 323 |
|
---|
| 324 | // start on next query
|
---|
| 325 | query.clear();
|
---|
| 326 | query.combinequery = (*options_here).value;
|
---|
| 327 |
|
---|
| 328 | // set defaults for query parameters
|
---|
| 329 | query.collection = collection;
|
---|
| 330 | query.index = filterOptions["Index"].defaultValue;
|
---|
| 331 | query.subcollection = filterOptions["Subcollection"].defaultValue;
|
---|
| 332 | query.language = filterOptions["Language"].defaultValue;
|
---|
| 333 | query.querystring.clear();
|
---|
| 334 | query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
|
---|
[501] | 335 | query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
|
---|
[351] | 336 | query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
|
---|
| 337 | query.stemming = (filterOptions["Stem"].defaultValue == "true");
|
---|
| 338 |
|
---|
| 339 | // "all", needed when combining queries where the document results are needed
|
---|
| 340 | if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
|
---|
[613] | 341 | else query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
|
---|
[351] | 342 |
|
---|
| 343 | } else if ((*options_here).name == "StartResults") {
|
---|
| 344 | startresults = (*options_here).value.getint();
|
---|
| 345 | } else if ((*options_here).name == "EndResults") {
|
---|
| 346 | endresults = (*options_here).value.getint();
|
---|
| 347 | } else if ((*options_here).name == "QueryType") {
|
---|
| 348 | query.search_type = ((*options_here).value == "ranked");
|
---|
[501] | 349 | } else if ((*options_here).name == "MatchMode") {
|
---|
| 350 | query.match_mode = ((*options_here).value == "all");
|
---|
| 351 | if (query.match_mode == 1) query.maxdocs = -1;
|
---|
[351] | 352 | } else if ((*options_here).name == "Term") {
|
---|
| 353 | query.querystring = (*options_here).value;
|
---|
| 354 | } else if ((*options_here).name == "Casefold") {
|
---|
| 355 | query.casefolding = ((*options_here).value == "true");
|
---|
| 356 | } else if ((*options_here).name == "Stem") {
|
---|
| 357 | query.stemming = ((*options_here).value == "true");
|
---|
| 358 | } else if ((*options_here).name == "Index") {
|
---|
| 359 | query.index = (*options_here).value;
|
---|
| 360 | } else if ((*options_here).name == "Subcollection") {
|
---|
| 361 | query.subcollection = (*options_here).value;
|
---|
| 362 | } else if ((*options_here).name == "Language") {
|
---|
| 363 | query.language = (*options_here).value;
|
---|
[613] | 364 | } else if ((*options_here).name == "Maxdocs") {
|
---|
| 365 | query.maxdocs = (*options_here).value.getint();
|
---|
[766] | 366 | } else if ((*options_here).name == "PhraseMatch") {
|
---|
| 367 | phrasematch = (*options_here).value;
|
---|
[351] | 368 | } else {
|
---|
| 369 | logout << text_t2ascii
|
---|
| 370 | << "warning: unknown queryfilter option \""
|
---|
| 371 | << (*options_here).name
|
---|
| 372 | << "\" ignored.\n\n";
|
---|
| 373 | }
|
---|
| 374 |
|
---|
| 375 | options_here++;
|
---|
| 376 | }
|
---|
| 377 |
|
---|
| 378 | // add the last query
|
---|
| 379 | query_params.push_back (query);
|
---|
| 380 | }
|
---|
| 381 |
|
---|
| 382 |
|
---|
| 383 |
|
---|
| 384 | // do query that might involve multiple sub queries
|
---|
| 385 | // mgsearchptr and gdbmptr are assumed to be valid
|
---|
| 386 | void queryfilterclass::do_multi_query (const FilterRequest_t &request,
|
---|
| 387 | const vector<queryparamclass> &query_params,
|
---|
| 388 | queryresultsclass &multiresults,
|
---|
| 389 | comerror_t &err, ostream &logout) {
|
---|
| 390 | outconvertclass text_t2ascii;
|
---|
| 391 |
|
---|
| 392 | err = noError;
|
---|
| 393 | mgsearchptr->setcollectdir (collectdir);
|
---|
| 394 | multiresults.clear();
|
---|
| 395 |
|
---|
| 396 | vector<queryparamclass>::const_iterator query_here = query_params.begin();
|
---|
| 397 | vector<queryparamclass>::const_iterator query_end = query_params.end();
|
---|
| 398 | while (query_here != query_end) {
|
---|
| 399 | queryresultsclass thisqueryresults;
|
---|
| 400 |
|
---|
| 401 | if (!mgsearchptr->search(*query_here, thisqueryresults)) {
|
---|
| 402 | // most likely a system problem
|
---|
| 403 | logout << text_t2ascii
|
---|
| 404 | << "system problem: could not do search with mg for index \""
|
---|
| 405 | << (*query_here).index << (*query_here).subcollection
|
---|
| 406 | << (*query_here).language << "\".\n\n";
|
---|
| 407 | err = systemProblem;
|
---|
| 408 | return;
|
---|
| 409 | }
|
---|
| 410 |
|
---|
| 411 | // combine the results
|
---|
| 412 | if (need_matching_docs (request.filterResultOptions)) {
|
---|
| 413 | // post-process the results if needed
|
---|
| 414 | if (!thisqueryresults.postprocessed && thisqueryresults.orgterms.size() > 1 &&
|
---|
| 415 | !thisqueryresults.docs.docset.empty()) {
|
---|
| 416 | post_process (*query_here, thisqueryresults);
|
---|
| 417 | thisqueryresults.postprocessed = true;
|
---|
| 418 | multiresults.postprocessed = true;
|
---|
| 419 | }
|
---|
| 420 |
|
---|
| 421 | if (query_params.size() == 1) {
|
---|
| 422 | multiresults.docs = thisqueryresults.docs; // just one set of results
|
---|
| 423 | multiresults.docs_matched = thisqueryresults.docs_matched;
|
---|
| 424 | multiresults.is_approx = thisqueryresults.is_approx;
|
---|
| 425 |
|
---|
| 426 | } else {
|
---|
| 427 | if ((*query_here).combinequery == "and") {
|
---|
| 428 | multiresults.docs.combine_and (thisqueryresults.docs);
|
---|
| 429 | } else if ((*query_here).combinequery == "or") {
|
---|
| 430 | multiresults.docs.combine_or (thisqueryresults.docs);
|
---|
| 431 | } else if ((*query_here).combinequery == "not") {
|
---|
| 432 | multiresults.docs.combine_not (thisqueryresults.docs);
|
---|
| 433 | }
|
---|
| 434 | multiresults.docs_matched = multiresults.docs.docset.size();
|
---|
[398] | 435 | multiresults.is_approx = Exact;
|
---|
[351] | 436 | }
|
---|
| 437 | }
|
---|
| 438 |
|
---|
| 439 | // combine the term information
|
---|
| 440 | if (need_term_info (request.filterResultOptions)) {
|
---|
| 441 | // append the terms
|
---|
| 442 | multiresults.orgterms.insert(multiresults.orgterms.end(),
|
---|
| 443 | thisqueryresults.orgterms.begin(),
|
---|
| 444 | thisqueryresults.orgterms.end());
|
---|
| 445 |
|
---|
| 446 | // add the term variants
|
---|
[358] | 447 | text_tset::iterator termvar_here = thisqueryresults.termvariants.begin();
|
---|
| 448 | text_tset::iterator termvar_end = thisqueryresults.termvariants.end();
|
---|
| 449 | while (termvar_here != termvar_end) {
|
---|
| 450 | multiresults.termvariants.insert(*termvar_here);
|
---|
| 451 | termvar_here++;
|
---|
| 452 | }
|
---|
[351] | 453 | }
|
---|
| 454 |
|
---|
| 455 | query_here++;
|
---|
| 456 | }
|
---|
| 457 |
|
---|
| 458 | // sort and unique the query terms
|
---|
| 459 | multiresults.sortuniqqueryterms ();
|
---|
| 460 | }
|
---|
| 461 |
|
---|
| 462 |
|
---|
| 463 | void queryfilterclass::sort_doc_results (const FilterRequest_t &/*request*/,
|
---|
| 464 | docresultsclass &docs) {
|
---|
| 465 | resultsorderer_t resultsorderer;
|
---|
[396] | 466 | resultsorderer.compare_phrase_match = true;
|
---|
[351] | 467 | resultsorderer.docset = &(docs.docset);
|
---|
| 468 |
|
---|
| 469 | // first get a list of document numbers
|
---|
| 470 | docs.docnum_order();
|
---|
| 471 |
|
---|
| 472 | sort (docs.docorder.begin(), docs.docorder.end(), resultsorderer);
|
---|
| 473 | }
|
---|
| 474 |
|
---|
| 475 |
|
---|
| 476 |
|
---|
[227] | 477 | queryfilterclass::queryfilterclass () {
|
---|
| 478 | gdbmptr = NULL;
|
---|
| 479 | mgsearchptr = NULL;
|
---|
[766] | 480 | num_phrases = 0;
|
---|
[227] | 481 |
|
---|
[351] | 482 | FilterOption_t filtopt;
|
---|
| 483 | filtopt.name = "CombineQuery";
|
---|
| 484 | filtopt.type = FilterOption_t::enumeratedt;
|
---|
| 485 | filtopt.repeatable = FilterOption_t::onePerQuery;
|
---|
| 486 | filtopt.defaultValue = "and";
|
---|
| 487 | filtopt.validValues.push_back("and");
|
---|
| 488 | filtopt.validValues.push_back("or");
|
---|
| 489 | filtopt.validValues.push_back("not");
|
---|
| 490 | filterOptions["CombineQuery"] = filtopt;
|
---|
| 491 |
|
---|
[227] | 492 | // -- onePerQuery StartResults integer
|
---|
[351] | 493 | filtopt.clear();
|
---|
[227] | 494 | filtopt.name = "StartResults";
|
---|
| 495 | filtopt.type = FilterOption_t::integert;
|
---|
| 496 | filtopt.repeatable = FilterOption_t::onePerQuery;
|
---|
| 497 | filtopt.defaultValue = "1";
|
---|
| 498 | filtopt.validValues.push_back("1");
|
---|
| 499 | filtopt.validValues.push_back("1000");
|
---|
| 500 | filterOptions["StartResults"] = filtopt;
|
---|
| 501 |
|
---|
| 502 | // -- onePerQuery EndResults integer
|
---|
| 503 | filtopt.clear();
|
---|
| 504 | filtopt.name = "EndResults";
|
---|
| 505 | filtopt.type = FilterOption_t::integert;
|
---|
| 506 | filtopt.repeatable = FilterOption_t::onePerQuery;
|
---|
| 507 | filtopt.defaultValue = "10";
|
---|
[621] | 508 | filtopt.validValues.push_back("-1");
|
---|
[227] | 509 | filtopt.validValues.push_back("1000");
|
---|
| 510 | filterOptions["EndResults"] = filtopt;
|
---|
| 511 |
|
---|
| 512 | // -- onePerQuery QueryType enumerated (boolean, ranked)
|
---|
| 513 | filtopt.clear();
|
---|
| 514 | filtopt.name = "QueryType";
|
---|
| 515 | filtopt.type = FilterOption_t::enumeratedt;
|
---|
| 516 | filtopt.repeatable = FilterOption_t::onePerQuery;
|
---|
| 517 | filtopt.defaultValue = "ranked";
|
---|
| 518 | filtopt.validValues.push_back("boolean");
|
---|
| 519 | filtopt.validValues.push_back("ranked");
|
---|
| 520 | filterOptions["QueryType"] = filtopt;
|
---|
| 521 |
|
---|
[501] | 522 | // -- onePerQuery MatchMode enumerated (some, all)
|
---|
| 523 | filtopt.clear();
|
---|
| 524 | filtopt.name = "MatchMode";
|
---|
| 525 | filtopt.type = FilterOption_t::enumeratedt;
|
---|
| 526 | filtopt.repeatable = FilterOption_t::onePerQuery;
|
---|
| 527 | filtopt.defaultValue = "some";
|
---|
| 528 | filtopt.validValues.push_back("some");
|
---|
| 529 | filtopt.validValues.push_back("all");
|
---|
[613] | 530 | filterOptions["MatchMode"] = filtopt;
|
---|
[501] | 531 |
|
---|
[227] | 532 | // -- onePerTerm Term string ???
|
---|
| 533 | filtopt.clear();
|
---|
| 534 | filtopt.name = "Term";
|
---|
| 535 | filtopt.type = FilterOption_t::stringt;
|
---|
| 536 | filtopt.repeatable = FilterOption_t::onePerTerm;
|
---|
| 537 | filtopt.defaultValue = "";
|
---|
| 538 | filterOptions["Term"] = filtopt;
|
---|
| 539 |
|
---|
| 540 | // -- onePerTerm Casefold boolean
|
---|
| 541 | filtopt.clear();
|
---|
| 542 | filtopt.name = "Casefold";
|
---|
| 543 | filtopt.type = FilterOption_t::booleant;
|
---|
| 544 | filtopt.repeatable = FilterOption_t::onePerTerm;
|
---|
| 545 | filtopt.defaultValue = "true";
|
---|
| 546 | filtopt.validValues.push_back("false");
|
---|
| 547 | filtopt.validValues.push_back("true");
|
---|
| 548 | filterOptions["Casefold"] = filtopt;
|
---|
| 549 |
|
---|
| 550 | // -- onePerTerm Stem boolean
|
---|
| 551 | filtopt.clear();
|
---|
| 552 | filtopt.name = "Stem";
|
---|
| 553 | filtopt.type = FilterOption_t::booleant;
|
---|
| 554 | filtopt.repeatable = FilterOption_t::onePerTerm;
|
---|
| 555 | filtopt.defaultValue = "false";
|
---|
| 556 | filtopt.validValues.push_back("false");
|
---|
| 557 | filtopt.validValues.push_back("true");
|
---|
| 558 | filterOptions["Stem"] = filtopt;
|
---|
| 559 |
|
---|
| 560 | // -- onePerTerm Index enumerated
|
---|
| 561 | filtopt.clear();
|
---|
| 562 | filtopt.name = "Index";
|
---|
| 563 | filtopt.type = FilterOption_t::enumeratedt;
|
---|
| 564 | filtopt.repeatable = FilterOption_t::onePerTerm;
|
---|
| 565 | filtopt.defaultValue = "";
|
---|
| 566 | filterOptions["Index"] = filtopt;
|
---|
| 567 |
|
---|
| 568 | // -- onePerTerm Subcollection enumerated
|
---|
| 569 | filtopt.clear();
|
---|
| 570 | filtopt.name = "Subcollection";
|
---|
| 571 | filtopt.type = FilterOption_t::enumeratedt;
|
---|
| 572 | filtopt.repeatable = FilterOption_t::onePerTerm;
|
---|
| 573 | filtopt.defaultValue = "";
|
---|
| 574 | filterOptions["Subcollection"] = filtopt;
|
---|
| 575 |
|
---|
| 576 | // -- onePerTerm Language enumerated
|
---|
| 577 | filtopt.clear();
|
---|
| 578 | filtopt.name = "Language";
|
---|
| 579 | filtopt.type = FilterOption_t::enumeratedt;
|
---|
| 580 | filtopt.repeatable = FilterOption_t::onePerTerm;
|
---|
| 581 | filtopt.defaultValue = "";
|
---|
| 582 | filterOptions["Language"] = filtopt;
|
---|
[613] | 583 |
|
---|
| 584 | // -- onePerQuery Maxdocs integer
|
---|
| 585 | filtopt.clear();
|
---|
| 586 | filtopt.name = "Maxdocs";
|
---|
| 587 | filtopt.type = FilterOption_t::integert;
|
---|
| 588 | filtopt.repeatable = FilterOption_t::onePerQuery;
|
---|
| 589 | filtopt.defaultValue = "200";
|
---|
| 590 | filtopt.validValues.push_back("-1");
|
---|
| 591 | filtopt.validValues.push_back("1000");
|
---|
| 592 | filterOptions["Maxdocs"] = filtopt;
|
---|
[766] | 593 |
|
---|
| 594 | // -- onePerQuery PhraseMatch enumerated
|
---|
| 595 | filtopt.clear();
|
---|
| 596 | filtopt.name = "PhraseMatch";
|
---|
| 597 | filtopt.type = FilterOption_t::enumeratedt;
|
---|
| 598 | filtopt.repeatable = FilterOption_t::onePerQuery;
|
---|
| 599 | filtopt.defaultValue = "some_phrases";
|
---|
| 600 | filtopt.validValues.push_back ("all_phrases");
|
---|
| 601 | filtopt.validValues.push_back ("some_phrases");
|
---|
| 602 | filtopt.validValues.push_back ("all_docs");
|
---|
| 603 | filterOptions["PhraseMatch"] = filtopt;
|
---|
[227] | 604 | }
|
---|
| 605 |
|
---|
| 606 | queryfilterclass::~queryfilterclass () {
|
---|
| 607 | }
|
---|
| 608 |
|
---|
| 609 | void queryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
|
---|
| 610 | filterclass::configure (key, cfgline);
|
---|
| 611 |
|
---|
| 612 | if (key == "indexmap") {
|
---|
| 613 | indexmap.importmap (cfgline);
|
---|
| 614 |
|
---|
| 615 | // update the list of indexes in the filter information
|
---|
| 616 | text_tarray options;
|
---|
| 617 | indexmap.gettoarray (options);
|
---|
| 618 | filterOptions["Index"].validValues = options;
|
---|
| 619 |
|
---|
[302] | 620 | } else if (key == "defaultindex") {
|
---|
| 621 | indexmap.from2to (cfgline[0], filterOptions["Index"].defaultValue);
|
---|
| 622 |
|
---|
[227] | 623 | } else if (key == "subcollectionmap") {
|
---|
| 624 | subcollectionmap.importmap (cfgline);
|
---|
| 625 |
|
---|
| 626 | // update the list of subcollections in the filter information
|
---|
| 627 | text_tarray options;
|
---|
| 628 | subcollectionmap.gettoarray (options);
|
---|
| 629 | filterOptions["Subcollection"].validValues = options;
|
---|
| 630 |
|
---|
[302] | 631 | } else if (key == "defaultsubcollection") {
|
---|
| 632 | subcollectionmap.from2to (cfgline[0], filterOptions["Subcollection"].defaultValue);
|
---|
| 633 |
|
---|
[227] | 634 | } else if (key == "languagemap") {
|
---|
| 635 | languagemap.importmap (cfgline);
|
---|
| 636 |
|
---|
| 637 | // update the list of languages in the filter information
|
---|
| 638 | text_tarray options;
|
---|
| 639 | languagemap.gettoarray (options);
|
---|
| 640 | filterOptions["Language"].validValues = options;
|
---|
[302] | 641 |
|
---|
| 642 | } else if (key == "defaultlanguage")
|
---|
| 643 | languagemap.from2to (cfgline[0], filterOptions["Language"].defaultValue);
|
---|
[227] | 644 | }
|
---|
| 645 |
|
---|
| 646 | bool queryfilterclass::init (ostream &logout) {
|
---|
| 647 | outconvertclass text_t2ascii;
|
---|
| 648 |
|
---|
| 649 | if (!filterclass::init(logout)) return false;
|
---|
| 650 |
|
---|
| 651 | // get the filename for the database and make sure it exists
|
---|
[534] | 652 | gdbm_filename = filename_cat(collectdir,"index","text",collection);
|
---|
[396] | 653 |
|
---|
[990] | 654 | if (littleEndian()) gdbm_filename += ".ldb";
|
---|
| 655 | else gdbm_filename += ".bdb";
|
---|
| 656 |
|
---|
[227] | 657 | if (!file_exists(gdbm_filename)) {
|
---|
| 658 | logout << text_t2ascii
|
---|
[722] | 659 | << "warning: gdbm database \"" //****
|
---|
[227] | 660 | << gdbm_filename << "\" does not exist\n\n";
|
---|
[722] | 661 | //return false; //****
|
---|
[227] | 662 | }
|
---|
| 663 |
|
---|
| 664 | return true;
|
---|
| 665 | }
|
---|
| 666 |
|
---|
[235] | 667 | void queryfilterclass::filter (const FilterRequest_t &request,
|
---|
[273] | 668 | FilterResponse_t &response,
|
---|
| 669 | comerror_t &err, ostream &logout) {
|
---|
[235] | 670 | outconvertclass text_t2ascii;
|
---|
| 671 |
|
---|
| 672 | response.clear ();
|
---|
[227] | 673 | err = noError;
|
---|
[235] | 674 | if (gdbmptr == NULL) {
|
---|
| 675 | // most likely a configuration problem
|
---|
| 676 | logout << text_t2ascii
|
---|
| 677 | << "configuration error: queryfilter contains a null gdbmclass\n\n";
|
---|
| 678 | err = configurationError;
|
---|
| 679 | return;
|
---|
| 680 | }
|
---|
| 681 | if (mgsearchptr == NULL) {
|
---|
| 682 | // most likely a configuration problem
|
---|
| 683 | logout << text_t2ascii
|
---|
| 684 | << "configuration error: queryfilter contains a null mgsearchclass\n\n";
|
---|
| 685 | err = configurationError;
|
---|
| 686 | return;
|
---|
| 687 | }
|
---|
| 688 |
|
---|
| 689 | // open the database
|
---|
| 690 | gdbmptr->setlogout(&logout);
|
---|
[501] | 691 | if (!gdbmptr->opendatabase (gdbm_filename, GDBM_READER, 100, false)) {
|
---|
[235] | 692 | // most likely a system problem (we have already checked that the
|
---|
| 693 | // gdbm database exists)
|
---|
| 694 | logout << text_t2ascii
|
---|
| 695 | << "system problem: open on gdbm database \""
|
---|
| 696 | << gdbm_filename << "\" failed\n\n";
|
---|
| 697 | err = systemProblem;
|
---|
| 698 | return;
|
---|
| 699 | }
|
---|
| 700 |
|
---|
| 701 | // get the query parameters
|
---|
| 702 | int startresults = filterOptions["StartResults"].defaultValue.getint();
|
---|
| 703 | int endresults = filterOptions["EndResults"].defaultValue.getint();
|
---|
[766] | 704 | text_t phrasematch = filterOptions["PhraseMatch"].defaultValue;
|
---|
| 705 |
|
---|
[351] | 706 | vector<queryparamclass> queryfilterparams;
|
---|
[766] | 707 | parse_query_params (request, queryfilterparams, startresults,
|
---|
| 708 | endresults, phrasematch, logout);
|
---|
[351] | 709 |
|
---|
[235] | 710 | // do query
|
---|
| 711 | queryresultsclass queryresults;
|
---|
[351] | 712 | do_multi_query (request, queryfilterparams, queryresults, err, logout);
|
---|
| 713 | if (err != noError) return;
|
---|
| 714 |
|
---|
[235] | 715 | // assemble document results
|
---|
[351] | 716 | if (need_matching_docs (request.filterResultOptions)) {
|
---|
| 717 | // sort the query results
|
---|
| 718 | sort_doc_results (request, queryresults.docs);
|
---|
| 719 |
|
---|
[235] | 720 | int resultnum = 1;
|
---|
| 721 | ResultDocInfo_t resultdoc;
|
---|
| 722 | text_t trans_OID;
|
---|
[351] | 723 | vector<int>::iterator docorder_here = queryresults.docs.docorder.begin();
|
---|
| 724 | vector<int>::iterator docorder_end = queryresults.docs.docorder.end();
|
---|
| 725 |
|
---|
[621] | 726 | if (endresults == -1) endresults = MAXNUMDOCS;
|
---|
[351] | 727 | while (docorder_here != docorder_end) {
|
---|
[235] | 728 | if (resultnum > endresults) break;
|
---|
| 729 |
|
---|
| 730 | // translate the document number
|
---|
[351] | 731 | if (!translate(gdbmptr, *docorder_here, trans_OID)) {
|
---|
[235] | 732 | logout << text_t2ascii
|
---|
| 733 | << "warning: could not translate mg document number \""
|
---|
[351] | 734 | << *docorder_here << "\"to OID.\n\n";
|
---|
[235] | 735 |
|
---|
| 736 | } else {
|
---|
[351] | 737 | docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
|
---|
| 738 |
|
---|
[766] | 739 | // documents containing matching phrases will be sorted to the top so
|
---|
| 740 | // we can break out once we're past those that match the PhraseMatch
|
---|
| 741 | // option -- "all_phrases" = return only those documents containing all
|
---|
| 742 | // phrases in query string
|
---|
| 743 | // "some_phrases" = return only those documents containing
|
---|
| 744 | // at least 1 of the phrases in the document
|
---|
| 745 | // "all_docs" = return all documents regardless
|
---|
| 746 | if (num_phrases > 0) {
|
---|
[787] | 747 | if ((phrasematch == "all_phrases") && ((*docset_here).second.num_phrase_match < num_phrases)) {
|
---|
| 748 | queryresults.docs_matched = response.docInfo.size();
|
---|
[766] | 749 | break;
|
---|
[787] | 750 | }
|
---|
| 751 | if ((phrasematch == "some_phrases") && ((*docset_here).second.num_phrase_match < 1)) {
|
---|
| 752 | queryresults.docs_matched = response.docInfo.size();
|
---|
[766] | 753 | break;
|
---|
[787] | 754 | }
|
---|
[766] | 755 | }
|
---|
| 756 |
|
---|
[351] | 757 | // see if there is a result for this number,
|
---|
| 758 | // if it is in the request set (or the request set is empty)
|
---|
| 759 | if (docset_here != queryresults.docs.docset.end() &&
|
---|
| 760 | (request.docSet.empty() || in_set(request.docSet, trans_OID))) {
|
---|
[235] | 761 | if (resultnum >= startresults) {
|
---|
| 762 | // add this document
|
---|
| 763 | resultdoc.OID = trans_OID;
|
---|
[355] | 764 | resultdoc.result_num = resultnum;
|
---|
[351] | 765 | resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
|
---|
[311] | 766 |
|
---|
| 767 | // these next two are not available on all versions of mg
|
---|
[351] | 768 | resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
|
---|
| 769 | resultdoc.num_phrase_match = (*docset_here).second.num_phrase_match;
|
---|
[311] | 770 |
|
---|
[235] | 771 | response.docInfo.push_back (resultdoc);
|
---|
| 772 | }
|
---|
| 773 |
|
---|
| 774 | resultnum++;
|
---|
| 775 | }
|
---|
| 776 | }
|
---|
| 777 |
|
---|
[351] | 778 | docorder_here++;
|
---|
[235] | 779 | }
|
---|
| 780 | }
|
---|
| 781 |
|
---|
| 782 | // assemble the term results
|
---|
[351] | 783 | if (need_term_info(request.filterResultOptions)) {
|
---|
[319] | 784 | // note: the terms have already been sorted and uniqued
|
---|
[235] | 785 |
|
---|
| 786 | TermInfo_t terminfo;
|
---|
| 787 | bool terms_first = true;
|
---|
[396] | 788 | termfreqclassarray::iterator terms_here = queryresults.terms.begin();
|
---|
| 789 | termfreqclassarray::iterator terms_end = queryresults.terms.end();
|
---|
[235] | 790 |
|
---|
| 791 | while (terms_here != terms_end) {
|
---|
| 792 | terminfo.clear();
|
---|
| 793 | terminfo.term = (*terms_here).termstr;
|
---|
| 794 | terminfo.freq = (*terms_here).termfreq;
|
---|
[351] | 795 | if (terms_first) {
|
---|
| 796 | text_tset::iterator termvariants_here = queryresults.termvariants.begin();
|
---|
| 797 | text_tset::iterator termvariants_end = queryresults.termvariants.end();
|
---|
| 798 | while (termvariants_here != termvariants_end) {
|
---|
| 799 | terminfo.matchTerms.push_back (*termvariants_here);
|
---|
| 800 | termvariants_here++;
|
---|
| 801 | }
|
---|
| 802 | }
|
---|
[235] | 803 | terms_first = false;
|
---|
| 804 |
|
---|
| 805 | response.termInfo.push_back (terminfo);
|
---|
| 806 |
|
---|
| 807 | terms_here++;
|
---|
| 808 | }
|
---|
| 809 | }
|
---|
| 810 |
|
---|
[334] | 811 | response.numDocs = queryresults.docs_matched;
|
---|
| 812 | response.isApprox = queryresults.is_approx;
|
---|
[227] | 813 | }
|
---|