source: trunk/gsdl/src/colservr/queryfilter.cpp@ 334

Last change on this file since 334 was 334, checked in by rjmcnab, 25 years ago

Changes for better reporting of number documents which match a query. Changes
should still work as before with older versions of mg.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.5 KB
Line 
1/**********************************************************************
2 *
3 * queryfilter.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: queryfilter.cpp 334 1999-07-01 09:29:21Z rjmcnab $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.9 1999/07/01 09:29:20 rjmcnab
15 Changes for better reporting of number documents which match a query. Changes
16 should still work as before with older versions of mg.
17
18 Revision 1.8 1999/07/01 03:59:54 rjmcnab
19 reduced MAXDOCS to 200 (more reasonable ???). I also added a virtual
20 method for post-processing the query.
21
22 Revision 1.7 1999/06/30 04:04:13 rjmcnab
23 made stemming functions available from mgsearch and made the stems
24 for the query terms available in queryinfo
25
26 Revision 1.6 1999/06/29 22:06:23 rjmcnab
27 Added a couple of fields to queryinfo to handle a special version
28 of mg.
29
30 Revision 1.5 1999/06/27 22:08:48 sjboddie
31 now check for defaultindex, defaultsubcollection, and defaultlanguage
32 entries in config files
33
34 Revision 1.4 1999/06/16 02:03:25 sjboddie
35 fixed bug in isApprox and set MAXDOCS to always be 500
36
37 Revision 1.3 1999/04/19 23:56:09 rjmcnab
38 Finished the gdbm metadata stuff
39
40 Revision 1.2 1999/04/12 03:45:03 rjmcnab
41 Finished the query filter.
42
43 Revision 1.1 1999/04/06 22:22:09 rjmcnab
44 Initial revision.
45
46 */
47
48
49#include "queryfilter.h"
50#include "fileutil.h"
51#include "queryinfo.h"
52
53#define MAXDOCS 200 // note that maxdocs must be at least as large
54 // as the highest possible value of EndResults
55
56// some useful functions
57
58// translate will return true if successful
59static bool translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
60 infodbclass info;
61
62 trans_OID.clear();
63
64 // get the info
65 if (gdbmptr == NULL) return false;
66 if (!gdbmptr->getinfo(docnum, info)) return false;
67
68 // translate
69 if (info["section"].empty()) return false;
70
71 trans_OID = info["section"];
72 return true;
73}
74
75
76// do aditional query processing
77void queryfilterclass::post_process (const queryparamclass &/*queryparams*/, const text_t &/*index*/,
78 queryresultsclass &/*queryresults*/) {
79}
80
81
82
83queryfilterclass::queryfilterclass () {
84 gdbmptr = NULL;
85 mgsearchptr = NULL;
86
87 // -- onePerQuery StartResults integer
88 FilterOption_t filtopt;
89 filtopt.name = "StartResults";
90 filtopt.type = FilterOption_t::integert;
91 filtopt.repeatable = FilterOption_t::onePerQuery;
92 filtopt.defaultValue = "1";
93 filtopt.validValues.push_back("1");
94 filtopt.validValues.push_back("1000");
95 filterOptions["StartResults"] = filtopt;
96
97 // -- onePerQuery EndResults integer
98 filtopt.clear();
99 filtopt.name = "EndResults";
100 filtopt.type = FilterOption_t::integert;
101 filtopt.repeatable = FilterOption_t::onePerQuery;
102 filtopt.defaultValue = "10";
103 filtopt.validValues.push_back("1");
104 filtopt.validValues.push_back("1000");
105 filterOptions["EndResults"] = filtopt;
106
107 // -- onePerQuery QueryType enumerated (boolean, ranked)
108 filtopt.clear();
109 filtopt.name = "QueryType";
110 filtopt.type = FilterOption_t::enumeratedt;
111 filtopt.repeatable = FilterOption_t::onePerQuery;
112 filtopt.defaultValue = "ranked";
113 filtopt.validValues.push_back("boolean");
114 filtopt.validValues.push_back("ranked");
115 filterOptions["QueryType"] = filtopt;
116
117 // -- onePerTerm Term string ???
118 filtopt.clear();
119 filtopt.name = "Term";
120 filtopt.type = FilterOption_t::stringt;
121 filtopt.repeatable = FilterOption_t::onePerTerm;
122 filtopt.defaultValue = "";
123 filterOptions["Term"] = filtopt;
124
125 // -- onePerTerm Casefold boolean
126 filtopt.clear();
127 filtopt.name = "Casefold";
128 filtopt.type = FilterOption_t::booleant;
129 filtopt.repeatable = FilterOption_t::onePerTerm;
130 filtopt.defaultValue = "true";
131 filtopt.validValues.push_back("false");
132 filtopt.validValues.push_back("true");
133 filterOptions["Casefold"] = filtopt;
134
135 // -- onePerTerm Stem boolean
136 filtopt.clear();
137 filtopt.name = "Stem";
138 filtopt.type = FilterOption_t::booleant;
139 filtopt.repeatable = FilterOption_t::onePerTerm;
140 filtopt.defaultValue = "false";
141 filtopt.validValues.push_back("false");
142 filtopt.validValues.push_back("true");
143 filterOptions["Stem"] = filtopt;
144
145 // -- onePerTerm Index enumerated
146 filtopt.clear();
147 filtopt.name = "Index";
148 filtopt.type = FilterOption_t::enumeratedt;
149 filtopt.repeatable = FilterOption_t::onePerTerm;
150 filtopt.defaultValue = "";
151 filterOptions["Index"] = filtopt;
152
153 // -- onePerTerm Subcollection enumerated
154 filtopt.clear();
155 filtopt.name = "Subcollection";
156 filtopt.type = FilterOption_t::enumeratedt;
157 filtopt.repeatable = FilterOption_t::onePerTerm;
158 filtopt.defaultValue = "";
159 filterOptions["Subcollection"] = filtopt;
160
161 // -- onePerTerm Language enumerated
162 filtopt.clear();
163 filtopt.name = "Language";
164 filtopt.type = FilterOption_t::enumeratedt;
165 filtopt.repeatable = FilterOption_t::onePerTerm;
166 filtopt.defaultValue = "";
167 filterOptions["Language"] = filtopt;
168}
169
170queryfilterclass::~queryfilterclass () {
171}
172
173void queryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
174 filterclass::configure (key, cfgline);
175
176 if (key == "indexmap") {
177 indexmap.importmap (cfgline);
178
179 // update the list of indexes in the filter information
180 text_tarray options;
181 indexmap.gettoarray (options);
182 filterOptions["Index"].validValues = options;
183
184 } else if (key == "defaultindex") {
185 indexmap.from2to (cfgline[0], filterOptions["Index"].defaultValue);
186
187 } else if (key == "subcollectionmap") {
188 subcollectionmap.importmap (cfgline);
189
190 // update the list of subcollections in the filter information
191 text_tarray options;
192 subcollectionmap.gettoarray (options);
193 filterOptions["Subcollection"].validValues = options;
194
195 } else if (key == "defaultsubcollection") {
196 subcollectionmap.from2to (cfgline[0], filterOptions["Subcollection"].defaultValue);
197
198 } else if (key == "languagemap") {
199 languagemap.importmap (cfgline);
200
201 // update the list of languages in the filter information
202 text_tarray options;
203 languagemap.gettoarray (options);
204 filterOptions["Language"].validValues = options;
205
206 } else if (key == "defaultlanguage")
207 languagemap.from2to (cfgline[0], filterOptions["Language"].defaultValue);
208}
209
210bool queryfilterclass::init (ostream &logout) {
211 outconvertclass text_t2ascii;
212
213 if (!filterclass::init(logout)) return false;
214
215 // get the filename for the database and make sure it exists
216 gdbm_filename = filename_cat(collectdir,"index","text",collection);
217#ifdef _LITTLE_ENDIAN
218 gdbm_filename += ".ldb";
219#else
220 gdbm_filename += ".bdb";
221#endif
222 if (!file_exists(gdbm_filename)) {
223 logout << text_t2ascii
224 << "error: gdbm database \""
225 << gdbm_filename << "\" does not exist\n\n";
226 return false;
227 }
228
229 return true;
230}
231
232void queryfilterclass::filter (const FilterRequest_t &request,
233 FilterResponse_t &response,
234 comerror_t &err, ostream &logout) {
235 outconvertclass text_t2ascii;
236
237 response.clear ();
238 err = noError;
239 if (gdbmptr == NULL) {
240 // most likely a configuration problem
241 logout << text_t2ascii
242 << "configuration error: queryfilter contains a null gdbmclass\n\n";
243 err = configurationError;
244 return;
245 }
246 if (mgsearchptr == NULL) {
247 // most likely a configuration problem
248 logout << text_t2ascii
249 << "configuration error: queryfilter contains a null mgsearchclass\n\n";
250 err = configurationError;
251 return;
252 }
253
254 // open the database
255 gdbmptr->setlogout(&logout);
256 if (!gdbmptr->opendatabase (gdbm_filename)) {
257 // most likely a system problem (we have already checked that the
258 // gdbm database exists)
259 logout << text_t2ascii
260 << "system problem: open on gdbm database \""
261 << gdbm_filename << "\" failed\n\n";
262 err = systemProblem;
263 return;
264 }
265
266 // get the query parameters
267 int startresults = filterOptions["StartResults"].defaultValue.getint();
268 int endresults = filterOptions["EndResults"].defaultValue.getint();
269 text_t index = filterOptions["Index"].defaultValue;
270 text_t subcollection = filterOptions["Subcollection"].defaultValue;
271 text_t language = filterOptions["Language"].defaultValue;
272 queryparamclass queryparams;
273 queryparams.collection = collection;
274 queryparams.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
275 queryparams.casefolding = (filterOptions["Casefold"].defaultValue == "true");
276 queryparams.stemming = (filterOptions["Stem"].defaultValue == "true");
277
278 OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
279 OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
280 while (options_here != options_end) {
281 if ((*options_here).name == "StartResults") {
282 startresults = (*options_here).value.getint();
283 } else if ((*options_here).name == "EndResults") {
284 endresults = (*options_here).value.getint();
285 } else if ((*options_here).name == "QueryType") {
286 queryparams.search_type = ((*options_here).value == "ranked");
287 } else if ((*options_here).name == "Term") {
288 queryparams.querystring = (*options_here).value;
289 } else if ((*options_here).name == "Casefold") {
290 queryparams.casefolding = ((*options_here).value == "true");
291 } else if ((*options_here).name == "Stem") {
292 queryparams.stemming = ((*options_here).value == "true");
293 } else if ((*options_here).name == "Index") {
294 index = (*options_here).value;
295 } else if ((*options_here).name == "Subcollection") {
296 subcollection = (*options_here).value;
297 } else if ((*options_here).name == "Language") {
298 language = (*options_here).value;
299 } else {
300 logout << text_t2ascii
301 << "warning: unknown queryfilter option \""
302 << (*options_here).name
303 << "\" ignored.\n\n";
304 }
305
306 options_here++;
307 }
308
309 queryparams.search_index = index+subcollection+language;
310 queryparams.maxdocs = MAXDOCS;
311
312 // do query
313 queryresultsclass queryresults;
314 mgsearchptr->setcollectdir (collectdir);
315 if (!mgsearchptr->search(queryparams, queryresults)) {
316 // most likely a system problem
317 logout << text_t2ascii
318 << "system problem: could not do search with mg for index \""
319 << queryparams.search_index << "\".\n\n";
320 err = systemProblem;
321 return;
322 }
323
324 // assemble document results
325 if ((request.filterResultOptions & FROID) || (request.filterResultOptions & FRranking) ||
326 (request.filterResultOptions & FRmetadata)) {
327
328 // post-process the results if needed
329 if (!queryresults.postprocessed && queryresults.orgterms.size() > 1 &&
330 !queryresults.docs.empty()) {
331 post_process (queryparams, index, queryresults);
332 queryresults.postprocessed = true;
333 }
334
335 int resultnum = 1;
336 ResultDocInfo_t resultdoc;
337 text_t trans_OID;
338 vector<docresultclass>::iterator docs_here = queryresults.docs.begin();
339 vector<docresultclass>::iterator docs_end = queryresults.docs.end();
340
341 while (docs_here != docs_end) {
342 if (resultnum > endresults) break;
343
344 // translate the document number
345 if (!translate(gdbmptr, (*docs_here).docnum, trans_OID)) {
346 logout << text_t2ascii
347 << "warning: could not translate mg document number \""
348 << (*docs_here).docnum << "\"to OID.\n\n";
349
350 } else {
351 // see if it is in the set (or the set is empty)
352 if (request.docSet.empty() || in_set(request.docSet, trans_OID)) {
353 if (resultnum >= startresults) {
354 // add this document
355 resultdoc.OID = trans_OID;
356 resultdoc.ranking = (int)((*docs_here).docweight * 10000.0 + 0.5);
357
358 // these next two are not available on all versions of mg
359 resultdoc.num_terms_matched = (*docs_here).num_query_terms_matched;
360 resultdoc.query_phrase_match = (*docs_here).query_phrase_match;
361
362 response.docInfo.push_back (resultdoc);
363 }
364
365 resultnum++;
366 }
367 }
368
369 docs_here++;
370 }
371 }
372
373 // assemble the term results
374 if ((request.filterResultOptions & FRtermFreq) || (request.filterResultOptions & FRmatchTerms)) {
375 // note: the terms have already been sorted and uniqued
376
377 TermInfo_t terminfo;
378 bool terms_first = true;
379 vector<termfreqclass>::iterator terms_here = queryresults.terms.begin();
380 vector<termfreqclass>::iterator terms_end = queryresults.terms.end();
381
382 while (terms_here != terms_end) {
383 terminfo.clear();
384 terminfo.term = (*terms_here).termstr;
385 terminfo.freq = (*terms_here).termfreq;
386 if (terms_first) terminfo.matchTerms = queryresults.termvariants;
387 terms_first = false;
388
389 response.termInfo.push_back (terminfo);
390
391 terms_here++;
392 }
393 }
394
395 response.numDocs = queryresults.docs_matched;
396 response.isApprox = queryresults.is_approx;
397}
Note: See TracBrowser for help on using the repository browser.