source: trunk/gsdl/src/colservr/queryfilter.cpp@ 12770

Last change on this file since 12770 was 12770, checked in by mdewsnip, 18 years ago

Changed the Lucene "-fuzzy" argument to "-fuzziness <value>", for more accurate control.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.8 KB
Line 
1/**********************************************************************
2 *
3 * queryfilter.cpp -- base class for queryfilters
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "queryfilter.h"
27#include "fileutil.h"
28#include "gsdltools.h"
29#include <assert.h>
30
31
32// translate will return true if successful
33bool queryfilterclass::translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
34 infodbclass info;
35
36 trans_OID.clear();
37
38 // get the info
39 if (gdbmptr == NULL) return false;
40 if (!gdbmptr->getinfo(docnum, info)) return false;
41
42 // translate
43 if (info["section"].empty()) return false;
44
45 trans_OID = info["section"];
46 return true;
47}
48
49
50// whether document results are needed
51bool queryfilterclass::need_matching_docs (int filterResultOptions) {
52 return ((filterResultOptions & FROID) || (filterResultOptions & FRranking) ||
53 (filterResultOptions & FRmetadata));
54}
55
56// whether term information is needed
57bool queryfilterclass::need_term_info (int filterResultOptions) {
58 return ((filterResultOptions & FRtermFreq) || (filterResultOptions & FRmatchTerms));
59}
60
61/////////////////////////////////
62// functions for queryfilterclass
63/////////////////////////////////
64
65// get the query parameters
66void queryfilterclass::parse_query_params (const FilterRequest_t &request,
67 vector<queryparamclass> &query_params,
68 int &startresults, int &endresults,
69 text_t &phrasematch, ostream &logout) {
70 outconvertclass text_t2ascii;
71
72 // set defaults for the return parameters
73 query_params.erase(query_params.begin(), query_params.end());
74 startresults = filterOptions["StartResults"].defaultValue.getint();
75 endresults = filterOptions["EndResults"].defaultValue.getint();
76 phrasematch = filterOptions["PhraseMatch"].defaultValue;
77
78 // set defaults for query parameters
79 queryparamclass query;
80 query.combinequery = "or"; // first one must be "or"
81 query.collection = collection;
82 query.index = filterOptions["Index"].defaultValue;
83 query.subcollection = filterOptions["Subcollection"].defaultValue;
84 query.language = filterOptions["Language"].defaultValue;
85 query.querystring.clear();
86 query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
87 query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
88 query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
89 query.stemming = (filterOptions["Stem"].defaultValue == "true");
90 query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
91 query.level = filterOptions["Level"].defaultValue;
92 query.filterstring = filterOptions["FilterString"].defaultValue; // Lucene specific
93 query.sortfield = filterOptions["SortField"].defaultValue; // Lucene specific
94 query.fuzziness = filterOptions["Fuzziness"].defaultValue; // Lucene specific
95 query.maxnumeric = maxnumeric;
96 OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
97 OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
98 while (options_here != options_end) {
99 if ((*options_here).name == "CombineQuery") {
100 // add this query
101
102 // "all", needed when combining queries where the document results are needed
103 if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
104 query_params.push_back (query);
105
106 // start on next query
107 query.clear();
108 query.combinequery = (*options_here).value;
109
110 // set defaults for query parameters
111 query.collection = collection;
112 query.index = filterOptions["Index"].defaultValue;
113 query.subcollection = filterOptions["Subcollection"].defaultValue;
114 query.language = filterOptions["Language"].defaultValue;
115 query.querystring.clear();
116 query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
117 query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
118 query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
119 query.stemming = (filterOptions["Stem"].defaultValue == "true");
120 query.level = filterOptions["Level"].defaultValue;
121 query.filterstring = filterOptions["FilterString"].defaultValue; // Lucene specific
122 query.sortfield = filterOptions["SortField"].defaultValue; // Lucene specific
123 query.fuzziness = filterOptions["Fuzziness"].defaultValue; // Lucene specific
124 query.maxnumeric = maxnumeric;
125 // "all", needed when combining queries where the document results are needed
126 if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
127 else query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
128
129 } else if ((*options_here).name == "StartResults") {
130 startresults = (*options_here).value.getint();
131 } else if ((*options_here).name == "EndResults") {
132 endresults = (*options_here).value.getint();
133 } else if ((*options_here).name == "QueryType") {
134 query.search_type = ((*options_here).value == "ranked");
135 } else if ((*options_here).name == "MatchMode") {
136 query.match_mode = ((*options_here).value == "all");
137 if (query.match_mode == 1) query.maxdocs = -1;
138 } else if ((*options_here).name == "Term") {
139 query.querystring = (*options_here).value;
140 } else if ((*options_here).name == "Casefold") {
141 query.casefolding = ((*options_here).value == "true");
142 } else if ((*options_here).name == "Stem") {
143 query.stemming = ((*options_here).value == "true");
144 } else if ((*options_here).name == "Index"&& (*options_here).value !="") {
145 query.index = (*options_here).value;
146 } else if ((*options_here).name == "Subcollection") {
147 query.subcollection = (*options_here).value;
148 } else if ((*options_here).name == "Language") {
149 query.language = (*options_here).value;
150 } else if ((*options_here).name == "Maxdocs") {
151 query.maxdocs = (*options_here).value.getint();
152 } else if ((*options_here).name == "PhraseMatch") {
153 phrasematch = (*options_here).value;
154 } else if ((*options_here).name == "Level") {
155 query.level = (*options_here).value;
156 } else if ((*options_here).name == "FilterString") {
157 query.filterstring = (*options_here).value;
158 } else if ((*options_here).name == "SortField") {
159 query.sortfield = (*options_here).value;
160 } else if ((*options_here).name == "Fuzziness") {
161 query.fuzziness = (*options_here).value;
162 } else {
163 logout << text_t2ascii
164 << "warning: unknown queryfilter option \""
165 << (*options_here).name
166 << "\" ignored.\n\n";
167 }
168
169 ++options_here;
170 }
171
172 // Store the start and end results in the query too, as lucene now needs to
173 // pass them through to the Java
174 query.startresults = startresults;
175 query.endresults = endresults;
176
177 // add the last query
178 query_params.push_back (query);
179}
180
181
182
183
184queryfilterclass::queryfilterclass () {
185 gdbmptr = NULL;
186 textsearchptr = NULL;
187 maxnumeric = 4;
188
189 FilterOption_t filtopt;
190 filtopt.name = "CombineQuery";
191 filtopt.type = FilterOption_t::enumeratedt;
192 filtopt.repeatable = FilterOption_t::onePerQuery;
193 filtopt.defaultValue = "and";
194 filtopt.validValues.push_back("and");
195 filtopt.validValues.push_back("or");
196 filtopt.validValues.push_back("not");
197 filterOptions["CombineQuery"] = filtopt;
198
199 // -- onePerQuery StartResults integer
200 filtopt.clear();
201 filtopt.name = "StartResults";
202 filtopt.type = FilterOption_t::integert;
203 filtopt.repeatable = FilterOption_t::onePerQuery;
204 filtopt.defaultValue = "1";
205 filtopt.validValues.push_back("1");
206 filtopt.validValues.push_back("1000");
207 filterOptions["StartResults"] = filtopt;
208
209 // -- onePerQuery EndResults integer
210 filtopt.clear();
211 filtopt.name = "EndResults";
212 filtopt.type = FilterOption_t::integert;
213 filtopt.repeatable = FilterOption_t::onePerQuery;
214 filtopt.defaultValue = "10";
215 filtopt.validValues.push_back("-1");
216 filtopt.validValues.push_back("1000");
217 filterOptions["EndResults"] = filtopt;
218
219 // -- onePerQuery QueryType enumerated (boolean, ranked)
220 filtopt.clear();
221 filtopt.name = "QueryType";
222 filtopt.type = FilterOption_t::enumeratedt;
223 filtopt.repeatable = FilterOption_t::onePerQuery;
224 filtopt.defaultValue = "ranked";
225 filtopt.validValues.push_back("boolean");
226 filtopt.validValues.push_back("ranked");
227 filterOptions["QueryType"] = filtopt;
228
229 // -- onePerQuery MatchMode enumerated (some, all)
230 filtopt.clear();
231 filtopt.name = "MatchMode";
232 filtopt.type = FilterOption_t::enumeratedt;
233 filtopt.repeatable = FilterOption_t::onePerQuery;
234 filtopt.defaultValue = "some";
235 filtopt.validValues.push_back("some");
236 filtopt.validValues.push_back("all");
237 filterOptions["MatchMode"] = filtopt;
238
239 // -- onePerTerm Term string ???
240 filtopt.clear();
241 filtopt.name = "Term";
242 filtopt.type = FilterOption_t::stringt;
243 filtopt.repeatable = FilterOption_t::onePerTerm;
244 filtopt.defaultValue = "";
245 filterOptions["Term"] = filtopt;
246
247 // -- onePerTerm Casefold boolean
248 filtopt.clear();
249 filtopt.name = "Casefold";
250 filtopt.type = FilterOption_t::booleant;
251 filtopt.repeatable = FilterOption_t::onePerTerm;
252 filtopt.defaultValue = "true";
253 filtopt.validValues.push_back("false");
254 filtopt.validValues.push_back("true");
255 filterOptions["Casefold"] = filtopt;
256
257 // -- onePerTerm Stem boolean
258 filtopt.clear();
259 filtopt.name = "Stem";
260 filtopt.type = FilterOption_t::booleant;
261 filtopt.repeatable = FilterOption_t::onePerTerm;
262 filtopt.defaultValue = "false";
263 filtopt.validValues.push_back("false");
264 filtopt.validValues.push_back("true");
265 filterOptions["Stem"] = filtopt;
266
267 // -- onePerTerm Index enumerated
268 filtopt.clear();
269 filtopt.name = "Index";
270 filtopt.type = FilterOption_t::enumeratedt;
271 filtopt.repeatable = FilterOption_t::onePerTerm;
272 filtopt.defaultValue = "";
273 filterOptions["Index"] = filtopt;
274
275 // -- onePerTerm Subcollection enumerated
276 filtopt.clear();
277 filtopt.name = "Subcollection";
278 filtopt.type = FilterOption_t::enumeratedt;
279 filtopt.repeatable = FilterOption_t::onePerTerm;
280 filtopt.defaultValue = "";
281 filterOptions["Subcollection"] = filtopt;
282
283 // -- onePerTerm Language enumerated
284 filtopt.clear();
285 filtopt.name = "Language";
286 filtopt.type = FilterOption_t::enumeratedt;
287 filtopt.repeatable = FilterOption_t::onePerTerm;
288 filtopt.defaultValue = "";
289 filterOptions["Language"] = filtopt;
290
291 // -- onePerQuery Maxdocs integer
292 filtopt.clear();
293 filtopt.name = "Maxdocs";
294 filtopt.type = FilterOption_t::integert;
295 filtopt.repeatable = FilterOption_t::onePerQuery;
296 filtopt.defaultValue = "200";
297 filtopt.validValues.push_back("-1");
298 filtopt.validValues.push_back("1000");
299 filterOptions["Maxdocs"] = filtopt;
300
301 // -- onePerQuery PhraseMatch enumerated
302 filtopt.clear();
303 filtopt.name = "PhraseMatch";
304 filtopt.type = FilterOption_t::enumeratedt;
305 filtopt.repeatable = FilterOption_t::onePerQuery;
306 filtopt.defaultValue = "some_phrases";
307 filtopt.validValues.push_back ("all_phrases");
308 filtopt.validValues.push_back ("some_phrases");
309 filtopt.validValues.push_back ("all_docs");
310 filterOptions["PhraseMatch"] = filtopt;
311}
312
313queryfilterclass::~queryfilterclass () {
314 // don't delete gdbmptr or mgsearchptr here, they'll
315 // be cleaned up by mggdbmsource
316}
317
318void queryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
319 filterclass::configure (key, cfgline);
320
321 if (key == "indexmap") {
322 indexmap.importmap (cfgline);
323
324 // update the list of indexes in the filter information
325 text_tarray options;
326 indexmap.gettoarray (options);
327 filterOptions["Index"].validValues = options;
328
329 } else if (key == "defaultindex") {
330 indexmap.from2to (cfgline[0], filterOptions["Index"].defaultValue);
331
332 } else if (key == "subcollectionmap") {
333 subcollectionmap.importmap (cfgline);
334
335 // update the list of subcollections in the filter information
336 text_tarray options;
337 subcollectionmap.gettoarray (options);
338 filterOptions["Subcollection"].validValues = options;
339
340 } else if (key == "defaultsubcollection") {
341 subcollectionmap.from2to (cfgline[0], filterOptions["Subcollection"].defaultValue);
342
343 } else if (key == "languagemap") {
344 languagemap.importmap (cfgline);
345
346 // update the list of languages in the filter information
347 text_tarray options;
348 languagemap.gettoarray (options);
349 filterOptions["Language"].validValues = options;
350
351 } else if (key == "defaultlanguage") {
352 languagemap.from2to (cfgline[0], filterOptions["Language"].defaultValue);
353 } else if (key == "indexstem") {
354 indexstem = cfgline[0];
355 } else if (key == "maxnumeric") {
356 maxnumeric = cfgline[0].getint();
357 }
358
359}
360
361bool queryfilterclass::init (ostream &logout) {
362 outconvertclass text_t2ascii;
363
364 if (!filterclass::init(logout)) return false;
365
366 if (filterOptions["Index"].defaultValue.empty()) {
367 // use first index in map as default if no default is set explicitly
368 text_tarray fromarray;
369 indexmap.getfromarray(fromarray);
370 if (fromarray.size()) {
371 filterOptions["Index"].defaultValue = fromarray[0];
372 }
373 }
374
375 if (filterOptions["Subcollection"].defaultValue.empty()) {
376 // use first subcollection in map as default if no default is set explicitly
377 text_tarray fromarray;
378 subcollectionmap.getfromarray(fromarray);
379 if (fromarray.size()) {
380 filterOptions["Subcollection"].defaultValue = fromarray[0];
381 }
382 }
383
384 if (filterOptions["Language"].defaultValue.empty()) {
385 // use first language in map as default if no default is set explicitly
386 text_tarray fromarray;
387 languagemap.getfromarray(fromarray);
388 if (fromarray.size()) {
389 filterOptions["Language"].defaultValue = fromarray[0];
390 }
391 }
392
393 // get the filename for the database and make sure it exists
394 if (indexstem.empty()) {
395 indexstem = collection;
396 }
397 gdbm_filename = filename_cat(gdbmhome, "collect", collection, "index", "text", indexstem);
398
399 if (littleEndian()) gdbm_filename += ".ldb";
400 else gdbm_filename += ".bdb";
401
402 if (!file_exists(gdbm_filename)) {
403 logout << text_t2ascii
404 << "warning: gdbm database \"" //****
405 << gdbm_filename << "\" does not exist\n\n";
406 //return false; //****
407 }
408
409 return true;
410}
411
Note: See TracBrowser for help on using the repository browser.