source: gsdl/trunk/src/colservr/queryfilter.cpp@ 14119

Last change on this file since 14119 was 12871, checked in by kjdon, 18 years ago

Accent Folding patch, thanks to Juan Grigera. Added AccentFold filteroption

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.4 KB
Line 
1/**********************************************************************
2 *
3 * queryfilter.cpp -- base class for queryfilters
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "queryfilter.h"
27#include "fileutil.h"
28#include "gsdltools.h"
29#include <assert.h>
30
31
32// translate will return true if successful
33bool queryfilterclass::translate (gdbmclass *gdbmptr, int docnum, text_t &trans_OID) {
34 infodbclass info;
35
36 trans_OID.clear();
37
38 // get the info
39 if (gdbmptr == NULL) return false;
40 if (!gdbmptr->getinfo(docnum, info)) return false;
41
42 // translate
43 if (info["section"].empty()) return false;
44
45 trans_OID = info["section"];
46 return true;
47}
48
49
50// whether document results are needed
51bool queryfilterclass::need_matching_docs (int filterResultOptions) {
52 return ((filterResultOptions & FROID) || (filterResultOptions & FRranking) ||
53 (filterResultOptions & FRmetadata));
54}
55
56// whether term information is needed
57bool queryfilterclass::need_term_info (int filterResultOptions) {
58 return ((filterResultOptions & FRtermFreq) || (filterResultOptions & FRmatchTerms));
59}
60
61/////////////////////////////////
62// functions for queryfilterclass
63/////////////////////////////////
64
65// get the query parameters
66void queryfilterclass::parse_query_params (const FilterRequest_t &request,
67 vector<queryparamclass> &query_params,
68 int &startresults, int &endresults,
69 text_t &phrasematch, ostream &logout) {
70 outconvertclass text_t2ascii;
71
72 // set defaults for the return parameters
73 query_params.erase(query_params.begin(), query_params.end());
74 startresults = filterOptions["StartResults"].defaultValue.getint();
75 endresults = filterOptions["EndResults"].defaultValue.getint();
76 phrasematch = filterOptions["PhraseMatch"].defaultValue;
77
78 // set defaults for query parameters
79 queryparamclass query;
80 query.combinequery = "or"; // first one must be "or"
81 query.collection = collection;
82 query.index = filterOptions["Index"].defaultValue;
83 query.subcollection = filterOptions["Subcollection"].defaultValue;
84 query.language = filterOptions["Language"].defaultValue;
85 query.querystring.clear();
86 query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
87 query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
88 query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
89 query.stemming = (filterOptions["Stem"].defaultValue == "true");
90 query.accentfolding = (filterOptions["AccentFold"].defaultValue == "true");
91 query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
92 query.level = filterOptions["Level"].defaultValue;
93 query.filterstring = filterOptions["FilterString"].defaultValue; // Lucene specific
94 query.sortfield = filterOptions["SortField"].defaultValue; // Lucene specific
95 query.fuzziness = filterOptions["Fuzziness"].defaultValue; // Lucene specific
96 query.maxnumeric = maxnumeric;
97 OptionValue_tarray::const_iterator options_here = request.filterOptions.begin();
98 OptionValue_tarray::const_iterator options_end = request.filterOptions.end();
99 while (options_here != options_end) {
100 if ((*options_here).name == "CombineQuery") {
101 // add this query
102
103 // "all", needed when combining queries where the document results are needed
104 if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
105 query_params.push_back (query);
106
107 // start on next query
108 query.clear();
109 query.combinequery = (*options_here).value;
110
111 // set defaults for query parameters
112 query.collection = collection;
113 query.index = filterOptions["Index"].defaultValue;
114 query.subcollection = filterOptions["Subcollection"].defaultValue;
115 query.language = filterOptions["Language"].defaultValue;
116 query.querystring.clear();
117 query.search_type = (filterOptions["QueryType"].defaultValue == "ranked");
118 query.match_mode = (filterOptions["MatchMode"].defaultValue == "all");
119 query.casefolding = (filterOptions["Casefold"].defaultValue == "true");
120 query.stemming = (filterOptions["Stem"].defaultValue == "true");
121 query.accentfolding = (filterOptions["AccentFold"].defaultValue == "true");
122 query.level = filterOptions["Level"].defaultValue;
123 query.filterstring = filterOptions["FilterString"].defaultValue; // Lucene specific
124 query.sortfield = filterOptions["SortField"].defaultValue; // Lucene specific
125 query.fuzziness = filterOptions["Fuzziness"].defaultValue; // Lucene specific
126 query.maxnumeric = maxnumeric;
127 // "all", needed when combining queries where the document results are needed
128 if (need_matching_docs (request.filterResultOptions)) query.maxdocs = -1;
129 else query.maxdocs = filterOptions["Maxdocs"].defaultValue.getint();
130
131 } else if ((*options_here).name == "StartResults") {
132 startresults = (*options_here).value.getint();
133 } else if ((*options_here).name == "EndResults") {
134 endresults = (*options_here).value.getint();
135 } else if ((*options_here).name == "QueryType") {
136 query.search_type = ((*options_here).value == "ranked");
137 } else if ((*options_here).name == "MatchMode") {
138 query.match_mode = ((*options_here).value == "all");
139 if (query.match_mode == 1) query.maxdocs = -1;
140 } else if ((*options_here).name == "Term") {
141 query.querystring = (*options_here).value;
142 } else if ((*options_here).name == "Casefold") {
143 query.casefolding = ((*options_here).value == "true");
144 } else if ((*options_here).name == "Stem") {
145 query.stemming = ((*options_here).value == "true");
146 } else if ((*options_here).name == "AccentFold") {
147 query.accentfolding = ((*options_here).value == "true");
148 } else if ((*options_here).name == "Index"&& (*options_here).value !="") {
149 query.index = (*options_here).value;
150 } else if ((*options_here).name == "Subcollection") {
151 query.subcollection = (*options_here).value;
152 } else if ((*options_here).name == "Language") {
153 query.language = (*options_here).value;
154 } else if ((*options_here).name == "Maxdocs") {
155 query.maxdocs = (*options_here).value.getint();
156 } else if ((*options_here).name == "PhraseMatch") {
157 phrasematch = (*options_here).value;
158 } else if ((*options_here).name == "Level") {
159 query.level = (*options_here).value;
160 } else if ((*options_here).name == "FilterString") {
161 query.filterstring = (*options_here).value;
162 } else if ((*options_here).name == "SortField") {
163 query.sortfield = (*options_here).value;
164 } else if ((*options_here).name == "Fuzziness") {
165 query.fuzziness = (*options_here).value;
166 } else {
167 logout << text_t2ascii
168 << "warning: unknown queryfilter option \""
169 << (*options_here).name
170 << "\" ignored.\n\n";
171 }
172
173 ++options_here;
174 }
175
176 // Store the start and end results in the query too, as lucene now needs to
177 // pass them through to the Java
178 query.startresults = startresults;
179 query.endresults = endresults;
180
181 // add the last query
182 query_params.push_back (query);
183}
184
185
186
187
188queryfilterclass::queryfilterclass () {
189 gdbmptr = NULL;
190 textsearchptr = NULL;
191 maxnumeric = 4;
192
193 FilterOption_t filtopt;
194 filtopt.name = "CombineQuery";
195 filtopt.type = FilterOption_t::enumeratedt;
196 filtopt.repeatable = FilterOption_t::onePerQuery;
197 filtopt.defaultValue = "and";
198 filtopt.validValues.push_back("and");
199 filtopt.validValues.push_back("or");
200 filtopt.validValues.push_back("not");
201 filterOptions["CombineQuery"] = filtopt;
202
203 // -- onePerQuery StartResults integer
204 filtopt.clear();
205 filtopt.name = "StartResults";
206 filtopt.type = FilterOption_t::integert;
207 filtopt.repeatable = FilterOption_t::onePerQuery;
208 filtopt.defaultValue = "1";
209 filtopt.validValues.push_back("1");
210 filtopt.validValues.push_back("1000");
211 filterOptions["StartResults"] = filtopt;
212
213 // -- onePerQuery EndResults integer
214 filtopt.clear();
215 filtopt.name = "EndResults";
216 filtopt.type = FilterOption_t::integert;
217 filtopt.repeatable = FilterOption_t::onePerQuery;
218 filtopt.defaultValue = "10";
219 filtopt.validValues.push_back("-1");
220 filtopt.validValues.push_back("1000");
221 filterOptions["EndResults"] = filtopt;
222
223 // -- onePerQuery QueryType enumerated (boolean, ranked)
224 filtopt.clear();
225 filtopt.name = "QueryType";
226 filtopt.type = FilterOption_t::enumeratedt;
227 filtopt.repeatable = FilterOption_t::onePerQuery;
228 filtopt.defaultValue = "ranked";
229 filtopt.validValues.push_back("boolean");
230 filtopt.validValues.push_back("ranked");
231 filterOptions["QueryType"] = filtopt;
232
233 // -- onePerQuery MatchMode enumerated (some, all)
234 filtopt.clear();
235 filtopt.name = "MatchMode";
236 filtopt.type = FilterOption_t::enumeratedt;
237 filtopt.repeatable = FilterOption_t::onePerQuery;
238 filtopt.defaultValue = "some";
239 filtopt.validValues.push_back("some");
240 filtopt.validValues.push_back("all");
241 filterOptions["MatchMode"] = filtopt;
242
243 // -- onePerTerm Term string ???
244 filtopt.clear();
245 filtopt.name = "Term";
246 filtopt.type = FilterOption_t::stringt;
247 filtopt.repeatable = FilterOption_t::onePerTerm;
248 filtopt.defaultValue = "";
249 filterOptions["Term"] = filtopt;
250
251 // -- onePerTerm Casefold boolean
252 filtopt.clear();
253 filtopt.name = "Casefold";
254 filtopt.type = FilterOption_t::booleant;
255 filtopt.repeatable = FilterOption_t::onePerTerm;
256 filtopt.defaultValue = "true";
257 filtopt.validValues.push_back("false");
258 filtopt.validValues.push_back("true");
259 filterOptions["Casefold"] = filtopt;
260
261 // -- onePerTerm Stem boolean
262 filtopt.clear();
263 filtopt.name = "Stem";
264 filtopt.type = FilterOption_t::booleant;
265 filtopt.repeatable = FilterOption_t::onePerTerm;
266 filtopt.defaultValue = "false";
267 filtopt.validValues.push_back("false");
268 filtopt.validValues.push_back("true");
269 filterOptions["Stem"] = filtopt;
270
271 // -- onePerTerm AccentFold boolean
272 filtopt.clear();
273 filtopt.name = "AccentFold";
274 filtopt.type = FilterOption_t::booleant;
275 filtopt.repeatable = FilterOption_t::onePerTerm;
276 filtopt.defaultValue = "false";
277 filtopt.validValues.push_back("false");
278 filtopt.validValues.push_back("true");
279 filterOptions["AccentFold"] = filtopt;
280
281 // -- onePerTerm Index enumerated
282 filtopt.clear();
283 filtopt.name = "Index";
284 filtopt.type = FilterOption_t::enumeratedt;
285 filtopt.repeatable = FilterOption_t::onePerTerm;
286 filtopt.defaultValue = "";
287 filterOptions["Index"] = filtopt;
288
289 // -- onePerTerm Subcollection enumerated
290 filtopt.clear();
291 filtopt.name = "Subcollection";
292 filtopt.type = FilterOption_t::enumeratedt;
293 filtopt.repeatable = FilterOption_t::onePerTerm;
294 filtopt.defaultValue = "";
295 filterOptions["Subcollection"] = filtopt;
296
297 // -- onePerTerm Language enumerated
298 filtopt.clear();
299 filtopt.name = "Language";
300 filtopt.type = FilterOption_t::enumeratedt;
301 filtopt.repeatable = FilterOption_t::onePerTerm;
302 filtopt.defaultValue = "";
303 filterOptions["Language"] = filtopt;
304
305 // -- onePerQuery Maxdocs integer
306 filtopt.clear();
307 filtopt.name = "Maxdocs";
308 filtopt.type = FilterOption_t::integert;
309 filtopt.repeatable = FilterOption_t::onePerQuery;
310 filtopt.defaultValue = "200";
311 filtopt.validValues.push_back("-1");
312 filtopt.validValues.push_back("1000");
313 filterOptions["Maxdocs"] = filtopt;
314
315 // -- onePerQuery PhraseMatch enumerated
316 filtopt.clear();
317 filtopt.name = "PhraseMatch";
318 filtopt.type = FilterOption_t::enumeratedt;
319 filtopt.repeatable = FilterOption_t::onePerQuery;
320 filtopt.defaultValue = "some_phrases";
321 filtopt.validValues.push_back ("all_phrases");
322 filtopt.validValues.push_back ("some_phrases");
323 filtopt.validValues.push_back ("all_docs");
324 filterOptions["PhraseMatch"] = filtopt;
325}
326
327queryfilterclass::~queryfilterclass () {
328 // don't delete gdbmptr or mgsearchptr here, they'll
329 // be cleaned up by mggdbmsource
330}
331
332void queryfilterclass::configure (const text_t &key, const text_tarray &cfgline) {
333 filterclass::configure (key, cfgline);
334
335 if (key == "indexmap") {
336 indexmap.importmap (cfgline);
337
338 // update the list of indexes in the filter information
339 text_tarray options;
340 indexmap.gettoarray (options);
341 filterOptions["Index"].validValues = options;
342
343 } else if (key == "defaultindex") {
344 indexmap.from2to (cfgline[0], filterOptions["Index"].defaultValue);
345
346 } else if (key == "subcollectionmap") {
347 subcollectionmap.importmap (cfgline);
348
349 // update the list of subcollections in the filter information
350 text_tarray options;
351 subcollectionmap.gettoarray (options);
352 filterOptions["Subcollection"].validValues = options;
353
354 } else if (key == "defaultsubcollection") {
355 subcollectionmap.from2to (cfgline[0], filterOptions["Subcollection"].defaultValue);
356
357 } else if (key == "languagemap") {
358 languagemap.importmap (cfgline);
359
360 // update the list of languages in the filter information
361 text_tarray options;
362 languagemap.gettoarray (options);
363 filterOptions["Language"].validValues = options;
364
365 } else if (key == "defaultlanguage") {
366 languagemap.from2to (cfgline[0], filterOptions["Language"].defaultValue);
367 } else if (key == "indexstem") {
368 indexstem = cfgline[0];
369 } else if (key == "maxnumeric") {
370 maxnumeric = cfgline[0].getint();
371 }
372
373}
374
375bool queryfilterclass::init (ostream &logout) {
376 outconvertclass text_t2ascii;
377
378 if (!filterclass::init(logout)) return false;
379
380 if (filterOptions["Index"].defaultValue.empty()) {
381 // use first index in map as default if no default is set explicitly
382 text_tarray fromarray;
383 indexmap.getfromarray(fromarray);
384 if (fromarray.size()) {
385 filterOptions["Index"].defaultValue = fromarray[0];
386 }
387 }
388
389 if (filterOptions["Subcollection"].defaultValue.empty()) {
390 // use first subcollection in map as default if no default is set explicitly
391 text_tarray fromarray;
392 subcollectionmap.getfromarray(fromarray);
393 if (fromarray.size()) {
394 filterOptions["Subcollection"].defaultValue = fromarray[0];
395 }
396 }
397
398 if (filterOptions["Language"].defaultValue.empty()) {
399 // use first language in map as default if no default is set explicitly
400 text_tarray fromarray;
401 languagemap.getfromarray(fromarray);
402 if (fromarray.size()) {
403 filterOptions["Language"].defaultValue = fromarray[0];
404 }
405 }
406
407 // get the filename for the database and make sure it exists
408 if (indexstem.empty()) {
409 indexstem = collection;
410 }
411 gdbm_filename = filename_cat(gdbmhome, "collect", collection, "index", "text", indexstem);
412
413 if (littleEndian()) gdbm_filename += ".ldb";
414 else gdbm_filename += ".bdb";
415
416 if (!file_exists(gdbm_filename)) {
417 logout << text_t2ascii
418 << "warning: gdbm database \"" //****
419 << gdbm_filename << "\" does not exist\n\n";
420 //return false; //****
421 }
422
423 return true;
424}
425
Note: See TracBrowser for help on using the repository browser.