source: trunk/gsdl/src/colservr/queryinfo.cpp@ 12770

Last change on this file since 12770 was 12770, checked in by mdewsnip, 18 years ago

Changed the Lucene "-fuzzy" argument to "-fuzziness <value>", for more accurate control.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.9 KB
Line 
1/**********************************************************************
2 *
3 * queryinfo.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "queryinfo.h"
27
28
29// query parameters
30
31queryparamclass::queryparamclass () {
32 clear ();
33}
34
35void queryparamclass::clear () {
36 combinequery.clear();
37 collection.clear();
38 index.clear();
39 subcollection.clear();
40 language.clear();
41 level.clear();
42 querystring.clear();
43 search_type = 0; // 0 = boolean, 1 = ranked
44 match_mode = 0; // 0 = some, 1 = all
45 casefolding = 0;
46 stemming = 0;
47 maxdocs = -1; // all
48 maxnumeric = 4; // must default to the same value as mg_passes
49 filterstring.clear();
50 sortfield.clear();
51 fuzziness.clear();
52 startresults = 1; // all
53 endresults = 10; // all
54}
55
56
57queryparamclass &queryparamclass::operator=(const queryparamclass &q) {
58 combinequery = q.combinequery;
59 collection = q.collection;
60 index = q.index;
61 subcollection = q.subcollection;
62 language = q.language;
63 level = q.level;
64 querystring = q.querystring;
65 search_type = q.search_type;
66 match_mode = q.match_mode;
67 casefolding = q.casefolding;
68 stemming = q.stemming;
69 maxdocs = q.maxdocs;
70 maxnumeric = q.maxnumeric;
71 filterstring = q.filterstring;
72 sortfield = q.sortfield;
73 fuzziness = q.fuzziness;
74 startresults = q.startresults;
75 endresults = q.endresults;
76 return *this;
77}
78
79
80bool operator==(const queryparamclass &x, const queryparamclass &y) {
81 return ((x.combinequery == y.combinequery) &&
82 (x.collection == y.collection) &&
83 (x.index == y.index) &&
84 (x.subcollection == y.subcollection) &&
85 (x.language == y.language) &&
86 (x.level == y.level) &&
87 (x.querystring == y.querystring) &&
88 (x.search_type == y.search_type) &&
89 (x.match_mode == y.match_mode) &&
90 (x.casefolding == y.casefolding) &&
91 (x.stemming == y.stemming) &&
92 (x.maxdocs == y.maxdocs) &&
93 (x.maxnumeric == y.maxnumeric) &&
94 (x.filterstring == y.filterstring) &&
95 (x.sortfield == y.sortfield) &&
96 (x.fuzziness == y.fuzziness) &&
97 (x.startresults == y.startresults) &&
98 (x.startresults == y.startresults));
99}
100
101bool operator!=(const queryparamclass &x, const queryparamclass &y) {
102 return !(x == y);
103}
104
105
106ostream &operator<< (ostream &outs, queryparamclass &q) {
107 outconvertclass text_t2ascii;
108
109 outs << "*** queryparamclass\n";
110 outs << text_t2ascii << " combinequery = \"" << q.combinequery << "\"\n";
111 outs << text_t2ascii << " collection = \"" << q.collection << "\"\n";
112 outs << text_t2ascii << " index = \"" << q.index << "\"\n";
113 outs << text_t2ascii << " level = \"" << q.level << "\"\n";
114 outs << text_t2ascii << " subcollection = \"" << q.subcollection << "\"\n";
115 outs << text_t2ascii << " language = \"" << q.language << "\"\n";
116 outs << text_t2ascii << " querystring = \"" << q.querystring << "\"\n";
117 outs << " search_type = \"" << q.search_type << "\"\n";
118 outs << " match_mode = \"" << q.match_mode << "\"\n";
119 outs << " casefolding = \"" << q.casefolding << "\"\n";
120 outs << " stemming = \"" << q.stemming << "\"\n";
121 outs << " maxdocs = \"" << q.maxdocs << "\"\n";
122 outs << " maxnumeric = \"" << q.maxnumeric << "\"\n";
123 outs << " filterstring = \"" << q.filterstring << "\"\n";
124 outs << " sortfield = \"" << q.sortfield << "\"\n";
125 outs << " fuzziness = \"" << q.fuzziness << "\"\n";
126 outs << " startresults = \"" << q.startresults << "\"\n";
127 outs << " endresults = \"" << q.endresults << "\"\n";
128 outs << "\n";
129
130 return outs;
131}
132
133
134
135
136// term frequencies
137
138termfreqclass::termfreqclass () {
139 clear();
140}
141
142void termfreqclass::clear() {
143 termstr.clear();
144 termstemstr.clear();
145 utf8equivterms.erase(utf8equivterms.begin(), utf8equivterms.end());
146 termfreq = 0;
147}
148
149termfreqclass &termfreqclass::operator=(const termfreqclass &t) {
150 termstr = t.termstr;
151 termstemstr = t.termstemstr;
152 utf8equivterms = t.utf8equivterms;
153 termfreq = t.termfreq;
154
155 return *this;
156}
157
158bool operator==(const termfreqclass &x, const termfreqclass &y) {
159 return ((x.termstr == y.termstr) &&
160 (x.termstemstr == y.termstemstr) &&
161 (x.termfreq == y.termfreq));
162}
163
164bool operator!=(const termfreqclass &x, const termfreqclass &y) {
165 return !(x == y);
166}
167
168// ordered by termfreq and then by termstr
169bool operator<(const termfreqclass &x, const termfreqclass &y) {
170 return ((x.termfreq < y.termfreq) ||
171 ((x.termfreq == y.termfreq) && (x.termstemstr < y.termstemstr)) ||
172 ((x.termfreq == y.termfreq) && (x.termstemstr == y.termstemstr) && (x.termstr < y.termstr)));
173}
174
175bool operator>(const termfreqclass &x, const termfreqclass &y) {
176 return ((x.termfreq > y.termfreq) ||
177 ((x.termfreq == y.termfreq) && (x.termstemstr > y.termstemstr)) ||
178 ((x.termfreq == y.termfreq) && (x.termstemstr == y.termstemstr) && (x.termstr > y.termstr)));
179}
180
181// stream output for debugging purposes
182ostream &operator<< (ostream &outs, termfreqclass &t) {
183 outconvertclass text_t2ascii;
184
185 outs << text_t2ascii << " t:\"" << t.termstr << "\"";
186 outs << text_t2ascii << " s:\"" << t.termstemstr << "\"";
187 outs << " f:" << t.termfreq << "\n";
188
189 return outs;
190}
191
192
193
194// one query result
195
196docresultclass::docresultclass() {
197 clear ();
198}
199
200void docresultclass::clear () {
201 docnum=-1;
202 docweight=0.0;
203 num_query_terms_matched=0;
204 num_phrase_match=0;
205}
206
207// merges two result classes relating to a single docnum
208docresultclass &docresultclass::combine(const docresultclass &d) {
209 docweight += d.docweight; // budget!
210 num_query_terms_matched += d.num_query_terms_matched;
211 num_phrase_match += d.num_phrase_match;
212
213 return *this;
214}
215
216docresultclass &docresultclass::operator=(const docresultclass &d) {
217 docnum = d.docnum;
218 docweight = d.docweight;
219 num_query_terms_matched = d.num_query_terms_matched;
220 num_phrase_match = d.num_phrase_match;
221
222 return *this;
223}
224
225
226bool operator==(const docresultclass &x, const docresultclass &y) {
227 return ((x.docnum == y.docnum) && (x.docweight == y.docweight) &&
228 (x.num_query_terms_matched == y.num_query_terms_matched) &&
229 (x.num_phrase_match == y.num_phrase_match));
230}
231
232bool operator<(const docresultclass &x, const docresultclass &y) {
233 return ((x.docnum < y.docnum) ||
234 ((x.docnum == y.docnum) &&
235 ((x.docweight < y.docweight) ||
236 ((x.docweight == y.docweight) &&
237 ((x.num_query_terms_matched < y.num_query_terms_matched) ||
238 ((x.num_query_terms_matched == y.num_query_terms_matched) &&
239 ((x.num_phrase_match < y.num_phrase_match))))))));
240}
241
242
243// stream output for debugging purposes
244ostream &operator<< (ostream &outs, docresultclass &a) {
245 outs << " d:" << a.docnum << " w:" << a.docweight << "\n";
246 return outs;
247}
248
249
250
251// many document results
252
253docresultsclass::docresultsclass () {
254 clear ();
255}
256
257void docresultsclass::clear () {
258 docset.erase(docset.begin(), docset.end());
259 docorder.erase(docorder.begin(), docorder.end());
260}
261
262void docresultsclass::docnum_order() {
263 docorder.erase(docorder.begin(), docorder.end());
264
265 docresultmap::iterator here = docset.begin();
266 docresultmap::iterator end = docset.end();
267 while (here != end) {
268 docorder.push_back ((*here).first);
269 ++here;
270 }
271}
272
273void docresultsclass::combine_and (const docresultsclass &d) {
274 docorder.erase(docorder.begin(), docorder.end());
275
276 // put the resulting set in tempresults
277 docresultmap tempresults;
278
279 docresultmap::const_iterator d_here = d.docset.begin();
280 docresultmap::const_iterator d_end = d.docset.end();
281 docresultmap::iterator found = docset.end();
282 while (d_here != d_end) {
283 found = docset.find((*d_here).first);
284 if (found != docset.end()) {
285 (*found).second.combine ((*d_here).second);
286 tempresults[(*found).first] = (*found).second;
287 }
288 ++d_here;
289 }
290
291 // then copy it back to docset
292 docset = tempresults;
293}
294
295void docresultsclass::combine_or (const docresultsclass &d) {
296 docorder.erase(docorder.begin(), docorder.end());
297
298 docresultmap::const_iterator d_here = d.docset.begin();
299 docresultmap::const_iterator d_end = d.docset.end();
300 docresultmap::iterator found = docset.end();
301 while (d_here != d_end) {
302 found = docset.find((*d_here).first);
303 if (found != docset.end()) {
304 (*found).second.combine ((*d_here).second);
305 } else {
306 docset[(*d_here).first] = (*d_here).second;
307 }
308 ++d_here;
309 }
310}
311
312void docresultsclass::combine_not (const docresultsclass &d) {
313 docorder.erase(docorder.begin(), docorder.end());
314
315 docresultmap::const_iterator d_here = d.docset.begin();
316 docresultmap::const_iterator d_end = d.docset.end();
317 docresultmap::iterator found = docset.end();
318 while (d_here != d_end) {
319 found = docset.find((*d_here).first);
320 if (found != docset.end()) docset.erase (found);
321 ++d_here;
322 }
323}
324
325docresultsclass &docresultsclass::operator=(const docresultsclass &d) {
326 docset = d.docset;
327 docorder = d.docorder;
328
329 return *this;
330}
331
332
333
334
335// query results
336
337void queryresultsclass::clear () {
338 error_message = g_EmptyText;
339 docs_matched = 0;
340 is_approx = Exact;
341 syntax_error = false;
342 postprocessed = false;
343
344 docs.clear();
345 orgterms.erase(orgterms.begin(),orgterms.end());
346 terms.erase(terms.begin(),terms.end());
347}
348
349queryresultsclass &queryresultsclass::operator=(const queryresultsclass &q) {
350 error_message = q.error_message;
351 docs_matched = q.docs_matched;
352 is_approx = q.is_approx;
353 syntax_error = q.syntax_error;
354 postprocessed = q.postprocessed;
355
356 docs = q.docs;
357 terms = q.terms;
358 termvariants = q.termvariants;
359
360 return *this;
361}
362
363void queryresultsclass::sortuniqqueryterms() {
364 termfreqclassarray tempterms = orgterms;
365 text_tset seenterms;
366 terms.erase(terms.begin(), terms.end());
367
368 // sort the terms to get the frequencies in ascending order
369 sort (tempterms.begin(), tempterms.end());
370
371 // insert first occurance of each term (maximum)
372 termfreqclassarray::reverse_iterator here = tempterms.rbegin();
373 termfreqclassarray::reverse_iterator end = tempterms.rend();
374 while (here != end) {
375 if (seenterms.find((*here).termstr) == seenterms.end()) {
376 // the termstemstr and utf8equivterms might be different for
377 // different occurances of the term
378 (*here).termstemstr.clear();
379 (*here).utf8equivterms.erase((*here).utf8equivterms.begin(),
380 (*here).utf8equivterms.end());
381 terms.push_back(*here);
382 seenterms.insert((*here).termstr);
383 }
384 ++here;
385 }
386
387 // now re-sort in ascending order
388 sort (terms.begin(), terms.end());
389}
390
391
392// stream output for debugging purposes
393ostream &operator<< (ostream &outs, queryresultsclass &q) {
394 outs << "*** queryresultsclass\n";
395 outs << "docs\n";
396
397 docresultmap::iterator docshere = q.docs.docset.begin();
398 docresultmap::iterator docsend = q.docs.docset.end();
399 while (docshere != docsend) {
400 outs << (*docshere).second;
401 ++docshere;
402 }
403
404 outs << "orgterms\n";
405 termfreqclassarray::iterator orgtermshere = q.orgterms.begin();
406 termfreqclassarray::iterator orgtermsend = q.orgterms.end();
407 while (orgtermshere != orgtermsend) {
408 outs << (*orgtermshere);
409 ++orgtermshere;
410 }
411
412 outs << "terms\n";
413 termfreqclassarray::iterator termshere = q.terms.begin();
414 termfreqclassarray::iterator termsend = q.terms.end();
415 while (termshere != termsend) {
416 outs << (*termshere);
417 ++termshere;
418 }
419
420 outs << "\n";
421
422 return outs;
423}
Note: See TracBrowser for help on using the repository browser.