source: trunk/gsdl/src/recpt/querytools.cpp@ 8357

Last change on this file since 8357 was 8357, checked in by kjdon, 20 years ago

added a partial fix to plain searching with mgg - if you are searching in a field, it used to put [ ]:TI around the whole query string. now if you are simple 'and' searching, then it does this, otherwise it individually tags words, like [q1]:TI [q2]:TI.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.6 KB
Line 
1/**********************************************************************
2 *
3 * querytools.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "querytools.h"
27#include <ctype.h>
28#include "unitool.h" // for is_unicode_letdig
29
30// request.filterResultOptions and request.fields (if required) should
31// be set from the calling code
32void set_queryfilter_options (FilterRequest_t &request, const text_t &querystring,
33 cgiargsclass &args) {
34
35 request.filterName = "QueryFilter";
36
37 OptionValue_t option;
38
39 option.name = "Term";
40 option.value = querystring;
41 request.filterOptions.push_back (option);
42
43 option.name = "QueryType";
44 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
45 request.filterOptions.push_back (option);
46
47 option.name = "MatchMode";
48 option.value = (args.getintarg("t")) ? "some" : "all";
49 request.filterOptions.push_back (option);
50
51 option.name = "Casefold";
52 option.value = (args.getintarg("k")) ? "true" : "false";
53 request.filterOptions.push_back (option);
54
55 option.name = "Stem";
56 option.value = (args.getintarg("s")) ? "true" : "false";
57 request.filterOptions.push_back (option);
58
59 if (!args["h"].empty()) {
60 option.name = "Index";
61 option.value = args["h"];
62 request.filterOptions.push_back (option);
63 }
64
65 if (!args["j"].empty()) {
66 option.name = "Subcollection";
67 option.value = args["j"];
68 request.filterOptions.push_back (option);
69 }
70
71 if (!args["n"].empty()) {
72 option.name = "Language";
73 option.value = args["n"];
74 request.filterOptions.push_back (option);
75 }
76
77 if (!args["g"].empty()) { // granularity for mgpp
78 option.name = "Level";
79 option.value = args["g"];
80 request.filterOptions.push_back (option);
81 }
82
83 set_more_queryfilter_options (request, args);
84}
85
86void set_queryfilter_options (FilterRequest_t &request, const text_t &querystring1,
87 const text_t &querystring2, cgiargsclass &args) {
88
89 set_queryfilter_options (request, querystring1, args);
90
91 // fill in the second query if needed
92 if (!args["cq2"].empty()) {
93 OptionValue_t option;
94
95 option.name = "CombineQuery";
96 option.value = args["cq2"];
97 request.filterOptions.push_back (option);
98
99 option.name = "Term";
100 option.value = querystring2;
101 request.filterOptions.push_back (option);
102
103 option.name = "QueryType";
104 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
105 request.filterOptions.push_back (option);
106
107 option.name = "Casefold";
108 option.value = (args.getintarg("k")) ? "true" : "false";
109 request.filterOptions.push_back (option);
110
111 option.name = "Stem";
112 option.value = (args.getintarg("s")) ? "true" : "false";
113 request.filterOptions.push_back (option);
114
115 if (!args["h2"].empty()) {
116 option.name = "Index";
117 option.value = args["h2"];
118 request.filterOptions.push_back (option);
119 }
120
121 if (!args["j2"].empty()) {
122 option.name = "Subcollection";
123 option.value = args["j2"];
124 request.filterOptions.push_back (option);
125 }
126
127 if (!args["n2"].empty()) {
128 option.name = "Language";
129 option.value = args["n2"];
130 request.filterOptions.push_back (option);
131 }
132 }
133 set_more_queryfilter_options (request, args);
134}
135
136void set_more_queryfilter_options (FilterRequest_t &request, cgiargsclass &args) {
137
138 OptionValue_t option;
139 int arg_m = args.getintarg("m");
140
141 option.name = "Maxdocs";
142 option.value = arg_m;
143 request.filterOptions.push_back (option);
144
145 // option.name = "StartResults";
146 // option.value = args["r"];
147 // request.filterOptions.push_back (option);
148
149 // option.name = "EndResults";
150 // int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
151 // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
152 // option.value = endresults;
153 // request.filterOptions.push_back (option);
154}
155
156void format_querystring (text_t &querystring, int querymode, bool segment) {
157 text_t formattedstring;
158
159 if (querymode == 1 && !segment) return;
160
161 text_t::const_iterator here = querystring.begin();
162 text_t::const_iterator end = querystring.end();
163
164 // space is used to insert spaces between Chinese
165 // characters. No space is needed before the first
166 // Chinese character.
167 bool space = false;
168
169 // want to remove ()|!& from querystring so boolean queries are just
170 // "all the words" queries (unless querymode is advanced)
171 while (here != end) {
172 if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
173 *here == '!' || *here == '&')) {
174 formattedstring.push_back(' ');
175 } else if (segment) {
176 if ((*here >= 0x4e00 && *here <= 0x9fa5) ||
177 (*here >= 0xf900 && *here <= 0xfa2d)) {
178 // Chinese character
179 if (space) formattedstring.push_back (0x200b);
180 formattedstring.push_back (*here);
181 formattedstring.push_back (0x200b);
182 space = true;
183 } else {
184 // non-Chinese character
185 formattedstring.push_back (*here);
186 space = false;
187 }
188
189 } else {
190 formattedstring.push_back (*here);
191 }
192 here ++;
193 }
194 querystring = formattedstring;
195}
196
197
198
199void add_dates(text_t &querystring, int startdate, int enddate,
200 int startbc, int endbc, int ct)
201{
202 if(startdate)
203 {
204 int querystringis = 0;
205 text_t::const_iterator here = querystring.begin();
206 text_t::const_iterator end = querystring.end();
207 while(here!=end)
208 {
209 if(!(isspace((*here)))){
210 here = end;
211 querystringis = 1;
212 }
213 else
214 here++;
215 }
216 //converting BCE dates
217 if(startbc && startdate > 0)
218 {
219 startdate *= -1;
220 }
221 if(endbc && enddate > 0)
222 {
223 enddate *= -1;
224 }
225 if(enddate != 0 && enddate<startdate)
226 {
227 cout<<"enddate too small"<<endl;
228 return;
229 }
230 if(querystringis)
231 querystring.appendcstr(" AND");
232 if(!enddate)
233 {
234 if (ct==1) {
235 mgpp_adddateelem(querystring,startdate);
236 }
237 else { // lucene
238 lucene_adddateelem(querystring,startdate);
239 }
240 }
241 else{
242 int nextdate = startdate;
243 querystring.appendcstr(" (");
244 while(nextdate<=enddate)
245 {
246 if(nextdate!=0) {
247 if (ct==1) {
248 mgpp_adddateelem(querystring,nextdate);
249 }
250 else { // lucene
251 lucene_adddateelem(querystring,nextdate);
252 }
253 }
254 nextdate++;
255 }
256 querystring.appendcstr(" )");
257 }
258 }
259
260}
261
262void get_phrases (const text_t &querystring, text_tarray &phrases) {
263
264 phrases.erase (phrases.begin(), phrases.end());
265 if (!querystring.empty()) {
266
267 text_t::const_iterator end = querystring.end();
268 text_t::const_iterator here = findchar (querystring.begin(), end, '"');
269 if (here != end) {
270 text_t tmptext;
271 bool foundquote = false;
272 while (here != end) {
273 if (*here == '"') {
274 if (foundquote) {
275 if (!tmptext.empty()) {
276 phrases.push_back(tmptext);
277 tmptext.clear();
278 }
279 foundquote = false;
280 } else foundquote = true;
281 } else {
282 if (foundquote) tmptext.push_back (*here);
283 }
284 here ++;
285 }
286 }
287 }
288}
289
290// search history tool
291// also used for form query macros
292text_t escape_quotes(const text_t &querystring) {
293
294 text_t::const_iterator here = querystring.begin();
295 text_t::const_iterator end = querystring.end();
296
297 text_t escquery = "";
298 while (here != end) {
299 if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
300 else if (*here == '\n' || *here == '\r') {
301 escquery.push_back(' ');
302 } else {
303 escquery +="\\\\";
304 escquery.push_back(*here);
305 }
306
307 here++;
308 }
309 return escquery;
310
311}
312
313// some query form parsing functions for use with mgpp
314
315void parse_reg_query_form(text_t &querystring, cgiargsclass &args)
316{
317 querystring.clear();
318
319 const int ct = args.getintarg("ct");
320 int argt = args.getintarg("t");// t=0 -and, t=1 - or
321
322 text_t combine;
323 if (ct==1) {
324 if (argt == 0) combine = "&";
325 else combine = "|";
326 }
327 else { // lucene
328 if (argt == 0) combine = "AND";
329 else combine = "OR";
330 }
331
332 text_t field = args["fqf"];
333 if (field.empty()) return; // no query
334 text_tarray fields;
335 splitchar(field.begin(), field.end(), ',', fields);
336
337 text_t value = args["fqv"];
338 if (value.empty()) return; // somethings wrong
339 text_tarray values;
340 splitchar(value.begin(), value.end(), ',', values);
341
342
343 for (int i=0; i< values.size(); i++) {
344 if (!values[i].empty()) {
345 if (ct == 1) {
346 mgpp_addqueryelem(querystring, fields[i], values[i], combine);
347 }
348 else { // lucene
349 lucene_addqueryelem(querystring, fields[i], values[i], combine);
350 }
351 }
352 }
353
354}
355
356
357void parse_adv_query_form(text_t &querystring, cgiargsclass &args){
358
359 querystring.clear();
360
361 const int ct = args.getintarg("ct");
362 text_t combine;
363 if (ct==1) {
364 combine = "&";
365 }
366 else { // lucene
367 combine = "AND";
368 }
369
370 text_t field = args["fqf"];
371 if (field.empty()) return; // no query
372 text_tarray fields;
373 splitchar(field.begin(), field.end(), ',', fields);
374
375 text_t value = args["fqv"];
376 if (value.empty()) return; // somethings wrong
377 text_tarray values;
378 splitchar(value.begin(), value.end(), ',', values);
379
380 text_t stem = args["fqs"];
381 if (stem.empty()) return; // somethings wrong
382 text_tarray stems;
383 splitchar(stem.begin(), stem.end(), ',', stems);
384
385 text_t fold = args["fqk"];
386 if (fold.empty()) return; // somethings wrong
387 text_tarray folds;
388 splitchar(fold.begin(), fold.end(), ',', folds);
389
390 text_t comb = args["fqc"];
391 if (comb.empty()) return; //somethings wrong
392 text_tarray combs;
393 splitchar(comb.begin(), comb.end(), ',', combs);
394
395 for(int i=0; i< values.size(); i++) {
396 if (!values[i].empty()) {
397 if (i!=0) {
398 if (ct==1) {
399 if (combs[i-1]=="and") combine = "&";
400 else if (combs[i-1]=="or")combine = "|";
401 else if (combs[i-1]=="not")combine = "!";
402 }
403 else { // lucene
404 if (combs[i-1]=="and") combine = "AND";
405 else if (combs[i-1]=="or")combine = "OR";
406 else if (combs[i-1]=="not")combine = "NOT";
407 }
408 }
409 text_t term = addstemcase(values[i], stems[i], folds[i]);
410 mgpp_addqueryelem(querystring, fields[i], term, combine);
411 }
412
413 }
414}
415
416text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold) {
417
418 text_t outtext;
419 text_t word;
420 //unsigned short c;
421 text_t::const_iterator here = terms.begin();
422 text_t::const_iterator end = terms.end();
423
424 while (here !=end) {
425
426 if (is_unicode_letdig(*here)) {
427 // not word boundary
428 word.push_back(*here);
429 here++;
430 }
431 else {
432 // found word boundary
433 if (!word.empty() ) {
434 if (stem == "1" || fold =="1") {
435 word += "#";
436 if (stem == "1") word += "s";
437 //else word += "u";
438
439 if (fold == "1") word += "i";
440 //else word += "c";
441 }
442
443 word += " ";
444 outtext += word;
445 word.clear();
446 }
447 if (*here == '\"') {
448 outtext.push_back(*here);
449 }
450 here++;
451 }
452 }
453
454 // get last word
455 if (!word.empty()) {
456 if (stem == "1"|| fold == "1") {
457 word += "#";
458 if (stem == "1") word += "s";
459 //else word += "u";
460
461 if (fold == "1") word += "i";
462 //else word += "c";
463 }
464 word += " ";
465 outtext += word;
466 }
467 return outtext;
468}
469
470
471void mgpp_adddateelem(text_t& querystring, const int date)
472{
473 querystring.appendcstr(" [");
474 if(date<0) {
475 querystring.appendcstr("bc");
476 querystring.appendint((date*-1));
477 }
478 else {
479 querystring.appendint(date);
480 }
481 querystring.appendcstr("]:CV");
482}
483
484void lucene_adddateelem(text_t& querystring, const int date)
485{
486 querystring.appendcstr(" CV:(");
487 if(date<0) {
488 querystring.appendcstr("bc");
489 querystring.appendint((date*-1));
490 }
491 else {
492 querystring.appendint(date);
493 }
494 querystring.appendcstr(")");
495}
496
497
498void mgpp_addqueryelem(text_t &querystring, text_t &tag,
499 text_t &query, text_t &combine) {
500 if (!querystring.empty()) { // have to put and/or
501 querystring += " " + combine + " ";
502
503 }
504 if (tag=="ZZ" || tag=="") { // just add onto querystring
505 querystring += query;
506 }
507 else {
508 querystring += "["+query+"]:"+tag;
509 }
510
511}
512
513void lucene_addqueryelem(text_t &querystring, text_t &tag,
514 text_t &query, text_t &combine) {
515 if (!querystring.empty()) { // have to put and/or
516 querystring += " " + combine + " ";
517
518 }
519 if (tag=="ZZ" || tag=="") { // just add onto querystring
520 querystring += query;
521 }
522 else {
523 querystring += tag+":("+query+")";
524 }
525}
526
527
528void addqueryelem_ex(text_t &querystring, const text_t &tag,
529 const text_t &terms, const text_t &stem, const text_t &fold,
530 const text_t& combine, const text_t& word_combine) {
531 if (!querystring.empty()) { // have to put and/or
532 querystring += " " + combine + " ";
533 }
534 text_t outtext; outtext.reserve(512);
535 text_t word; word.reserve(100);
536 //unsigned short c;
537 text_t::const_iterator here = terms.begin();
538 text_t::const_iterator end = terms.end();
539 bool inquote = false, firstword = true;
540
541 text_t word2; word2.reserve(256);
542
543 while (here !=end) {
544 if (is_unicode_space(*here)) {
545 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
546 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
547 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
548 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
549 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
550 if (inquote) {
551 word2.push_back(*here);
552 }
553 word.append(word2); word2.clear();
554
555 if (!inquote && !word.empty() ) {
556 // found word boundary
557
558 if (stem == "1" || fold =="1") {
559 word += "#";
560 if (stem == "1") word += "s";
561 //else word += "u";
562
563 if (fold == "1") word += "i";
564 //else word += "c";
565 }
566 if (firstword) {
567 firstword = false;
568 } else {
569 outtext += " " + word_combine + " ";
570 }
571 outtext += "[" + word + "]:"+tag;
572 word.clear();
573 }
574 ++here;
575 } else if (*here == '\"') {
576 word2.push_back(*here);
577 inquote = !inquote;
578 ++here;
579 } else {
580 // not word boundary
581 word2.push_back(*here);
582 ++here;
583 }
584 }
585
586 // get last word
587 if (!word2.empty()) {
588 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
589 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
590 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
591 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
592 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
593 word.append(word2); word2.clear();
594
595 if (stem == "1"|| fold == "1") {
596 word += "#";
597 if (stem == "1") word += "s";
598 //else word += "u";
599
600 if (fold == "1") word += "i";
601 //else word += "c";
602 }
603 if (!outtext.empty()) outtext += " " + word_combine + " ";
604 outtext += "[" + word + "]:"+tag;
605 }
606 querystring += "(" + outtext + ")";
607}
608
609
610void add_field_info(text_t &querystring, const text_t &tag, int type) {
611
612 if (type == 1) { //mgpp
613 querystring = "["+querystring+"]:"+tag;
614 } else if (type == 2) { // lucene
615 querystring = tag+":("+querystring+")";
616 }
617
618}
619
620
621void format_field_info(text_t &querystring, cgiargsclass &args) {
622
623 text_t tag = args["fqf"];
624 if (tag == "ZZ" || tag == "") {
625 return; // do nothing
626 }
627
628 int argct = args.getintarg("ct");
629 int argt = args.getintarg("t");// t=0 -and, t=1 - or
630 int argb = args.getintarg("b"); // b=0 simple, b=1 advanced
631
632 if (argb==0 && argt==0) {
633 // simple 'and' search - just put tag info round whole query string
634 add_field_info(querystring, tag, argct);
635 return;
636 }
637
638 // we need to individually tag words
639 text_t outtext;
640 text_t word;
641 //unsigned short c;
642 text_t::const_iterator here = querystring.begin();
643 text_t::const_iterator end = querystring.end();
644
645 while (here !=end) {
646
647 if (is_unicode_letdig(*here)|| *here == '#' || *here == '/' ) {
648 // include term modifiers in a word just in case
649 // not word boundary
650 word.push_back(*here);
651 here++;
652 }
653 else {
654 // found word boundary
655 if (!word.empty() ) {
656 add_field_info(word, tag, argct);
657 outtext += word;
658 word.clear();
659 }
660 // everything else, we add into the query string
661 outtext.push_back(*here);
662 here++;
663 }
664 }
665
666 // get last word
667 if (!word.empty()) {
668 add_field_info(word, tag, argct);
669 outtext += word;
670 }
671
672 querystring = outtext;
673}
674
Note: See TracBrowser for help on using the repository browser.