source: trunk/gsdl/src/recpt/querytools.cpp@ 10733

Last change on this file since 10733 was 10411, checked in by mdewsnip, 19 years ago

Added some code to fix bugs in Lucene "all" and phrase searching with a plain search form.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.0 KB
Line 
1/**********************************************************************
2 *
3 * querytools.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "querytools.h"
27#include <ctype.h>
28#include "unitool.h" // for is_unicode_letdig
29
30// request.filterResultOptions and request.fields (if required) should
31// be set from the calling code
32void set_queryfilter_options (FilterRequest_t &request, const text_t &querystring,
33 cgiargsclass &args) {
34
35 request.filterName = "QueryFilter";
36
37 OptionValue_t option;
38
39 option.name = "Term";
40 option.value = querystring;
41 request.filterOptions.push_back (option);
42
43 option.name = "QueryType";
44 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
45 request.filterOptions.push_back (option);
46
47 option.name = "MatchMode";
48 option.value = (args.getintarg("t")) ? "some" : "all";
49 request.filterOptions.push_back (option);
50
51 option.name = "Casefold";
52 option.value = (args.getintarg("k")) ? "true" : "false";
53 request.filterOptions.push_back (option);
54
55 option.name = "Stem";
56 option.value = (args.getintarg("s")) ? "true" : "false";
57 request.filterOptions.push_back (option);
58
59 if (!args["h"].empty()) {
60 option.name = "Index";
61 option.value = args["h"];
62 request.filterOptions.push_back (option);
63 }
64
65 if (!args["j"].empty()) {
66 option.name = "Subcollection";
67 option.value = args["j"];
68 request.filterOptions.push_back (option);
69 }
70
71 if (!args["n"].empty()) {
72 option.name = "Language";
73 option.value = args["n"];
74 request.filterOptions.push_back (option);
75 }
76
77 if (!args["g"].empty()) { // granularity for mgpp
78 option.name = "Level";
79 option.value = args["g"];
80 request.filterOptions.push_back (option);
81 }
82
83 set_more_queryfilter_options (request, args);
84}
85
86void set_queryfilter_options (FilterRequest_t &request, const text_t &querystring1,
87 const text_t &querystring2, cgiargsclass &args) {
88
89 set_queryfilter_options (request, querystring1, args);
90
91 // fill in the second query if needed
92 if (!args["cq2"].empty()) {
93 OptionValue_t option;
94
95 option.name = "CombineQuery";
96 option.value = args["cq2"];
97 request.filterOptions.push_back (option);
98
99 option.name = "Term";
100 option.value = querystring2;
101 request.filterOptions.push_back (option);
102
103 option.name = "QueryType";
104 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
105 request.filterOptions.push_back (option);
106
107 option.name = "Casefold";
108 option.value = (args.getintarg("k")) ? "true" : "false";
109 request.filterOptions.push_back (option);
110
111 option.name = "Stem";
112 option.value = (args.getintarg("s")) ? "true" : "false";
113 request.filterOptions.push_back (option);
114
115 if (!args["h2"].empty()) {
116 option.name = "Index";
117 option.value = args["h2"];
118 request.filterOptions.push_back (option);
119 }
120
121 if (!args["j2"].empty()) {
122 option.name = "Subcollection";
123 option.value = args["j2"];
124 request.filterOptions.push_back (option);
125 }
126
127 if (!args["n2"].empty()) {
128 option.name = "Language";
129 option.value = args["n2"];
130 request.filterOptions.push_back (option);
131 }
132 }
133 set_more_queryfilter_options (request, args);
134}
135
136void set_more_queryfilter_options (FilterRequest_t &request, cgiargsclass &args) {
137
138 OptionValue_t option;
139 int arg_m = args.getintarg("m");
140
141 option.name = "Maxdocs";
142 option.value = arg_m;
143 request.filterOptions.push_back (option);
144
145 // option.name = "StartResults";
146 // option.value = args["r"];
147 // request.filterOptions.push_back (option);
148
149 // option.name = "EndResults";
150 // int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
151 // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
152 // option.value = endresults;
153 // request.filterOptions.push_back (option);
154}
155
156void format_querystring (text_t &querystring, int querymode, bool segment) {
157 text_t formattedstring;
158
159 if (querymode == 1 && !segment) return;
160
161 text_t::const_iterator here = querystring.begin();
162 text_t::const_iterator end = querystring.end();
163
164 // space is used to insert spaces between Chinese
165 // characters. No space is needed before the first
166 // Chinese character.
167 bool space = false;
168
169 // want to remove ()|!& from querystring so boolean queries are just
170 // "all the words" queries (unless querymode is advanced)
171 while (here != end) {
172 if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
173 *here == '!' || *here == '&')) {
174 formattedstring.push_back(' ');
175 } else if (segment) {
176 if ((*here >= 0x4e00 && *here <= 0x9fa5) ||
177 (*here >= 0xf900 && *here <= 0xfa2d)) {
178 // Chinese character
179 if (!space) formattedstring.push_back (0x200b); // zero width space
180 formattedstring.push_back (*here);
181 formattedstring.push_back (0x200b);
182 space = true;
183 } else {
184
185 // non-Chinese character
186 formattedstring.push_back (*here);
187 space = false;
188
189 }
190
191 } else {
192 formattedstring.push_back (*here);
193 }
194 ++here;
195 }
196 querystring = formattedstring;
197}
198
199
200
201void add_dates(text_t &querystring, int startdate, int enddate,
202 int startbc, int endbc, int ct)
203{
204 if(startdate)
205 {
206 int querystringis = 0;
207 text_t::const_iterator here = querystring.begin();
208 text_t::const_iterator end = querystring.end();
209 while(here!=end)
210 {
211 if(!(isspace((*here)))){
212 here = end;
213 querystringis = 1;
214 }
215 else
216 ++here;
217 }
218 //converting BCE dates
219 if(startbc && startdate > 0)
220 {
221 startdate *= -1;
222 }
223 if(endbc && enddate > 0)
224 {
225 enddate *= -1;
226 }
227 if(enddate != 0 && enddate<startdate)
228 {
229 cout<<"enddate too small"<<endl;
230 return;
231 }
232 if(querystringis)
233 querystring.appendcstr(" AND");
234 if(!enddate)
235 {
236 if (ct==1) {
237 mgpp_adddateelem(querystring,startdate);
238 }
239 else { // lucene
240 lucene_adddateelem(querystring,startdate);
241 }
242 }
243 else{
244 int nextdate = startdate;
245 querystring.appendcstr(" (");
246 while(nextdate<=enddate)
247 {
248 if(nextdate!=0) {
249 if (ct==1) {
250 mgpp_adddateelem(querystring,nextdate);
251 }
252 else { // lucene
253 lucene_adddateelem(querystring,nextdate);
254 }
255 }
256 ++nextdate;
257 }
258 querystring.appendcstr(" )");
259 }
260 }
261
262}
263
264void get_phrases (const text_t &querystring, text_tarray &phrases) {
265
266 phrases.erase (phrases.begin(), phrases.end());
267 if (!querystring.empty()) {
268
269 text_t::const_iterator end = querystring.end();
270 text_t::const_iterator here = findchar (querystring.begin(), end, '"');
271 if (here != end) {
272 text_t tmptext;
273 bool foundquote = false;
274 while (here != end) {
275 if (*here == '"') {
276 if (foundquote) {
277 if (!tmptext.empty()) {
278 phrases.push_back(tmptext);
279 tmptext.clear();
280 }
281 foundquote = false;
282 } else foundquote = true;
283 } else {
284 if (foundquote) tmptext.push_back (*here);
285 }
286 ++here;
287 }
288 }
289 }
290}
291
292// search history tool
293// also used for form query macros
294text_t escape_quotes(const text_t &querystring) {
295
296 text_t::const_iterator here = querystring.begin();
297 text_t::const_iterator end = querystring.end();
298
299 text_t escquery = "";
300 while (here != end) {
301 if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
302 else if (*here == '\n' || *here == '\r') {
303 escquery.push_back(' ');
304 } else {
305 escquery +="\\\\";
306 escquery.push_back(*here);
307 }
308
309 ++here;
310 }
311 return escquery;
312
313}
314
315// some query form parsing functions for use with mgpp
316
317void parse_reg_query_form(text_t &querystring, cgiargsclass &args)
318{
319 querystring.clear();
320
321 const int ct = args.getintarg("ct");
322 int argt = args.getintarg("t");// t=0 -and, t=1 - or
323
324 text_t combine;
325 if (ct==1) {
326 if (argt == 0) combine = "&";
327 else combine = "|";
328 }
329 else { // lucene
330 if (argt == 0) combine = "AND";
331 else combine = "OR";
332 }
333
334 text_t field = args["fqf"];
335 if (field.empty()) return; // no query
336 text_tarray fields;
337 splitchar(field.begin(), field.end(), ',', fields);
338
339 text_t value = args["fqv"];
340 if (value.empty()) return; // somethings wrong
341 text_tarray values;
342 splitchar(value.begin(), value.end(), ',', values);
343
344
345 for (int i=0; i< values.size(); ++i) {
346 if (!values[i].empty()) {
347 if (ct == 1) {
348 mgpp_addqueryelem(querystring, fields[i], values[i], combine);
349 }
350 else { // lucene
351 lucene_addqueryelem(querystring, fields[i], values[i], combine);
352 }
353 }
354 }
355
356}
357
358
359void parse_adv_query_form(text_t &querystring, cgiargsclass &args){
360
361 querystring.clear();
362
363 const int ct = args.getintarg("ct");
364 text_t combine;
365 if (ct==1) {
366 combine = "&";
367 }
368 else { // lucene
369 combine = "AND";
370 }
371
372 text_t field = args["fqf"];
373 if (field.empty()) return; // no query
374 text_tarray fields;
375 splitchar(field.begin(), field.end(), ',', fields);
376
377 text_t value = args["fqv"];
378 if (value.empty()) return; // somethings wrong
379 text_tarray values;
380 splitchar(value.begin(), value.end(), ',', values);
381
382 text_t stem = args["fqs"];
383 if (stem.empty()) return; // somethings wrong
384 text_tarray stems;
385 splitchar(stem.begin(), stem.end(), ',', stems);
386
387 text_t fold = args["fqk"];
388 if (fold.empty()) return; // somethings wrong
389 text_tarray folds;
390 splitchar(fold.begin(), fold.end(), ',', folds);
391
392 text_t comb = args["fqc"];
393 if (comb.empty()) return; //somethings wrong
394 text_tarray combs;
395 splitchar(comb.begin(), comb.end(), ',', combs);
396
397 for(int i=0; i< values.size(); ++i) {
398 if (!values[i].empty()) {
399 if (i!=0) {
400 if (ct==1) {
401 if (combs[i-1]=="and") combine = "&";
402 else if (combs[i-1]=="or")combine = "|";
403 else if (combs[i-1]=="not")combine = "!";
404 }
405 else { // lucene
406 if (combs[i-1]=="and") combine = "AND";
407 else if (combs[i-1]=="or")combine = "OR";
408 else if (combs[i-1]=="not")combine = "NOT";
409 }
410 }
411 text_t term = addstemcase(values[i], stems[i], folds[i]);
412 mgpp_addqueryelem(querystring, fields[i], term, combine);
413 }
414
415 }
416}
417
418text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold) {
419
420 text_t outtext;
421 text_t word;
422 //unsigned short c;
423 text_t::const_iterator here = terms.begin();
424 text_t::const_iterator end = terms.end();
425
426 while (here !=end) {
427
428 if (is_unicode_letdig(*here)) {
429 // not word boundary
430 word.push_back(*here);
431 ++here;
432 }
433 else {
434 // found word boundary
435 if (!word.empty() ) {
436 if (stem == "1" || fold =="1") {
437 word += "#";
438 if (stem == "1") word += "s";
439 //else word += "u";
440
441 if (fold == "1") word += "i";
442 //else word += "c";
443 }
444
445 word += " ";
446 outtext += word;
447 word.clear();
448 }
449 if (*here == '\"') {
450 outtext.push_back(*here);
451 }
452 ++here;
453 }
454 }
455
456 // get last word
457 if (!word.empty()) {
458 if (stem == "1"|| fold == "1") {
459 word += "#";
460 if (stem == "1") word += "s";
461 //else word += "u";
462
463 if (fold == "1") word += "i";
464 //else word += "c";
465 }
466 word += " ";
467 outtext += word;
468 }
469 return outtext;
470}
471
472
473void mgpp_adddateelem(text_t& querystring, const int date)
474{
475 querystring.appendcstr(" [");
476 if(date<0) {
477 querystring.appendcstr("bc");
478 querystring.appendint((date*-1));
479 }
480 else {
481 querystring.appendint(date);
482 }
483 querystring.appendcstr("]:CV");
484}
485
486void lucene_adddateelem(text_t& querystring, const int date)
487{
488 querystring.appendcstr(" CV:(");
489 if(date<0) {
490 querystring.appendcstr("bc");
491 querystring.appendint((date*-1));
492 }
493 else {
494 querystring.appendint(date);
495 }
496 querystring.appendcstr(")");
497}
498
499
500void mgpp_addqueryelem(text_t &querystring, text_t &tag,
501 text_t &query, text_t &combine) {
502 if (!querystring.empty()) { // have to put and/or
503 querystring += " " + combine + " ";
504
505 }
506 if (tag=="ZZ" || tag=="") { // just add onto querystring
507 querystring += query;
508 }
509 else {
510 querystring += "["+query+"]:"+tag;
511 }
512
513}
514
515void lucene_addqueryelem(text_t &querystring, text_t &tag,
516 text_t &query, text_t &combine) {
517 if (!querystring.empty()) { // have to put and/or
518 querystring += " " + combine + " ";
519
520 }
521 if (tag=="ZZ" || tag=="") { // just add onto querystring
522 querystring += query;
523 }
524 else {
525 querystring += tag+":("+query+")";
526 }
527}
528
529
530void addqueryelem_ex(text_t &querystring, const text_t &tag,
531 const text_t &terms, const text_t &stem, const text_t &fold,
532 const text_t& combine, const text_t& word_combine) {
533 if (!querystring.empty()) { // have to put and/or
534 querystring += " " + combine + " ";
535 }
536 text_t outtext; outtext.reserve(512);
537 text_t word; word.reserve(100);
538 //unsigned short c;
539 text_t::const_iterator here = terms.begin();
540 text_t::const_iterator end = terms.end();
541 bool inquote = false, firstword = true;
542
543 text_t word2; word2.reserve(256);
544
545 while (here !=end) {
546 if (is_unicode_space(*here)) {
547 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
548 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
549 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
550 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
551 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
552 if (inquote) {
553 word2.push_back(*here);
554 }
555 word.append(word2); word2.clear();
556
557 if (!inquote && !word.empty() ) {
558 // found word boundary
559
560 if (stem == "1" || fold =="1") {
561 word += "#";
562 if (stem == "1") word += "s";
563 //else word += "u";
564
565 if (fold == "1") word += "i";
566 //else word += "c";
567 }
568 if (firstword) {
569 firstword = false;
570 } else {
571 outtext += " " + word_combine + " ";
572 }
573 outtext += "[" + word + "]:"+tag;
574 word.clear();
575 }
576 ++here;
577 } else if (*here == '\"') {
578 word2.push_back(*here);
579 inquote = !inquote;
580 ++here;
581 } else {
582 // not word boundary
583 word2.push_back(*here);
584 ++here;
585 }
586 }
587
588 // get last word
589 if (!word2.empty()) {
590 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
591 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
592 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
593 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
594 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
595 word.append(word2); word2.clear();
596
597 if (stem == "1"|| fold == "1") {
598 word += "#";
599 if (stem == "1") word += "s";
600 //else word += "u";
601
602 if (fold == "1") word += "i";
603 //else word += "c";
604 }
605 if (!outtext.empty()) outtext += " " + word_combine + " ";
606 outtext += "[" + word + "]:"+tag;
607 }
608 querystring += "(" + outtext + ")";
609}
610
611
612void add_field_info(text_t &querystring, const text_t &tag, int type) {
613
614 if (type == 1) { //mgpp
615 querystring = "["+querystring+"]:"+tag;
616 } else if (type == 2) { // lucene
617 querystring = tag+":("+querystring+")";
618 }
619
620}
621
622
623void format_field_info(text_t &querystring, cgiargsclass &args) {
624
625 text_t tag = args["fqf"];
626 if (tag == "ZZ" || tag == "") {
627 return; // do nothing
628 }
629
630 int argct = args.getintarg("ct");
631 int argt = args.getintarg("t");// t=0 -and, t=1 - or
632 int argb = args.getintarg("b"); // b=0 simple, b=1 advanced
633
634 // Special code for Lucene
635 // The default operator for Lucene is "or", so we need to add "+" symbols when t == 0
636 // Also, we need to be careful not to mess up phrase searches
637 if (argct == 2) {
638 text_t processed_querystring = "";
639 text_t queryelement = "";
640 text_t combine = ((argt == 0) ? "+" : "");
641 bool in_phrase = false;
642 text_t::const_iterator here = querystring.begin();
643 text_t::const_iterator end = querystring.end();
644 while (here != end) {
645 if (is_unicode_letdig(*here)) {
646 queryelement.push_back(*here);
647 }
648
649 // Detect phrase starts/finishes
650 else if (*here == '"') {
651 queryelement.push_back(*here);
652 if (in_phrase == false) in_phrase = true;
653 else if (in_phrase == true) {
654 add_field_info(queryelement, tag, argct);
655 processed_querystring += combine + queryelement;
656 queryelement.clear();
657 in_phrase = false;
658 }
659 }
660
661 // Found word boundary
662 else if (in_phrase) {
663 queryelement.push_back(*here);
664 }
665 else {
666 if (!queryelement.empty()) {
667 add_field_info(queryelement, tag, argct);
668 processed_querystring += combine + queryelement;
669 queryelement.clear();
670 }
671 processed_querystring.push_back(*here);
672 }
673
674 ++here;
675 }
676
677 // Get last element
678 if (!queryelement.empty()) {
679 add_field_info(queryelement, tag, argct);
680 processed_querystring += combine + queryelement;
681 }
682
683 querystring = processed_querystring;
684 return;
685 }
686
687 if (argb==0 && argt==0) {
688 // simple 'and' search - just put tag info round whole query string
689 add_field_info(querystring, tag, argct);
690 return;
691 }
692
693 // we need to individually tag words
694 text_t outtext;
695 text_t word;
696 //unsigned short c;
697 text_t::const_iterator here = querystring.begin();
698 text_t::const_iterator end = querystring.end();
699
700 while (here !=end) {
701
702 if (is_unicode_letdig(*here)|| *here == '#' || *here == '/' ) {
703 // include term modifiers in a word just in case
704 // not word boundary
705 word.push_back(*here);
706 ++here;
707 }
708 else {
709 // found word boundary
710 if (!word.empty() ) {
711 add_field_info(word, tag, argct);
712 outtext += word;
713 word.clear();
714 }
715 // everything else, we add into the query string
716 outtext.push_back(*here);
717 ++here;
718 }
719 }
720
721 // get last word
722 if (!word.empty()) {
723 add_field_info(word, tag, argct);
724 outtext += word;
725 }
726
727 querystring = outtext;
728}
729
Note: See TracBrowser for help on using the repository browser.