source: main/trunk/greenstone2/runtime-src/src/recpt/querytools.cpp@ 22935

Last change on this file since 22935 was 22935, checked in by ak19, 11 years ago

For ticket no 712. Fixes to 2 related crashes that occurred when using a combination of advanced (server.exe and library.cgi depending on which web server was used): 1. When parsing cgi args, arrays stem and fold contained the URL encodings percent-2-C rather than commas for delimiters and weren't split properly resulting in arrays of unexpected lengths (and values). Need to decode the percent-2-C to commas by calling decode_commas() in cgiutils.cpp before splitting. 2. decode_commas in cgiutils.cpp was performing an illegal iterator operation by attempting to peek PAST the end of the iterator which doesn't seem to be allowed by the STL code. When the iteration really got past the end, the iteration operation causes a problem resulting in a server.exe crash of its own.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 34.3 KB
Line 
1/**********************************************************************
2 *
3 * querytools.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "querytools.h"
27#include <ctype.h>
28#include "cgiutils.h"
29#include "unitool.h" // for is_unicode_letdig
30
31// sets the ct, qt, qto arguments
32void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
33
34 if (args["ct"].empty()) {
35 text_t build_type = cinfo->buildType;
36 if (build_type == "mgpp") {
37 args["ct"] = "1";
38 } else if (build_type == "lucene") {
39 args["ct"] = "2";
40 } else {
41 args["ct"] = "0";
42 }
43 }
44 text_t arg_ct = args["ct"];
45 if (arg_ct == "0") {
46 // mg
47 args["qt"] = "0";
48 args["qto"] = "0";
49 return;
50 }
51
52 if (!args["qt"].empty() && !args["qto"].empty()) {
53 return;
54 }
55
56 text_tmap::iterator check = cinfo->format.find("SearchTypes");
57 text_t search_types;
58 if(check != cinfo->format.end() && !(*check).second.empty()){
59 search_types = (*check).second;
60 } else {
61 // assume plain,form
62 if (args["qto"].empty()) args["qto"] = "3";
63 if (args["qt"].empty()) {
64 int arg_qto = args.getintarg("qto");
65 if (arg_qto == 2) {
66 args["qt"] = "1";
67 } else {
68 args["qt"] = "0";
69 }
70 }
71 return;
72 }
73
74
75 if (args["qto"].empty()) {
76 unsigned int type = 0;
77 if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
78 type |= 2;
79 }
80 if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
81 type |= 1;
82 }
83 args.setintarg("qto", type);
84 }
85
86 if (args["qt"].empty()) {
87 int arg_qto = args.getintarg("qto");
88 if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
89 args["qt"] = "1";
90 } else {
91 args["qt"] = "0";
92 }
93 }
94
95
96 // decide if sqlqto should be set or not
97 unsigned int sql_type = 0;
98 text_t infodb_type = cinfo->infodbType;
99 if ((infodb_type == "sqlite") || (infodb_type == "mssql")) {
100 if (findword(search_types.begin(), search_types.end(), "sqlform") != search_types.end()) {
101 sql_type = 1;
102 }
103 }
104
105 if (sql_type) {
106 args["sqlqto"] = "1";
107 }
108 else {
109 args["sqlqto"] = "0";
110 }
111
112
113}
114
115// sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args
116void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
117 int stemIndexes = cinfo->stemIndexes;
118
119 if (stemIndexes & SIcasefold) {
120 args["ks"] = 1;
121 }
122 if (stemIndexes & SIstem) {
123 args["ss"] = 1;
124 }
125 if (stemIndexes & SIaccentfold) {
126 args["afs"] = 1;
127 }
128
129}
130
131
132
133void set_basequeryfilter_options (FilterRequest_t &request,
134 cgiargsclass &args)
135{
136
137 OptionValue_t option;
138 int arg_m = args.getintarg("m");
139
140 option.name = "Maxdocs";
141 option.value = arg_m;
142 request.filterOptions.push_back (option);
143
144 // option.name = "StartResults";
145 // option.value = args["r"];
146 // request.filterOptions.push_back (option);
147
148 // option.name = "EndResults";
149 // int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
150 // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
151 // option.value = endresults;
152 // request.filterOptions.push_back (option);
153}
154
155
156// request.filterResultOptions and request.fields (if required) should
157// be set from the calling code
158void set_fulltext_queryfilter_options (FilterRequest_t &request,
159 const text_t &querystring,
160 cgiargsclass &args)
161{
162 // better if this function, and the two-query companion function
163 // was implemented in queryaction.cpp
164 // Has to be done here to documentaction.cpp can call it directly
165
166 request.filterName = "QueryFilter";
167
168 OptionValue_t option;
169
170 option.name = "Term";
171 option.value = querystring;
172 request.filterOptions.push_back (option);
173
174 option.name = "QueryType";
175 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
176 request.filterOptions.push_back (option);
177
178 option.name = "MatchMode";
179 // mgpp in advanced mode, always use some query
180 if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
181 option.value = "some";
182 } else {
183 option.value = (args.getintarg("t")) ? "some" : "all";
184 }
185 request.filterOptions.push_back (option);
186
187 option.name = "Casefold";
188 option.value = (args.getintarg("k")) ? "true" : "false";
189 request.filterOptions.push_back (option);
190
191 option.name = "Stem";
192 option.value = (args.getintarg("s")) ? "true" : "false";
193 request.filterOptions.push_back (option);
194
195 option.name = "AccentFold";
196 option.value = (args.getintarg("af")) ? "true" : "false";
197 request.filterOptions.push_back (option);
198
199 if (!args["h"].empty()) {
200 option.name = "Index";
201 option.value = args["h"];
202 request.filterOptions.push_back (option);
203 }
204
205 if (!args["j"].empty()) {
206 option.name = "Subcollection";
207 option.value = args["j"];
208 request.filterOptions.push_back (option);
209 }
210
211 if (!args["n"].empty()) {
212 option.name = "Language";
213 option.value = args["n"];
214 request.filterOptions.push_back (option);
215 }
216
217 if (!args["g"].empty()) { // granularity for mgpp
218 option.name = "Level";
219 option.value = args["g"];
220 request.filterOptions.push_back (option);
221 }
222
223 if (!args["fs"].empty()) { // filter string for lucene
224 option.name = "FilterString";
225 option.value = args["fs"];
226 request.filterOptions.push_back (option);
227 }
228
229 if (!args["sf"].empty()) { // sort field for lucene
230 option.name = "SortField";
231 option.value = args["sf"];
232 request.filterOptions.push_back (option);
233 }
234
235 if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene
236 option.name = "Fuzziness";
237 option.value = (text_t) "0." + args["fuzziness"];
238 request.filterOptions.push_back (option);
239 }
240
241 set_basequeryfilter_options(request, args);
242}
243
244
245
246void set_fulltext_queryfilter_options (FilterRequest_t &request,
247 const text_t &querystring1,
248 const text_t &querystring2,
249 cgiargsclass &args)
250{
251
252 set_fulltext_queryfilter_options (request, querystring1, args);
253
254 // fill in the second query if needed
255 if (!args["cq2"].empty()) {
256 OptionValue_t option;
257
258 option.name = "CombineQuery";
259 option.value = args["cq2"];
260 request.filterOptions.push_back (option);
261
262 option.name = "Term";
263 option.value = querystring2;
264 request.filterOptions.push_back (option);
265
266 option.name = "QueryType";
267 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
268 request.filterOptions.push_back (option);
269
270 option.name = "Casefold";
271 option.value = (args.getintarg("k")) ? "true" : "false";
272 request.filterOptions.push_back (option);
273
274 option.name = "Stem";
275 option.value = (args.getintarg("s")) ? "true" : "false";
276 request.filterOptions.push_back (option);
277
278 option.name = "AccentFold";
279 option.value = (args.getintarg("af")) ? "true" : "false";
280 request.filterOptions.push_back (option);
281
282 if (!args["h2"].empty()) {
283 option.name = "Index";
284 option.value = args["h2"];
285 request.filterOptions.push_back (option);
286 }
287
288 if (!args["j2"].empty()) {
289 option.name = "Subcollection";
290 option.value = args["j2"];
291 request.filterOptions.push_back (option);
292 }
293
294 if (!args["n2"].empty()) {
295 option.name = "Language";
296 option.value = args["n2"];
297 request.filterOptions.push_back (option);
298 }
299 }
300
301 // this is probably redundant, as first line to this method will have
302 // already caused it to invoke set_basequeryfilter_options
303
304 set_basequeryfilter_options(request, args);
305}
306
307
308
309// request.filterResultOptions and request.fields (if required) should
310// be set from the calling code
311void set_sql_queryfilter_options (FilterRequest_t &request,
312 cgiargsclass &args)
313{
314 if (!args["sqlsf"].empty()) { // sort field for lucene
315 OptionValue_t option;
316
317 option.name = "SortField";
318 option.value = args["sqlsf"];
319 request.filterOptions.push_back (option);
320 }
321
322 set_basequeryfilter_options(request, args);
323}
324
325
326bool is_special_character(int indexer_type, unsigned short character) {
327 // mgpp
328 if (indexer_type == 1) {
329 return (character == '#' || character == '/' || character == '*');
330 }
331 // lucene
332 else if (indexer_type == 2) {
333 return (character == '?' || character == '*' || character == '~' ||
334 character == '^');
335 }
336 return false;
337}
338
339// This function removes boolean operators from simple searches, and segments
340// chinese characters if segment=true
341void format_querystring (text_t &querystring, int querymode, bool segment) {
342 text_t formattedstring;
343
344 // advanced search, no segmenting, don't need to do anything
345 if (querymode == 1 && !segment) return;
346
347 text_t::const_iterator here = querystring.begin();
348 text_t::const_iterator end = querystring.end();
349
350 // space is used to insert spaces between Chinese
351 // characters. No space is needed before the first
352 // Chinese character.
353 bool space = false;
354
355 // want to remove ()|!& from querystring so boolean queries are just
356 // "all the words" queries (unless querymode is advanced)
357 while (here != end) {
358 if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
359 *here == '!' || *here == '&')) {
360 formattedstring.push_back(' ');
361 } else if (segment) {
362 if ((*here >= 0x2e80 && *here <= 0xd7a3) ||
363 ( *here >= 0xf900 && *here <= 0xfa6a)) {
364 /* text_t not big enough to handle these. */
365 /* (*here >= 0x20000 && *here <= 0x2a6d6) ||
366 (*here >= 0x2f800 && *here <= 0x2fa1d)) { */
367
368 // CJK character
369 if (!space) formattedstring.push_back (0x200b); // zero width space
370 formattedstring.push_back (*here);
371 formattedstring.push_back (0x200b);
372 space = true;
373 } else {
374
375 // non-Chinese character
376 formattedstring.push_back (*here);
377 space = false;
378
379 }
380
381 } else {
382 formattedstring.push_back (*here);
383 }
384 ++here;
385 }
386 querystring = formattedstring;
387}
388
389// turn query string into terms separated by spaces.
390// still working on this...
391text_t get_plain_query_terms(const text_t &querystring, const text_t &arg_ct) {
392 text_t::const_iterator here = querystring.begin();
393 text_t::const_iterator end = querystring.end();
394
395 // lets look for [] and () first - these are a pain.
396 text_t::const_iterator bracket;
397 text_t query_no_brackets = "";
398
399 // mgpp brackets: [xxx]:TI
400 if (findchar(here, end, '[') != end) {
401 while ((bracket = findchar(here, end, '[')) != end) {
402 // get the first bit
403 query_no_brackets += substr(here, bracket);
404 bracket++;
405 here = bracket;
406 // get the end bracket
407 bracket = findchar(here, end, ']');
408 query_no_brackets += substr(here, bracket);
409 // skip the :TI bits
410 while (*bracket != ' ' && bracket != end) { bracket++;}
411 here = bracket;
412 }
413 if (here != end) {
414 query_no_brackets += substr(here,end);
415 }
416 } else if (findchar(here, end, '(') != end) {
417 // lucene brackets TI:(xxx)
418 while ((bracket = findchar(here, end, '(')) != end) {
419 // back up the field name
420 text_t::const_iterator old_bracket = bracket;
421 while (*bracket != ' ' && bracket != here) {
422 --bracket;
423 }
424 if (bracket != here) {
425 // get the first bit
426 query_no_brackets += substr(here, bracket+1);
427 }
428 here = old_bracket +1;
429 // get the end bracket
430 bracket = findchar(here, end, ')');
431 query_no_brackets += substr(here, bracket);
432 if (bracket != end) {
433 here = bracket+1;
434 }
435 }
436 if (here != end) {
437 query_no_brackets += substr(here,end);
438 }
439 } else {
440 // was no brackets
441 query_no_brackets = querystring;
442 }
443
444
445 if (arg_ct == "2") { // lucene
446 // look for AND OR NOT and remove
447 here = query_no_brackets.begin();
448 end = query_no_brackets.end();
449 text_tlist terms;
450 splitword(here, end, "AND", terms);
451 joinchar(terms, ' ', query_no_brackets);
452 here = query_no_brackets.begin();
453 end = query_no_brackets.end();
454 splitword(here, end, "OR", terms);
455 joinchar(terms, ' ', query_no_brackets);
456 here = query_no_brackets.begin();
457 end = query_no_brackets.end();
458 splitword(here, end, "NOT", terms);
459 joinchar(terms, ' ', query_no_brackets);
460
461 }
462 text_t terms = "";
463 bool space = false;
464 here = query_no_brackets.begin();
465 end = query_no_brackets.end();
466
467 while (here != end) {
468 if (*here == '#' || *here == '/') {
469 // skip over #is /10 etc
470 ++here;
471 while (here != end && *here != ' ') {
472 ++here;
473 }
474 if (here == end) break;
475 }
476 if (is_unicode_letdig(*here)) {
477 terms.push_back(*here);
478 space = false;
479 } else {
480 if (!space) {
481 terms.push_back(' ');
482 space = true;
483 }
484 }
485 ++here;
486 }
487 return terms;
488
489}
490
491// search history tool
492// also used for form query macros
493text_t escape_quotes(const text_t &querystring) {
494
495 text_t::const_iterator here = querystring.begin();
496 text_t::const_iterator end = querystring.end();
497
498 text_t escquery = "";
499 while (here != end) {
500 if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
501 else if (*here == '\n' || *here == '\r') {
502 escquery.push_back(' ');
503 } else {
504 escquery +="\\\\";
505 escquery.push_back(*here);
506 }
507
508 ++here;
509 }
510 return escquery;
511
512}
513
514// Parses the terms into words, and adds #si if necessary
515text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
516 const int indexer_type) {
517
518 // the default stem and case are set to 0 if this is being used, so we are only adding on qualifiers if stem or fold is 1.
519 if (stem == "0" && fold == "0") {
520 return terms;
521 }
522 // this is only for mgpp collections, shouldn't be called for anything else
523 if (indexer_type != 1) {
524 return terms;
525 }
526
527 text_t outtext;
528 text_t word;
529
530 text_t::const_iterator here = terms.begin();
531 text_t::const_iterator end = terms.end();
532
533 while (here !=end) {
534
535 if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
536 // not word boundary
537 word.push_back(*here);
538 ++here;
539 }
540 else {
541 // found word boundary
542 if (!word.empty() ) {
543 if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) {
544 outtext += word;
545 word.clear();
546 }
547 else {
548 word += "#";
549 if (stem == "1") word += "s";
550 if (fold == "1") word += "i";
551 outtext += word;
552 word.clear();
553 }
554 }
555 // this only used in advanced form, so we leave in boolean operators
556 if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' ||
557 *here == '(' || *here == ')' || is_unicode_space(*here)) {
558 outtext.push_back(*here);
559 }
560 ++here;
561 }
562 }
563
564 // get last word
565 if (!word.empty()) {
566 word += "#";
567 if (stem == "1") word += "s";
568 if (fold == "1") word += "i";
569 word += " ";
570 outtext += word;
571 }
572 return outtext;
573}
574
575
576// some query form parsing functions for use with mgpp & lucene
577
578void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
579{
580 querystring.clear();
581
582 int argct = args.getintarg("ct");
583 int argt = args.getintarg("t");// t=0 -and, t=1 - or
584 int argb = args.getintarg("b");
585
586 text_t combine;
587
588 // lucene uses global combine, so only need this for mgpp
589 if (argct==1) {
590 if (argt == 0) combine = "&";
591 else combine = "|";
592 }
593
594 text_t field = args["fqf"];
595 if (field.empty()) return; // no query
596 text_tarray fields;
597 splitchar(field.begin(), field.end(), ',', fields);
598
599 text_t value = args["fqv"];
600 if (value.empty()) return; // somethings wrong
601 text_tarray values;
602 splitchar(value.begin(), value.end(), ',', values);
603
604
605 for (int i=0; i< values.size(); ++i) {
606 if (!values[i].empty()) {
607 text_t this_value = values[i];
608
609 // remove operators for simple search, segments text if necessary
610 format_querystring(this_value, argb, segment);
611
612 // add tag info for this field (and other processing)
613 format_field_info(this_value, fields[i], argct, argt, argb);
614
615 // add into query string
616 if (argct == 2) {
617 // lucene
618 // we don't worry about AND/OR, cos this is done by defaultcombineoperator
619 querystring += this_value+" ";
620 } else {
621 // mgpp
622 if (!querystring.empty()) {
623 querystring += " "+ combine+ " ";
624 }
625 querystring += this_value;
626 }
627 }
628 }
629}
630
631
632void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
633 querystring.clear();
634
635 const int argct = args.getintarg("ct");
636 int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
637 int argb = args.getintarg("b");
638 text_t combine;
639 if (argct==1) {
640 combine = "&";
641 }
642 else { // lucene
643 combine = "AND";
644 }
645
646 text_t field = args["fqf"];
647 if (field.empty()) return; // no query
648 text_tarray fields;
649 splitchar(field.begin(), field.end(), ',', fields);
650
651 text_t value = args["fqv"];
652 if (value.empty()) return; // somethings wrong
653 text_tarray values;
654 splitchar(value.begin(), value.end(), ',', values);
655
656 text_t comb = args["fqc"];
657 if (comb.empty()) return; //somethings wrong
658 text_tarray combs;
659 splitchar(comb.begin(), comb.end(), ',', combs);
660
661 text_tarray stems;
662 text_tarray folds;
663 if (argct == 1) {// mgpp - lucene doesn't do stem/case
664 text_t stem = args["fqs"];
665 if (stem.empty()) return; // somethings wrong
666 stem = decode_commas(stem); // %2C -> ,
667 splitchar(stem.begin(), stem.end(), ',', stems);
668
669 text_t fold = args["fqk"];
670 if (fold.empty()) return; // somethings wrong
671 fold = decode_commas(fold); // %2C -> ,
672 splitchar(fold.begin(), fold.end(), ',', folds);
673 }
674
675 for(int i=0; i< values.size(); ++i) {
676 if (!values[i].empty()) {
677 if (i!=0) {
678 if (argct==1) {
679 if (combs[i-1]=="and") combine = "&";
680 else if (combs[i-1]=="or")combine = "|";
681 else if (combs[i-1]=="not")combine = "!";
682 }
683 else { // lucene
684 if (combs[i-1]=="and") combine = "AND";
685 else if (combs[i-1]=="or")combine = "OR";
686 else if (combs[i-1]=="not")combine = "NOT";
687 }
688 }
689 text_t this_value = values[i];
690 // remove operators for simple search, segments text if necessary
691 format_querystring(this_value, argb, segment);
692 if (argct == 1) { // mgpp only
693 this_value = addstemcase(this_value, stems[i], folds[i], argct);
694 }
695 // add tag info for this field (and other processing)
696 format_field_info(this_value, fields[i], argct, argt, argb);
697 // add into query string
698 if (!querystring.empty()) {
699 querystring += " "+ combine+ " ";
700 }
701 querystring += this_value;
702
703 }
704 }
705}
706
707
708// SQL versions for parsing query form
709
710void parse_sqlreg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
711{
712 querystring.clear();
713
714 int argt = args.getintarg("t");// t=0 -and, t=1 - or
715 int argb = args.getintarg("b");
716
717 text_t combine;
718
719 if (argt == 0) combine = "AND";
720 else combine = "OR";
721
722 text_t field = args["sqlfqf"];
723 if (field.empty()) return; // no query
724 text_tarray fields;
725 splitchar(field.begin(), field.end(), ',', fields);
726
727 text_t sqlcomb = args["sqlfqc"];
728 if (sqlcomb.empty()) return; //somethings wrong
729 text_tarray sqlcombs;
730 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
731
732 text_t value = args["fqv"];
733 if (value.empty()) return; // somethings wrong
734 text_tarray values;
735 splitchar(value.begin(), value.end(), ',', values);
736
737
738 for (int i=0; i< values.size(); ++i) {
739 if (!values[i].empty()) {
740 text_t this_value = values[i];
741
742 // remove operators for simple search, segments text if necessary
743 format_querystring(this_value, argb, segment);
744
745 // add tag info for this field (and other processing)
746 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);
747
748 const text_t DISTINCT_SELECT_WHERE
749 = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
750
751 if (querystring.empty()) {
752 // first query term
753 querystring = DISTINCT_SELECT_WHERE + this_value;
754 }
755 else {
756 this_value = DISTINCT_SELECT_WHERE + this_value;
757
758 if (combine=="AND") {
759 // INNER JOIN to restrict to only matching docOIDs
760 querystring = "SELECT docOID FROM (" + querystring + ")"
761 + " INNER JOIN (" + this_value +") USING (docOID)";
762 }
763 else if (combine=="OR") {
764 // Union to allow union of the two
765 querystring = querystring + " UNION " + this_value;
766 }
767 }
768 }
769 }
770}
771
772
773void parse_sqladv_query_form(text_t &querystring, cgiargsclass &args,
774 bool segment)
775{
776 querystring.clear();
777
778 int argt = 0; // set it to 0 = AND, by default
779 int argb = args.getintarg("b");
780 text_t combine = "AND";
781
782 text_t field = args["sqlfqf"];
783
784 if (field.empty()) return; // no query
785 text_tarray fields;
786 splitchar(field.begin(), field.end(), ',', fields);
787
788 text_t sqlcomb = args["sqlfqc"];
789 if (sqlcomb.empty()) return; //somethings wrong
790 text_tarray sqlcombs;
791 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
792
793 text_t value = args["fqv"];
794 if (value.empty()) return; // somethings wrong
795 text_tarray values;
796 splitchar(value.begin(), value.end(), ',', values);
797
798 text_t comb = args["fqc"];
799 if (comb.empty()) return; //somethings wrong
800 text_tarray combs;
801 splitchar(comb.begin(), comb.end(), ',', combs);
802
803 for(int i=0; i< values.size(); ++i) {
804 if (!values[i].empty()) {
805 if (i>0) {
806 if (combs[i-1]=="and") { combine = "AND"; }
807 else if (combs[i-1]=="or") { combine = "OR"; }
808 else if (combs[i-1]=="not") { combine = "NOT"; }
809 }
810 text_t this_value = values[i];
811
812 // remove operators for simple search, segments text if necessary
813 format_querystring(this_value, argb, segment);
814
815 // add tag info for this field (and other processing)
816 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);
817
818 // add into query string
819
820 const text_t DISTINCT_SELECT_WHERE
821 = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
822
823 if (querystring.empty()) {
824 // first query term
825 querystring = DISTINCT_SELECT_WHERE + this_value;
826 }
827 else {
828 this_value = DISTINCT_SELECT_WHERE + this_value;
829
830 if (combine=="AND") {
831 // INNER JOIN to restrict to only matching docOIDs
832 querystring = "SELECT docOID FROM (" + querystring + ")"
833 + " INNER JOIN (" + this_value +") USING (docOID)";
834 }
835 else if (combine=="OR") {
836 // Union to allow union of the two
837 querystring = querystring + " UNION " + this_value;
838 }
839 else {
840 cerr << "Unsupported combination operation: " << combine << endl;
841 }
842 }
843
844 }
845 }
846}
847
848
849
850
851// Extended addqueryelem for Human Info project
852void addqueryelem_ex(text_t &querystring, const text_t &tag,
853 const text_t &terms, const text_t &stem,
854 const text_t &fold,
855 const text_t& combine, const text_t& word_combine) {
856
857 if (!querystring.empty()) { // have to put and/or
858 querystring += " " + combine + " ";
859 }
860 text_t outtext; outtext.reserve(512);
861 text_t word; word.reserve(100);
862 //unsigned short c;
863 text_t::const_iterator here = terms.begin();
864 text_t::const_iterator end = terms.end();
865 bool inquote = false, firstword = true;
866
867 text_t word2; word2.reserve(256);
868
869 while (here !=end) {
870 if (is_unicode_space(*here)) {
871 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
872 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
873 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
874 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
875 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
876 if (inquote) {
877 word2.push_back(*here);
878 }
879 word.append(word2); word2.clear();
880
881 if (!inquote && !word.empty() ) {
882 // found word boundary
883
884 if (stem == "1" || fold =="1") {
885 word += "#";
886 if (stem == "1") word += "s";
887 //else word += "u";
888
889 if (fold == "1") word += "i";
890 //else word += "c";
891 }
892 if (firstword) {
893 firstword = false;
894 } else {
895 outtext += " " + word_combine + " ";
896 }
897 outtext += "[" + word + "]:"+tag;
898 word.clear();
899 }
900 ++here;
901 } else if (*here == '\"') {
902 word2.push_back(*here);
903 inquote = !inquote;
904 ++here;
905 } else {
906 // not word boundary
907 word2.push_back(*here);
908 ++here;
909 }
910 }
911
912 // get last word
913 if (!word2.empty()) {
914 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
915 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
916 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
917 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
918 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
919 word.append(word2); word2.clear();
920
921 if (stem == "1"|| fold == "1") {
922 word += "#";
923 if (stem == "1") word += "s";
924 //else word += "u";
925
926 if (fold == "1") word += "i";
927 //else word += "c";
928 }
929 if (!outtext.empty()) outtext += " " + word_combine + " ";
930 outtext += "[" + word + "]:"+tag;
931 }
932 querystring += "(" + outtext + ")";
933}
934
935void add_field_info(text_t &querystring, const text_t &tag, int type) {
936
937 if (tag == "") return; // do nothing
938 if (tag == "ZZ" && type == 1) return; // mgpp doesn't use ZZ tag internally
939 if (type == 1) { //mgpp
940 querystring = "["+querystring+"]:"+tag;
941 } else if (type == 2) { // lucene
942 querystring = tag+":("+querystring+")";
943 }
944
945}
946
947
948void add_field_info_sql(text_t &querystring, const text_t &tagseq,
949 const text_t& sqlcomb)
950{
951
952 if (tagseq == "") return; // do nothing
953
954 text_t element_in = "(element IN (";
955
956 text_tlist mdterms;
957
958 splitword(tagseq.begin(), tagseq.end(), "/", mdterms);
959
960 text_t tags_in = "";
961
962 while (!mdterms.empty()) {
963 text_t tag = mdterms.front();
964 mdterms.pop_front();
965
966 if (!tag.empty()) {
967
968 if (tag.size()>3 && (substr(tag.begin(), tag.begin()+3) == "ex.")) {
969 tag = substr (tag.begin()+3, tag.end());
970 }
971
972 if (!tags_in.empty()) {
973 tags_in += ",";
974 }
975
976 tags_in += "'" + tag + "'";
977 }
978 }
979
980 element_in += tags_in + ") AND (";
981
982 if (sqlcomb == "=") {
983 // override what it means to do equality, to make it more like full text
984 // searching
985
986 text_t orterms = "";
987 text_t term = "";
988 bool in_phrase = false;
989
990 text_t::const_iterator here = querystring.begin();
991 text_t::const_iterator end = querystring.end();
992 while (here != end) {
993 if (is_unicode_letdig(*here)) {
994 term.push_back(*here);
995 }
996 else if (*here == '"') {
997 term.push_back(*here);
998 if (!in_phrase) {
999 in_phrase = true;
1000 } else {
1001 in_phrase = false;
1002 }
1003 }
1004 else if (in_phrase) {
1005 // Found word boundary, but in a phrase, so does not complete term
1006 term.push_back(*here);
1007 }
1008 else {
1009 // Found a word boundary
1010 if (!orterms.empty()) {
1011 orterms += " OR ";
1012 }
1013 orterms += "value LIKE '%" + term + "%'";
1014 term.clear();
1015 }
1016 ++here;
1017 }
1018
1019 if (!term.empty()) {
1020 if (!orterms.empty()) {
1021 orterms += " OR ";
1022 }
1023 orterms += "value LIKE '%" + term + "%'";
1024 }
1025
1026 element_in += orterms;
1027 }
1028 else {
1029 // search on value is "as is" querystring
1030 element_in += "value " + sqlcomb + " '" + querystring+"'";
1031 }
1032
1033
1034 querystring = element_in + "))";
1035
1036}
1037
1038
1039void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) {
1040
1041 int type = 2; //lucene
1042
1043 if (argb==0) { // simple
1044 // there will be no & or | as they should have already been removed
1045 // just tag the entire thing
1046 if (tag != "") {
1047 add_field_info(querystring, tag, type);
1048 }
1049 return;
1050 }
1051
1052 // need to replace & with &&, | with ||
1053 text_t::const_iterator here = querystring.begin();
1054 text_t::const_iterator end = querystring.end();
1055
1056 text_t finalquery = "";
1057 while (here != end) {
1058 if (*here == '&') {
1059 finalquery.push_back('&');
1060 finalquery.push_back('&');
1061 while (*(here+1) == '&') {
1062 ++here;
1063 }
1064 }
1065 else if (*here == '|') {
1066 finalquery.push_back('|');
1067 finalquery.push_back('|');
1068 while (*(here+1) == '|') {
1069 ++here;
1070 }
1071 }
1072 else {
1073 finalquery.push_back(*here);
1074 }
1075 ++here;
1076 }
1077 querystring = finalquery;
1078 add_field_info(querystring, tag, type);
1079}
1080
1081
1082void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {
1083
1084 if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
1085 if (tag == "" && argb == 1) {
1086 return; // no field specifier, advanced mode, the query stays as written
1087 }
1088
1089 int type = 1; // mgpp
1090
1091 bool simple_and = (argb==0 && argt==0);
1092 text_t finalquery = "";
1093 text_t fieldpart ="";
1094 text_t queryelem = "";
1095 bool in_phrase = false;
1096 bool in_field = false;
1097
1098 text_t::const_iterator here = querystring.begin();
1099 text_t::const_iterator end = querystring.end();
1100 while (here != end) {
1101 if (is_unicode_letdig(*here) || *here == '&' || is_special_character(type, *here)) {
1102 queryelem.push_back(*here);
1103 }
1104 else if (*here == '|') {
1105 in_field = false;
1106 }
1107 else if (*here == '!' || *here == '(' || *here == ')') {
1108 if (!in_phrase) { // ignore these if in_phrase
1109 // output field, then output operator
1110 in_field = false;
1111 if (!queryelem.empty()) {
1112 if (!simple_and && !fieldpart.empty()) {
1113 add_field_info(fieldpart, tag, type);
1114 finalquery += fieldpart;
1115 finalquery.push_back(' ');
1116 fieldpart.clear();
1117 }
1118 fieldpart += queryelem;
1119 }
1120 if (!fieldpart.empty()) {
1121 add_field_info(fieldpart, tag, type);
1122 finalquery += fieldpart;
1123 finalquery.push_back(' ');
1124 }
1125 fieldpart.clear();
1126 queryelem.clear();
1127 finalquery.push_back(*here);
1128 finalquery.push_back(' ');
1129 }
1130 }
1131 else if (*here == '"') {
1132 queryelem.push_back(*here);
1133 if (in_phrase == false) in_phrase = true;
1134 else {
1135 in_phrase = false;
1136 }
1137 }
1138
1139 // Found word boundary, in a phrase
1140 else if (in_phrase) {
1141 queryelem.push_back(*here);
1142 }
1143 // Found a word boundary
1144 else {
1145 if (!queryelem.empty()) {
1146 if (queryelem == "&") {
1147 in_field = true;
1148 queryelem.clear();
1149 }
1150 else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
1151
1152 if (argb==1) {
1153 // simple search, these not allowed
1154 in_field = true;
1155 fieldpart += queryelem;
1156 fieldpart.push_back(' ');
1157 }
1158 queryelem.clear();
1159
1160 }
1161 else {
1162 if (!simple_and && !in_field) {
1163 if (!fieldpart.empty()) {
1164 add_field_info(fieldpart, tag, type);
1165 finalquery += fieldpart;
1166 finalquery.push_back(' ');
1167 fieldpart.clear();
1168 }
1169 }
1170
1171 fieldpart += queryelem;
1172 fieldpart.push_back(' ');
1173 queryelem.clear();
1174 }
1175 }
1176 }
1177 ++here;
1178 }
1179 // at the end
1180 if (!queryelem.empty()) {
1181 if (!simple_and && !in_field && !fieldpart.empty()) {
1182 add_field_info(fieldpart, tag, type);
1183 finalquery += fieldpart;
1184 finalquery.push_back(' ');
1185 fieldpart.clear();
1186 }
1187 fieldpart += queryelem;
1188 }
1189 if (!fieldpart.empty()) {
1190 add_field_info(fieldpart, tag, type);
1191 finalquery += fieldpart;
1192 fieldpart.clear();
1193
1194 // doesn't the following just leave a dangling space at the end ?? (used to make mgpp crash)
1195 // consider cutting this line
1196 finalquery.push_back(' ');
1197 }
1198
1199 querystring = finalquery;
1200}
1201
1202
1203void format_field_info_sql(text_t &querystring, const text_t &tagseq,
1204 const text_t &sqlcomb,
1205 int argt, int argb)
1206{
1207 add_field_info_sql(querystring, tagseq, sqlcomb);
1208}
1209
1210
1211void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) {
1212 if (argct == 1) {
1213 format_field_info_mgpp(querystring, tag, argt, argb);
1214 } else if (argct == 2) {
1215 format_field_info_lucene(querystring, tag, argt, argb);
1216 }
1217}
1218
1219void mgpp_adddateelem(text_t& querystring, const int date)
1220{
1221 querystring.appendcstr(" [");
1222 if(date<0) {
1223 querystring.appendcstr("bc");
1224 querystring.appendint((date*-1));
1225 }
1226 else {
1227 querystring.appendint(date);
1228 }
1229 querystring.appendcstr("]:CV");
1230}
1231
1232void lucene_adddateelem(text_t& querystring, const int date)
1233{
1234 querystring.appendcstr(" CV:(");
1235 if(date<0) {
1236 querystring.appendcstr("bc");
1237 querystring.appendint((date*-1));
1238 }
1239 else {
1240 querystring.appendint(date);
1241 }
1242 querystring.appendcstr(")");
1243}
1244
1245
1246void add_dates(text_t &querystring, int startdate, int enddate,
1247 int startbc, int endbc, int ct)
1248{
1249 if(startdate)
1250 {
1251 int querystringis = 0;
1252 text_t::const_iterator here = querystring.begin();
1253 text_t::const_iterator end = querystring.end();
1254 while(here!=end)
1255 {
1256 if(!(isspace((*here)))){
1257 here = end;
1258 querystringis = 1;
1259 }
1260 else
1261 ++here;
1262 }
1263 //converting BCE dates
1264 if(startbc && startdate > 0)
1265 {
1266 startdate *= -1;
1267 }
1268 if(endbc && enddate > 0)
1269 {
1270 enddate *= -1;
1271 }
1272 if(enddate != 0 && enddate<startdate)
1273 {
1274 cout<<"enddate too small"<<endl;
1275 return;
1276 }
1277 if(querystringis)
1278 querystring.appendcstr(" AND");
1279 if(!enddate)
1280 {
1281 if (ct==1) {
1282 mgpp_adddateelem(querystring,startdate);
1283 }
1284 else { // lucene
1285 lucene_adddateelem(querystring,startdate);
1286 }
1287 }
1288 else{
1289 int nextdate = startdate;
1290 querystring.appendcstr(" (");
1291 while(nextdate<=enddate)
1292 {
1293 if(nextdate!=0) {
1294 if (ct==1) {
1295 mgpp_adddateelem(querystring,nextdate);
1296 }
1297 else { // lucene
1298 lucene_adddateelem(querystring,nextdate);
1299 }
1300 }
1301 ++nextdate;
1302 }
1303 querystring.appendcstr(" )");
1304 }
1305 }
1306
1307}
Note: See TracBrowser for help on using the repository browser.