source: main/trunk/greenstone2/runtime-src/src/recpt/querytools.cpp@ 22984

Last change on this file since 22984 was 22984, checked in by ak19, 14 years ago
  1. Undoing commit of 22934 where decode_commas was called on stem and fold comma separated list: previously separated due to url-encoding of commas. Now that the problem has been fixed at the source, the decode_commas hack is no longer necessary. 2. Commas in stem and fold are no longer url-encoded because the multiple_value field of the continuously-reused struct arg_ainfo is always set back to the default false after ever being set to true. So it no longer subtly stays at true to affect Greenstone functioning in unforeseen ways (such as suddenly and unnecessarily URL-encoding commas where this is not wanted).
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 34.2 KB
Line 
1/**********************************************************************
2 *
3 * querytools.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "querytools.h"
27#include <ctype.h>
28#include "unitool.h" // for is_unicode_letdig
29
30// sets the ct, qt, qto arguments
31void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
32
33 if (args["ct"].empty()) {
34 text_t build_type = cinfo->buildType;
35 if (build_type == "mgpp") {
36 args["ct"] = "1";
37 } else if (build_type == "lucene") {
38 args["ct"] = "2";
39 } else {
40 args["ct"] = "0";
41 }
42 }
43 text_t arg_ct = args["ct"];
44 if (arg_ct == "0") {
45 // mg
46 args["qt"] = "0";
47 args["qto"] = "0";
48 return;
49 }
50
51 if (!args["qt"].empty() && !args["qto"].empty()) {
52 return;
53 }
54
55 text_tmap::iterator check = cinfo->format.find("SearchTypes");
56 text_t search_types;
57 if(check != cinfo->format.end() && !(*check).second.empty()){
58 search_types = (*check).second;
59 } else {
60 // assume plain,form
61 if (args["qto"].empty()) args["qto"] = "3";
62 if (args["qt"].empty()) {
63 int arg_qto = args.getintarg("qto");
64 if (arg_qto == 2) {
65 args["qt"] = "1";
66 } else {
67 args["qt"] = "0";
68 }
69 }
70 return;
71 }
72
73
74 if (args["qto"].empty()) {
75 unsigned int type = 0;
76 if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
77 type |= 2;
78 }
79 if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
80 type |= 1;
81 }
82 args.setintarg("qto", type);
83 }
84
85 if (args["qt"].empty()) {
86 int arg_qto = args.getintarg("qto");
87 if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
88 args["qt"] = "1";
89 } else {
90 args["qt"] = "0";
91 }
92 }
93
94
95 // decide if sqlqto should be set or not
96 unsigned int sql_type = 0;
97 text_t infodb_type = cinfo->infodbType;
98 if ((infodb_type == "sqlite") || (infodb_type == "mssql")) {
99 if (findword(search_types.begin(), search_types.end(), "sqlform") != search_types.end()) {
100 sql_type = 1;
101 }
102 }
103
104 if (sql_type) {
105 args["sqlqto"] = "1";
106 }
107 else {
108 args["sqlqto"] = "0";
109 }
110
111
112}
113
114// sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args
115void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
116 int stemIndexes = cinfo->stemIndexes;
117
118 if (stemIndexes & SIcasefold) {
119 args["ks"] = 1;
120 }
121 if (stemIndexes & SIstem) {
122 args["ss"] = 1;
123 }
124 if (stemIndexes & SIaccentfold) {
125 args["afs"] = 1;
126 }
127
128}
129
130
131
132void set_basequeryfilter_options (FilterRequest_t &request,
133 cgiargsclass &args)
134{
135
136 OptionValue_t option;
137 int arg_m = args.getintarg("m");
138
139 option.name = "Maxdocs";
140 option.value = arg_m;
141 request.filterOptions.push_back (option);
142
143 // option.name = "StartResults";
144 // option.value = args["r"];
145 // request.filterOptions.push_back (option);
146
147 // option.name = "EndResults";
148 // int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
149 // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
150 // option.value = endresults;
151 // request.filterOptions.push_back (option);
152}
153
154
155// request.filterResultOptions and request.fields (if required) should
156// be set from the calling code
157void set_fulltext_queryfilter_options (FilterRequest_t &request,
158 const text_t &querystring,
159 cgiargsclass &args)
160{
161 // better if this function, and the two-query companion function
162 // was implemented in queryaction.cpp
163 // Has to be done here to documentaction.cpp can call it directly
164
165 request.filterName = "QueryFilter";
166
167 OptionValue_t option;
168
169 option.name = "Term";
170 option.value = querystring;
171 request.filterOptions.push_back (option);
172
173 option.name = "QueryType";
174 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
175 request.filterOptions.push_back (option);
176
177 option.name = "MatchMode";
178 // mgpp in advanced mode, always use some query
179 if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
180 option.value = "some";
181 } else {
182 option.value = (args.getintarg("t")) ? "some" : "all";
183 }
184 request.filterOptions.push_back (option);
185
186 option.name = "Casefold";
187 option.value = (args.getintarg("k")) ? "true" : "false";
188 request.filterOptions.push_back (option);
189
190 option.name = "Stem";
191 option.value = (args.getintarg("s")) ? "true" : "false";
192 request.filterOptions.push_back (option);
193
194 option.name = "AccentFold";
195 option.value = (args.getintarg("af")) ? "true" : "false";
196 request.filterOptions.push_back (option);
197
198 if (!args["h"].empty()) {
199 option.name = "Index";
200 option.value = args["h"];
201 request.filterOptions.push_back (option);
202 }
203
204 if (!args["j"].empty()) {
205 option.name = "Subcollection";
206 option.value = args["j"];
207 request.filterOptions.push_back (option);
208 }
209
210 if (!args["n"].empty()) {
211 option.name = "Language";
212 option.value = args["n"];
213 request.filterOptions.push_back (option);
214 }
215
216 if (!args["g"].empty()) { // granularity for mgpp
217 option.name = "Level";
218 option.value = args["g"];
219 request.filterOptions.push_back (option);
220 }
221
222 if (!args["fs"].empty()) { // filter string for lucene
223 option.name = "FilterString";
224 option.value = args["fs"];
225 request.filterOptions.push_back (option);
226 }
227
228 if (!args["sf"].empty()) { // sort field for lucene
229 option.name = "SortField";
230 option.value = args["sf"];
231 request.filterOptions.push_back (option);
232 }
233
234 if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene
235 option.name = "Fuzziness";
236 option.value = (text_t) "0." + args["fuzziness"];
237 request.filterOptions.push_back (option);
238 }
239
240 set_basequeryfilter_options(request, args);
241}
242
243
244
245void set_fulltext_queryfilter_options (FilterRequest_t &request,
246 const text_t &querystring1,
247 const text_t &querystring2,
248 cgiargsclass &args)
249{
250
251 set_fulltext_queryfilter_options (request, querystring1, args);
252
253 // fill in the second query if needed
254 if (!args["cq2"].empty()) {
255 OptionValue_t option;
256
257 option.name = "CombineQuery";
258 option.value = args["cq2"];
259 request.filterOptions.push_back (option);
260
261 option.name = "Term";
262 option.value = querystring2;
263 request.filterOptions.push_back (option);
264
265 option.name = "QueryType";
266 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
267 request.filterOptions.push_back (option);
268
269 option.name = "Casefold";
270 option.value = (args.getintarg("k")) ? "true" : "false";
271 request.filterOptions.push_back (option);
272
273 option.name = "Stem";
274 option.value = (args.getintarg("s")) ? "true" : "false";
275 request.filterOptions.push_back (option);
276
277 option.name = "AccentFold";
278 option.value = (args.getintarg("af")) ? "true" : "false";
279 request.filterOptions.push_back (option);
280
281 if (!args["h2"].empty()) {
282 option.name = "Index";
283 option.value = args["h2"];
284 request.filterOptions.push_back (option);
285 }
286
287 if (!args["j2"].empty()) {
288 option.name = "Subcollection";
289 option.value = args["j2"];
290 request.filterOptions.push_back (option);
291 }
292
293 if (!args["n2"].empty()) {
294 option.name = "Language";
295 option.value = args["n2"];
296 request.filterOptions.push_back (option);
297 }
298 }
299
300 // this is probably redundant, as first line to this method will have
301 // already caused it to invoke set_basequeryfilter_options
302
303 set_basequeryfilter_options(request, args);
304}
305
306
307
308// request.filterResultOptions and request.fields (if required) should
309// be set from the calling code
310void set_sql_queryfilter_options (FilterRequest_t &request,
311 cgiargsclass &args)
312{
313 if (!args["sqlsf"].empty()) { // sort field for lucene
314 OptionValue_t option;
315
316 option.name = "SortField";
317 option.value = args["sqlsf"];
318 request.filterOptions.push_back (option);
319 }
320
321 set_basequeryfilter_options(request, args);
322}
323
324
325bool is_special_character(int indexer_type, unsigned short character) {
326 // mgpp
327 if (indexer_type == 1) {
328 return (character == '#' || character == '/' || character == '*');
329 }
330 // lucene
331 else if (indexer_type == 2) {
332 return (character == '?' || character == '*' || character == '~' ||
333 character == '^');
334 }
335 return false;
336}
337
338// This function removes boolean operators from simple searches, and segments
339// chinese characters if segment=true
340void format_querystring (text_t &querystring, int querymode, bool segment) {
341 text_t formattedstring;
342
343 // advanced search, no segmenting, don't need to do anything
344 if (querymode == 1 && !segment) return;
345
346 text_t::const_iterator here = querystring.begin();
347 text_t::const_iterator end = querystring.end();
348
349 // space is used to insert spaces between Chinese
350 // characters. No space is needed before the first
351 // Chinese character.
352 bool space = false;
353
354 // want to remove ()|!& from querystring so boolean queries are just
355 // "all the words" queries (unless querymode is advanced)
356 while (here != end) {
357 if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
358 *here == '!' || *here == '&')) {
359 formattedstring.push_back(' ');
360 } else if (segment) {
361 if ((*here >= 0x2e80 && *here <= 0xd7a3) ||
362 ( *here >= 0xf900 && *here <= 0xfa6a)) {
363 /* text_t not big enough to handle these. */
364 /* (*here >= 0x20000 && *here <= 0x2a6d6) ||
365 (*here >= 0x2f800 && *here <= 0x2fa1d)) { */
366
367 // CJK character
368 if (!space) formattedstring.push_back (0x200b); // zero width space
369 formattedstring.push_back (*here);
370 formattedstring.push_back (0x200b);
371 space = true;
372 } else {
373
374 // non-Chinese character
375 formattedstring.push_back (*here);
376 space = false;
377
378 }
379
380 } else {
381 formattedstring.push_back (*here);
382 }
383 ++here;
384 }
385 querystring = formattedstring;
386}
387
388// turn query string into terms separated by spaces.
389// still working on this...
390text_t get_plain_query_terms(const text_t &querystring, const text_t &arg_ct) {
391 text_t::const_iterator here = querystring.begin();
392 text_t::const_iterator end = querystring.end();
393
394 // lets look for [] and () first - these are a pain.
395 text_t::const_iterator bracket;
396 text_t query_no_brackets = "";
397
398 // mgpp brackets: [xxx]:TI
399 if (findchar(here, end, '[') != end) {
400 while ((bracket = findchar(here, end, '[')) != end) {
401 // get the first bit
402 query_no_brackets += substr(here, bracket);
403 bracket++;
404 here = bracket;
405 // get the end bracket
406 bracket = findchar(here, end, ']');
407 query_no_brackets += substr(here, bracket);
408 // skip the :TI bits
409 while (*bracket != ' ' && bracket != end) { bracket++;}
410 here = bracket;
411 }
412 if (here != end) {
413 query_no_brackets += substr(here,end);
414 }
415 } else if (findchar(here, end, '(') != end) {
416 // lucene brackets TI:(xxx)
417 while ((bracket = findchar(here, end, '(')) != end) {
418 // back up the field name
419 text_t::const_iterator old_bracket = bracket;
420 while (*bracket != ' ' && bracket != here) {
421 --bracket;
422 }
423 if (bracket != here) {
424 // get the first bit
425 query_no_brackets += substr(here, bracket+1);
426 }
427 here = old_bracket +1;
428 // get the end bracket
429 bracket = findchar(here, end, ')');
430 query_no_brackets += substr(here, bracket);
431 if (bracket != end) {
432 here = bracket+1;
433 }
434 }
435 if (here != end) {
436 query_no_brackets += substr(here,end);
437 }
438 } else {
439 // was no brackets
440 query_no_brackets = querystring;
441 }
442
443
444 if (arg_ct == "2") { // lucene
445 // look for AND OR NOT and remove
446 here = query_no_brackets.begin();
447 end = query_no_brackets.end();
448 text_tlist terms;
449 splitword(here, end, "AND", terms);
450 joinchar(terms, ' ', query_no_brackets);
451 here = query_no_brackets.begin();
452 end = query_no_brackets.end();
453 splitword(here, end, "OR", terms);
454 joinchar(terms, ' ', query_no_brackets);
455 here = query_no_brackets.begin();
456 end = query_no_brackets.end();
457 splitword(here, end, "NOT", terms);
458 joinchar(terms, ' ', query_no_brackets);
459
460 }
461 text_t terms = "";
462 bool space = false;
463 here = query_no_brackets.begin();
464 end = query_no_brackets.end();
465
466 while (here != end) {
467 if (*here == '#' || *here == '/') {
468 // skip over #is /10 etc
469 ++here;
470 while (here != end && *here != ' ') {
471 ++here;
472 }
473 if (here == end) break;
474 }
475 if (is_unicode_letdig(*here)) {
476 terms.push_back(*here);
477 space = false;
478 } else {
479 if (!space) {
480 terms.push_back(' ');
481 space = true;
482 }
483 }
484 ++here;
485 }
486 return terms;
487
488}
489
490// search history tool
491// also used for form query macros
492text_t escape_quotes(const text_t &querystring) {
493
494 text_t::const_iterator here = querystring.begin();
495 text_t::const_iterator end = querystring.end();
496
497 text_t escquery = "";
498 while (here != end) {
499 if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
500 else if (*here == '\n' || *here == '\r') {
501 escquery.push_back(' ');
502 } else {
503 escquery +="\\\\";
504 escquery.push_back(*here);
505 }
506
507 ++here;
508 }
509 return escquery;
510
511}
512
513// Parses the terms into words, and adds #si if necessary
514text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
515 const int indexer_type) {
516
517 // the default stem and case are set to 0 if this is being used, so we are only adding on qualifiers if stem or fold is 1.
518 if (stem == "0" && fold == "0") {
519 return terms;
520 }
521 // this is only for mgpp collections, shouldn't be called for anything else
522 if (indexer_type != 1) {
523 return terms;
524 }
525
526 text_t outtext;
527 text_t word;
528
529 text_t::const_iterator here = terms.begin();
530 text_t::const_iterator end = terms.end();
531
532 while (here !=end) {
533
534 if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
535 // not word boundary
536 word.push_back(*here);
537 ++here;
538 }
539 else {
540 // found word boundary
541 if (!word.empty() ) {
542 if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) {
543 outtext += word;
544 word.clear();
545 }
546 else {
547 word += "#";
548 if (stem == "1") word += "s";
549 if (fold == "1") word += "i";
550 outtext += word;
551 word.clear();
552 }
553 }
554 // this only used in advanced form, so we leave in boolean operators
555 if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' ||
556 *here == '(' || *here == ')' || is_unicode_space(*here)) {
557 outtext.push_back(*here);
558 }
559 ++here;
560 }
561 }
562
563 // get last word
564 if (!word.empty()) {
565 word += "#";
566 if (stem == "1") word += "s";
567 if (fold == "1") word += "i";
568 word += " ";
569 outtext += word;
570 }
571 return outtext;
572}
573
574
575// some query form parsing functions for use with mgpp & lucene
576
577void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
578{
579 querystring.clear();
580
581 int argct = args.getintarg("ct");
582 int argt = args.getintarg("t");// t=0 -and, t=1 - or
583 int argb = args.getintarg("b");
584
585 text_t combine;
586
587 // lucene uses global combine, so only need this for mgpp
588 if (argct==1) {
589 if (argt == 0) combine = "&";
590 else combine = "|";
591 }
592
593 text_t field = args["fqf"];
594 if (field.empty()) return; // no query
595 text_tarray fields;
596 splitchar(field.begin(), field.end(), ',', fields);
597
598 text_t value = args["fqv"];
599 if (value.empty()) return; // somethings wrong
600 text_tarray values;
601 splitchar(value.begin(), value.end(), ',', values);
602
603
604 for (int i=0; i< values.size(); ++i) {
605 if (!values[i].empty()) {
606 text_t this_value = values[i];
607
608 // remove operators for simple search, segments text if necessary
609 format_querystring(this_value, argb, segment);
610
611 // add tag info for this field (and other processing)
612 format_field_info(this_value, fields[i], argct, argt, argb);
613
614 // add into query string
615 if (argct == 2) {
616 // lucene
617 // we don't worry about AND/OR, cos this is done by defaultcombineoperator
618 querystring += this_value+" ";
619 } else {
620 // mgpp
621 if (!querystring.empty()) {
622 querystring += " "+ combine+ " ";
623 }
624 querystring += this_value;
625 }
626 }
627 }
628}
629
630
631void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
632 querystring.clear();
633
634 const int argct = args.getintarg("ct");
635 int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
636 int argb = args.getintarg("b");
637 text_t combine;
638 if (argct==1) {
639 combine = "&";
640 }
641 else { // lucene
642 combine = "AND";
643 }
644
645 text_t field = args["fqf"];
646 if (field.empty()) return; // no query
647 text_tarray fields;
648 splitchar(field.begin(), field.end(), ',', fields);
649
650 text_t value = args["fqv"];
651 if (value.empty()) return; // somethings wrong
652 text_tarray values;
653 splitchar(value.begin(), value.end(), ',', values);
654
655 text_t comb = args["fqc"];
656 if (comb.empty()) return; //somethings wrong
657 text_tarray combs;
658 splitchar(comb.begin(), comb.end(), ',', combs);
659
660 text_tarray stems;
661 text_tarray folds;
662 if (argct == 1) {// mgpp - lucene doesn't do stem/case
663 text_t stem = args["fqs"];
664 if (stem.empty()) return; // somethings wrong
665 splitchar(stem.begin(), stem.end(), ',', stems);
666
667 text_t fold = args["fqk"];
668 if (fold.empty()) return; // somethings wrong
669 splitchar(fold.begin(), fold.end(), ',', folds);
670 }
671
672 for(int i=0; i< values.size(); ++i) {
673 if (!values[i].empty()) {
674 if (i!=0) {
675 if (argct==1) {
676 if (combs[i-1]=="and") combine = "&";
677 else if (combs[i-1]=="or")combine = "|";
678 else if (combs[i-1]=="not")combine = "!";
679 }
680 else { // lucene
681 if (combs[i-1]=="and") combine = "AND";
682 else if (combs[i-1]=="or")combine = "OR";
683 else if (combs[i-1]=="not")combine = "NOT";
684 }
685 }
686 text_t this_value = values[i];
687 // remove operators for simple search, segments text if necessary
688 format_querystring(this_value, argb, segment);
689 if (argct == 1) { // mgpp only
690 this_value = addstemcase(this_value, stems[i], folds[i], argct);
691 }
692 // add tag info for this field (and other processing)
693 format_field_info(this_value, fields[i], argct, argt, argb);
694 // add into query string
695 if (!querystring.empty()) {
696 querystring += " "+ combine+ " ";
697 }
698 querystring += this_value;
699
700 }
701 }
702}
703
704
705// SQL versions for parsing query form
706
707void parse_sqlreg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
708{
709 querystring.clear();
710
711 int argt = args.getintarg("t");// t=0 -and, t=1 - or
712 int argb = args.getintarg("b");
713
714 text_t combine;
715
716 if (argt == 0) combine = "AND";
717 else combine = "OR";
718
719 text_t field = args["sqlfqf"];
720 if (field.empty()) return; // no query
721 text_tarray fields;
722 splitchar(field.begin(), field.end(), ',', fields);
723
724 text_t sqlcomb = args["sqlfqc"];
725 if (sqlcomb.empty()) return; //somethings wrong
726 text_tarray sqlcombs;
727 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
728
729 text_t value = args["fqv"];
730 if (value.empty()) return; // somethings wrong
731 text_tarray values;
732 splitchar(value.begin(), value.end(), ',', values);
733
734
735 for (int i=0; i< values.size(); ++i) {
736 if (!values[i].empty()) {
737 text_t this_value = values[i];
738
739 // remove operators for simple search, segments text if necessary
740 format_querystring(this_value, argb, segment);
741
742 // add tag info for this field (and other processing)
743 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);
744
745 const text_t DISTINCT_SELECT_WHERE
746 = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
747
748 if (querystring.empty()) {
749 // first query term
750 querystring = DISTINCT_SELECT_WHERE + this_value;
751 }
752 else {
753 this_value = DISTINCT_SELECT_WHERE + this_value;
754
755 if (combine=="AND") {
756 // INNER JOIN to restrict to only matching docOIDs
757 querystring = "SELECT docOID FROM (" + querystring + ")"
758 + " INNER JOIN (" + this_value +") USING (docOID)";
759 }
760 else if (combine=="OR") {
761 // Union to allow union of the two
762 querystring = querystring + " UNION " + this_value;
763 }
764 }
765 }
766 }
767}
768
769
770void parse_sqladv_query_form(text_t &querystring, cgiargsclass &args,
771 bool segment)
772{
773 querystring.clear();
774
775 int argt = 0; // set it to 0 = AND, by default
776 int argb = args.getintarg("b");
777 text_t combine = "AND";
778
779 text_t field = args["sqlfqf"];
780
781 if (field.empty()) return; // no query
782 text_tarray fields;
783 splitchar(field.begin(), field.end(), ',', fields);
784
785 text_t sqlcomb = args["sqlfqc"];
786 if (sqlcomb.empty()) return; //somethings wrong
787 text_tarray sqlcombs;
788 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
789
790 text_t value = args["fqv"];
791 if (value.empty()) return; // somethings wrong
792 text_tarray values;
793 splitchar(value.begin(), value.end(), ',', values);
794
795 text_t comb = args["fqc"];
796 if (comb.empty()) return; //somethings wrong
797 text_tarray combs;
798 splitchar(comb.begin(), comb.end(), ',', combs);
799
800 for(int i=0; i< values.size(); ++i) {
801 if (!values[i].empty()) {
802 if (i>0) {
803 if (combs[i-1]=="and") { combine = "AND"; }
804 else if (combs[i-1]=="or") { combine = "OR"; }
805 else if (combs[i-1]=="not") { combine = "NOT"; }
806 }
807 text_t this_value = values[i];
808
809 // remove operators for simple search, segments text if necessary
810 format_querystring(this_value, argb, segment);
811
812 // add tag info for this field (and other processing)
813 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);
814
815 // add into query string
816
817 const text_t DISTINCT_SELECT_WHERE
818 = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
819
820 if (querystring.empty()) {
821 // first query term
822 querystring = DISTINCT_SELECT_WHERE + this_value;
823 }
824 else {
825 this_value = DISTINCT_SELECT_WHERE + this_value;
826
827 if (combine=="AND") {
828 // INNER JOIN to restrict to only matching docOIDs
829 querystring = "SELECT docOID FROM (" + querystring + ")"
830 + " INNER JOIN (" + this_value +") USING (docOID)";
831 }
832 else if (combine=="OR") {
833 // Union to allow union of the two
834 querystring = querystring + " UNION " + this_value;
835 }
836 else {
837 cerr << "Unsupported combination operation: " << combine << endl;
838 }
839 }
840
841 }
842 }
843}
844
845
846
847
848// Extended addqueryelem for Human Info project
849void addqueryelem_ex(text_t &querystring, const text_t &tag,
850 const text_t &terms, const text_t &stem,
851 const text_t &fold,
852 const text_t& combine, const text_t& word_combine) {
853
854 if (!querystring.empty()) { // have to put and/or
855 querystring += " " + combine + " ";
856 }
857 text_t outtext; outtext.reserve(512);
858 text_t word; word.reserve(100);
859 //unsigned short c;
860 text_t::const_iterator here = terms.begin();
861 text_t::const_iterator end = terms.end();
862 bool inquote = false, firstword = true;
863
864 text_t word2; word2.reserve(256);
865
866 while (here !=end) {
867 if (is_unicode_space(*here)) {
868 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
869 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
870 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
871 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
872 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
873 if (inquote) {
874 word2.push_back(*here);
875 }
876 word.append(word2); word2.clear();
877
878 if (!inquote && !word.empty() ) {
879 // found word boundary
880
881 if (stem == "1" || fold =="1") {
882 word += "#";
883 if (stem == "1") word += "s";
884 //else word += "u";
885
886 if (fold == "1") word += "i";
887 //else word += "c";
888 }
889 if (firstword) {
890 firstword = false;
891 } else {
892 outtext += " " + word_combine + " ";
893 }
894 outtext += "[" + word + "]:"+tag;
895 word.clear();
896 }
897 ++here;
898 } else if (*here == '\"') {
899 word2.push_back(*here);
900 inquote = !inquote;
901 ++here;
902 } else {
903 // not word boundary
904 word2.push_back(*here);
905 ++here;
906 }
907 }
908
909 // get last word
910 if (!word2.empty()) {
911 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
912 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
913 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
914 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
915 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
916 word.append(word2); word2.clear();
917
918 if (stem == "1"|| fold == "1") {
919 word += "#";
920 if (stem == "1") word += "s";
921 //else word += "u";
922
923 if (fold == "1") word += "i";
924 //else word += "c";
925 }
926 if (!outtext.empty()) outtext += " " + word_combine + " ";
927 outtext += "[" + word + "]:"+tag;
928 }
929 querystring += "(" + outtext + ")";
930}
931
932void add_field_info(text_t &querystring, const text_t &tag, int type) {
933
934 if (tag == "") return; // do nothing
935 if (tag == "ZZ" && type == 1) return; // mgpp doesn't use ZZ tag internally
936 if (type == 1) { //mgpp
937 querystring = "["+querystring+"]:"+tag;
938 } else if (type == 2) { // lucene
939 querystring = tag+":("+querystring+")";
940 }
941
942}
943
944
945void add_field_info_sql(text_t &querystring, const text_t &tagseq,
946 const text_t& sqlcomb)
947{
948
949 if (tagseq == "") return; // do nothing
950
951 text_t element_in = "(element IN (";
952
953 text_tlist mdterms;
954
955 splitword(tagseq.begin(), tagseq.end(), "/", mdterms);
956
957 text_t tags_in = "";
958
959 while (!mdterms.empty()) {
960 text_t tag = mdterms.front();
961 mdterms.pop_front();
962
963 if (!tag.empty()) {
964
965 if (tag.size()>3 && (substr(tag.begin(), tag.begin()+3) == "ex.")) {
966 tag = substr (tag.begin()+3, tag.end());
967 }
968
969 if (!tags_in.empty()) {
970 tags_in += ",";
971 }
972
973 tags_in += "'" + tag + "'";
974 }
975 }
976
977 element_in += tags_in + ") AND (";
978
979 if (sqlcomb == "=") {
980 // override what it means to do equality, to make it more like full text
981 // searching
982
983 text_t orterms = "";
984 text_t term = "";
985 bool in_phrase = false;
986
987 text_t::const_iterator here = querystring.begin();
988 text_t::const_iterator end = querystring.end();
989 while (here != end) {
990 if (is_unicode_letdig(*here)) {
991 term.push_back(*here);
992 }
993 else if (*here == '"') {
994 term.push_back(*here);
995 if (!in_phrase) {
996 in_phrase = true;
997 } else {
998 in_phrase = false;
999 }
1000 }
1001 else if (in_phrase) {
1002 // Found word boundary, but in a phrase, so does not complete term
1003 term.push_back(*here);
1004 }
1005 else {
1006 // Found a word boundary
1007 if (!orterms.empty()) {
1008 orterms += " OR ";
1009 }
1010 orterms += "value LIKE '%" + term + "%'";
1011 term.clear();
1012 }
1013 ++here;
1014 }
1015
1016 if (!term.empty()) {
1017 if (!orterms.empty()) {
1018 orterms += " OR ";
1019 }
1020 orterms += "value LIKE '%" + term + "%'";
1021 }
1022
1023 element_in += orterms;
1024 }
1025 else {
1026 // search on value is "as is" querystring
1027 element_in += "value " + sqlcomb + " '" + querystring+"'";
1028 }
1029
1030
1031 querystring = element_in + "))";
1032
1033}
1034
1035
1036void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) {
1037
1038 int type = 2; //lucene
1039
1040 if (argb==0) { // simple
1041 // there will be no & or | as they should have already been removed
1042 // just tag the entire thing
1043 if (tag != "") {
1044 add_field_info(querystring, tag, type);
1045 }
1046 return;
1047 }
1048
1049 // need to replace & with &&, | with ||
1050 text_t::const_iterator here = querystring.begin();
1051 text_t::const_iterator end = querystring.end();
1052
1053 text_t finalquery = "";
1054 while (here != end) {
1055 if (*here == '&') {
1056 finalquery.push_back('&');
1057 finalquery.push_back('&');
1058 while (*(here+1) == '&') {
1059 ++here;
1060 }
1061 }
1062 else if (*here == '|') {
1063 finalquery.push_back('|');
1064 finalquery.push_back('|');
1065 while (*(here+1) == '|') {
1066 ++here;
1067 }
1068 }
1069 else {
1070 finalquery.push_back(*here);
1071 }
1072 ++here;
1073 }
1074 querystring = finalquery;
1075 add_field_info(querystring, tag, type);
1076}
1077
1078
1079void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {
1080
1081 if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
1082 if (tag == "" && argb == 1) {
1083 return; // no field specifier, advanced mode, the query stays as written
1084 }
1085
1086 int type = 1; // mgpp
1087
1088 bool simple_and = (argb==0 && argt==0);
1089 text_t finalquery = "";
1090 text_t fieldpart ="";
1091 text_t queryelem = "";
1092 bool in_phrase = false;
1093 bool in_field = false;
1094
1095 text_t::const_iterator here = querystring.begin();
1096 text_t::const_iterator end = querystring.end();
1097 while (here != end) {
1098 if (is_unicode_letdig(*here) || *here == '&' || is_special_character(type, *here)) {
1099 queryelem.push_back(*here);
1100 }
1101 else if (*here == '|') {
1102 in_field = false;
1103 }
1104 else if (*here == '!' || *here == '(' || *here == ')') {
1105 if (!in_phrase) { // ignore these if in_phrase
1106 // output field, then output operator
1107 in_field = false;
1108 if (!queryelem.empty()) {
1109 if (!simple_and && !fieldpart.empty()) {
1110 add_field_info(fieldpart, tag, type);
1111 finalquery += fieldpart;
1112 finalquery.push_back(' ');
1113 fieldpart.clear();
1114 }
1115 fieldpart += queryelem;
1116 }
1117 if (!fieldpart.empty()) {
1118 add_field_info(fieldpart, tag, type);
1119 finalquery += fieldpart;
1120 finalquery.push_back(' ');
1121 }
1122 fieldpart.clear();
1123 queryelem.clear();
1124 finalquery.push_back(*here);
1125 finalquery.push_back(' ');
1126 }
1127 }
1128 else if (*here == '"') {
1129 queryelem.push_back(*here);
1130 if (in_phrase == false) in_phrase = true;
1131 else {
1132 in_phrase = false;
1133 }
1134 }
1135
1136 // Found word boundary, in a phrase
1137 else if (in_phrase) {
1138 queryelem.push_back(*here);
1139 }
1140 // Found a word boundary
1141 else {
1142 if (!queryelem.empty()) {
1143 if (queryelem == "&") {
1144 in_field = true;
1145 queryelem.clear();
1146 }
1147 else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
1148
1149 if (argb==1) {
1150 // simple search, these not allowed
1151 in_field = true;
1152 fieldpart += queryelem;
1153 fieldpart.push_back(' ');
1154 }
1155 queryelem.clear();
1156
1157 }
1158 else {
1159 if (!simple_and && !in_field) {
1160 if (!fieldpart.empty()) {
1161 add_field_info(fieldpart, tag, type);
1162 finalquery += fieldpart;
1163 finalquery.push_back(' ');
1164 fieldpart.clear();
1165 }
1166 }
1167
1168 fieldpart += queryelem;
1169 fieldpart.push_back(' ');
1170 queryelem.clear();
1171 }
1172 }
1173 }
1174 ++here;
1175 }
1176 // at the end
1177 if (!queryelem.empty()) {
1178 if (!simple_and && !in_field && !fieldpart.empty()) {
1179 add_field_info(fieldpart, tag, type);
1180 finalquery += fieldpart;
1181 finalquery.push_back(' ');
1182 fieldpart.clear();
1183 }
1184 fieldpart += queryelem;
1185 }
1186 if (!fieldpart.empty()) {
1187 add_field_info(fieldpart, tag, type);
1188 finalquery += fieldpart;
1189 fieldpart.clear();
1190
1191 // doesn't the following just leave a dangling space at the end ?? (used to make mgpp crash)
1192 // consider cutting this line
1193 finalquery.push_back(' ');
1194 }
1195
1196 querystring = finalquery;
1197}
1198
1199
1200void format_field_info_sql(text_t &querystring, const text_t &tagseq,
1201 const text_t &sqlcomb,
1202 int argt, int argb)
1203{
1204 add_field_info_sql(querystring, tagseq, sqlcomb);
1205}
1206
1207
1208void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) {
1209 if (argct == 1) {
1210 format_field_info_mgpp(querystring, tag, argt, argb);
1211 } else if (argct == 2) {
1212 format_field_info_lucene(querystring, tag, argt, argb);
1213 }
1214}
1215
1216void mgpp_adddateelem(text_t& querystring, const int date)
1217{
1218 querystring.appendcstr(" [");
1219 if(date<0) {
1220 querystring.appendcstr("bc");
1221 querystring.appendint((date*-1));
1222 }
1223 else {
1224 querystring.appendint(date);
1225 }
1226 querystring.appendcstr("]:CV");
1227}
1228
1229void lucene_adddateelem(text_t& querystring, const int date)
1230{
1231 querystring.appendcstr(" CV:(");
1232 if(date<0) {
1233 querystring.appendcstr("bc");
1234 querystring.appendint((date*-1));
1235 }
1236 else {
1237 querystring.appendint(date);
1238 }
1239 querystring.appendcstr(")");
1240}
1241
1242
1243void add_dates(text_t &querystring, int startdate, int enddate,
1244 int startbc, int endbc, int ct)
1245{
1246 if(startdate)
1247 {
1248 int querystringis = 0;
1249 text_t::const_iterator here = querystring.begin();
1250 text_t::const_iterator end = querystring.end();
1251 while(here!=end)
1252 {
1253 if(!(isspace((*here)))){
1254 here = end;
1255 querystringis = 1;
1256 }
1257 else
1258 ++here;
1259 }
1260 //converting BCE dates
1261 if(startbc && startdate > 0)
1262 {
1263 startdate *= -1;
1264 }
1265 if(endbc && enddate > 0)
1266 {
1267 enddate *= -1;
1268 }
1269 if(enddate != 0 && enddate<startdate)
1270 {
1271 cout<<"enddate too small"<<endl;
1272 return;
1273 }
1274 if(querystringis)
1275 querystring.appendcstr(" AND");
1276 if(!enddate)
1277 {
1278 if (ct==1) {
1279 mgpp_adddateelem(querystring,startdate);
1280 }
1281 else { // lucene
1282 lucene_adddateelem(querystring,startdate);
1283 }
1284 }
1285 else{
1286 int nextdate = startdate;
1287 querystring.appendcstr(" (");
1288 while(nextdate<=enddate)
1289 {
1290 if(nextdate!=0) {
1291 if (ct==1) {
1292 mgpp_adddateelem(querystring,nextdate);
1293 }
1294 else { // lucene
1295 lucene_adddateelem(querystring,nextdate);
1296 }
1297 }
1298 ++nextdate;
1299 }
1300 querystring.appendcstr(" )");
1301 }
1302 }
1303
1304}
Note: See TracBrowser for help on using the repository browser.