source: main/trunk/greenstone2/runtime-src/src/recpt/querytools.cpp@ 24874

Last change on this file since 24874 was 24306, checked in by ak19, 13 years ago

More changes to do with the ex. prefixed to embedded metadata (that may have an additional metadata set as namespace qualifier). The C code now removes the ex. prefix only if there are no other metadataset qualifiers in the metadata name.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 36.6 KB
Line 
1/**********************************************************************
2 *
3 * querytools.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "querytools.h"
27#include <ctype.h>
28#include "unitool.h" // for is_unicode_letdig
29
30// sets the ct, qt, qto arguments
31void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
32
33 if (args["ct"].empty()) {
34 text_t build_type = cinfo->buildType;
35 if (build_type == "mgpp") {
36 args["ct"] = "1";
37 } else if (build_type == "lucene") {
38 args["ct"] = "2";
39 } else {
40 args["ct"] = "0";
41 }
42 }
43 text_t arg_ct = args["ct"];
44 if (arg_ct == "0") {
45 // mg
46 args["qt"] = "0";
47 args["qto"] = "0";
48 return;
49 }
50
51 if (!args["qt"].empty() && !args["qto"].empty()) {
52 return;
53 }
54
55 text_tmap::iterator check = cinfo->format.find("SearchTypes");
56 text_t search_types;
57 if(check != cinfo->format.end() && !(*check).second.empty()){
58 search_types = (*check).second;
59 } else {
60 // assume plain,form
61 if (args["qto"].empty()) args["qto"] = "3";
62 if (args["qt"].empty()) {
63 int arg_qto = args.getintarg("qto");
64 if (arg_qto == 2) {
65 args["qt"] = "1";
66 } else {
67 args["qt"] = "0";
68 }
69 }
70 return;
71 }
72
73
74 if (args["qto"].empty()) {
75 unsigned int type = 0;
76 if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
77 type |= 2;
78 }
79 if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
80 type |= 1;
81 }
82 args.setintarg("qto", type);
83 }
84
85 if (args["qt"].empty()) {
86 int arg_qto = args.getintarg("qto");
87 if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
88 args["qt"] = "1";
89 } else {
90 args["qt"] = "0";
91 }
92 }
93
94
95 // decide if sqlqto should be set or not
96 unsigned int sql_type = 0;
97 text_t infodb_type = cinfo->infodbType;
98 if ((infodb_type == "sqlite") || (infodb_type == "mssql")) {
99 if (findword(search_types.begin(), search_types.end(), "sqlform") != search_types.end()) {
100 sql_type = 1;
101 }
102 }
103
104 if (sql_type) {
105 args["sqlqto"] = "1";
106 }
107 else {
108 args["sqlqto"] = "0";
109 }
110
111
112}
113
114// sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args
115void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
116 int stemIndexes = cinfo->stemIndexes;
117
118 if (stemIndexes & SIcasefold) {
119 args["ks"] = 1;
120 }
121 if (stemIndexes & SIstem) {
122 args["ss"] = 1;
123 }
124 if (stemIndexes & SIaccentfold) {
125 args["afs"] = 1;
126 }
127
128}
129
130
131
132void set_basequeryfilter_options (FilterRequest_t &request,
133 cgiargsclass &args)
134{
135
136 OptionValue_t option;
137 int arg_m = args.getintarg("m");
138
139 option.name = "Maxdocs";
140 option.value = arg_m;
141 request.filterOptions.push_back (option);
142
143 // option.name = "StartResults";
144 // option.value = args["r"];
145 // request.filterOptions.push_back (option);
146
147 // option.name = "EndResults";
148 // int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
149 // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
150 // option.value = endresults;
151 // request.filterOptions.push_back (option);
152}
153
154
155// request.filterResultOptions and request.fields (if required) should
156// be set from the calling code
157void set_fulltext_queryfilter_options (FilterRequest_t &request,
158 const text_t &querystring,
159 cgiargsclass &args)
160{
161 // better if this function, and the two-query companion function
162 // was implemented in queryaction.cpp
163 // Has to be done here to documentaction.cpp can call it directly
164
165 request.filterName = "QueryFilter";
166
167 OptionValue_t option;
168
169 option.name = "Term";
170 option.value = querystring;
171 request.filterOptions.push_back (option);
172
173 option.name = "QueryType";
174 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
175 request.filterOptions.push_back (option);
176
177 option.name = "MatchMode";
178 // mgpp in advanced mode, always use some query
179 if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
180 option.value = "some";
181 } else {
182 option.value = (args.getintarg("t")) ? "some" : "all";
183 }
184 request.filterOptions.push_back (option);
185
186 option.name = "Casefold";
187 option.value = (args.getintarg("k")) ? "true" : "false";
188 request.filterOptions.push_back (option);
189
190 option.name = "Stem";
191 option.value = (args.getintarg("s")) ? "true" : "false";
192 request.filterOptions.push_back (option);
193
194 option.name = "AccentFold";
195 option.value = (args.getintarg("af")) ? "true" : "false";
196 request.filterOptions.push_back (option);
197
198 if (!args["h"].empty()) {
199 option.name = "Index";
200 option.value = args["h"];
201 request.filterOptions.push_back (option);
202 }
203
204 if (!args["j"].empty()) {
205 option.name = "Subcollection";
206 option.value = args["j"];
207 request.filterOptions.push_back (option);
208 }
209
210 if (!args["n"].empty()) {
211 option.name = "Language";
212 option.value = args["n"];
213 request.filterOptions.push_back (option);
214 }
215
216 if (!args["g"].empty()) { // granularity for mgpp
217 option.name = "Level";
218 option.value = args["g"];
219 request.filterOptions.push_back (option);
220 }
221
222 if (!args["fs"].empty()) { // filter string for lucene
223 option.name = "FilterString";
224 option.value = args["fs"];
225 request.filterOptions.push_back (option);
226 }
227
228 if (!args["sf"].empty()) { // sort field for lucene
229 option.name = "SortField";
230 option.value = args["sf"];
231 request.filterOptions.push_back (option);
232 }
233
234 if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene
235 option.name = "Fuzziness";
236 option.value = (text_t) "0." + args["fuzziness"];
237 request.filterOptions.push_back (option);
238 }
239
240 set_basequeryfilter_options(request, args);
241}
242
243
244
245void set_fulltext_queryfilter_options (FilterRequest_t &request,
246 const text_t &querystring1,
247 const text_t &querystring2,
248 cgiargsclass &args)
249{
250
251 set_fulltext_queryfilter_options (request, querystring1, args);
252
253 // fill in the second query if needed
254 if (!args["cq2"].empty()) {
255 OptionValue_t option;
256
257 option.name = "CombineQuery";
258 option.value = args["cq2"];
259 request.filterOptions.push_back (option);
260
261 option.name = "Term";
262 option.value = querystring2;
263 request.filterOptions.push_back (option);
264
265 option.name = "QueryType";
266 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
267 request.filterOptions.push_back (option);
268
269 option.name = "Casefold";
270 option.value = (args.getintarg("k")) ? "true" : "false";
271 request.filterOptions.push_back (option);
272
273 option.name = "Stem";
274 option.value = (args.getintarg("s")) ? "true" : "false";
275 request.filterOptions.push_back (option);
276
277 option.name = "AccentFold";
278 option.value = (args.getintarg("af")) ? "true" : "false";
279 request.filterOptions.push_back (option);
280
281 if (!args["h2"].empty()) {
282 option.name = "Index";
283 option.value = args["h2"];
284 request.filterOptions.push_back (option);
285 }
286
287 if (!args["j2"].empty()) {
288 option.name = "Subcollection";
289 option.value = args["j2"];
290 request.filterOptions.push_back (option);
291 }
292
293 if (!args["n2"].empty()) {
294 option.name = "Language";
295 option.value = args["n2"];
296 request.filterOptions.push_back (option);
297 }
298 }
299
300 // this is probably redundant, as first line to this method will have
301 // already caused it to invoke set_basequeryfilter_options
302
303 set_basequeryfilter_options(request, args);
304}
305
306
307
308// request.filterResultOptions and request.fields (if required) should
309// be set from the calling code
310void set_sql_queryfilter_options (FilterRequest_t &request,
311 cgiargsclass &args)
312{
313 if (!args["sqlsf"].empty()) { // sort field for lucene
314 OptionValue_t option;
315
316 option.name = "SortField";
317 option.value = args["sqlsf"];
318 request.filterOptions.push_back (option);
319 }
320
321 set_basequeryfilter_options(request, args);
322}
323
324
325bool is_special_character(int indexer_type, unsigned short character) {
326 // mgpp
327 if (indexer_type == 1) {
328 return (character == '#' || character == '/' || character == '*');
329 }
330 // lucene
331 else if (indexer_type == 2) {
332 return (character == '?' || character == '*' || character == '~' ||
333 character == '^');
334 }
335 return false;
336}
337
338// This function removes boolean operators from simple searches, and segments
339// chinese characters if segment=true
340void format_querystring (text_t &querystring, int querymode, bool segment) {
341 text_t formattedstring;
342
343 // advanced search, no segmenting, don't need to do anything
344 if (querymode == 1 && !segment) return;
345
346 text_t::const_iterator here = querystring.begin();
347 text_t::const_iterator end = querystring.end();
348
349 // space is used to insert spaces between Chinese
350 // characters. No space is needed before the first
351 // Chinese character.
352 bool space = false;
353
354 // want to remove ()|!& from querystring so boolean queries are just
355 // "all the words" queries (unless querymode is advanced)
356 while (here != end) {
357 if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
358 *here == '!' || *here == '&')) {
359 formattedstring.push_back(' ');
360 } else if (segment) {
361 if ((*here >= 0x2e80 && *here <= 0xd7a3) ||
362 ( *here >= 0xf900 && *here <= 0xfa6a)) {
363 /* text_t not big enough to handle these. */
364 /* (*here >= 0x20000 && *here <= 0x2a6d6) ||
365 (*here >= 0x2f800 && *here <= 0x2fa1d)) { */
366
367 // CJK character
368 if (!space) formattedstring.push_back (0x200b); // zero width space
369 formattedstring.push_back (*here);
370 formattedstring.push_back (0x200b);
371 space = true;
372 } else {
373
374 // non-Chinese character
375 formattedstring.push_back (*here);
376 space = false;
377
378 }
379
380 } else {
381 formattedstring.push_back (*here);
382 }
383 ++here;
384 }
385 querystring = formattedstring;
386}
387
388// turn query string into terms separated by spaces.
389// still working on this...
390text_t get_plain_query_terms(const text_t &querystring, const text_t &arg_ct) {
391 text_t::const_iterator here = querystring.begin();
392 text_t::const_iterator end = querystring.end();
393
394 // lets look for [] and () first - these are a pain.
395 text_t::const_iterator bracket;
396 text_t query_no_brackets = "";
397
398 // mgpp brackets: [xxx]:TI
399 if (findchar(here, end, '[') != end) {
400 while ((bracket = findchar(here, end, '[')) != end) {
401 // get the first bit
402 query_no_brackets += substr(here, bracket);
403 bracket++;
404 here = bracket;
405 // get the end bracket
406 bracket = findchar(here, end, ']');
407 query_no_brackets += substr(here, bracket);
408 // skip the :TI bits
409 while (bracket != end // do bracket != end test first, ELSE when bracket = end, we're past the string, in
410 && *bracket != ' ') { // which case *bracket becomes an invalid operation that causes the server to crash
411 bracket++;
412 }
413 here = bracket;
414 }
415 if (here != end) {
416 query_no_brackets += substr(here,end);
417 }
418 } else if (findchar(here, end, '(') != end) {
419 // lucene brackets TI:(xxx)
420 while ((bracket = findchar(here, end, '(')) != end) {
421 // back up the field name
422 text_t::const_iterator old_bracket = bracket;
423 while (bracket != here && *bracket != ' ') { // order of tests in condition matters (see long comment above)
424 --bracket;
425 }
426 if (bracket != here) {
427 // get the first bit
428 query_no_brackets += substr(here, bracket+1);
429 }
430 here = old_bracket +1;
431 // get the end bracket
432 bracket = findchar(here, end, ')');
433 query_no_brackets += substr(here, bracket);
434 if (bracket != end) {
435 here = bracket+1;
436 }
437 }
438 if (here != end) {
439 query_no_brackets += substr(here,end);
440 }
441 } else {
442 // was no brackets
443 query_no_brackets = querystring;
444 }
445
446
447 if (arg_ct == "2") { // lucene
448 // look for AND OR NOT and remove
449 here = query_no_brackets.begin();
450 end = query_no_brackets.end();
451 text_tlist terms;
452 splitword(here, end, "AND", terms);
453 joinchar(terms, ' ', query_no_brackets);
454 here = query_no_brackets.begin();
455 end = query_no_brackets.end();
456 splitword(here, end, "OR", terms);
457 joinchar(terms, ' ', query_no_brackets);
458 here = query_no_brackets.begin();
459 end = query_no_brackets.end();
460 splitword(here, end, "NOT", terms);
461 joinchar(terms, ' ', query_no_brackets);
462
463 }
464 text_t terms = "";
465 bool space = false;
466 here = query_no_brackets.begin();
467 end = query_no_brackets.end();
468
469 while (here != end) {
470 if (*here == '#' || *here == '/') {
471 // skip over #is /10 etc
472 ++here;
473 while (here != end && *here != ' ') {
474 ++here;
475 }
476 if (here == end) break;
477 }
478 if (is_unicode_letdig(*here)) {
479 terms.push_back(*here);
480 space = false;
481 } else {
482 if (!space) {
483 terms.push_back(' ');
484 space = true;
485 }
486 }
487 ++here;
488 }
489 return trim(terms);
490
491}
492
493// search history tool
494// also used for form query macros
495text_t escape_quotes(const text_t &querystring) {
496
497 text_t::const_iterator here = querystring.begin();
498 text_t::const_iterator end = querystring.end();
499
500 text_t escquery = "";
501 while (here != end) {
502 if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
503 else if (*here == '\n' || *here == '\r') {
504 escquery.push_back(' ');
505 } else {
506 escquery +="\\\\";
507 escquery.push_back(*here);
508 }
509
510 ++here;
511 }
512 return escquery;
513
514}
515
516// Parses the terms into words, and adds #si if necessary
517text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
518 const int indexer_type) {
519
520 // the default stem and case are set to 0 if this is being used, so we are only adding on qualifiers if stem or fold is 1.
521 if (stem == "0" && fold == "0") {
522 return terms;
523 }
524 // this is only for mgpp collections, shouldn't be called for anything else
525 if (indexer_type != 1) {
526 return terms;
527 }
528
529 text_t outtext;
530 text_t word;
531
532 text_t::const_iterator here = terms.begin();
533 text_t::const_iterator end = terms.end();
534
535 while (here !=end) {
536
537 if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
538 // not word boundary
539 word.push_back(*here);
540 ++here;
541 }
542 else {
543 // found word boundary
544 if (!word.empty() ) {
545 if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) {
546 outtext += word;
547 word.clear();
548 }
549 else {
550 word += "#";
551 if (stem == "1") word += "s";
552 if (fold == "1") word += "i";
553 outtext += word;
554 word.clear();
555 }
556 }
557 // this only used in advanced form, so we leave in boolean operators
558 if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' ||
559 *here == '(' || *here == ')' || is_unicode_space(*here)) {
560 outtext.push_back(*here);
561 }
562 ++here;
563 }
564 }
565
566 // get last word
567 if (!word.empty()) {
568 word += "#";
569 if (stem == "1") word += "s";
570 if (fold == "1") word += "i";
571 word += " ";
572 outtext += word;
573 }
574 return outtext;
575}
576
577
578// some query form parsing functions for use with mgpp & lucene
579
580void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
581{
582 querystring.clear();
583
584 int argct = args.getintarg("ct");
585 int argt = args.getintarg("t");// t=0 -and, t=1 - or
586 int argb = args.getintarg("b");
587
588 text_t combine;
589
590 // lucene uses global combine, so only need this for mgpp
591 if (argct==1) {
592 if (argt == 0) combine = "&";
593 else combine = "|";
594 }
595
596 text_t field = args["fqf"];
597 if (field.empty()) return; // no query
598 text_tarray fields;
599 splitchar(field.begin(), field.end(), ',', fields);
600
601 text_t value = args["fqv"];
602 if (value.empty()) return; // somethings wrong
603 text_tarray values;
604 splitchar(value.begin(), value.end(), ',', values);
605
606
607 for (int i=0; i< values.size(); ++i) {
608 if (!values[i].empty()) {
609 text_t this_value = values[i];
610
611 // remove operators for simple search, segments text if necessary
612 format_querystring(this_value, argb, segment);
613
614 // add tag info for this field (and other processing)
615 format_field_info(this_value, fields[i], argct, argt, argb);
616
617 // add into query string
618 if (argct == 2) {
619 // lucene
620 // we don't worry about AND/OR, cos this is done by defaultcombineoperator
621 querystring += this_value+" ";
622 } else {
623 // mgpp
624 if (!querystring.empty()) {
625 querystring += " "+ combine+ " ";
626 }
627 querystring += this_value;
628 }
629 }
630 }
631}
632
633
634void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
635 querystring.clear();
636
637 const int argct = args.getintarg("ct");
638 int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
639 int argb = args.getintarg("b");
640 text_t combine;
641 if (argct==1) {
642 combine = "&";
643 }
644 else { // lucene
645 combine = "AND";
646 }
647
648 text_t field = args["fqf"];
649 if (field.empty()) return; // no query
650 text_tarray fields;
651 splitchar(field.begin(), field.end(), ',', fields);
652
653 text_t value = args["fqv"];
654 if (value.empty()) return; // somethings wrong
655 text_tarray values;
656 splitchar(value.begin(), value.end(), ',', values);
657
658 text_t comb = args["fqc"];
659 if (comb.empty()) return; //somethings wrong
660 text_tarray combs;
661 splitchar(comb.begin(), comb.end(), ',', combs);
662
663 text_tarray stems;
664 text_tarray folds;
665 if (argct == 1) {// mgpp - lucene doesn't do stem/case
666 text_t stem = args["fqs"];
667 if (stem.empty()) return; // somethings wrong
668 splitchar(stem.begin(), stem.end(), ',', stems);
669
670 text_t fold = args["fqk"];
671 if (fold.empty()) return; // somethings wrong
672 splitchar(fold.begin(), fold.end(), ',', folds);
673 }
674
675 for(int i=0; i< values.size(); ++i) {
676 if (!values[i].empty()) {
677 if (i!=0) {
678 if (argct==1) {
679 if (combs[i-1]=="and") combine = "&";
680 else if (combs[i-1]=="or")combine = "|";
681 else if (combs[i-1]=="not")combine = "!";
682 }
683 else { // lucene
684 if (combs[i-1]=="and") combine = "AND";
685 else if (combs[i-1]=="or")combine = "OR";
686 else if (combs[i-1]=="not")combine = "NOT";
687 }
688 }
689 text_t this_value = values[i];
690 // remove operators for simple search, segments text if necessary
691 format_querystring(this_value, argb, segment);
692 if (argct == 1) { // mgpp only
693 this_value = addstemcase(this_value, stems[i], folds[i], argct);
694 }
695 // add tag info for this field (and other processing)
696 format_field_info(this_value, fields[i], argct, argt, argb);
697 // add into query string
698 if (!querystring.empty()) {
699 querystring += " "+ combine+ " ";
700 }
701 querystring += this_value;
702
703 }
704 }
705}
706
707
708// SQL versions for parsing query form
709
710void parse_sqlreg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
711{
712 querystring.clear();
713
714 int argt = args.getintarg("t");// t=0 -and, t=1 - or
715 int argb = args.getintarg("b");
716
717 text_t combine;
718
719 if (argt == 0) combine = "AND";
720 else combine = "OR";
721
722 text_t field = args["sqlfqf"];
723 if (field.empty()) return; // no query
724 text_tarray fields;
725 splitchar(field.begin(), field.end(), ',', fields);
726
727 text_t sqlcomb = args["sqlfqc"];
728 if (sqlcomb.empty()) return; //somethings wrong
729 text_tarray sqlcombs;
730 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
731
732 text_t value = args["fqv"];
733 if (value.empty()) return; // somethings wrong
734 text_tarray values;
735 splitchar(value.begin(), value.end(), ',', values);
736
737
738 for (int i=0; i< values.size(); ++i) {
739 if (!values[i].empty()) {
740 text_t this_value;
741 const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
742 const text_t LIKE_CONDITION = "LIKE";
743
744 //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
745 //in order to search a field starting with certain words.
746 if (sqlcombs[i] == STARTINGWITH_CONDITION)
747 {this_value = values[i];
748 this_value += "%";
749 // remove operators for simple search, segments text if necessary
750 format_querystring(this_value, argb, segment);
751 // add tag info for this field (and other processing)
752 format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}
753
754 else
755 {this_value = values[i];
756 // remove operators for simple search, segments text if necessary
757 format_querystring(this_value, argb, segment);
758 // add tag info for this field (and other processing)
759 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}
760
761
762 const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
763
764 if (querystring.empty()) {
765 // first query term
766 querystring = DISTINCT_SELECT_WHERE + this_value;
767 }
768 else {
769 this_value = DISTINCT_SELECT_WHERE + this_value;
770
771 if (combine=="AND") {
772 // INNER JOIN to restrict to only matching docOIDs
773 querystring = "SELECT docOID FROM (" + querystring + ")"
774 + " INNER JOIN (" + this_value +") USING (docOID)";
775 }
776 else if (combine=="OR") {
777 // Union to allow union of the two
778 querystring = querystring + " UNION " + this_value;
779 }
780 }
781 }
782 }
783}
784
785
786void parse_sqladv_query_form(text_t &querystring, cgiargsclass &args,
787 bool segment)
788{
789 querystring.clear();
790
791 int argt = 0; // set it to 0 = AND, by default
792 int argb = args.getintarg("b");
793 text_t combine = "AND";
794
795 text_t field = args["sqlfqf"];
796
797 if (field.empty()) return; // no query
798 text_tarray fields;
799 splitchar(field.begin(), field.end(), ',', fields);
800
801 text_t sqlcomb = args["sqlfqc"];
802 if (sqlcomb.empty()) return; //somethings wrong
803 text_tarray sqlcombs;
804 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
805
806 text_t value = args["fqv"];
807 if (value.empty()) return; // somethings wrong
808 text_tarray values;
809 splitchar(value.begin(), value.end(), ',', values);
810
811 text_t comb = args["fqc"];
812 if (comb.empty()) return; //somethings wrong
813 text_tarray combs;
814 splitchar(comb.begin(), comb.end(), ',', combs);
815
816 for(int i=0; i< values.size(); ++i) {
817 if (!values[i].empty()) {
818 if (i>0) {
819 if (combs[i-1]=="and") { combine = "AND"; }
820 else if (combs[i-1]=="or") { combine = "OR"; }
821 else if (combs[i-1]=="not") { combine = "NOT"; }
822 }
823 text_t this_value;
824 const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
825 const text_t LIKE_CONDITION = "LIKE";
826
827 //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
828 //in order to search a field starting with certain words.
829 if (sqlcombs[i] == STARTINGWITH_CONDITION)
830 {this_value = values[i];
831 this_value += "%";
832 // remove operators for simple search, segments text if necessary
833 format_querystring(this_value, argb, segment);
834 // add tag info for this field (and other processing)
835 format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}
836
837 else
838 {this_value = values[i];
839 // remove operators for simple search, segments text if necessary
840 format_querystring(this_value, argb, segment);
841 // add tag info for this field (and other processing)
842 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}
843
844 const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
845
846 if (querystring.empty()) {
847 // first query term
848 querystring = DISTINCT_SELECT_WHERE + this_value;
849 }
850 else {
851 this_value = DISTINCT_SELECT_WHERE + this_value;
852
853 if (combine=="AND") {
854 // INNER JOIN to restrict to only matching docOIDs
855 querystring = "SELECT docOID FROM (" + querystring + ")"
856 + " INNER JOIN (" + this_value +") USING (docOID)";
857 }
858 else if (combine=="OR") {
859 // Union to allow union of the two
860 querystring = querystring + " UNION " + this_value;
861 }
862 else {
863 cerr << "Unsupported combination operation: " << combine << endl;
864 }
865 }
866
867 }
868 }
869}
870
871
872
873
874// Extended addqueryelem for Human Info project
875void addqueryelem_ex(text_t &querystring, const text_t &tag,
876 const text_t &terms, const text_t &stem,
877 const text_t &fold,
878 const text_t& combine, const text_t& word_combine) {
879
880 if (!querystring.empty()) { // have to put and/or
881 querystring += " " + combine + " ";
882 }
883 text_t outtext; outtext.reserve(512);
884 text_t word; word.reserve(100);
885 //unsigned short c;
886 text_t::const_iterator here = terms.begin();
887 text_t::const_iterator end = terms.end();
888 bool inquote = false, firstword = true;
889
890 text_t word2; word2.reserve(256);
891
892 while (here !=end) {
893 if (is_unicode_space(*here)) {
894 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
895 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
896 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
897 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
898 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
899 if (inquote) {
900 word2.push_back(*here);
901 }
902 word.append(word2); word2.clear();
903
904 if (!inquote && !word.empty() ) {
905 // found word boundary
906
907 if (stem == "1" || fold =="1") {
908 word += "#";
909 if (stem == "1") word += "s";
910 //else word += "u";
911
912 if (fold == "1") word += "i";
913 //else word += "c";
914 }
915 if (firstword) {
916 firstword = false;
917 } else {
918 outtext += " " + word_combine + " ";
919 }
920 outtext += "[" + word + "]:"+tag;
921 word.clear();
922 }
923 ++here;
924 } else if (*here == '\"') {
925 word2.push_back(*here);
926 inquote = !inquote;
927 ++here;
928 } else {
929 // not word boundary
930 word2.push_back(*here);
931 ++here;
932 }
933 }
934
935 // get last word
936 if (!word2.empty()) {
937 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
938 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
939 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
940 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
941 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
942 word.append(word2); word2.clear();
943
944 if (stem == "1"|| fold == "1") {
945 word += "#";
946 if (stem == "1") word += "s";
947 //else word += "u";
948
949 if (fold == "1") word += "i";
950 //else word += "c";
951 }
952 if (!outtext.empty()) outtext += " " + word_combine + " ";
953 outtext += "[" + word + "]:"+tag;
954 }
955 querystring += "(" + outtext + ")";
956}
957
958void add_field_info(text_t &querystring, const text_t &tag, int type) {
959
960 if (tag == "") return; // do nothing
961 if (tag == "ZZ" && type == 1) return; // mgpp doesn't use ZZ tag internally
962 if (type == 1) { //mgpp
963 querystring = "["+querystring+"]:"+tag;
964 } else if (type == 2) { // lucene
965 querystring = tag+":("+querystring+")";
966 }
967
968}
969
970
971void add_field_info_sql(text_t &querystring, const text_t &tagseq,
972 const text_t& sqlcomb)
973{
974
975 if (tagseq == "") return; // do nothing
976
977 text_t element_in = "(element IN (";
978
979 text_tlist mdterms;
980
981 splitword(tagseq.begin(), tagseq.end(), "/", mdterms);
982
983 text_t tags_in = "";
984
985 while (!mdterms.empty()) {
986 text_t tag = mdterms.front();
987 mdterms.pop_front();
988
989 if (!tag.empty()) {
990
991 // remove "ex." prefix, but only if there are no other metadata set qualifiers
992 // in the metaname, since we want to retain prefixes like "ex.dc." as-is
993 text_t::iterator period = findchar(tag.begin(), tag.end(), '.');
994 text_t::iterator lastperiod = findlastchar(tag.begin(), tag.end(), '.');
995
996 if (tag.size()>3 && (substr(tag.begin(), tag.begin()+3) == "ex.") && period == lastperiod) {
997 tag = substr (tag.begin()+3, tag.end());
998 }
999
1000 if (!tags_in.empty()) {
1001 tags_in += ",";
1002 }
1003
1004 tags_in += "'" + tag + "'";
1005 }
1006 }
1007
1008 element_in += tags_in + ") AND (";
1009
1010
1011 if (sqlcomb == "=") {
1012 // override what it means to do equality, to make it more like full text
1013 // searching
1014
1015 text_t orterms = "";
1016 text_t term = "";
1017 bool in_phrase = false;
1018
1019 text_t::const_iterator here = querystring.begin();
1020 text_t::const_iterator end = querystring.end();
1021 while (here != end) {
1022 if (is_unicode_letdig(*here)) {
1023 term.push_back(*here);
1024 }
1025 else if (*here == '"') {
1026 term.push_back(*here);
1027 if (!in_phrase) {
1028 in_phrase = true;
1029 } else {
1030 in_phrase = false;
1031 }
1032 }
1033 else if (in_phrase) {
1034 // Found word boundary, but in a phrase, so does not complete term
1035 term.push_back(*here);
1036 }
1037 else {
1038 // Found a word boundary
1039 if (!orterms.empty()) {
1040 orterms += " OR ";
1041 }
1042 orterms += "value LIKE '%" + term + "%'";
1043 term.clear();
1044 }
1045 ++here;
1046 }
1047
1048 if (!term.empty()) {
1049 if (!orterms.empty()) {
1050 orterms += " OR ";
1051 }
1052 orterms += "value LIKE '%" + term + "%'";
1053 }
1054
1055 element_in += orterms;
1056 }
1057 //We cast the value from STRING to REAL to allow numeric sorting
1058 else if (sqlcomb == "<num") {
1059 element_in += "CAST(value as REAL) < CAST('" + querystring+"' AS REAL)";
1060 }
1061 else if (sqlcomb == ">num") {
1062 element_in += "CAST(value as REAL) > CAST('" + querystring+"' AS REAL)";
1063 }
1064 else if (sqlcomb == "<=num") {
1065 element_in += "CAST(value as REAL) <= CAST('" + querystring+"' AS REAL)";
1066 }
1067 else if (sqlcomb == ">=num") {
1068 element_in += "CAST(value as REAL) >= CAST('" + querystring+"' AS REAL)";
1069 }
1070 else if (sqlcomb == "=num") {
1071 element_in += "CAST(value as REAL) = CAST('" + querystring+"' AS REAL)";
1072 }
1073 else {
1074 // search on value is "as is" querystring
1075 element_in += "value " + sqlcomb + " '" + querystring+"'";
1076 }
1077
1078
1079 querystring = element_in + "))";
1080
1081}
1082
1083
1084void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) {
1085
1086 int type = 2; //lucene
1087
1088 if (argb==0) { // simple
1089 // there will be no & or | as they should have already been removed
1090 // just tag the entire thing
1091 if (tag != "") {
1092 add_field_info(querystring, tag, type);
1093 }
1094 return;
1095 }
1096
1097 // need to replace & with &&, | with ||
1098 text_t::const_iterator here = querystring.begin();
1099 text_t::const_iterator end = querystring.end();
1100
1101 text_t finalquery = "";
1102 while (here != end) {
1103 if (*here == '&') {
1104 finalquery.push_back('&');
1105 finalquery.push_back('&');
1106 while (*(here+1) == '&') {
1107 ++here;
1108 }
1109 }
1110 else if (*here == '|') {
1111 finalquery.push_back('|');
1112 finalquery.push_back('|');
1113 while (*(here+1) == '|') {
1114 ++here;
1115 }
1116 }
1117 else {
1118 finalquery.push_back(*here);
1119 }
1120 ++here;
1121 }
1122 querystring = finalquery;
1123 add_field_info(querystring, tag, type);
1124}
1125
1126
1127void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {
1128
1129 if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
1130 if (tag == "" && argb == 1) {
1131 return; // no field specifier, advanced mode, the query stays as written
1132 }
1133
1134 int type = 1; // mgpp
1135
1136 bool simple_and = (argb==0 && argt==0);
1137 text_t finalquery = "";
1138 text_t fieldpart ="";
1139 text_t queryelem = "";
1140 bool in_phrase = false;
1141 bool in_field = false;
1142
1143 text_t::const_iterator here = querystring.begin();
1144 text_t::const_iterator end = querystring.end();
1145 while (here != end) {
1146 if (is_unicode_letdig(*here) || *here == '&' || is_special_character(type, *here)) {
1147 queryelem.push_back(*here);
1148 }
1149 else if (*here == '|') {
1150 in_field = false;
1151 }
1152 else if (*here == '!' || *here == '(' || *here == ')') {
1153 if (!in_phrase) { // ignore these if in_phrase
1154 // output field, then output operator
1155 in_field = false;
1156 if (!queryelem.empty()) {
1157 if (!simple_and && !fieldpart.empty()) {
1158 add_field_info(fieldpart, tag, type);
1159 finalquery += fieldpart;
1160 finalquery.push_back(' ');
1161 fieldpart.clear();
1162 }
1163 fieldpart += queryelem;
1164 }
1165 if (!fieldpart.empty()) {
1166 add_field_info(fieldpart, tag, type);
1167 finalquery += fieldpart;
1168 finalquery.push_back(' ');
1169 }
1170 fieldpart.clear();
1171 queryelem.clear();
1172 finalquery.push_back(*here);
1173 finalquery.push_back(' ');
1174 }
1175 }
1176 else if (*here == '"') {
1177 queryelem.push_back(*here);
1178 if (in_phrase == false) in_phrase = true;
1179 else {
1180 in_phrase = false;
1181 }
1182 }
1183
1184 // Found word boundary, in a phrase
1185 else if (in_phrase) {
1186 queryelem.push_back(*here);
1187 }
1188 // Found a word boundary
1189 else {
1190 if (!queryelem.empty()) {
1191 if (queryelem == "&") {
1192 in_field = true;
1193 queryelem.clear();
1194 }
1195 else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
1196
1197 if (argb==1) {
1198 // simple search, these not allowed
1199 in_field = true;
1200 fieldpart += queryelem;
1201 fieldpart.push_back(' ');
1202 }
1203 queryelem.clear();
1204
1205 }
1206 else {
1207 if (!simple_and && !in_field) {
1208 if (!fieldpart.empty()) {
1209 add_field_info(fieldpart, tag, type);
1210 finalquery += fieldpart;
1211 finalquery.push_back(' ');
1212 fieldpart.clear();
1213 }
1214 }
1215
1216 fieldpart += queryelem;
1217 fieldpart.push_back(' ');
1218 queryelem.clear();
1219 }
1220 }
1221 }
1222 ++here;
1223 }
1224 // at the end
1225 if (!queryelem.empty()) {
1226 if (!simple_and && !in_field && !fieldpart.empty()) {
1227 add_field_info(fieldpart, tag, type);
1228 finalquery += fieldpart;
1229 finalquery.push_back(' ');
1230 fieldpart.clear();
1231 }
1232 fieldpart += queryelem;
1233 }
1234 if (!fieldpart.empty()) {
1235 add_field_info(fieldpart, tag, type);
1236 finalquery += fieldpart;
1237 fieldpart.clear();
1238
1239 // doesn't the following just leave a dangling space at the end ?? (used to make mgpp crash)
1240 // consider cutting this line
1241 finalquery.push_back(' ');
1242 }
1243
1244 querystring = finalquery;
1245}
1246
1247
1248void format_field_info_sql(text_t &querystring, const text_t &tagseq,
1249 const text_t &sqlcomb,
1250 int argt, int argb)
1251{
1252 add_field_info_sql(querystring, tagseq, sqlcomb);
1253}
1254
1255
1256void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) {
1257 if (argct == 1) {
1258 format_field_info_mgpp(querystring, tag, argt, argb);
1259 } else if (argct == 2) {
1260 format_field_info_lucene(querystring, tag, argt, argb);
1261 }
1262}
1263
1264void mgpp_adddateelem(text_t& querystring, const int date)
1265{
1266 querystring.appendcstr(" [");
1267 if(date<0) {
1268 querystring.appendcstr("bc");
1269 querystring.appendint((date*-1));
1270 }
1271 else {
1272 querystring.appendint(date);
1273 }
1274 querystring.appendcstr("]:CV");
1275}
1276
1277void lucene_adddateelem(text_t& querystring, const int date)
1278{
1279 querystring.appendcstr(" CV:(");
1280 if(date<0) {
1281 querystring.appendcstr("bc");
1282 querystring.appendint((date*-1));
1283 }
1284 else {
1285 querystring.appendint(date);
1286 }
1287 querystring.appendcstr(")");
1288}
1289
1290
1291void add_dates(text_t &querystring, int startdate, int enddate,
1292 int startbc, int endbc, int ct)
1293{
1294 if(startdate)
1295 {
1296 int querystringis = 0;
1297 text_t::const_iterator here = querystring.begin();
1298 text_t::const_iterator end = querystring.end();
1299 while(here!=end)
1300 {
1301 if(!(isspace((*here)))){
1302 here = end;
1303 querystringis = 1;
1304 }
1305 else
1306 ++here;
1307 }
1308 //converting BCE dates
1309 if(startbc && startdate > 0)
1310 {
1311 startdate *= -1;
1312 }
1313 if(endbc && enddate > 0)
1314 {
1315 enddate *= -1;
1316 }
1317 if(enddate != 0 && enddate<startdate)
1318 {
1319 cout<<"enddate too small"<<endl;
1320 return;
1321 }
1322 if(querystringis)
1323 querystring.appendcstr(" AND");
1324 if(!enddate)
1325 {
1326 if (ct==1) {
1327 mgpp_adddateelem(querystring,startdate);
1328 }
1329 else { // lucene
1330 lucene_adddateelem(querystring,startdate);
1331 }
1332 }
1333 else{
1334 int nextdate = startdate;
1335 querystring.appendcstr(" (");
1336 while(nextdate<=enddate)
1337 {
1338 if(nextdate!=0) {
1339 if (ct==1) {
1340 mgpp_adddateelem(querystring,nextdate);
1341 }
1342 else { // lucene
1343 lucene_adddateelem(querystring,nextdate);
1344 }
1345 }
1346 ++nextdate;
1347 }
1348 querystring.appendcstr(" )");
1349 }
1350 }
1351
1352}
Note: See TracBrowser for help on using the repository browser.