source: main/trunk/greenstone2/runtime-src/src/recpt/querytools.cpp@ 22177

Last change on this file since 22177 was 22046, checked in by davidb, 14 years ago

Changes necessary to support new sql-query action

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 34.2 KB
Line 
1/**********************************************************************
2 *
3 * querytools.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "querytools.h"
27#include <ctype.h>
28#include "unitool.h" // for is_unicode_letdig
29
30// sets the ct, qt, qto arguments
31void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
32
33 if (args["ct"].empty()) {
34 text_t build_type = cinfo->buildType;
35 if (build_type == "mgpp") {
36 args["ct"] = "1";
37 } else if (build_type == "lucene") {
38 args["ct"] = "2";
39 } else {
40 args["ct"] = "0";
41 }
42 }
43 text_t arg_ct = args["ct"];
44 if (arg_ct == "0") {
45 // mg
46 args["qt"] = "0";
47 args["qto"] = "0";
48 return;
49 }
50
51 if (!args["qt"].empty() && !args["qto"].empty()) {
52 return;
53 }
54
55 text_tmap::iterator check = cinfo->format.find("SearchTypes");
56 text_t search_types;
57 if(check != cinfo->format.end() && !(*check).second.empty()){
58 search_types = (*check).second;
59 } else {
60 // assume plain,form
61 if (args["qto"].empty()) args["qto"] = "3";
62 if (args["qt"].empty()) {
63 int arg_qto = args.getintarg("qto");
64 if (arg_qto == 2) {
65 args["qt"] = "1";
66 } else {
67 args["qt"] = "0";
68 }
69 }
70 return;
71 }
72
73
74 if (args["qto"].empty()) {
75 unsigned int type = 0;
76 if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
77 type |= 2;
78 }
79 if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
80 type |= 1;
81 }
82 args.setintarg("qto", type);
83 }
84
85 if (args["qt"].empty()) {
86 int arg_qto = args.getintarg("qto");
87 if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
88 args["qt"] = "1";
89 } else {
90 args["qt"] = "0";
91 }
92 }
93
94
95 // decide if sqlqto should be set or not
96 unsigned int sql_type = 0;
97 text_t infodb_type = cinfo->infodbType;
98 if ((infodb_type == "sqlite") || (infodb_type == "mssql")) {
99 if (findword(search_types.begin(), search_types.end(), "sqlform") != search_types.end()) {
100 sql_type = 1;
101 }
102 }
103
104 if (sql_type) {
105 args["sqlqto"] = "1";
106 }
107 else {
108 args["sqlqto"] = "0";
109 }
110
111
112}
113
114// sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args
115void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
116 int stemIndexes = cinfo->stemIndexes;
117
118 if (stemIndexes & SIcasefold) {
119 args["ks"] = 1;
120 }
121 if (stemIndexes & SIstem) {
122 args["ss"] = 1;
123 }
124 if (stemIndexes & SIaccentfold) {
125 args["afs"] = 1;
126 }
127
128}
129
130
131
132void set_basequeryfilter_options (FilterRequest_t &request,
133 cgiargsclass &args)
134{
135
136 OptionValue_t option;
137 int arg_m = args.getintarg("m");
138
139 option.name = "Maxdocs";
140 option.value = arg_m;
141 request.filterOptions.push_back (option);
142
143 // option.name = "StartResults";
144 // option.value = args["r"];
145 // request.filterOptions.push_back (option);
146
147 // option.name = "EndResults";
148 // int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
149 // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
150 // option.value = endresults;
151 // request.filterOptions.push_back (option);
152}
153
154
155// request.filterResultOptions and request.fields (if required) should
156// be set from the calling code
157void set_fulltext_queryfilter_options (FilterRequest_t &request,
158 const text_t &querystring,
159 cgiargsclass &args)
160{
161 // better if this function, and the two-query companion function
162 // was implemented in queryaction.cpp
163 // Has to be done here to documentaction.cpp can call it directly
164
165 request.filterName = "QueryFilter";
166
167 OptionValue_t option;
168
169 option.name = "Term";
170 option.value = querystring;
171 request.filterOptions.push_back (option);
172
173 option.name = "QueryType";
174 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
175 request.filterOptions.push_back (option);
176
177 option.name = "MatchMode";
178 // mgpp in advanced mode, always use some query
179 if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
180 option.value = "some";
181 } else {
182 option.value = (args.getintarg("t")) ? "some" : "all";
183 }
184 request.filterOptions.push_back (option);
185
186 option.name = "Casefold";
187 option.value = (args.getintarg("k")) ? "true" : "false";
188 request.filterOptions.push_back (option);
189
190 option.name = "Stem";
191 option.value = (args.getintarg("s")) ? "true" : "false";
192 request.filterOptions.push_back (option);
193
194 option.name = "AccentFold";
195 option.value = (args.getintarg("af")) ? "true" : "false";
196 request.filterOptions.push_back (option);
197
198 if (!args["h"].empty()) {
199 option.name = "Index";
200 option.value = args["h"];
201 request.filterOptions.push_back (option);
202 }
203
204 if (!args["j"].empty()) {
205 option.name = "Subcollection";
206 option.value = args["j"];
207 request.filterOptions.push_back (option);
208 }
209
210 if (!args["n"].empty()) {
211 option.name = "Language";
212 option.value = args["n"];
213 request.filterOptions.push_back (option);
214 }
215
216 if (!args["g"].empty()) { // granularity for mgpp
217 option.name = "Level";
218 option.value = args["g"];
219 request.filterOptions.push_back (option);
220 }
221
222 if (!args["fs"].empty()) { // filter string for lucene
223 option.name = "FilterString";
224 option.value = args["fs"];
225 request.filterOptions.push_back (option);
226 }
227
228 if (!args["sf"].empty()) { // sort field for lucene
229 option.name = "SortField";
230 option.value = args["sf"];
231 request.filterOptions.push_back (option);
232 }
233
234 if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene
235 option.name = "Fuzziness";
236 option.value = (text_t) "0." + args["fuzziness"];
237 request.filterOptions.push_back (option);
238 }
239
240 set_basequeryfilter_options(request, args);
241}
242
243
244
245void set_fulltext_queryfilter_options (FilterRequest_t &request,
246 const text_t &querystring1,
247 const text_t &querystring2,
248 cgiargsclass &args)
249{
250
251 set_fulltext_queryfilter_options (request, querystring1, args);
252
253 // fill in the second query if needed
254 if (!args["cq2"].empty()) {
255 OptionValue_t option;
256
257 option.name = "CombineQuery";
258 option.value = args["cq2"];
259 request.filterOptions.push_back (option);
260
261 option.name = "Term";
262 option.value = querystring2;
263 request.filterOptions.push_back (option);
264
265 option.name = "QueryType";
266 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
267 request.filterOptions.push_back (option);
268
269 option.name = "Casefold";
270 option.value = (args.getintarg("k")) ? "true" : "false";
271 request.filterOptions.push_back (option);
272
273 option.name = "Stem";
274 option.value = (args.getintarg("s")) ? "true" : "false";
275 request.filterOptions.push_back (option);
276
277 option.name = "AccentFold";
278 option.value = (args.getintarg("af")) ? "true" : "false";
279 request.filterOptions.push_back (option);
280
281 if (!args["h2"].empty()) {
282 option.name = "Index";
283 option.value = args["h2"];
284 request.filterOptions.push_back (option);
285 }
286
287 if (!args["j2"].empty()) {
288 option.name = "Subcollection";
289 option.value = args["j2"];
290 request.filterOptions.push_back (option);
291 }
292
293 if (!args["n2"].empty()) {
294 option.name = "Language";
295 option.value = args["n2"];
296 request.filterOptions.push_back (option);
297 }
298 }
299
300 // this is probably redundant, as first line to this method will have
301 // already caused it to invoke set_basequeryfilter_options
302
303 set_basequeryfilter_options(request, args);
304}
305
306
307
308// request.filterResultOptions and request.fields (if required) should
309// be set from the calling code
310void set_sql_queryfilter_options (FilterRequest_t &request,
311 cgiargsclass &args)
312{
313 if (!args["sqlsf"].empty()) { // sort field for lucene
314 OptionValue_t option;
315
316 option.name = "SortField";
317 option.value = args["sqlsf"];
318 request.filterOptions.push_back (option);
319 }
320
321 set_basequeryfilter_options(request, args);
322}
323
324
325bool is_special_character(int indexer_type, unsigned short character) {
326 // mgpp
327 if (indexer_type == 1) {
328 return (character == '#' || character == '/' || character == '*');
329 }
330 // lucene
331 else if (indexer_type == 2) {
332 return (character == '?' || character == '*' || character == '~' ||
333 character == '^');
334 }
335 return false;
336}
337
338// This function removes boolean operators from simple searches, and segments
339// chinese characters if segment=true
340void format_querystring (text_t &querystring, int querymode, bool segment) {
341 text_t formattedstring;
342
343 // advanced search, no segmenting, don't need to do anything
344 if (querymode == 1 && !segment) return;
345
346 text_t::const_iterator here = querystring.begin();
347 text_t::const_iterator end = querystring.end();
348
349 // space is used to insert spaces between Chinese
350 // characters. No space is needed before the first
351 // Chinese character.
352 bool space = false;
353
354 // want to remove ()|!& from querystring so boolean queries are just
355 // "all the words" queries (unless querymode is advanced)
356 while (here != end) {
357 if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
358 *here == '!' || *here == '&')) {
359 formattedstring.push_back(' ');
360 } else if (segment) {
361 if ((*here >= 0x2e80 && *here <= 0xd7a3) ||
362 ( *here >= 0xf900 && *here <= 0xfa6a)) {
363 /* text_t not big enough to handle these. */
364 /* (*here >= 0x20000 && *here <= 0x2a6d6) ||
365 (*here >= 0x2f800 && *here <= 0x2fa1d)) { */
366
367 // CJK character
368 if (!space) formattedstring.push_back (0x200b); // zero width space
369 formattedstring.push_back (*here);
370 formattedstring.push_back (0x200b);
371 space = true;
372 } else {
373
374 // non-Chinese character
375 formattedstring.push_back (*here);
376 space = false;
377
378 }
379
380 } else {
381 formattedstring.push_back (*here);
382 }
383 ++here;
384 }
385 querystring = formattedstring;
386}
387
388// turn query string into terms separated by spaces.
389// still working on this...
390text_t get_plain_query_terms(const text_t &querystring, const text_t &arg_ct) {
391 text_t::const_iterator here = querystring.begin();
392 text_t::const_iterator end = querystring.end();
393
394 // lets look for [] and () first - these are a pain.
395 text_t::const_iterator bracket;
396 text_t query_no_brackets = "";
397
398 // mgpp brackets: [xxx]:TI
399 if (findchar(here, end, '[') != end) {
400 while ((bracket = findchar(here, end, '[')) != end) {
401 // get the first bit
402 query_no_brackets += substr(here, bracket);
403 bracket++;
404 here = bracket;
405 // get the end bracket
406 bracket = findchar(here, end, ']');
407 query_no_brackets += substr(here, bracket);
408 // skip the :TI bits
409 while (*bracket != ' ' && bracket != end) { bracket++;}
410 here = bracket;
411 }
412 if (here != end) {
413 query_no_brackets += substr(here,end);
414 }
415 } else if (findchar(here, end, '(') != end) {
416 // lucene brackets TI:(xxx)
417 while ((bracket = findchar(here, end, '(')) != end) {
418 // back up the field name
419 text_t::const_iterator old_bracket = bracket;
420 while (*bracket != ' ' && bracket != here) {
421 --bracket;
422 }
423 if (bracket != here) {
424 // get the first bit
425 query_no_brackets += substr(here, bracket+1);
426 }
427 here = old_bracket +1;
428 // get the end bracket
429 bracket = findchar(here, end, ')');
430 query_no_brackets += substr(here, bracket);
431 if (bracket != end) {
432 here = bracket+1;
433 }
434 }
435 if (here != end) {
436 query_no_brackets += substr(here,end);
437 }
438 } else {
439 // was no brackets
440 query_no_brackets = querystring;
441 }
442
443
444 if (arg_ct == "2") { // lucene
445 // look for AND OR NOT and remove
446 here = query_no_brackets.begin();
447 end = query_no_brackets.end();
448 text_tlist terms;
449 splitword(here, end, "AND", terms);
450 joinchar(terms, ' ', query_no_brackets);
451 here = query_no_brackets.begin();
452 end = query_no_brackets.end();
453 splitword(here, end, "OR", terms);
454 joinchar(terms, ' ', query_no_brackets);
455 here = query_no_brackets.begin();
456 end = query_no_brackets.end();
457 splitword(here, end, "NOT", terms);
458 joinchar(terms, ' ', query_no_brackets);
459
460 }
461 text_t terms = "";
462 bool space = false;
463 here = query_no_brackets.begin();
464 end = query_no_brackets.end();
465
466 while (here != end) {
467 if (*here == '#' || *here == '/') {
468 // skip over #is /10 etc
469 ++here;
470 while (here != end && *here != ' ') {
471 ++here;
472 }
473 if (here == end) break;
474 }
475 if (is_unicode_letdig(*here)) {
476 terms.push_back(*here);
477 space = false;
478 } else {
479 if (!space) {
480 terms.push_back(' ');
481 space = true;
482 }
483 }
484 ++here;
485 }
486 return terms;
487
488}
489
490// search history tool
491// also used for form query macros
492text_t escape_quotes(const text_t &querystring) {
493
494 text_t::const_iterator here = querystring.begin();
495 text_t::const_iterator end = querystring.end();
496
497 text_t escquery = "";
498 while (here != end) {
499 if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
500 else if (*here == '\n' || *here == '\r') {
501 escquery.push_back(' ');
502 } else {
503 escquery +="\\\\";
504 escquery.push_back(*here);
505 }
506
507 ++here;
508 }
509 return escquery;
510
511}
512
513// Parses the terms into words, and adds #si if necessary
514text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
515 const int indexer_type) {
516
517 // the default stem and case are set to 0 if this is being used, so we are only adding on qualifiers if stem or fold is 1.
518 if (stem == "0" && fold == "0") {
519 return terms;
520 }
521 // this is only for mgpp collections, shouldn't be called for anything else
522 if (indexer_type != 1) {
523 return terms;
524 }
525
526 text_t outtext;
527 text_t word;
528
529 text_t::const_iterator here = terms.begin();
530 text_t::const_iterator end = terms.end();
531
532 while (here !=end) {
533
534 if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
535 // not word boundary
536 word.push_back(*here);
537 ++here;
538 }
539 else {
540 // found word boundary
541 if (!word.empty() ) {
542 if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) {
543 outtext += word;
544 word.clear();
545 }
546 else {
547 word += "#";
548 if (stem == "1") word += "s";
549 if (fold == "1") word += "i";
550 outtext += word;
551 word.clear();
552 }
553 }
554 // this only used in advanced form, so we leave in boolean operators
555 if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' ||
556 *here == '(' || *here == ')' || is_unicode_space(*here)) {
557 outtext.push_back(*here);
558 }
559 ++here;
560 }
561 }
562
563 // get last word
564 if (!word.empty()) {
565 word += "#";
566 if (stem == "1") word += "s";
567 if (fold == "1") word += "i";
568 word += " ";
569 outtext += word;
570 }
571 return outtext;
572}
573
574
575// some query form parsing functions for use with mgpp & lucene
576
577void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
578{
579 querystring.clear();
580
581 int argct = args.getintarg("ct");
582 int argt = args.getintarg("t");// t=0 -and, t=1 - or
583 int argb = args.getintarg("b");
584
585 text_t combine;
586
587 // lucene uses global combine, so only need this for mgpp
588 if (argct==1) {
589 if (argt == 0) combine = "&";
590 else combine = "|";
591 }
592
593 text_t field = args["fqf"];
594 if (field.empty()) return; // no query
595 text_tarray fields;
596 splitchar(field.begin(), field.end(), ',', fields);
597
598 text_t value = args["fqv"];
599 if (value.empty()) return; // somethings wrong
600 text_tarray values;
601 splitchar(value.begin(), value.end(), ',', values);
602
603
604 for (int i=0; i< values.size(); ++i) {
605 if (!values[i].empty()) {
606 text_t this_value = values[i];
607
608 // remove operators for simple search, segments text if necessary
609 format_querystring(this_value, argb, segment);
610
611 // add tag info for this field (and other processing)
612 format_field_info(this_value, fields[i], argct, argt, argb);
613
614 // add into query string
615 if (argct == 2) {
616 // lucene
617 // we don't worry about AND/OR, cos this is done by defaultcombineoperator
618 querystring += this_value+" ";
619 } else {
620 // mgpp
621 if (!querystring.empty()) {
622 querystring += " "+ combine+ " ";
623 }
624 querystring += this_value;
625 }
626 }
627 }
628}
629
630
631void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
632 querystring.clear();
633
634 const int argct = args.getintarg("ct");
635 int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
636 int argb = args.getintarg("b");
637 text_t combine;
638 if (argct==1) {
639 combine = "&";
640 }
641 else { // lucene
642 combine = "AND";
643 }
644
645 text_t field = args["fqf"];
646 if (field.empty()) return; // no query
647 text_tarray fields;
648 splitchar(field.begin(), field.end(), ',', fields);
649
650 text_t value = args["fqv"];
651 if (value.empty()) return; // somethings wrong
652 text_tarray values;
653 splitchar(value.begin(), value.end(), ',', values);
654
655 text_t comb = args["fqc"];
656 if (comb.empty()) return; //somethings wrong
657 text_tarray combs;
658 splitchar(comb.begin(), comb.end(), ',', combs);
659
660 text_tarray stems;
661 text_tarray folds;
662 if (argct == 1) {// mgpp - lucene doesn't do stem/case
663 text_t stem = args["fqs"];
664 if (stem.empty()) return; // somethings wrong
665 splitchar(stem.begin(), stem.end(), ',', stems);
666
667 text_t fold = args["fqk"];
668 if (fold.empty()) return; // somethings wrong
669 splitchar(fold.begin(), fold.end(), ',', folds);
670 }
671
672 for(int i=0; i< values.size(); ++i) {
673 if (!values[i].empty()) {
674 if (i!=0) {
675 if (argct==1) {
676 if (combs[i-1]=="and") combine = "&";
677 else if (combs[i-1]=="or")combine = "|";
678 else if (combs[i-1]=="not")combine = "!";
679 }
680 else { // lucene
681 if (combs[i-1]=="and") combine = "AND";
682 else if (combs[i-1]=="or")combine = "OR";
683 else if (combs[i-1]=="not")combine = "NOT";
684 }
685 }
686 text_t this_value = values[i];
687 // remove operators for simple search, segments text if necessary
688 format_querystring(this_value, argb, segment);
689 if (argct == 1) { // mgpp only
690 this_value = addstemcase(this_value, stems[i], folds[i], argct);
691 }
692 // add tag info for this field (and other processing)
693 format_field_info(this_value, fields[i], argct, argt, argb);
694 // add into query string
695 if (!querystring.empty()) {
696 querystring += " "+ combine+ " ";
697 }
698 querystring += this_value;
699
700 }
701 }
702}
703
704
705// SQL versions for parsing query form
706
707void parse_sqlreg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
708{
709 querystring.clear();
710
711 int argt = args.getintarg("t");// t=0 -and, t=1 - or
712 int argb = args.getintarg("b");
713
714 text_t combine;
715
716 if (argt == 0) combine = "AND";
717 else combine = "OR";
718
719 text_t field = args["sqlfqf"];
720 if (field.empty()) return; // no query
721 text_tarray fields;
722 splitchar(field.begin(), field.end(), ',', fields);
723
724 text_t sqlcomb = args["sqlfqc"];
725 if (sqlcomb.empty()) return; //somethings wrong
726 text_tarray sqlcombs;
727 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
728
729 text_t value = args["fqv"];
730 if (value.empty()) return; // somethings wrong
731 text_tarray values;
732 splitchar(value.begin(), value.end(), ',', values);
733
734
735 for (int i=0; i< values.size(); ++i) {
736 if (!values[i].empty()) {
737 text_t this_value = values[i];
738
739 // remove operators for simple search, segments text if necessary
740 format_querystring(this_value, argb, segment);
741
742 // add tag info for this field (and other processing)
743 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);
744
745 const text_t DISTINCT_SELECT_WHERE
746 = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
747
748 if (querystring.empty()) {
749 // first query term
750 querystring = DISTINCT_SELECT_WHERE + this_value;
751 }
752 else {
753 this_value = DISTINCT_SELECT_WHERE + this_value;
754
755 if (combine=="AND") {
756 // INNER JOIN to restrict to only matching docOIDs
757 querystring = "SELECT docOID FROM (" + querystring + ")"
758 + " INNER JOIN (" + this_value +") USING (docOID)";
759 }
760 else if (combine=="OR") {
761 // Union to allow union of the two
762 querystring = querystring + " UNION " + this_value;
763 }
764 }
765 }
766 }
767}
768
769
770void parse_sqladv_query_form(text_t &querystring, cgiargsclass &args,
771 bool segment)
772{
773 querystring.clear();
774
775 int argt = 0; // set it to 0 = AND, by default
776 int argb = args.getintarg("b");
777 text_t combine = "AND";
778
779 text_t field = args["sqlfqf"];
780
781 if (field.empty()) return; // no query
782 text_tarray fields;
783 splitchar(field.begin(), field.end(), ',', fields);
784
785 text_t sqlcomb = args["sqlfqc"];
786 if (sqlcomb.empty()) return; //somethings wrong
787 text_tarray sqlcombs;
788 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
789
790 text_t value = args["fqv"];
791 if (value.empty()) return; // somethings wrong
792 text_tarray values;
793 splitchar(value.begin(), value.end(), ',', values);
794
795 text_t comb = args["fqc"];
796 if (comb.empty()) return; //somethings wrong
797 text_tarray combs;
798 splitchar(comb.begin(), comb.end(), ',', combs);
799
800 for(int i=0; i< values.size(); ++i) {
801 if (!values[i].empty()) {
802 if (i>0) {
803 if (combs[i-1]=="and") { combine = "AND"; }
804 else if (combs[i-1]=="or") { combine = "OR"; }
805 else if (combs[i-1]=="not") { combine = "NOT"; }
806 }
807 text_t this_value = values[i];
808
809 // remove operators for simple search, segments text if necessary
810 format_querystring(this_value, argb, segment);
811
812 // add tag info for this field (and other processing)
813 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);
814
815 // add into query string
816
817 const text_t DISTINCT_SELECT_WHERE
818 = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
819
820 if (querystring.empty()) {
821 // first query term
822 querystring = DISTINCT_SELECT_WHERE + this_value;
823 }
824 else {
825 this_value = DISTINCT_SELECT_WHERE + this_value;
826
827 if (combine=="AND") {
828 // INNER JOIN to restrict to only matching docOIDs
829 querystring = "SELECT docOID FROM (" + querystring + ")"
830 + " INNER JOIN (" + this_value +") USING (docOID)";
831 }
832 else if (combine=="OR") {
833 // Union to allow union of the two
834 querystring = querystring + " UNION " + this_value;
835 }
836 else {
837 cerr << "Unsupported combination operation: " << combine << endl;
838 }
839 }
840
841 }
842 }
843}
844
845
846
847
848// Extended addqueryelem for Human Info project
849void addqueryelem_ex(text_t &querystring, const text_t &tag,
850 const text_t &terms, const text_t &stem,
851 const text_t &fold,
852 const text_t& combine, const text_t& word_combine) {
853
854 if (!querystring.empty()) { // have to put and/or
855 querystring += " " + combine + " ";
856 }
857 text_t outtext; outtext.reserve(512);
858 text_t word; word.reserve(100);
859 //unsigned short c;
860 text_t::const_iterator here = terms.begin();
861 text_t::const_iterator end = terms.end();
862 bool inquote = false, firstword = true;
863
864 text_t word2; word2.reserve(256);
865
866 while (here !=end) {
867 if (is_unicode_space(*here)) {
868 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
869 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
870 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
871 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
872 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
873 if (inquote) {
874 word2.push_back(*here);
875 }
876 word.append(word2); word2.clear();
877
878 if (!inquote && !word.empty() ) {
879 // found word boundary
880
881 if (stem == "1" || fold =="1") {
882 word += "#";
883 if (stem == "1") word += "s";
884 //else word += "u";
885
886 if (fold == "1") word += "i";
887 //else word += "c";
888 }
889 if (firstword) {
890 firstword = false;
891 } else {
892 outtext += " " + word_combine + " ";
893 }
894 outtext += "[" + word + "]:"+tag;
895 word.clear();
896 }
897 ++here;
898 } else if (*here == '\"') {
899 word2.push_back(*here);
900 inquote = !inquote;
901 ++here;
902 } else {
903 // not word boundary
904 word2.push_back(*here);
905 ++here;
906 }
907 }
908
909 // get last word
910 if (!word2.empty()) {
911 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
912 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
913 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
914 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
915 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
916 word.append(word2); word2.clear();
917
918 if (stem == "1"|| fold == "1") {
919 word += "#";
920 if (stem == "1") word += "s";
921 //else word += "u";
922
923 if (fold == "1") word += "i";
924 //else word += "c";
925 }
926 if (!outtext.empty()) outtext += " " + word_combine + " ";
927 outtext += "[" + word + "]:"+tag;
928 }
929 querystring += "(" + outtext + ")";
930}
931
932void add_field_info(text_t &querystring, const text_t &tag, int type) {
933
934 if (tag == "") return; // do nothing
935 if (tag == "ZZ" && type == 1) return; // mgpp doesn't use ZZ tag internally
936 if (type == 1) { //mgpp
937 querystring = "["+querystring+"]:"+tag;
938 } else if (type == 2) { // lucene
939 querystring = tag+":("+querystring+")";
940 }
941
942}
943
944
945void add_field_info_sql(text_t &querystring, const text_t &tagseq,
946 const text_t& sqlcomb)
947{
948
949 if (tagseq == "") return; // do nothing
950
951 text_t element_in = "(element IN (";
952
953 text_tlist mdterms;
954
955 splitword(tagseq.begin(), tagseq.end(), "/", mdterms);
956
957 text_t tags_in = "";
958
959 while (!mdterms.empty()) {
960 text_t tag = mdterms.front();
961 mdterms.pop_front();
962
963 if (!tag.empty()) {
964
965 if (tag.size()>3 && (substr(tag.begin(), tag.begin()+3) == "ex.")) {
966 tag = substr (tag.begin()+3, tag.end());
967 }
968
969 if (!tags_in.empty()) {
970 tags_in += ",";
971 }
972
973 tags_in += "'" + tag + "'";
974 }
975 }
976
977 element_in += tags_in + ") AND (";
978
979 if (sqlcomb == "=") {
980 // override what it means to do equality, to make it more like full text
981 // searching
982
983 text_t orterms = "";
984 text_t term = "";
985 bool in_phrase = false;
986
987 text_t::const_iterator here = querystring.begin();
988 text_t::const_iterator end = querystring.end();
989 while (here != end) {
990 if (is_unicode_letdig(*here)) {
991 term.push_back(*here);
992 }
993 else if (*here == '"') {
994 term.push_back(*here);
995 if (!in_phrase) {
996 in_phrase = true;
997 } else {
998 in_phrase = false;
999 }
1000 }
1001 else if (in_phrase) {
1002 // Found word boundary, but in a phrase, so does not complete term
1003 term.push_back(*here);
1004 }
1005 else {
1006 // Found a word boundary
1007 if (!orterms.empty()) {
1008 orterms += " OR ";
1009 }
1010 orterms += "value LIKE '%" + term + "%'";
1011 term.clear();
1012 }
1013 ++here;
1014 }
1015
1016 if (!term.empty()) {
1017 if (!orterms.empty()) {
1018 orterms += " OR ";
1019 }
1020 orterms += "value LIKE '%" + term + "%'";
1021 }
1022
1023 element_in += orterms;
1024 }
1025 else {
1026 // search on value is "as is" querystring
1027 element_in += "value " + sqlcomb + " '" + querystring+"'";
1028 }
1029
1030
1031 querystring = element_in + "))";
1032
1033}
1034
1035
1036void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) {
1037
1038 int type = 2; //lucene
1039
1040 if (argb==0) { // simple
1041 // there will be no & or | as they should have already been removed
1042 // just tag the entire thing
1043 if (tag != "") {
1044 add_field_info(querystring, tag, type);
1045 }
1046 return;
1047 }
1048
1049 // need to replace & with &&, | with ||
1050 text_t::const_iterator here = querystring.begin();
1051 text_t::const_iterator end = querystring.end();
1052
1053 text_t finalquery = "";
1054 while (here != end) {
1055 if (*here == '&') {
1056 finalquery.push_back('&');
1057 finalquery.push_back('&');
1058 while (*(here+1) == '&') {
1059 ++here;
1060 }
1061 }
1062 else if (*here == '|') {
1063 finalquery.push_back('|');
1064 finalquery.push_back('|');
1065 while (*(here+1) == '|') {
1066 ++here;
1067 }
1068 }
1069 else {
1070 finalquery.push_back(*here);
1071 }
1072 ++here;
1073 }
1074 querystring = finalquery;
1075 add_field_info(querystring, tag, type);
1076}
1077
1078
1079void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {
1080
1081 if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
1082 if (tag == "" && argb == 1) {
1083 return; // no field specifier, advanced mode, the query stays as written
1084 }
1085
1086 int type = 1; // mgpp
1087
1088 bool simple_and = (argb==0 && argt==0);
1089 text_t finalquery = "";
1090 text_t fieldpart ="";
1091 text_t queryelem = "";
1092 bool in_phrase = false;
1093 bool in_field = false;
1094
1095 text_t::const_iterator here = querystring.begin();
1096 text_t::const_iterator end = querystring.end();
1097 while (here != end) {
1098 if (is_unicode_letdig(*here) || *here == '&' || is_special_character(type, *here)) {
1099 queryelem.push_back(*here);
1100 }
1101 else if (*here == '|') {
1102 in_field = false;
1103 }
1104 else if (*here == '!' || *here == '(' || *here == ')') {
1105 if (!in_phrase) { // ignore these if in_phrase
1106 // output field, then output operator
1107 in_field = false;
1108 if (!queryelem.empty()) {
1109 if (!simple_and && !fieldpart.empty()) {
1110 add_field_info(fieldpart, tag, type);
1111 finalquery += fieldpart;
1112 finalquery.push_back(' ');
1113 fieldpart.clear();
1114 }
1115 fieldpart += queryelem;
1116 }
1117 if (!fieldpart.empty()) {
1118 add_field_info(fieldpart, tag, type);
1119 finalquery += fieldpart;
1120 finalquery.push_back(' ');
1121 }
1122 fieldpart.clear();
1123 queryelem.clear();
1124 finalquery.push_back(*here);
1125 finalquery.push_back(' ');
1126 }
1127 }
1128 else if (*here == '"') {
1129 queryelem.push_back(*here);
1130 if (in_phrase == false) in_phrase = true;
1131 else {
1132 in_phrase = false;
1133 }
1134 }
1135
1136 // Found word boundary, in a phrase
1137 else if (in_phrase) {
1138 queryelem.push_back(*here);
1139 }
1140 // Found a word boundary
1141 else {
1142 if (!queryelem.empty()) {
1143 if (queryelem == "&") {
1144 in_field = true;
1145 queryelem.clear();
1146 }
1147 else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
1148
1149 if (argb==1) {
1150 // simple search, these not allowed
1151 in_field = true;
1152 fieldpart += queryelem;
1153 fieldpart.push_back(' ');
1154 }
1155 queryelem.clear();
1156
1157 }
1158 else {
1159 if (!simple_and && !in_field) {
1160 if (!fieldpart.empty()) {
1161 add_field_info(fieldpart, tag, type);
1162 finalquery += fieldpart;
1163 finalquery.push_back(' ');
1164 fieldpart.clear();
1165 }
1166 }
1167
1168 fieldpart += queryelem;
1169 fieldpart.push_back(' ');
1170 queryelem.clear();
1171 }
1172 }
1173 }
1174 ++here;
1175 }
1176 // at the end
1177 if (!queryelem.empty()) {
1178 if (!simple_and && !in_field && !fieldpart.empty()) {
1179 add_field_info(fieldpart, tag, type);
1180 finalquery += fieldpart;
1181 finalquery.push_back(' ');
1182 fieldpart.clear();
1183 }
1184 fieldpart += queryelem;
1185 }
1186 if (!fieldpart.empty()) {
1187 add_field_info(fieldpart, tag, type);
1188 finalquery += fieldpart;
1189 fieldpart.clear();
1190
1191 // doesn't the following just leave a dangling space at the end ?? (used to make mgpp crash)
1192 // consider cutting this line
1193 finalquery.push_back(' ');
1194 }
1195
1196 querystring = finalquery;
1197}
1198
1199
1200void format_field_info_sql(text_t &querystring, const text_t &tagseq,
1201 const text_t &sqlcomb,
1202 int argt, int argb)
1203{
1204 add_field_info_sql(querystring, tagseq, sqlcomb);
1205}
1206
1207
1208void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) {
1209 if (argct == 1) {
1210 format_field_info_mgpp(querystring, tag, argt, argb);
1211 } else if (argct == 2) {
1212 format_field_info_lucene(querystring, tag, argt, argb);
1213 }
1214}
1215
1216void mgpp_adddateelem(text_t& querystring, const int date)
1217{
1218 querystring.appendcstr(" [");
1219 if(date<0) {
1220 querystring.appendcstr("bc");
1221 querystring.appendint((date*-1));
1222 }
1223 else {
1224 querystring.appendint(date);
1225 }
1226 querystring.appendcstr("]:CV");
1227}
1228
1229void lucene_adddateelem(text_t& querystring, const int date)
1230{
1231 querystring.appendcstr(" CV:(");
1232 if(date<0) {
1233 querystring.appendcstr("bc");
1234 querystring.appendint((date*-1));
1235 }
1236 else {
1237 querystring.appendint(date);
1238 }
1239 querystring.appendcstr(")");
1240}
1241
1242
1243void add_dates(text_t &querystring, int startdate, int enddate,
1244 int startbc, int endbc, int ct)
1245{
1246 if(startdate)
1247 {
1248 int querystringis = 0;
1249 text_t::const_iterator here = querystring.begin();
1250 text_t::const_iterator end = querystring.end();
1251 while(here!=end)
1252 {
1253 if(!(isspace((*here)))){
1254 here = end;
1255 querystringis = 1;
1256 }
1257 else
1258 ++here;
1259 }
1260 //converting BCE dates
1261 if(startbc && startdate > 0)
1262 {
1263 startdate *= -1;
1264 }
1265 if(endbc && enddate > 0)
1266 {
1267 enddate *= -1;
1268 }
1269 if(enddate != 0 && enddate<startdate)
1270 {
1271 cout<<"enddate too small"<<endl;
1272 return;
1273 }
1274 if(querystringis)
1275 querystring.appendcstr(" AND");
1276 if(!enddate)
1277 {
1278 if (ct==1) {
1279 mgpp_adddateelem(querystring,startdate);
1280 }
1281 else { // lucene
1282 lucene_adddateelem(querystring,startdate);
1283 }
1284 }
1285 else{
1286 int nextdate = startdate;
1287 querystring.appendcstr(" (");
1288 while(nextdate<=enddate)
1289 {
1290 if(nextdate!=0) {
1291 if (ct==1) {
1292 mgpp_adddateelem(querystring,nextdate);
1293 }
1294 else { // lucene
1295 lucene_adddateelem(querystring,nextdate);
1296 }
1297 }
1298 ++nextdate;
1299 }
1300 querystring.appendcstr(" )");
1301 }
1302 }
1303
1304}
Note: See TracBrowser for help on using the repository browser.