source: main/trunk/greenstone2/runtime-src/src/recpt/querytools.cpp@ 24073

Last change on this file since 24073 was 24073, checked in by davidb, 13 years ago

Two features added:

1) allow for SQL matching 'startingwith'
2) numeric based searching '<num' and similar, which is in addition to string searching with '<' etc.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 36.2 KB
RevLine 
[270]1/**********************************************************************
2 *
3 * querytools.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
[533]6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
[270]9 *
[533]10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
[270]24 *********************************************************************/
25
26#include "querytools.h"
[1373]27#include <ctype.h>
[1914]28#include "unitool.h" // for is_unicode_letdig
[270]29
[12784]30// sets the ct, qt, qto arguments
[11987]31void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
32
33 if (args["ct"].empty()) {
34 text_t build_type = cinfo->buildType;
35 if (build_type == "mgpp") {
36 args["ct"] = "1";
37 } else if (build_type == "lucene") {
38 args["ct"] = "2";
39 } else {
40 args["ct"] = "0";
41 }
42 }
43 text_t arg_ct = args["ct"];
44 if (arg_ct == "0") {
45 // mg
46 args["qt"] = "0";
47 args["qto"] = "0";
48 return;
49 }
50
51 if (!args["qt"].empty() && !args["qto"].empty()) {
52 return;
53 }
54
55 text_tmap::iterator check = cinfo->format.find("SearchTypes");
[12784]56 text_t search_types;
57 if(check != cinfo->format.end() && !(*check).second.empty()){
[11987]58 search_types = (*check).second;
[12784]59 } else {
60 // assume plain,form
61 if (args["qto"].empty()) args["qto"] = "3";
62 if (args["qt"].empty()) {
63 int arg_qto = args.getintarg("qto");
[12930]64 if (arg_qto == 2) {
[12784]65 args["qt"] = "1";
66 } else {
67 args["qt"] = "0";
68 }
[11987]69 }
[12784]70 return;
[11987]71 }
72
[12784]73
[11987]74 if (args["qto"].empty()) {
75 unsigned int type = 0;
76 if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
77 type |= 2;
78 }
79 if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
80 type |= 1;
81 }
82 args.setintarg("qto", type);
83 }
[22046]84
[11987]85 if (args["qt"].empty()) {
86 int arg_qto = args.getintarg("qto");
87 if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
88 args["qt"] = "1";
89 } else {
90 args["qt"] = "0";
91 }
92 }
[22046]93
94
95 // decide if sqlqto should be set or not
96 unsigned int sql_type = 0;
97 text_t infodb_type = cinfo->infodbType;
98 if ((infodb_type == "sqlite") || (infodb_type == "mssql")) {
99 if (findword(search_types.begin(), search_types.end(), "sqlform") != search_types.end()) {
100 sql_type = 1;
101 }
102 }
103
104 if (sql_type) {
105 args["sqlqto"] = "1";
106 }
107 else {
108 args["sqlqto"] = "0";
109 }
110
111
[11987]112}
113
[12864]114// sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args
115void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
116 int stemIndexes = cinfo->stemIndexes;
117
118 if (stemIndexes & SIcasefold) {
119 args["ks"] = 1;
120 }
121 if (stemIndexes & SIstem) {
122 args["ss"] = 1;
123 }
124 if (stemIndexes & SIaccentfold) {
125 args["afs"] = 1;
126 }
127
128}
129
[22046]130
131
132void set_basequeryfilter_options (FilterRequest_t &request,
133 cgiargsclass &args)
134{
135
136 OptionValue_t option;
137 int arg_m = args.getintarg("m");
138
139 option.name = "Maxdocs";
140 option.value = arg_m;
141 request.filterOptions.push_back (option);
142
143 // option.name = "StartResults";
144 // option.value = args["r"];
145 // request.filterOptions.push_back (option);
146
147 // option.name = "EndResults";
148 // int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
149 // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
150 // option.value = endresults;
151 // request.filterOptions.push_back (option);
152}
153
154
[759]155// request.filterResultOptions and request.fields (if required) should
156// be set from the calling code
[22046]157void set_fulltext_queryfilter_options (FilterRequest_t &request,
158 const text_t &querystring,
159 cgiargsclass &args)
160{
161 // better if this function, and the two-query companion function
162 // was implemented in queryaction.cpp
163 // Has to be done here to documentaction.cpp can call it directly
[270]164
165 request.filterName = "QueryFilter";
166
167 OptionValue_t option;
[470]168
[270]169 option.name = "Term";
[759]170 option.value = querystring;
[270]171 request.filterOptions.push_back (option);
172
173 option.name = "QueryType";
174 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
175 request.filterOptions.push_back (option);
176
[1774]177 option.name = "MatchMode";
[11765]178 // mgpp in advanced mode, always use some query
[12428]179 if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
[11765]180 option.value = "some";
181 } else {
182 option.value = (args.getintarg("t")) ? "some" : "all";
183 }
[1774]184 request.filterOptions.push_back (option);
185
[270]186 option.name = "Casefold";
187 option.value = (args.getintarg("k")) ? "true" : "false";
188 request.filterOptions.push_back (option);
189
190 option.name = "Stem";
191 option.value = (args.getintarg("s")) ? "true" : "false";
192 request.filterOptions.push_back (option);
193
[12864]194 option.name = "AccentFold";
195 option.value = (args.getintarg("af")) ? "true" : "false";
196 request.filterOptions.push_back (option);
197
[270]198 if (!args["h"].empty()) {
199 option.name = "Index";
200 option.value = args["h"];
201 request.filterOptions.push_back (option);
202 }
203
204 if (!args["j"].empty()) {
205 option.name = "Subcollection";
206 option.value = args["j"];
207 request.filterOptions.push_back (option);
208 }
209
210 if (!args["n"].empty()) {
211 option.name = "Language";
212 option.value = args["n"];
213 request.filterOptions.push_back (option);
214 }
[1329]215
216 if (!args["g"].empty()) { // granularity for mgpp
217 option.name = "Level";
218 option.value = args["g"];
219 request.filterOptions.push_back (option);
220 }
[270]221
[12410]222 if (!args["fs"].empty()) { // filter string for lucene
223 option.name = "FilterString";
224 option.value = args["fs"];
225 request.filterOptions.push_back (option);
226 }
227
[12276]228 if (!args["sf"].empty()) { // sort field for lucene
229 option.name = "SortField";
230 option.value = args["sf"];
231 request.filterOptions.push_back (option);
232 }
233
[12771]234 if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene
[12770]235 option.name = "Fuzziness";
[12771]236 option.value = (text_t) "0." + args["fuzziness"];
[12770]237 request.filterOptions.push_back (option);
238 }
[12388]239
[22046]240 set_basequeryfilter_options(request, args);
[759]241}
242
243
244
[22046]245void set_fulltext_queryfilter_options (FilterRequest_t &request,
246 const text_t &querystring1,
247 const text_t &querystring2,
248 cgiargsclass &args)
249{
250
251 set_fulltext_queryfilter_options (request, querystring1, args);
252
[349]253 // fill in the second query if needed
254 if (!args["cq2"].empty()) {
[759]255 OptionValue_t option;
256
[349]257 option.name = "CombineQuery";
258 option.value = args["cq2"];
259 request.filterOptions.push_back (option);
260
261 option.name = "Term";
[759]262 option.value = querystring2;
[349]263 request.filterOptions.push_back (option);
[759]264
[349]265 option.name = "QueryType";
266 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
267 request.filterOptions.push_back (option);
268
269 option.name = "Casefold";
270 option.value = (args.getintarg("k")) ? "true" : "false";
271 request.filterOptions.push_back (option);
272
273 option.name = "Stem";
274 option.value = (args.getintarg("s")) ? "true" : "false";
275 request.filterOptions.push_back (option);
276
[12864]277 option.name = "AccentFold";
278 option.value = (args.getintarg("af")) ? "true" : "false";
279 request.filterOptions.push_back (option);
280
[349]281 if (!args["h2"].empty()) {
282 option.name = "Index";
283 option.value = args["h2"];
284 request.filterOptions.push_back (option);
285 }
286
287 if (!args["j2"].empty()) {
288 option.name = "Subcollection";
289 option.value = args["j2"];
290 request.filterOptions.push_back (option);
291 }
292
293 if (!args["n2"].empty()) {
294 option.name = "Language";
295 option.value = args["n2"];
296 request.filterOptions.push_back (option);
297 }
298 }
[22046]299
300 // this is probably redundant, as first line to this method will have
301 // already caused it to invoke set_basequeryfilter_options
302
303 set_basequeryfilter_options(request, args);
[759]304}
[608]305
[759]306
[1329]307
[22046]308// request.filterResultOptions and request.fields (if required) should
309// be set from the calling code
310void set_sql_queryfilter_options (FilterRequest_t &request,
311 cgiargsclass &args)
312{
313 if (!args["sqlsf"].empty()) { // sort field for lucene
314 OptionValue_t option;
[270]315
[22046]316 option.name = "SortField";
317 option.value = args["sqlsf"];
318 request.filterOptions.push_back (option);
319 }
320
321 set_basequeryfilter_options(request, args);
[270]322}
323
[22046]324
[11987]325bool is_special_character(int indexer_type, unsigned short character) {
326 // mgpp
327 if (indexer_type == 1) {
328 return (character == '#' || character == '/' || character == '*');
329 }
330 // lucene
[12784]331 else if (indexer_type == 2) {
[11987]332 return (character == '?' || character == '*' || character == '~' ||
333 character == '^');
334 }
335 return false;
336}
337
[12784]338// This function removes boolean operators from simple searches, and segments
339// chinese characters if segment=true
[6584]340void format_querystring (text_t &querystring, int querymode, bool segment) {
[270]341 text_t formattedstring;
342
[12784]343 // advanced search, no segmenting, don't need to do anything
[6584]344 if (querymode == 1 && !segment) return;
345
[270]346 text_t::const_iterator here = querystring.begin();
347 text_t::const_iterator end = querystring.end();
348
349 // space is used to insert spaces between Chinese
350 // characters. No space is needed before the first
351 // Chinese character.
352 bool space = false;
353
354 // want to remove ()|!& from querystring so boolean queries are just
[470]355 // "all the words" queries (unless querymode is advanced)
[270]356 while (here != end) {
[470]357 if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
358 *here == '!' || *here == '&')) {
[270]359 formattedstring.push_back(' ');
[6584]360 } else if (segment) {
[16980]361 if ((*here >= 0x2e80 && *here <= 0xd7a3) ||
362 ( *here >= 0xf900 && *here <= 0xfa6a)) {
363 /* text_t not big enough to handle these. */
364 /* (*here >= 0x20000 && *here <= 0x2a6d6) ||
365 (*here >= 0x2f800 && *here <= 0x2fa1d)) { */
[16645]366
367 // CJK character
[8715]368 if (!space) formattedstring.push_back (0x200b); // zero width space
[397]369 formattedstring.push_back (*here);
370 formattedstring.push_back (0x200b);
371 space = true;
[270]372 } else {
[8715]373
[397]374 // non-Chinese character
375 formattedstring.push_back (*here);
376 space = false;
[8715]377
[270]378 }
[6584]379
380 } else {
381 formattedstring.push_back (*here);
[270]382 }
[9620]383 ++here;
[270]384 }
[397]385 querystring = formattedstring;
[270]386}
387
[20481]388// turn query string into terms separated by spaces.
389// still working on this...
[20602]390text_t get_plain_query_terms(const text_t &querystring, const text_t &arg_ct) {
[20481]391 text_t::const_iterator here = querystring.begin();
392 text_t::const_iterator end = querystring.end();
[20602]393
394 // lets look for [] and () first - these are a pain.
395 text_t::const_iterator bracket;
396 text_t query_no_brackets = "";
397
398 // mgpp brackets: [xxx]:TI
399 if (findchar(here, end, '[') != end) {
400 while ((bracket = findchar(here, end, '[')) != end) {
401 // get the first bit
402 query_no_brackets += substr(here, bracket);
403 bracket++;
404 here = bracket;
405 // get the end bracket
406 bracket = findchar(here, end, ']');
407 query_no_brackets += substr(here, bracket);
408 // skip the :TI bits
[23635]409 while (bracket != end // do bracket != end test first, ELSE when bracket = end, we're past the string, in
410 && *bracket != ' ') { // which case *bracket becomes an invalid operation that causes the server to crash
411 bracket++;
412 }
[20602]413 here = bracket;
414 }
415 if (here != end) {
416 query_no_brackets += substr(here,end);
417 }
418 } else if (findchar(here, end, '(') != end) {
419 // lucene brackets TI:(xxx)
420 while ((bracket = findchar(here, end, '(')) != end) {
421 // back up the field name
422 text_t::const_iterator old_bracket = bracket;
[23635]423 while (bracket != here && *bracket != ' ') { // order of tests in condition matters (see long comment above)
424 --bracket;
[20602]425 }
426 if (bracket != here) {
427 // get the first bit
428 query_no_brackets += substr(here, bracket+1);
429 }
430 here = old_bracket +1;
431 // get the end bracket
432 bracket = findchar(here, end, ')');
433 query_no_brackets += substr(here, bracket);
434 if (bracket != end) {
435 here = bracket+1;
436 }
437 }
438 if (here != end) {
439 query_no_brackets += substr(here,end);
440 }
441 } else {
442 // was no brackets
443 query_no_brackets = querystring;
444 }
445
446
447 if (arg_ct == "2") { // lucene
448 // look for AND OR NOT and remove
449 here = query_no_brackets.begin();
450 end = query_no_brackets.end();
451 text_tlist terms;
452 splitword(here, end, "AND", terms);
453 joinchar(terms, ' ', query_no_brackets);
454 here = query_no_brackets.begin();
455 end = query_no_brackets.end();
456 splitword(here, end, "OR", terms);
457 joinchar(terms, ' ', query_no_brackets);
458 here = query_no_brackets.begin();
459 end = query_no_brackets.end();
460 splitword(here, end, "NOT", terms);
461 joinchar(terms, ' ', query_no_brackets);
462
463 }
[20481]464 text_t terms = "";
465 bool space = false;
[20602]466 here = query_no_brackets.begin();
467 end = query_no_brackets.end();
468
[20481]469 while (here != end) {
470 if (*here == '#' || *here == '/') {
471 // skip over #is /10 etc
472 ++here;
473 while (here != end && *here != ' ') {
474 ++here;
475 }
476 if (here == end) break;
477 }
478 if (is_unicode_letdig(*here)) {
479 terms.push_back(*here);
480 space = false;
481 } else {
482 if (!space) {
483 terms.push_back(' ');
484 space = true;
485 }
486 }
487 ++here;
488 }
489 return terms;
490
491}
[1467]492
[3160]493// search history tool
494// also used for form query macros
[1914]495text_t escape_quotes(const text_t &querystring) {
496
497 text_t::const_iterator here = querystring.begin();
498 text_t::const_iterator end = querystring.end();
499
500 text_t escquery = "";
501 while (here != end) {
[1988]502 if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
503 else if (*here == '\n' || *here == '\r') {
504 escquery.push_back(' ');
505 } else {
[1914]506 escquery +="\\\\";
507 escquery.push_back(*here);
508 }
509
[9620]510 ++here;
[1914]511 }
512 return escquery;
513
514}
515
[12784]516// Parses the terms into words, and adds #si if necessary
517text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
518 const int indexer_type) {
519
520 // the default stem and case are set to 0 if this is being used, so we are only adding on qualifiers if stem or fold is 1.
521 if (stem == "0" && fold == "0") {
[12791]522 return terms;
[12784]523 }
524 // this is only for mgpp collections, shouldn't be called for anything else
525 if (indexer_type != 1) {
[12791]526 return terms;
[12784]527 }
528
529 text_t outtext;
530 text_t word;
531
532 text_t::const_iterator here = terms.begin();
533 text_t::const_iterator end = terms.end();
534
535 while (here !=end) {
536
537 if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
538 // not word boundary
539 word.push_back(*here);
540 ++here;
541 }
542 else {
543 // found word boundary
544 if (!word.empty() ) {
545 if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) {
546 outtext += word;
547 word.clear();
548 }
549 else {
550 word += "#";
551 if (stem == "1") word += "s";
552 if (fold == "1") word += "i";
553 outtext += word;
554 word.clear();
555 }
556 }
557 // this only used in advanced form, so we leave in boolean operators
[12792]558 if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' ||
559 *here == '(' || *here == ')' || is_unicode_space(*here)) {
[12784]560 outtext.push_back(*here);
561 }
562 ++here;
563 }
564 }
565
566 // get last word
567 if (!word.empty()) {
568 word += "#";
569 if (stem == "1") word += "s";
570 if (fold == "1") word += "i";
571 word += " ";
572 outtext += word;
573 }
574 return outtext;
575}
576
577
[11765]578// some query form parsing functions for use with mgpp & lucene
[1914]579
[12784]580void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
[8029]581{
582 querystring.clear();
[1914]583
[12784]584 int argct = args.getintarg("ct");
[8029]585 int argt = args.getintarg("t");// t=0 -and, t=1 - or
[12784]586 int argb = args.getintarg("b");
587
588 text_t combine;
[8029]589
[12784]590 // lucene uses global combine, so only need this for mgpp
591 if (argct==1) {
[8029]592 if (argt == 0) combine = "&";
593 else combine = "|";
594 }
[1914]595
596 text_t field = args["fqf"];
597 if (field.empty()) return; // no query
598 text_tarray fields;
599 splitchar(field.begin(), field.end(), ',', fields);
600
601 text_t value = args["fqv"];
602 if (value.empty()) return; // somethings wrong
603 text_tarray values;
604 splitchar(value.begin(), value.end(), ',', values);
605
[8029]606
[9620]607 for (int i=0; i< values.size(); ++i) {
[1914]608 if (!values[i].empty()) {
[12784]609 text_t this_value = values[i];
[22046]610
[12784]611 // remove operators for simple search, segments text if necessary
612 format_querystring(this_value, argb, segment);
[22046]613
[12784]614 // add tag info for this field (and other processing)
615 format_field_info(this_value, fields[i], argct, argt, argb);
[22046]616
[12784]617 // add into query string
618 if (argct == 2) {
619 // lucene
620 // we don't worry about AND/OR, cos this is done by defaultcombineoperator
621 querystring += this_value+" ";
622 } else {
623 // mgpp
624 if (!querystring.empty()) {
625 querystring += " "+ combine+ " ";
626 }
627 querystring += this_value;
[8029]628 }
[1914]629 }
630 }
631}
632
633
[12784]634void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
[1914]635 querystring.clear();
636
[12784]637 const int argct = args.getintarg("ct");
638 int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
639 int argb = args.getintarg("b");
[8029]640 text_t combine;
[12784]641 if (argct==1) {
[8029]642 combine = "&";
643 }
644 else { // lucene
645 combine = "AND";
646 }
647
[1914]648 text_t field = args["fqf"];
649 if (field.empty()) return; // no query
650 text_tarray fields;
651 splitchar(field.begin(), field.end(), ',', fields);
652
653 text_t value = args["fqv"];
654 if (value.empty()) return; // somethings wrong
655 text_tarray values;
656 splitchar(value.begin(), value.end(), ',', values);
657
658 text_t comb = args["fqc"];
659 if (comb.empty()) return; //somethings wrong
660 text_tarray combs;
661 splitchar(comb.begin(), comb.end(), ',', combs);
[12784]662
663 text_tarray stems;
664 text_tarray folds;
665 if (argct == 1) {// mgpp - lucene doesn't do stem/case
666 text_t stem = args["fqs"];
667 if (stem.empty()) return; // somethings wrong
668 splitchar(stem.begin(), stem.end(), ',', stems);
669
670 text_t fold = args["fqk"];
671 if (fold.empty()) return; // somethings wrong
672 splitchar(fold.begin(), fold.end(), ',', folds);
673 }
[1914]674
[9620]675 for(int i=0; i< values.size(); ++i) {
[1914]676 if (!values[i].empty()) {
677 if (i!=0) {
[12784]678 if (argct==1) {
[8029]679 if (combs[i-1]=="and") combine = "&";
680 else if (combs[i-1]=="or")combine = "|";
681 else if (combs[i-1]=="not")combine = "!";
682 }
683 else { // lucene
684 if (combs[i-1]=="and") combine = "AND";
685 else if (combs[i-1]=="or")combine = "OR";
686 else if (combs[i-1]=="not")combine = "NOT";
687 }
[1914]688 }
[12784]689 text_t this_value = values[i];
690 // remove operators for simple search, segments text if necessary
691 format_querystring(this_value, argb, segment);
692 if (argct == 1) { // mgpp only
693 this_value = addstemcase(this_value, stems[i], folds[i], argct);
[1914]694 }
[12784]695 // add tag info for this field (and other processing)
696 format_field_info(this_value, fields[i], argct, argt, argb);
697 // add into query string
698 if (!querystring.empty()) {
699 querystring += " "+ combine+ " ";
[2745]700 }
[12784]701 querystring += this_value;
[1914]702
703 }
704 }
705}
706
[22046]707
708// SQL versions for parsing query form
709
710void parse_sqlreg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
711{
712 querystring.clear();
713
714 int argt = args.getintarg("t");// t=0 -and, t=1 - or
715 int argb = args.getintarg("b");
716
717 text_t combine;
718
719 if (argt == 0) combine = "AND";
720 else combine = "OR";
721
722 text_t field = args["sqlfqf"];
723 if (field.empty()) return; // no query
724 text_tarray fields;
725 splitchar(field.begin(), field.end(), ',', fields);
726
727 text_t sqlcomb = args["sqlfqc"];
728 if (sqlcomb.empty()) return; //somethings wrong
729 text_tarray sqlcombs;
730 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
731
732 text_t value = args["fqv"];
733 if (value.empty()) return; // somethings wrong
734 text_tarray values;
735 splitchar(value.begin(), value.end(), ',', values);
736
737
738 for (int i=0; i< values.size(); ++i) {
739 if (!values[i].empty()) {
[24073]740 text_t this_value;
741 const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
742 const text_t LIKE_CONDITION = "LIKE";
743
744 //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
745 //in order to search a field starting with certain words.
746 if (sqlcombs[i] == STARTINGWITH_CONDITION)
747 {this_value = values[i];
748 this_value += "%";
749 // remove operators for simple search, segments text if necessary
750 format_querystring(this_value, argb, segment);
751 // add tag info for this field (and other processing)
752 format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}
[22046]753
[24073]754 else
755 {this_value = values[i];
756 // remove operators for simple search, segments text if necessary
757 format_querystring(this_value, argb, segment);
758 // add tag info for this field (and other processing)
759 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}
[22046]760
[24073]761
762 const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
[22046]763
764 if (querystring.empty()) {
765 // first query term
766 querystring = DISTINCT_SELECT_WHERE + this_value;
767 }
768 else {
769 this_value = DISTINCT_SELECT_WHERE + this_value;
770
771 if (combine=="AND") {
772 // INNER JOIN to restrict to only matching docOIDs
773 querystring = "SELECT docOID FROM (" + querystring + ")"
774 + " INNER JOIN (" + this_value +") USING (docOID)";
775 }
776 else if (combine=="OR") {
777 // Union to allow union of the two
778 querystring = querystring + " UNION " + this_value;
779 }
780 }
781 }
782 }
783}
784
785
786void parse_sqladv_query_form(text_t &querystring, cgiargsclass &args,
787 bool segment)
788{
789 querystring.clear();
790
791 int argt = 0; // set it to 0 = AND, by default
792 int argb = args.getintarg("b");
793 text_t combine = "AND";
794
795 text_t field = args["sqlfqf"];
796
797 if (field.empty()) return; // no query
798 text_tarray fields;
799 splitchar(field.begin(), field.end(), ',', fields);
800
801 text_t sqlcomb = args["sqlfqc"];
802 if (sqlcomb.empty()) return; //somethings wrong
803 text_tarray sqlcombs;
804 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
805
806 text_t value = args["fqv"];
807 if (value.empty()) return; // somethings wrong
808 text_tarray values;
809 splitchar(value.begin(), value.end(), ',', values);
810
811 text_t comb = args["fqc"];
812 if (comb.empty()) return; //somethings wrong
813 text_tarray combs;
814 splitchar(comb.begin(), comb.end(), ',', combs);
815
816 for(int i=0; i< values.size(); ++i) {
817 if (!values[i].empty()) {
818 if (i>0) {
819 if (combs[i-1]=="and") { combine = "AND"; }
820 else if (combs[i-1]=="or") { combine = "OR"; }
821 else if (combs[i-1]=="not") { combine = "NOT"; }
822 }
[24073]823 text_t this_value;
824 const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
825 const text_t LIKE_CONDITION = "LIKE";
826
827 //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
828 //in order to search a field starting with certain words.
829 if (sqlcombs[i] == STARTINGWITH_CONDITION)
830 {this_value = values[i];
831 this_value += "%";
832 // remove operators for simple search, segments text if necessary
833 format_querystring(this_value, argb, segment);
834 // add tag info for this field (and other processing)
835 format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}
[22046]836
[24073]837 else
838 {this_value = values[i];
839 // remove operators for simple search, segments text if necessary
840 format_querystring(this_value, argb, segment);
841 // add tag info for this field (and other processing)
842 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}
843
844 const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
[22046]845
846 if (querystring.empty()) {
847 // first query term
848 querystring = DISTINCT_SELECT_WHERE + this_value;
849 }
850 else {
851 this_value = DISTINCT_SELECT_WHERE + this_value;
852
853 if (combine=="AND") {
854 // INNER JOIN to restrict to only matching docOIDs
855 querystring = "SELECT docOID FROM (" + querystring + ")"
856 + " INNER JOIN (" + this_value +") USING (docOID)";
857 }
858 else if (combine=="OR") {
859 // Union to allow union of the two
860 querystring = querystring + " UNION " + this_value;
861 }
862 else {
863 cerr << "Unsupported combination operation: " << combine << endl;
864 }
865 }
866
867 }
868 }
869}
870
871
872
873
[12784]874// Extended addqueryelem for Human Info project
[7380]875void addqueryelem_ex(text_t &querystring, const text_t &tag,
[12784]876 const text_t &terms, const text_t &stem,
877 const text_t &fold,
[7380]878 const text_t& combine, const text_t& word_combine) {
[12784]879
[7380]880 if (!querystring.empty()) { // have to put and/or
881 querystring += " " + combine + " ";
882 }
883 text_t outtext; outtext.reserve(512);
884 text_t word; word.reserve(100);
885 //unsigned short c;
886 text_t::const_iterator here = terms.begin();
887 text_t::const_iterator end = terms.end();
888 bool inquote = false, firstword = true;
[1914]889
[7380]890 text_t word2; word2.reserve(256);
891
892 while (here !=end) {
893 if (is_unicode_space(*here)) {
894 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
895 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
896 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
897 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
898 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
899 if (inquote) {
900 word2.push_back(*here);
901 }
902 word.append(word2); word2.clear();
903
904 if (!inquote && !word.empty() ) {
[12784]905 // found word boundary
[7380]906
907 if (stem == "1" || fold =="1") {
908 word += "#";
909 if (stem == "1") word += "s";
910 //else word += "u";
911
912 if (fold == "1") word += "i";
913 //else word += "c";
914 }
915 if (firstword) {
916 firstword = false;
917 } else {
918 outtext += " " + word_combine + " ";
919 }
920 outtext += "[" + word + "]:"+tag;
921 word.clear();
922 }
923 ++here;
924 } else if (*here == '\"') {
925 word2.push_back(*here);
926 inquote = !inquote;
927 ++here;
928 } else {
929 // not word boundary
930 word2.push_back(*here);
931 ++here;
932 }
933 }
934
935 // get last word
936 if (!word2.empty()) {
937 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
938 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
939 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
940 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
941 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
942 word.append(word2); word2.clear();
943
944 if (stem == "1"|| fold == "1") {
945 word += "#";
946 if (stem == "1") word += "s";
947 //else word += "u";
948
949 if (fold == "1") word += "i";
950 //else word += "c";
951 }
952 if (!outtext.empty()) outtext += " " + word_combine + " ";
953 outtext += "[" + word + "]:"+tag;
954 }
955 querystring += "(" + outtext + ")";
956}
957
[8357]958void add_field_info(text_t &querystring, const text_t &tag, int type) {
[7380]959
[17796]960 if (tag == "") return; // do nothing
961 if (tag == "ZZ" && type == 1) return; // mgpp doesn't use ZZ tag internally
[8357]962 if (type == 1) { //mgpp
963 querystring = "["+querystring+"]:"+tag;
964 } else if (type == 2) { // lucene
965 querystring = tag+":("+querystring+")";
[4757]966 }
[8357]967
[4757]968}
[8029]969
970
[22046]971void add_field_info_sql(text_t &querystring, const text_t &tagseq,
972 const text_t& sqlcomb)
973{
974
975 if (tagseq == "") return; // do nothing
976
977 text_t element_in = "(element IN (";
978
979 text_tlist mdterms;
980
981 splitword(tagseq.begin(), tagseq.end(), "/", mdterms);
982
983 text_t tags_in = "";
984
985 while (!mdterms.empty()) {
986 text_t tag = mdterms.front();
987 mdterms.pop_front();
988
989 if (!tag.empty()) {
990
991 if (tag.size()>3 && (substr(tag.begin(), tag.begin()+3) == "ex.")) {
992 tag = substr (tag.begin()+3, tag.end());
993 }
994
995 if (!tags_in.empty()) {
996 tags_in += ",";
997 }
998
999 tags_in += "'" + tag + "'";
1000 }
1001 }
1002
1003 element_in += tags_in + ") AND (";
1004
[24073]1005
[22046]1006 if (sqlcomb == "=") {
1007 // override what it means to do equality, to make it more like full text
1008 // searching
1009
1010 text_t orterms = "";
1011 text_t term = "";
1012 bool in_phrase = false;
1013
1014 text_t::const_iterator here = querystring.begin();
1015 text_t::const_iterator end = querystring.end();
1016 while (here != end) {
1017 if (is_unicode_letdig(*here)) {
1018 term.push_back(*here);
1019 }
1020 else if (*here == '"') {
1021 term.push_back(*here);
1022 if (!in_phrase) {
1023 in_phrase = true;
1024 } else {
1025 in_phrase = false;
1026 }
1027 }
1028 else if (in_phrase) {
1029 // Found word boundary, but in a phrase, so does not complete term
1030 term.push_back(*here);
1031 }
1032 else {
1033 // Found a word boundary
1034 if (!orterms.empty()) {
1035 orterms += " OR ";
1036 }
1037 orterms += "value LIKE '%" + term + "%'";
1038 term.clear();
1039 }
1040 ++here;
1041 }
1042
1043 if (!term.empty()) {
1044 if (!orterms.empty()) {
1045 orterms += " OR ";
1046 }
1047 orterms += "value LIKE '%" + term + "%'";
1048 }
1049
1050 element_in += orterms;
1051 }
[24073]1052 //We cast the value from STRING to REAL to allow numeric sorting
1053 else if (sqlcomb == "<num") {
1054 element_in += "CAST(value as REAL) < CAST('" + querystring+"' AS REAL)";
1055 }
1056 else if (sqlcomb == ">num") {
1057 element_in += "CAST(value as REAL) > CAST('" + querystring+"' AS REAL)";
1058 }
1059 else if (sqlcomb == "<=num") {
1060 element_in += "CAST(value as REAL) <= CAST('" + querystring+"' AS REAL)";
1061 }
1062 else if (sqlcomb == ">=num") {
1063 element_in += "CAST(value as REAL) >= CAST('" + querystring+"' AS REAL)";
1064 }
1065 else if (sqlcomb == "=num") {
1066 element_in += "CAST(value as REAL) = CAST('" + querystring+"' AS REAL)";
1067 }
[22046]1068 else {
1069 // search on value is "as is" querystring
1070 element_in += "value " + sqlcomb + " '" + querystring+"'";
1071 }
1072
1073
1074 querystring = element_in + "))";
1075
1076}
1077
1078
[17796]1079void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) {
1080
[11765]1081 int type = 2; //lucene
[8029]1082
[12784]1083 if (argb==0) { // simple
1084 // there will be no & or | as they should have already been removed
[11765]1085 // just tag the entire thing
[10995]1086 if (tag != "") {
[11765]1087 add_field_info(querystring, tag, type);
[10995]1088 }
[8357]1089 return;
1090 }
[10995]1091
[12784]1092 // need to replace & with &&, | with ||
[8357]1093 text_t::const_iterator here = querystring.begin();
1094 text_t::const_iterator end = querystring.end();
[12784]1095
1096 text_t finalquery = "";
[10995]1097 while (here != end) {
[12784]1098 if (*here == '&') {
1099 finalquery.push_back('&');
1100 finalquery.push_back('&');
1101 while (*(here+1) == '&') {
1102 ++here;
[10995]1103 }
[12784]1104 }
1105 else if (*here == '|') {
1106 finalquery.push_back('|');
1107 finalquery.push_back('|');
1108 while (*(here+1) == '|') {
1109 ++here;
1110 }
1111 }
[8357]1112 else {
[12784]1113 finalquery.push_back(*here);
[8357]1114 }
[10995]1115 ++here;
[8357]1116 }
[11765]1117 querystring = finalquery;
[12784]1118 add_field_info(querystring, tag, type);
[11765]1119}
1120
[12784]1121
1122void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {
1123
[11765]1124 if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
[12784]1125 if (tag == "" && argb == 1) {
[11765]1126 return; // no field specifier, advanced mode, the query stays as written
[10995]1127 }
[11765]1128
1129 int type = 1; // mgpp
1130
1131 bool simple_and = (argb==0 && argt==0);
1132 text_t finalquery = "";
1133 text_t fieldpart ="";
1134 text_t queryelem = "";
1135 bool in_phrase = false;
1136 bool in_field = false;
1137
1138 text_t::const_iterator here = querystring.begin();
1139 text_t::const_iterator end = querystring.end();
1140 while (here != end) {
1141 if (is_unicode_letdig(*here) || *here == '&' || is_special_character(type, *here)) {
1142 queryelem.push_back(*here);
1143 }
1144 else if (*here == '|') {
1145 in_field = false;
1146 }
1147 else if (*here == '!' || *here == '(' || *here == ')') {
1148 if (!in_phrase) { // ignore these if in_phrase
1149 // output field, then output operator
1150 in_field = false;
1151 if (!queryelem.empty()) {
1152 if (!simple_and && !fieldpart.empty()) {
1153 add_field_info(fieldpart, tag, type);
1154 finalquery += fieldpart;
1155 finalquery.push_back(' ');
1156 fieldpart.clear();
1157 }
1158 fieldpart += queryelem;
1159 }
1160 if (!fieldpart.empty()) {
1161 add_field_info(fieldpart, tag, type);
1162 finalquery += fieldpart;
1163 finalquery.push_back(' ');
1164 }
1165 fieldpart.clear();
1166 queryelem.clear();
1167 finalquery.push_back(*here);
1168 finalquery.push_back(' ');
1169 }
1170 }
1171 else if (*here == '"') {
1172 queryelem.push_back(*here);
1173 if (in_phrase == false) in_phrase = true;
1174 else {
1175 in_phrase = false;
1176 }
1177 }
1178
1179 // Found word boundary, in a phrase
1180 else if (in_phrase) {
1181 queryelem.push_back(*here);
1182 }
1183 // Found a word boundary
1184 else {
1185 if (!queryelem.empty()) {
1186 if (queryelem == "&") {
1187 in_field = true;
1188 queryelem.clear();
1189 }
1190 else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
1191
1192 if (argb==1) {
1193 // simple search, these not allowed
1194 in_field = true;
1195 fieldpart += queryelem;
1196 fieldpart.push_back(' ');
1197 }
1198 queryelem.clear();
1199
1200 }
1201 else {
1202 if (!simple_and && !in_field) {
1203 if (!fieldpart.empty()) {
1204 add_field_info(fieldpart, tag, type);
1205 finalquery += fieldpart;
1206 finalquery.push_back(' ');
1207 fieldpart.clear();
1208 }
1209 }
1210
1211 fieldpart += queryelem;
1212 fieldpart.push_back(' ');
1213 queryelem.clear();
1214 }
1215 }
1216 }
1217 ++here;
1218 }
1219 // at the end
1220 if (!queryelem.empty()) {
1221 if (!simple_and && !in_field && !fieldpart.empty()) {
1222 add_field_info(fieldpart, tag, type);
1223 finalquery += fieldpart;
[18459]1224 finalquery.push_back(' ');
[11765]1225 fieldpart.clear();
1226 }
1227 fieldpart += queryelem;
1228 }
1229 if (!fieldpart.empty()) {
1230 add_field_info(fieldpart, tag, type);
1231 finalquery += fieldpart;
1232 fieldpart.clear();
[18459]1233
1234 // doesn't the following just leave a dangling space at the end ?? (used to make mgpp crash)
1235 // consider cutting this line
1236 finalquery.push_back(' ');
[11765]1237 }
[22046]1238
[11765]1239 querystring = finalquery;
[8029]1240}
[8357]1241
[12784]1242
[22046]1243void format_field_info_sql(text_t &querystring, const text_t &tagseq,
1244 const text_t &sqlcomb,
1245 int argt, int argb)
1246{
1247 add_field_info_sql(querystring, tagseq, sqlcomb);
1248}
1249
1250
[12784]1251void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) {
[11765]1252 if (argct == 1) {
[12784]1253 format_field_info_mgpp(querystring, tag, argt, argb);
[11765]1254 } else if (argct == 2) {
[12784]1255 format_field_info_lucene(querystring, tag, argt, argb);
[11765]1256 }
1257}
[10995]1258
[12784]1259void mgpp_adddateelem(text_t& querystring, const int date)
1260{
1261 querystring.appendcstr(" [");
1262 if(date<0) {
1263 querystring.appendcstr("bc");
1264 querystring.appendint((date*-1));
1265 }
1266 else {
1267 querystring.appendint(date);
1268 }
1269 querystring.appendcstr("]:CV");
1270}
1271
1272void lucene_adddateelem(text_t& querystring, const int date)
1273{
1274 querystring.appendcstr(" CV:(");
1275 if(date<0) {
1276 querystring.appendcstr("bc");
1277 querystring.appendint((date*-1));
1278 }
1279 else {
1280 querystring.appendint(date);
1281 }
1282 querystring.appendcstr(")");
1283}
1284
1285
1286void add_dates(text_t &querystring, int startdate, int enddate,
1287 int startbc, int endbc, int ct)
1288{
1289 if(startdate)
1290 {
1291 int querystringis = 0;
1292 text_t::const_iterator here = querystring.begin();
1293 text_t::const_iterator end = querystring.end();
1294 while(here!=end)
1295 {
1296 if(!(isspace((*here)))){
1297 here = end;
1298 querystringis = 1;
1299 }
1300 else
1301 ++here;
1302 }
1303 //converting BCE dates
1304 if(startbc && startdate > 0)
1305 {
1306 startdate *= -1;
1307 }
1308 if(endbc && enddate > 0)
1309 {
1310 enddate *= -1;
1311 }
1312 if(enddate != 0 && enddate<startdate)
1313 {
1314 cout<<"enddate too small"<<endl;
1315 return;
1316 }
1317 if(querystringis)
1318 querystring.appendcstr(" AND");
1319 if(!enddate)
1320 {
1321 if (ct==1) {
1322 mgpp_adddateelem(querystring,startdate);
1323 }
1324 else { // lucene
1325 lucene_adddateelem(querystring,startdate);
1326 }
1327 }
1328 else{
1329 int nextdate = startdate;
1330 querystring.appendcstr(" (");
1331 while(nextdate<=enddate)
1332 {
1333 if(nextdate!=0) {
1334 if (ct==1) {
1335 mgpp_adddateelem(querystring,nextdate);
1336 }
1337 else { // lucene
1338 lucene_adddateelem(querystring,nextdate);
1339 }
1340 }
1341 ++nextdate;
1342 }
1343 querystring.appendcstr(" )");
1344 }
1345 }
1346
1347}
Note: See TracBrowser for help on using the repository browser.