source: main/trunk/greenstone2/runtime-src/src/recpt/querytools.cpp@ 27066

Last change on this file since 27066 was 27066, checked in by kjdon, 11 years ago

added in the so (lucene sort order) cgi arg handling

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 36.8 KB
RevLine 
[270]1/**********************************************************************
2 *
3 * querytools.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
[533]6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
[270]9 *
[533]10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
[270]24 *********************************************************************/
25
26#include "querytools.h"
[1373]27#include <ctype.h>
[1914]28#include "unitool.h" // for is_unicode_letdig
[270]29
[12784]30// sets the ct, qt, qto arguments
[11987]31void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
32
33 if (args["ct"].empty()) {
34 text_t build_type = cinfo->buildType;
35 if (build_type == "mgpp") {
36 args["ct"] = "1";
37 } else if (build_type == "lucene") {
38 args["ct"] = "2";
39 } else {
40 args["ct"] = "0";
41 }
42 }
43 text_t arg_ct = args["ct"];
44 if (arg_ct == "0") {
45 // mg
46 args["qt"] = "0";
47 args["qto"] = "0";
48 return;
49 }
50
51 if (!args["qt"].empty() && !args["qto"].empty()) {
52 return;
53 }
54
55 text_tmap::iterator check = cinfo->format.find("SearchTypes");
[12784]56 text_t search_types;
57 if(check != cinfo->format.end() && !(*check).second.empty()){
[11987]58 search_types = (*check).second;
[12784]59 } else {
60 // assume plain,form
61 if (args["qto"].empty()) args["qto"] = "3";
62 if (args["qt"].empty()) {
63 int arg_qto = args.getintarg("qto");
[12930]64 if (arg_qto == 2) {
[12784]65 args["qt"] = "1";
66 } else {
67 args["qt"] = "0";
68 }
[11987]69 }
[12784]70 return;
[11987]71 }
72
[12784]73
[11987]74 if (args["qto"].empty()) {
75 unsigned int type = 0;
76 if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
77 type |= 2;
78 }
79 if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
80 type |= 1;
81 }
82 args.setintarg("qto", type);
83 }
[22046]84
[11987]85 if (args["qt"].empty()) {
86 int arg_qto = args.getintarg("qto");
87 if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
88 args["qt"] = "1";
89 } else {
90 args["qt"] = "0";
91 }
92 }
[22046]93
94
95 // decide if sqlqto should be set or not
96 unsigned int sql_type = 0;
97 text_t infodb_type = cinfo->infodbType;
98 if ((infodb_type == "sqlite") || (infodb_type == "mssql")) {
99 if (findword(search_types.begin(), search_types.end(), "sqlform") != search_types.end()) {
100 sql_type = 1;
101 }
102 }
103
104 if (sql_type) {
105 args["sqlqto"] = "1";
106 }
107 else {
108 args["sqlqto"] = "0";
109 }
110
111
[11987]112}
113
[12864]114// sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args
115void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
116 int stemIndexes = cinfo->stemIndexes;
117
118 if (stemIndexes & SIcasefold) {
119 args["ks"] = 1;
120 }
121 if (stemIndexes & SIstem) {
122 args["ss"] = 1;
123 }
124 if (stemIndexes & SIaccentfold) {
125 args["afs"] = 1;
126 }
127
128}
129
[22046]130
131
132void set_basequeryfilter_options (FilterRequest_t &request,
133 cgiargsclass &args)
134{
135
136 OptionValue_t option;
137 int arg_m = args.getintarg("m");
138
139 option.name = "Maxdocs";
140 option.value = arg_m;
141 request.filterOptions.push_back (option);
142
143 // option.name = "StartResults";
144 // option.value = args["r"];
145 // request.filterOptions.push_back (option);
146
147 // option.name = "EndResults";
148 // int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
149 // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
150 // option.value = endresults;
151 // request.filterOptions.push_back (option);
152}
153
154
[759]155// request.filterResultOptions and request.fields (if required) should
156// be set from the calling code
[22046]157void set_fulltext_queryfilter_options (FilterRequest_t &request,
158 const text_t &querystring,
159 cgiargsclass &args)
160{
161 // better if this function, and the two-query companion function
162 // was implemented in queryaction.cpp
163 // Has to be done here to documentaction.cpp can call it directly
[270]164
165 request.filterName = "QueryFilter";
166
167 OptionValue_t option;
[470]168
[270]169 option.name = "Term";
[759]170 option.value = querystring;
[270]171 request.filterOptions.push_back (option);
172
173 option.name = "QueryType";
174 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
175 request.filterOptions.push_back (option);
176
[1774]177 option.name = "MatchMode";
[11765]178 // mgpp in advanced mode, always use some query
[12428]179 if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
[11765]180 option.value = "some";
181 } else {
182 option.value = (args.getintarg("t")) ? "some" : "all";
183 }
[1774]184 request.filterOptions.push_back (option);
185
[270]186 option.name = "Casefold";
187 option.value = (args.getintarg("k")) ? "true" : "false";
188 request.filterOptions.push_back (option);
189
190 option.name = "Stem";
191 option.value = (args.getintarg("s")) ? "true" : "false";
192 request.filterOptions.push_back (option);
193
[12864]194 option.name = "AccentFold";
195 option.value = (args.getintarg("af")) ? "true" : "false";
196 request.filterOptions.push_back (option);
197
[270]198 if (!args["h"].empty()) {
199 option.name = "Index";
200 option.value = args["h"];
201 request.filterOptions.push_back (option);
202 }
203
204 if (!args["j"].empty()) {
205 option.name = "Subcollection";
206 option.value = args["j"];
207 request.filterOptions.push_back (option);
208 }
209
210 if (!args["n"].empty()) {
211 option.name = "Language";
212 option.value = args["n"];
213 request.filterOptions.push_back (option);
214 }
[1329]215
216 if (!args["g"].empty()) { // granularity for mgpp
217 option.name = "Level";
218 option.value = args["g"];
219 request.filterOptions.push_back (option);
220 }
[270]221
[12410]222 if (!args["fs"].empty()) { // filter string for lucene
223 option.name = "FilterString";
224 option.value = args["fs"];
225 request.filterOptions.push_back (option);
226 }
227
[12276]228 if (!args["sf"].empty()) { // sort field for lucene
229 option.name = "SortField";
230 option.value = args["sf"];
231 request.filterOptions.push_back (option);
232 }
[27066]233 if (!args["so"].empty()) { // sort order for lucene
234 option.name = "SortOrder";
235 option.value = (args.getintarg("so")? "descending" : "ascending");
236 request.filterOptions.push_back (option);
237 }
[12276]238
[12771]239 if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene
[12770]240 option.name = "Fuzziness";
[12771]241 option.value = (text_t) "0." + args["fuzziness"];
[12770]242 request.filterOptions.push_back (option);
243 }
[12388]244
[22046]245 set_basequeryfilter_options(request, args);
[759]246}
247
248
249
[22046]250void set_fulltext_queryfilter_options (FilterRequest_t &request,
251 const text_t &querystring1,
252 const text_t &querystring2,
253 cgiargsclass &args)
254{
255
256 set_fulltext_queryfilter_options (request, querystring1, args);
257
[349]258 // fill in the second query if needed
259 if (!args["cq2"].empty()) {
[759]260 OptionValue_t option;
261
[349]262 option.name = "CombineQuery";
263 option.value = args["cq2"];
264 request.filterOptions.push_back (option);
265
266 option.name = "Term";
[759]267 option.value = querystring2;
[349]268 request.filterOptions.push_back (option);
[759]269
[349]270 option.name = "QueryType";
271 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
272 request.filterOptions.push_back (option);
273
274 option.name = "Casefold";
275 option.value = (args.getintarg("k")) ? "true" : "false";
276 request.filterOptions.push_back (option);
277
278 option.name = "Stem";
279 option.value = (args.getintarg("s")) ? "true" : "false";
280 request.filterOptions.push_back (option);
281
[12864]282 option.name = "AccentFold";
283 option.value = (args.getintarg("af")) ? "true" : "false";
284 request.filterOptions.push_back (option);
285
[349]286 if (!args["h2"].empty()) {
287 option.name = "Index";
288 option.value = args["h2"];
289 request.filterOptions.push_back (option);
290 }
291
292 if (!args["j2"].empty()) {
293 option.name = "Subcollection";
294 option.value = args["j2"];
295 request.filterOptions.push_back (option);
296 }
297
298 if (!args["n2"].empty()) {
299 option.name = "Language";
300 option.value = args["n2"];
301 request.filterOptions.push_back (option);
302 }
303 }
[22046]304
305 // this is probably redundant, as first line to this method will have
306 // already caused it to invoke set_basequeryfilter_options
307
308 set_basequeryfilter_options(request, args);
[759]309}
[608]310
[759]311
[1329]312
[22046]313// request.filterResultOptions and request.fields (if required) should
314// be set from the calling code
315void set_sql_queryfilter_options (FilterRequest_t &request,
316 cgiargsclass &args)
317{
318 if (!args["sqlsf"].empty()) { // sort field for lucene
319 OptionValue_t option;
[270]320
[22046]321 option.name = "SortField";
322 option.value = args["sqlsf"];
323 request.filterOptions.push_back (option);
324 }
325
326 set_basequeryfilter_options(request, args);
[270]327}
328
[22046]329
[11987]330bool is_special_character(int indexer_type, unsigned short character) {
331 // mgpp
332 if (indexer_type == 1) {
333 return (character == '#' || character == '/' || character == '*');
334 }
335 // lucene
[12784]336 else if (indexer_type == 2) {
[11987]337 return (character == '?' || character == '*' || character == '~' ||
338 character == '^');
339 }
340 return false;
341}
342
[12784]343// This function removes boolean operators from simple searches, and segments
344// chinese characters if segment=true
[6584]345void format_querystring (text_t &querystring, int querymode, bool segment) {
[270]346 text_t formattedstring;
347
[12784]348 // advanced search, no segmenting, don't need to do anything
[6584]349 if (querymode == 1 && !segment) return;
350
[270]351 text_t::const_iterator here = querystring.begin();
352 text_t::const_iterator end = querystring.end();
353
354 // space is used to insert spaces between Chinese
355 // characters. No space is needed before the first
356 // Chinese character.
357 bool space = false;
358
359 // want to remove ()|!& from querystring so boolean queries are just
[470]360 // "all the words" queries (unless querymode is advanced)
[270]361 while (here != end) {
[470]362 if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
363 *here == '!' || *here == '&')) {
[270]364 formattedstring.push_back(' ');
[6584]365 } else if (segment) {
[16980]366 if ((*here >= 0x2e80 && *here <= 0xd7a3) ||
367 ( *here >= 0xf900 && *here <= 0xfa6a)) {
368 /* text_t not big enough to handle these. */
369 /* (*here >= 0x20000 && *here <= 0x2a6d6) ||
370 (*here >= 0x2f800 && *here <= 0x2fa1d)) { */
[16645]371
372 // CJK character
[8715]373 if (!space) formattedstring.push_back (0x200b); // zero width space
[397]374 formattedstring.push_back (*here);
375 formattedstring.push_back (0x200b);
376 space = true;
[270]377 } else {
[8715]378
[397]379 // non-Chinese character
380 formattedstring.push_back (*here);
381 space = false;
[8715]382
[270]383 }
[6584]384
385 } else {
386 formattedstring.push_back (*here);
[270]387 }
[9620]388 ++here;
[270]389 }
[397]390 querystring = formattedstring;
[270]391}
392
[20481]393// turn query string into terms separated by spaces.
394// still working on this...
[20602]395text_t get_plain_query_terms(const text_t &querystring, const text_t &arg_ct) {
[20481]396 text_t::const_iterator here = querystring.begin();
397 text_t::const_iterator end = querystring.end();
[20602]398
399 // lets look for [] and () first - these are a pain.
400 text_t::const_iterator bracket;
401 text_t query_no_brackets = "";
402
403 // mgpp brackets: [xxx]:TI
404 if (findchar(here, end, '[') != end) {
405 while ((bracket = findchar(here, end, '[')) != end) {
406 // get the first bit
407 query_no_brackets += substr(here, bracket);
408 bracket++;
409 here = bracket;
410 // get the end bracket
411 bracket = findchar(here, end, ']');
412 query_no_brackets += substr(here, bracket);
413 // skip the :TI bits
[23635]414 while (bracket != end // do bracket != end test first, ELSE when bracket = end, we're past the string, in
415 && *bracket != ' ') { // which case *bracket becomes an invalid operation that causes the server to crash
416 bracket++;
417 }
[20602]418 here = bracket;
419 }
420 if (here != end) {
421 query_no_brackets += substr(here,end);
422 }
423 } else if (findchar(here, end, '(') != end) {
424 // lucene brackets TI:(xxx)
425 while ((bracket = findchar(here, end, '(')) != end) {
426 // back up the field name
427 text_t::const_iterator old_bracket = bracket;
[23635]428 while (bracket != here && *bracket != ' ') { // order of tests in condition matters (see long comment above)
429 --bracket;
[20602]430 }
431 if (bracket != here) {
432 // get the first bit
433 query_no_brackets += substr(here, bracket+1);
434 }
435 here = old_bracket +1;
436 // get the end bracket
437 bracket = findchar(here, end, ')');
438 query_no_brackets += substr(here, bracket);
439 if (bracket != end) {
440 here = bracket+1;
441 }
442 }
443 if (here != end) {
444 query_no_brackets += substr(here,end);
445 }
446 } else {
447 // was no brackets
448 query_no_brackets = querystring;
449 }
450
451
452 if (arg_ct == "2") { // lucene
453 // look for AND OR NOT and remove
454 here = query_no_brackets.begin();
455 end = query_no_brackets.end();
456 text_tlist terms;
457 splitword(here, end, "AND", terms);
458 joinchar(terms, ' ', query_no_brackets);
459 here = query_no_brackets.begin();
460 end = query_no_brackets.end();
461 splitword(here, end, "OR", terms);
462 joinchar(terms, ' ', query_no_brackets);
463 here = query_no_brackets.begin();
464 end = query_no_brackets.end();
465 splitword(here, end, "NOT", terms);
466 joinchar(terms, ' ', query_no_brackets);
467
468 }
[20481]469 text_t terms = "";
470 bool space = false;
[20602]471 here = query_no_brackets.begin();
472 end = query_no_brackets.end();
473
[20481]474 while (here != end) {
475 if (*here == '#' || *here == '/') {
476 // skip over #is /10 etc
477 ++here;
478 while (here != end && *here != ' ') {
479 ++here;
480 }
481 if (here == end) break;
482 }
483 if (is_unicode_letdig(*here)) {
484 terms.push_back(*here);
485 space = false;
486 } else {
487 if (!space) {
488 terms.push_back(' ');
489 space = true;
490 }
491 }
492 ++here;
493 }
[24111]494 return trim(terms);
[20481]495
496}
[1467]497
[3160]498// search history tool
499// also used for form query macros
[1914]500text_t escape_quotes(const text_t &querystring) {
501
502 text_t::const_iterator here = querystring.begin();
503 text_t::const_iterator end = querystring.end();
504
505 text_t escquery = "";
506 while (here != end) {
[1988]507 if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
508 else if (*here == '\n' || *here == '\r') {
509 escquery.push_back(' ');
510 } else {
[1914]511 escquery +="\\\\";
512 escquery.push_back(*here);
513 }
514
[9620]515 ++here;
[1914]516 }
517 return escquery;
518
519}
520
[12784]521// Parses the terms into words, and adds #si if necessary
522text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
523 const int indexer_type) {
524
525 // the default stem and case are set to 0 if this is being used, so we are only adding on qualifiers if stem or fold is 1.
526 if (stem == "0" && fold == "0") {
[12791]527 return terms;
[12784]528 }
529 // this is only for mgpp collections, shouldn't be called for anything else
530 if (indexer_type != 1) {
[12791]531 return terms;
[12784]532 }
533
534 text_t outtext;
535 text_t word;
536
537 text_t::const_iterator here = terms.begin();
538 text_t::const_iterator end = terms.end();
539
540 while (here !=end) {
541
542 if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
543 // not word boundary
544 word.push_back(*here);
545 ++here;
546 }
547 else {
548 // found word boundary
549 if (!word.empty() ) {
550 if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) {
551 outtext += word;
552 word.clear();
553 }
554 else {
555 word += "#";
556 if (stem == "1") word += "s";
557 if (fold == "1") word += "i";
558 outtext += word;
559 word.clear();
560 }
561 }
562 // this only used in advanced form, so we leave in boolean operators
[12792]563 if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' ||
564 *here == '(' || *here == ')' || is_unicode_space(*here)) {
[12784]565 outtext.push_back(*here);
566 }
567 ++here;
568 }
569 }
570
571 // get last word
572 if (!word.empty()) {
573 word += "#";
574 if (stem == "1") word += "s";
575 if (fold == "1") word += "i";
576 word += " ";
577 outtext += word;
578 }
579 return outtext;
580}
581
582
[11765]583// some query form parsing functions for use with mgpp & lucene
[1914]584
[12784]585void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
[8029]586{
587 querystring.clear();
[1914]588
[12784]589 int argct = args.getintarg("ct");
[8029]590 int argt = args.getintarg("t");// t=0 -and, t=1 - or
[12784]591 int argb = args.getintarg("b");
592
593 text_t combine;
[8029]594
[12784]595 // lucene uses global combine, so only need this for mgpp
596 if (argct==1) {
[8029]597 if (argt == 0) combine = "&";
598 else combine = "|";
599 }
[1914]600
601 text_t field = args["fqf"];
602 if (field.empty()) return; // no query
603 text_tarray fields;
604 splitchar(field.begin(), field.end(), ',', fields);
605
606 text_t value = args["fqv"];
607 if (value.empty()) return; // somethings wrong
608 text_tarray values;
609 splitchar(value.begin(), value.end(), ',', values);
610
[8029]611
[9620]612 for (int i=0; i< values.size(); ++i) {
[1914]613 if (!values[i].empty()) {
[12784]614 text_t this_value = values[i];
[22046]615
[12784]616 // remove operators for simple search, segments text if necessary
617 format_querystring(this_value, argb, segment);
[22046]618
[12784]619 // add tag info for this field (and other processing)
620 format_field_info(this_value, fields[i], argct, argt, argb);
[22046]621
[12784]622 // add into query string
623 if (argct == 2) {
624 // lucene
625 // we don't worry about AND/OR, cos this is done by defaultcombineoperator
626 querystring += this_value+" ";
627 } else {
628 // mgpp
629 if (!querystring.empty()) {
630 querystring += " "+ combine+ " ";
631 }
632 querystring += this_value;
[8029]633 }
[1914]634 }
635 }
636}
637
638
[12784]639void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
[1914]640 querystring.clear();
641
[12784]642 const int argct = args.getintarg("ct");
643 int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
644 int argb = args.getintarg("b");
[8029]645 text_t combine;
[12784]646 if (argct==1) {
[8029]647 combine = "&";
648 }
649 else { // lucene
650 combine = "AND";
651 }
652
[1914]653 text_t field = args["fqf"];
654 if (field.empty()) return; // no query
655 text_tarray fields;
656 splitchar(field.begin(), field.end(), ',', fields);
657
658 text_t value = args["fqv"];
659 if (value.empty()) return; // somethings wrong
660 text_tarray values;
661 splitchar(value.begin(), value.end(), ',', values);
662
663 text_t comb = args["fqc"];
664 if (comb.empty()) return; //somethings wrong
665 text_tarray combs;
666 splitchar(comb.begin(), comb.end(), ',', combs);
[12784]667
668 text_tarray stems;
669 text_tarray folds;
670 if (argct == 1) {// mgpp - lucene doesn't do stem/case
671 text_t stem = args["fqs"];
672 if (stem.empty()) return; // somethings wrong
673 splitchar(stem.begin(), stem.end(), ',', stems);
674
675 text_t fold = args["fqk"];
676 if (fold.empty()) return; // somethings wrong
677 splitchar(fold.begin(), fold.end(), ',', folds);
678 }
[1914]679
[9620]680 for(int i=0; i< values.size(); ++i) {
[1914]681 if (!values[i].empty()) {
682 if (i!=0) {
[12784]683 if (argct==1) {
[8029]684 if (combs[i-1]=="and") combine = "&";
685 else if (combs[i-1]=="or")combine = "|";
686 else if (combs[i-1]=="not")combine = "!";
687 }
688 else { // lucene
689 if (combs[i-1]=="and") combine = "AND";
690 else if (combs[i-1]=="or")combine = "OR";
691 else if (combs[i-1]=="not")combine = "NOT";
692 }
[1914]693 }
[12784]694 text_t this_value = values[i];
695 // remove operators for simple search, segments text if necessary
696 format_querystring(this_value, argb, segment);
697 if (argct == 1) { // mgpp only
698 this_value = addstemcase(this_value, stems[i], folds[i], argct);
[1914]699 }
[12784]700 // add tag info for this field (and other processing)
701 format_field_info(this_value, fields[i], argct, argt, argb);
702 // add into query string
703 if (!querystring.empty()) {
704 querystring += " "+ combine+ " ";
[2745]705 }
[12784]706 querystring += this_value;
[1914]707
708 }
709 }
710}
711
[22046]712
713// SQL versions for parsing query form
714
715void parse_sqlreg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
716{
717 querystring.clear();
718
719 int argt = args.getintarg("t");// t=0 -and, t=1 - or
720 int argb = args.getintarg("b");
721
722 text_t combine;
723
724 if (argt == 0) combine = "AND";
725 else combine = "OR";
726
727 text_t field = args["sqlfqf"];
728 if (field.empty()) return; // no query
729 text_tarray fields;
730 splitchar(field.begin(), field.end(), ',', fields);
731
732 text_t sqlcomb = args["sqlfqc"];
733 if (sqlcomb.empty()) return; //somethings wrong
734 text_tarray sqlcombs;
735 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
736
737 text_t value = args["fqv"];
738 if (value.empty()) return; // somethings wrong
739 text_tarray values;
740 splitchar(value.begin(), value.end(), ',', values);
741
742
743 for (int i=0; i< values.size(); ++i) {
744 if (!values[i].empty()) {
[24073]745 text_t this_value;
746 const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
747 const text_t LIKE_CONDITION = "LIKE";
748
749 //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
750 //in order to search a field starting with certain words.
751 if (sqlcombs[i] == STARTINGWITH_CONDITION)
752 {this_value = values[i];
753 this_value += "%";
754 // remove operators for simple search, segments text if necessary
755 format_querystring(this_value, argb, segment);
756 // add tag info for this field (and other processing)
757 format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}
[22046]758
[24073]759 else
760 {this_value = values[i];
761 // remove operators for simple search, segments text if necessary
762 format_querystring(this_value, argb, segment);
763 // add tag info for this field (and other processing)
764 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}
[22046]765
[24073]766
767 const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
[22046]768
769 if (querystring.empty()) {
770 // first query term
771 querystring = DISTINCT_SELECT_WHERE + this_value;
772 }
773 else {
774 this_value = DISTINCT_SELECT_WHERE + this_value;
775
776 if (combine=="AND") {
777 // INNER JOIN to restrict to only matching docOIDs
778 querystring = "SELECT docOID FROM (" + querystring + ")"
779 + " INNER JOIN (" + this_value +") USING (docOID)";
780 }
781 else if (combine=="OR") {
782 // Union to allow union of the two
783 querystring = querystring + " UNION " + this_value;
784 }
785 }
786 }
787 }
788}
789
790
791void parse_sqladv_query_form(text_t &querystring, cgiargsclass &args,
792 bool segment)
793{
794 querystring.clear();
795
796 int argt = 0; // set it to 0 = AND, by default
797 int argb = args.getintarg("b");
798 text_t combine = "AND";
799
800 text_t field = args["sqlfqf"];
801
802 if (field.empty()) return; // no query
803 text_tarray fields;
804 splitchar(field.begin(), field.end(), ',', fields);
805
806 text_t sqlcomb = args["sqlfqc"];
807 if (sqlcomb.empty()) return; //somethings wrong
808 text_tarray sqlcombs;
809 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
810
811 text_t value = args["fqv"];
812 if (value.empty()) return; // somethings wrong
813 text_tarray values;
814 splitchar(value.begin(), value.end(), ',', values);
815
816 text_t comb = args["fqc"];
817 if (comb.empty()) return; //somethings wrong
818 text_tarray combs;
819 splitchar(comb.begin(), comb.end(), ',', combs);
820
821 for(int i=0; i< values.size(); ++i) {
822 if (!values[i].empty()) {
823 if (i>0) {
824 if (combs[i-1]=="and") { combine = "AND"; }
825 else if (combs[i-1]=="or") { combine = "OR"; }
826 else if (combs[i-1]=="not") { combine = "NOT"; }
827 }
[24073]828 text_t this_value;
829 const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
830 const text_t LIKE_CONDITION = "LIKE";
831
832 //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
833 //in order to search a field starting with certain words.
834 if (sqlcombs[i] == STARTINGWITH_CONDITION)
835 {this_value = values[i];
836 this_value += "%";
837 // remove operators for simple search, segments text if necessary
838 format_querystring(this_value, argb, segment);
839 // add tag info for this field (and other processing)
840 format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}
[22046]841
[24073]842 else
843 {this_value = values[i];
844 // remove operators for simple search, segments text if necessary
845 format_querystring(this_value, argb, segment);
846 // add tag info for this field (and other processing)
847 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}
848
849 const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
[22046]850
851 if (querystring.empty()) {
852 // first query term
853 querystring = DISTINCT_SELECT_WHERE + this_value;
854 }
855 else {
856 this_value = DISTINCT_SELECT_WHERE + this_value;
857
858 if (combine=="AND") {
859 // INNER JOIN to restrict to only matching docOIDs
860 querystring = "SELECT docOID FROM (" + querystring + ")"
861 + " INNER JOIN (" + this_value +") USING (docOID)";
862 }
863 else if (combine=="OR") {
864 // Union to allow union of the two
865 querystring = querystring + " UNION " + this_value;
866 }
867 else {
868 cerr << "Unsupported combination operation: " << combine << endl;
869 }
870 }
871
872 }
873 }
874}
875
876
877
878
[12784]879// Extended addqueryelem for Human Info project
[7380]880void addqueryelem_ex(text_t &querystring, const text_t &tag,
[12784]881 const text_t &terms, const text_t &stem,
882 const text_t &fold,
[7380]883 const text_t& combine, const text_t& word_combine) {
[12784]884
[7380]885 if (!querystring.empty()) { // have to put and/or
886 querystring += " " + combine + " ";
887 }
888 text_t outtext; outtext.reserve(512);
889 text_t word; word.reserve(100);
890 //unsigned short c;
891 text_t::const_iterator here = terms.begin();
892 text_t::const_iterator end = terms.end();
893 bool inquote = false, firstword = true;
[1914]894
[7380]895 text_t word2; word2.reserve(256);
896
897 while (here !=end) {
898 if (is_unicode_space(*here)) {
899 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
900 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
901 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
902 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
903 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
904 if (inquote) {
905 word2.push_back(*here);
906 }
907 word.append(word2); word2.clear();
908
909 if (!inquote && !word.empty() ) {
[12784]910 // found word boundary
[7380]911
912 if (stem == "1" || fold =="1") {
913 word += "#";
914 if (stem == "1") word += "s";
915 //else word += "u";
916
917 if (fold == "1") word += "i";
918 //else word += "c";
919 }
920 if (firstword) {
921 firstword = false;
922 } else {
923 outtext += " " + word_combine + " ";
924 }
925 outtext += "[" + word + "]:"+tag;
926 word.clear();
927 }
928 ++here;
929 } else if (*here == '\"') {
930 word2.push_back(*here);
931 inquote = !inquote;
932 ++here;
933 } else {
934 // not word boundary
935 word2.push_back(*here);
936 ++here;
937 }
938 }
939
940 // get last word
941 if (!word2.empty()) {
942 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
943 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
944 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
945 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
946 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
947 word.append(word2); word2.clear();
948
949 if (stem == "1"|| fold == "1") {
950 word += "#";
951 if (stem == "1") word += "s";
952 //else word += "u";
953
954 if (fold == "1") word += "i";
955 //else word += "c";
956 }
957 if (!outtext.empty()) outtext += " " + word_combine + " ";
958 outtext += "[" + word + "]:"+tag;
959 }
960 querystring += "(" + outtext + ")";
961}
962
[8357]963void add_field_info(text_t &querystring, const text_t &tag, int type) {
[7380]964
[17796]965 if (tag == "") return; // do nothing
966 if (tag == "ZZ" && type == 1) return; // mgpp doesn't use ZZ tag internally
[8357]967 if (type == 1) { //mgpp
968 querystring = "["+querystring+"]:"+tag;
969 } else if (type == 2) { // lucene
970 querystring = tag+":("+querystring+")";
[4757]971 }
[8357]972
[4757]973}
[8029]974
975
[22046]976void add_field_info_sql(text_t &querystring, const text_t &tagseq,
977 const text_t& sqlcomb)
978{
979
980 if (tagseq == "") return; // do nothing
981
982 text_t element_in = "(element IN (";
983
984 text_tlist mdterms;
985
986 splitword(tagseq.begin(), tagseq.end(), "/", mdterms);
987
988 text_t tags_in = "";
989
990 while (!mdterms.empty()) {
991 text_t tag = mdterms.front();
992 mdterms.pop_front();
993
994 if (!tag.empty()) {
995
[24306]996 // remove "ex." prefix, but only if there are no other metadata set qualifiers
997 // in the metaname, since we want to retain prefixes like "ex.dc." as-is
998 text_t::iterator period = findchar(tag.begin(), tag.end(), '.');
999 text_t::iterator lastperiod = findlastchar(tag.begin(), tag.end(), '.');
1000
1001 if (tag.size()>3 && (substr(tag.begin(), tag.begin()+3) == "ex.") && period == lastperiod) {
[22046]1002 tag = substr (tag.begin()+3, tag.end());
1003 }
1004
1005 if (!tags_in.empty()) {
1006 tags_in += ",";
1007 }
1008
1009 tags_in += "'" + tag + "'";
1010 }
1011 }
1012
1013 element_in += tags_in + ") AND (";
1014
[24073]1015
[22046]1016 if (sqlcomb == "=") {
1017 // override what it means to do equality, to make it more like full text
1018 // searching
1019
1020 text_t orterms = "";
1021 text_t term = "";
1022 bool in_phrase = false;
1023
1024 text_t::const_iterator here = querystring.begin();
1025 text_t::const_iterator end = querystring.end();
1026 while (here != end) {
1027 if (is_unicode_letdig(*here)) {
1028 term.push_back(*here);
1029 }
1030 else if (*here == '"') {
1031 term.push_back(*here);
1032 if (!in_phrase) {
1033 in_phrase = true;
1034 } else {
1035 in_phrase = false;
1036 }
1037 }
1038 else if (in_phrase) {
1039 // Found word boundary, but in a phrase, so does not complete term
1040 term.push_back(*here);
1041 }
1042 else {
1043 // Found a word boundary
1044 if (!orterms.empty()) {
1045 orterms += " OR ";
1046 }
1047 orterms += "value LIKE '%" + term + "%'";
1048 term.clear();
1049 }
1050 ++here;
1051 }
1052
1053 if (!term.empty()) {
1054 if (!orterms.empty()) {
1055 orterms += " OR ";
1056 }
1057 orterms += "value LIKE '%" + term + "%'";
1058 }
1059
1060 element_in += orterms;
1061 }
[24073]1062 //We cast the value from STRING to REAL to allow numeric sorting
1063 else if (sqlcomb == "<num") {
1064 element_in += "CAST(value as REAL) < CAST('" + querystring+"' AS REAL)";
1065 }
1066 else if (sqlcomb == ">num") {
1067 element_in += "CAST(value as REAL) > CAST('" + querystring+"' AS REAL)";
1068 }
1069 else if (sqlcomb == "<=num") {
1070 element_in += "CAST(value as REAL) <= CAST('" + querystring+"' AS REAL)";
1071 }
1072 else if (sqlcomb == ">=num") {
1073 element_in += "CAST(value as REAL) >= CAST('" + querystring+"' AS REAL)";
1074 }
1075 else if (sqlcomb == "=num") {
1076 element_in += "CAST(value as REAL) = CAST('" + querystring+"' AS REAL)";
1077 }
[22046]1078 else {
1079 // search on value is "as is" querystring
1080 element_in += "value " + sqlcomb + " '" + querystring+"'";
1081 }
1082
1083
1084 querystring = element_in + "))";
1085
1086}
1087
1088
[17796]1089void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) {
1090
[11765]1091 int type = 2; //lucene
[8029]1092
[12784]1093 if (argb==0) { // simple
1094 // there will be no & or | as they should have already been removed
[11765]1095 // just tag the entire thing
[10995]1096 if (tag != "") {
[11765]1097 add_field_info(querystring, tag, type);
[10995]1098 }
[8357]1099 return;
1100 }
[10995]1101
[12784]1102 // need to replace & with &&, | with ||
[8357]1103 text_t::const_iterator here = querystring.begin();
1104 text_t::const_iterator end = querystring.end();
[12784]1105
1106 text_t finalquery = "";
[10995]1107 while (here != end) {
[12784]1108 if (*here == '&') {
1109 finalquery.push_back('&');
1110 finalquery.push_back('&');
1111 while (*(here+1) == '&') {
1112 ++here;
[10995]1113 }
[12784]1114 }
1115 else if (*here == '|') {
1116 finalquery.push_back('|');
1117 finalquery.push_back('|');
1118 while (*(here+1) == '|') {
1119 ++here;
1120 }
1121 }
[8357]1122 else {
[12784]1123 finalquery.push_back(*here);
[8357]1124 }
[10995]1125 ++here;
[8357]1126 }
[11765]1127 querystring = finalquery;
[12784]1128 add_field_info(querystring, tag, type);
[11765]1129}
1130
[12784]1131
1132void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {
1133
[11765]1134 if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
[12784]1135 if (tag == "" && argb == 1) {
[11765]1136 return; // no field specifier, advanced mode, the query stays as written
[10995]1137 }
[11765]1138
1139 int type = 1; // mgpp
1140
1141 bool simple_and = (argb==0 && argt==0);
1142 text_t finalquery = "";
1143 text_t fieldpart ="";
1144 text_t queryelem = "";
1145 bool in_phrase = false;
1146 bool in_field = false;
1147
1148 text_t::const_iterator here = querystring.begin();
1149 text_t::const_iterator end = querystring.end();
1150 while (here != end) {
1151 if (is_unicode_letdig(*here) || *here == '&' || is_special_character(type, *here)) {
1152 queryelem.push_back(*here);
1153 }
1154 else if (*here == '|') {
1155 in_field = false;
1156 }
1157 else if (*here == '!' || *here == '(' || *here == ')') {
1158 if (!in_phrase) { // ignore these if in_phrase
1159 // output field, then output operator
1160 in_field = false;
1161 if (!queryelem.empty()) {
1162 if (!simple_and && !fieldpart.empty()) {
1163 add_field_info(fieldpart, tag, type);
1164 finalquery += fieldpart;
1165 finalquery.push_back(' ');
1166 fieldpart.clear();
1167 }
1168 fieldpart += queryelem;
1169 }
1170 if (!fieldpart.empty()) {
1171 add_field_info(fieldpart, tag, type);
1172 finalquery += fieldpart;
1173 finalquery.push_back(' ');
1174 }
1175 fieldpart.clear();
1176 queryelem.clear();
1177 finalquery.push_back(*here);
1178 finalquery.push_back(' ');
1179 }
1180 }
1181 else if (*here == '"') {
1182 queryelem.push_back(*here);
1183 if (in_phrase == false) in_phrase = true;
1184 else {
1185 in_phrase = false;
1186 }
1187 }
1188
1189 // Found word boundary, in a phrase
1190 else if (in_phrase) {
1191 queryelem.push_back(*here);
1192 }
1193 // Found a word boundary
1194 else {
1195 if (!queryelem.empty()) {
1196 if (queryelem == "&") {
1197 in_field = true;
1198 queryelem.clear();
1199 }
1200 else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
1201
1202 if (argb==1) {
1203 // simple search, these not allowed
1204 in_field = true;
1205 fieldpart += queryelem;
1206 fieldpart.push_back(' ');
1207 }
1208 queryelem.clear();
1209
1210 }
1211 else {
1212 if (!simple_and && !in_field) {
1213 if (!fieldpart.empty()) {
1214 add_field_info(fieldpart, tag, type);
1215 finalquery += fieldpart;
1216 finalquery.push_back(' ');
1217 fieldpart.clear();
1218 }
1219 }
1220
1221 fieldpart += queryelem;
1222 fieldpart.push_back(' ');
1223 queryelem.clear();
1224 }
1225 }
1226 }
1227 ++here;
1228 }
1229 // at the end
1230 if (!queryelem.empty()) {
1231 if (!simple_and && !in_field && !fieldpart.empty()) {
1232 add_field_info(fieldpart, tag, type);
1233 finalquery += fieldpart;
[18459]1234 finalquery.push_back(' ');
[11765]1235 fieldpart.clear();
1236 }
1237 fieldpart += queryelem;
1238 }
1239 if (!fieldpart.empty()) {
1240 add_field_info(fieldpart, tag, type);
1241 finalquery += fieldpart;
1242 fieldpart.clear();
[18459]1243
1244 // doesn't the following just leave a dangling space at the end ?? (used to make mgpp crash)
1245 // consider cutting this line
1246 finalquery.push_back(' ');
[11765]1247 }
[22046]1248
[11765]1249 querystring = finalquery;
[8029]1250}
[8357]1251
[12784]1252
[22046]1253void format_field_info_sql(text_t &querystring, const text_t &tagseq,
1254 const text_t &sqlcomb,
1255 int argt, int argb)
1256{
1257 add_field_info_sql(querystring, tagseq, sqlcomb);
1258}
1259
1260
[12784]1261void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) {
[11765]1262 if (argct == 1) {
[12784]1263 format_field_info_mgpp(querystring, tag, argt, argb);
[11765]1264 } else if (argct == 2) {
[12784]1265 format_field_info_lucene(querystring, tag, argt, argb);
[11765]1266 }
1267}
[10995]1268
[12784]1269void mgpp_adddateelem(text_t& querystring, const int date)
1270{
1271 querystring.appendcstr(" [");
1272 if(date<0) {
1273 querystring.appendcstr("bc");
1274 querystring.appendint((date*-1));
1275 }
1276 else {
1277 querystring.appendint(date);
1278 }
1279 querystring.appendcstr("]:CV");
1280}
1281
1282void lucene_adddateelem(text_t& querystring, const int date)
1283{
1284 querystring.appendcstr(" CV:(");
1285 if(date<0) {
1286 querystring.appendcstr("bc");
1287 querystring.appendint((date*-1));
1288 }
1289 else {
1290 querystring.appendint(date);
1291 }
1292 querystring.appendcstr(")");
1293}
1294
1295
1296void add_dates(text_t &querystring, int startdate, int enddate,
1297 int startbc, int endbc, int ct)
1298{
1299 if(startdate)
1300 {
1301 int querystringis = 0;
1302 text_t::const_iterator here = querystring.begin();
1303 text_t::const_iterator end = querystring.end();
1304 while(here!=end)
1305 {
1306 if(!(isspace((*here)))){
1307 here = end;
1308 querystringis = 1;
1309 }
1310 else
1311 ++here;
1312 }
1313 //converting BCE dates
1314 if(startbc && startdate > 0)
1315 {
1316 startdate *= -1;
1317 }
1318 if(endbc && enddate > 0)
1319 {
1320 enddate *= -1;
1321 }
1322 if(enddate != 0 && enddate<startdate)
1323 {
1324 cout<<"enddate too small"<<endl;
1325 return;
1326 }
1327 if(querystringis)
1328 querystring.appendcstr(" AND");
1329 if(!enddate)
1330 {
1331 if (ct==1) {
1332 mgpp_adddateelem(querystring,startdate);
1333 }
1334 else { // lucene
1335 lucene_adddateelem(querystring,startdate);
1336 }
1337 }
1338 else{
1339 int nextdate = startdate;
1340 querystring.appendcstr(" (");
1341 while(nextdate<=enddate)
1342 {
1343 if(nextdate!=0) {
1344 if (ct==1) {
1345 mgpp_adddateelem(querystring,nextdate);
1346 }
1347 else { // lucene
1348 lucene_adddateelem(querystring,nextdate);
1349 }
1350 }
1351 ++nextdate;
1352 }
1353 querystring.appendcstr(" )");
1354 }
1355 }
1356
1357}
Note: See TracBrowser for help on using the repository browser.