source: main/trunk/greenstone2/runtime-src/src/recpt/querytools.cpp

Last change on this file was 28841, checked in by ak19, 10 years ago

Fixing up URL encoding of cgi args so that phrase searching works again. Tested MGPP, Lucene and SQLite searching. Tested simple search, fielded search, advanced single field and multi-field as well as running a query.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 38.6 KB
Line 
1/**********************************************************************
2 *
3 * querytools.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "querytools.h"
27#include "cgiutils.h"
28#include <ctype.h>
29#include "unitool.h" // for is_unicode_letdig
30
31// sets the ct, qt, qto arguments
32void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
33
34 if (args["ct"].empty()) {
35 text_t build_type = cinfo->buildType;
36 if (build_type == "mgpp") {
37 args["ct"] = "1";
38 } else if (build_type == "lucene") {
39 args["ct"] = "2";
40 } else {
41 args["ct"] = "0";
42 }
43 }
44 text_t arg_ct = args["ct"];
45 if (arg_ct == "0") {
46 // mg
47 args["qt"] = "0";
48 args["qto"] = "0";
49 return;
50 }
51
52 if (!args["qt"].empty() && !args["qto"].empty()) {
53 return;
54 }
55
56 text_tmap::iterator check = cinfo->format.find("SearchTypes");
57 text_t search_types;
58 if(check != cinfo->format.end() && !(*check).second.empty()){
59 search_types = (*check).second;
60 } else {
61 // assume plain,form
62 if (args["qto"].empty()) args["qto"] = "3";
63 if (args["qt"].empty()) {
64 int arg_qto = args.getintarg("qto");
65 if (arg_qto == 2) {
66 args["qt"] = "1";
67 } else {
68 args["qt"] = "0";
69 }
70 }
71 return;
72 }
73
74
75 if (args["qto"].empty()) {
76 unsigned int type = 0;
77 if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
78 type |= 2;
79 }
80 if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
81 type |= 1;
82 }
83 args.setintarg("qto", type);
84 }
85
86 if (args["qt"].empty()) {
87 int arg_qto = args.getintarg("qto");
88 if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
89 args["qt"] = "1";
90 } else {
91 args["qt"] = "0";
92 }
93 }
94
95
96 // decide if sqlqto should be set or not
97 unsigned int sql_type = 0;
98 text_t infodb_type = cinfo->infodbType;
99 if ((infodb_type == "sqlite") || (infodb_type == "mssql")) {
100 if (findword(search_types.begin(), search_types.end(), "sqlform") != search_types.end()) {
101 sql_type = 1;
102 }
103 }
104
105 if (sql_type) {
106 args["sqlqto"] = "1";
107 }
108 else {
109 args["sqlqto"] = "0";
110 }
111
112
113}
114
115// sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args
116void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
117 int stemIndexes = cinfo->stemIndexes;
118
119 if (stemIndexes & SIcasefold) {
120 args["ks"] = 1;
121 }
122 if (stemIndexes & SIstem) {
123 args["ss"] = 1;
124 }
125 if (stemIndexes & SIaccentfold) {
126 args["afs"] = 1;
127 }
128
129}
130
131
132
133void set_basequeryfilter_options (FilterRequest_t &request,
134 cgiargsclass &args)
135{
136
137 OptionValue_t option;
138 int arg_m = args.getintarg("m");
139
140 option.name = "Maxdocs";
141 option.value = arg_m;
142 request.filterOptions.push_back (option);
143
144 // option.name = "StartResults";
145 // option.value = args["r"];
146 // request.filterOptions.push_back (option);
147
148 // option.name = "EndResults";
149 // int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
150 // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
151 // option.value = endresults;
152 // request.filterOptions.push_back (option);
153}
154
155
156// request.filterResultOptions and request.fields (if required) should
157// be set from the calling code
158void set_fulltext_queryfilter_options (FilterRequest_t &request,
159 const text_t &querystring,
160 cgiargsclass &args)
161{
162 // better if this function, and the two-query companion function
163 // was implemented in queryaction.cpp
164 // Has to be done here to documentaction.cpp can call it directly
165
166 request.filterName = "QueryFilter";
167
168 OptionValue_t option;
169
170 option.name = "Term";
171 option.value = querystring;
172 request.filterOptions.push_back (option);
173
174 option.name = "QueryType";
175 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
176 request.filterOptions.push_back (option);
177
178 option.name = "MatchMode";
179 // mgpp in advanced mode, always use some query
180 if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
181 option.value = "some";
182 } else {
183 option.value = (args.getintarg("t")) ? "some" : "all";
184 }
185 request.filterOptions.push_back (option);
186
187 option.name = "Casefold";
188 option.value = (args.getintarg("k")) ? "true" : "false";
189 request.filterOptions.push_back (option);
190
191 option.name = "Stem";
192 option.value = (args.getintarg("s")) ? "true" : "false";
193 request.filterOptions.push_back (option);
194
195 option.name = "AccentFold";
196 option.value = (args.getintarg("af")) ? "true" : "false";
197 request.filterOptions.push_back (option);
198
199 if (!args["h"].empty()) {
200 option.name = "Index";
201 option.value = args["h"];
202 request.filterOptions.push_back (option);
203 }
204
205 if (!args["j"].empty()) {
206 option.name = "Subcollection";
207 option.value = args["j"];
208 request.filterOptions.push_back (option);
209 }
210
211 if (!args["n"].empty()) {
212 option.name = "Language";
213 option.value = args["n"];
214 request.filterOptions.push_back (option);
215 }
216
217 if (!args["g"].empty()) { // granularity for mgpp
218 option.name = "Level";
219 option.value = args["g"];
220 request.filterOptions.push_back (option);
221 }
222
223 if (!args["fs"].empty()) { // filter string for lucene
224 option.name = "FilterString";
225 option.value = args["fs"];
226 request.filterOptions.push_back (option);
227 }
228
229 if (!args["sf"].empty()) { // sort field for lucene
230 option.name = "SortField";
231 option.value = args["sf"];
232 request.filterOptions.push_back (option);
233 }
234 if (!args["so"].empty()) { // sort order for lucene
235 option.name = "SortOrder";
236 option.value = (args.getintarg("so")? "descending" : "ascending");
237 request.filterOptions.push_back (option);
238 }
239
240 if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene
241 option.name = "Fuzziness";
242 option.value = (text_t) "0." + args["fuzziness"];
243 request.filterOptions.push_back (option);
244 }
245
246 set_basequeryfilter_options(request, args);
247}
248
249
250
251void set_fulltext_queryfilter_options (FilterRequest_t &request,
252 const text_t &querystring1,
253 const text_t &querystring2,
254 cgiargsclass &args)
255{
256
257 set_fulltext_queryfilter_options (request, querystring1, args);
258
259 // fill in the second query if needed
260 if (!args["cq2"].empty()) {
261 OptionValue_t option;
262
263 option.name = "CombineQuery";
264 option.value = args["cq2"];
265 request.filterOptions.push_back (option);
266
267 option.name = "Term";
268 option.value = querystring2;
269 request.filterOptions.push_back (option);
270
271 option.name = "QueryType";
272 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
273 request.filterOptions.push_back (option);
274
275 option.name = "Casefold";
276 option.value = (args.getintarg("k")) ? "true" : "false";
277 request.filterOptions.push_back (option);
278
279 option.name = "Stem";
280 option.value = (args.getintarg("s")) ? "true" : "false";
281 request.filterOptions.push_back (option);
282
283 option.name = "AccentFold";
284 option.value = (args.getintarg("af")) ? "true" : "false";
285 request.filterOptions.push_back (option);
286
287 if (!args["h2"].empty()) {
288 option.name = "Index";
289 option.value = args["h2"];
290 request.filterOptions.push_back (option);
291 }
292
293 if (!args["j2"].empty()) {
294 option.name = "Subcollection";
295 option.value = args["j2"];
296 request.filterOptions.push_back (option);
297 }
298
299 if (!args["n2"].empty()) {
300 option.name = "Language";
301 option.value = args["n2"];
302 request.filterOptions.push_back (option);
303 }
304 }
305
306 // this is probably redundant, as first line to this method will have
307 // already caused it to invoke set_basequeryfilter_options
308
309 set_basequeryfilter_options(request, args);
310}
311
312
313
314// request.filterResultOptions and request.fields (if required) should
315// be set from the calling code
316void set_sql_queryfilter_options (FilterRequest_t &request,
317 cgiargsclass &args)
318{
319 if (!args["sqlsf"].empty()) { // sort field for lucene
320 OptionValue_t option;
321
322 option.name = "SortField";
323 option.value = args["sqlsf"];
324 request.filterOptions.push_back (option);
325 }
326
327 set_basequeryfilter_options(request, args);
328}
329
330
331bool is_special_character(int indexer_type, unsigned short character) {
332 // mgpp
333 if (indexer_type == 1) {
334 return (character == '#' || character == '/' || character == '*');
335 }
336 // lucene
337 else if (indexer_type == 2) {
338 return (character == '?' || character == '*' || character == '~' ||
339 character == '^');
340 }
341 return false;
342}
343
344// This function removes boolean operators from simple searches, and segments
345// chinese characters if segment=true
346// Called by several parse_..._form methods here, this function decodes &
347// to undo the URL encoding done in cgiutils.cpp for security purposes
348void format_querystring (text_t &querystring, int querymode, bool segment) {
349 text_t formattedstring;
350
351 // & has meaning in boolean searches and can be %26 encoded at this point, need to decode them now.
352 // Also decode any " here, so that the entire search phrase is highlighted and not just the final word
353 unsafe_cgi_arg("ALL", querystring);
354
355 // advanced search, no segmenting, don't need to do anything
356 if (querymode == 1 && !segment) return;
357
358 text_t::const_iterator here = querystring.begin();
359 text_t::const_iterator end = querystring.end();
360
361 // space is used to insert spaces between Chinese
362 // characters. No space is needed before the first
363 // Chinese character.
364 bool space = false;
365
366 // want to remove ()|!& from querystring so boolean queries are just
367 // "all the words" queries (unless querymode is advanced)
368 while (here != end) {
369 if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
370 *here == '!' || *here == '&')) {
371 formattedstring.push_back(' ');
372 } else if (segment) {
373 if ((*here >= 0x2e80 && *here <= 0xd7a3) ||
374 ( *here >= 0xf900 && *here <= 0xfa6a)) {
375 /* text_t not big enough to handle these. */
376 /* (*here >= 0x20000 && *here <= 0x2a6d6) ||
377 (*here >= 0x2f800 && *here <= 0x2fa1d)) { */
378
379 // CJK character
380 if (!space) formattedstring.push_back (0x200b); // zero width space
381 formattedstring.push_back (*here);
382 formattedstring.push_back (0x200b);
383 space = true;
384 } else {
385
386 // non-Chinese character
387 formattedstring.push_back (*here);
388 space = false;
389
390 }
391
392 } else {
393 formattedstring.push_back (*here);
394 }
395 ++here;
396 }
397 querystring = formattedstring;
398}
399
400// turn query string into terms separated by spaces.
401// still working on this...
402text_t get_plain_query_terms(const text_t &querystring, const text_t &arg_ct) {
403 text_t::const_iterator here = querystring.begin();
404 text_t::const_iterator end = querystring.end();
405
406 // lets look for [] and () first - these are a pain.
407 text_t::const_iterator bracket;
408 text_t query_no_brackets = "";
409
410 // mgpp brackets: [xxx]:TI
411 if (findchar(here, end, '[') != end) {
412 while ((bracket = findchar(here, end, '[')) != end) {
413 // get the first bit
414 query_no_brackets += substr(here, bracket);
415 bracket++;
416 here = bracket;
417 // get the end bracket
418 bracket = findchar(here, end, ']');
419 query_no_brackets += substr(here, bracket);
420 // skip the :TI bits
421 while (bracket != end // do bracket != end test first, ELSE when bracket = end, we're past the string, in
422 && *bracket != ' ') { // which case *bracket becomes an invalid operation that causes the server to crash
423 bracket++;
424 }
425 here = bracket;
426 }
427 if (here != end) {
428 query_no_brackets += substr(here,end);
429 }
430 } else if (findchar(here, end, '(') != end) {
431 // lucene brackets TI:(xxx)
432 while ((bracket = findchar(here, end, '(')) != end) {
433 // back up the field name
434 text_t::const_iterator old_bracket = bracket;
435 while (bracket != here && *bracket != ' ') { // order of tests in condition matters (see long comment above)
436 --bracket;
437 }
438 if (bracket != here) {
439 // get the first bit
440 query_no_brackets += substr(here, bracket+1);
441 }
442 here = old_bracket +1;
443 // get the end bracket
444 bracket = findchar(here, end, ')');
445 query_no_brackets += substr(here, bracket);
446 if (bracket != end) {
447 here = bracket+1;
448 }
449 }
450 if (here != end) {
451 query_no_brackets += substr(here,end);
452 }
453 } else {
454 // was no brackets
455 query_no_brackets = querystring;
456 }
457
458 if (arg_ct == "2") { // lucene
459 // look for AND OR NOT and remove
460 here = query_no_brackets.begin();
461 end = query_no_brackets.end();
462 text_tlist terms;
463 splitword(here, end, "AND", terms);
464 joinchar(terms, ' ', query_no_brackets);
465 here = query_no_brackets.begin();
466 end = query_no_brackets.end();
467 splitword(here, end, "OR", terms);
468 joinchar(terms, ' ', query_no_brackets);
469 here = query_no_brackets.begin();
470 end = query_no_brackets.end();
471 splitword(here, end, "NOT", terms);
472 joinchar(terms, ' ', query_no_brackets);
473
474 }
475 text_t terms = "";
476 bool space = false;
477 here = query_no_brackets.begin();
478 end = query_no_brackets.end();
479
480 while (here != end) {
481 if (*here == '#' || *here == '/') {
482 // skip over #is /10 etc
483 ++here;
484 while (here != end && *here != ' ') {
485 ++here;
486 }
487 if (here == end) break;
488 }
489 if (is_unicode_letdig(*here)) {
490 terms.push_back(*here);
491 space = false;
492 } else {
493 if (!space) {
494 terms.push_back(' ');
495 space = true;
496 }
497 }
498 ++here;
499 }
500 return trim(terms);
501
502}
503
504// search history tool
505// also used for form query macros
506text_t escape_quotes(const text_t &querystring) {
507
508 text_t::const_iterator here = querystring.begin();
509 text_t::const_iterator end = querystring.end();
510
511 text_t escquery = "";
512 while (here != end) {
513 if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
514 else if (*here == '\n' || *here == '\r') {
515 escquery.push_back(' ');
516 } else {
517 escquery +="\\\\";
518 escquery.push_back(*here);
519 }
520
521 ++here;
522 }
523 return escquery;
524
525}
526
527// Parses the terms into words, and adds #sif if necessary
528text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &casefold, const text_t &accentfold,
529 const int indexer_type) {
530
531 // the default stem, case and accentfold are set to 0 if this is being used, so we are only adding on qualifiers if stem,case,accent is 1.
532 if (stem == "0" && casefold == "0" && accentfold =="0") {
533 return terms;
534 }
535 // this is only for mgpp collections, shouldn't be called for anything else
536 if (indexer_type != 1) {
537 return terms;
538 }
539
540 text_t outtext;
541 text_t word;
542
543 text_t::const_iterator here = terms.begin();
544 text_t::const_iterator end = terms.end();
545
546 text_t word_modifier = "#";
547 if (casefold == "1") word_modifier += "i";
548 if (accentfold == "1") word_modifier += "f";
549 if (stem == "1") word_modifier += "s";
550
551 while (here !=end) {
552
553 if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
554 // not word boundary
555 word.push_back(*here);
556 ++here;
557 }
558 else {
559 // found word boundary
560 if (!word.empty() ) {
561 if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) {
562 outtext += word;
563 word.clear();
564 }
565 else {
566 outtext += word+word_modifier;
567 word.clear();
568 }
569 }
570 // this only used in advanced form, so we leave in boolean operators
571 if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' ||
572 *here == '(' || *here == ')' || is_unicode_space(*here)) {
573 outtext.push_back(*here);
574 }
575 ++here;
576 }
577 }
578
579 // get last word
580 if (!word.empty()) {
581 outtext += word+word_modifier+" ";
582 }
583 return outtext;
584}
585
586
587// The following parse_..._form functions first decode various fields for
588// both simple and advanced searches to undo the URL encoding.
589// E.g. quotes have meaning in phrase searches and these have to be decoded
590// before sending the search off to the index.
591
592// some query form parsing functions for use with mgpp & lucene
593
594void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
595{
596 querystring.clear();
597
598 int argct = args.getintarg("ct");
599 int argt = args.getintarg("t");// t=0 -and, t=1 - or
600 int argb = args.getintarg("b");
601
602 text_t combine;
603
604 // lucene uses global combine, so only need this for mgpp
605 if (argct==1) {
606 if (argt == 0) combine = "&";
607 else combine = "|";
608 }
609
610 text_t field = args["fqf"];
611 if (field.empty()) return; // no query
612 unsafe_cgi_arg("ALL", field);
613 text_tarray fields;
614 splitchar(field.begin(), field.end(), ',', fields);
615
616 text_t value = args["fqv"];
617 if (value.empty()) return; // somethings wrong
618 unsafe_cgi_arg("ALL", value);
619 text_tarray values;
620 splitchar(value.begin(), value.end(), ',', values);
621
622
623 for (int i=0; i< values.size(); ++i) {
624 if (!values[i].empty()) {
625 text_t this_value = values[i];
626
627 // remove operators for simple search, segments text if necessary
628 format_querystring(this_value, argb, segment);
629
630 // add tag info for this field (and other processing)
631 format_field_info(this_value, fields[i], argct, argt, argb);
632
633 // add into query string
634 if (argct == 2) {
635 // lucene
636 // we don't worry about AND/OR, cos this is done by defaultcombineoperator
637 querystring += this_value+" ";
638 } else {
639 // mgpp
640 if (!querystring.empty()) {
641 querystring += " "+ combine+ " ";
642 }
643 querystring += this_value;
644 }
645 }
646 }
647}
648
649
650void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
651 querystring.clear();
652
653 const int argct = args.getintarg("ct");
654 int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
655 int argb = args.getintarg("b");
656 text_t combine;
657 if (argct==1) {
658 combine = "&";
659 }
660 else { // lucene
661 combine = "AND";
662 }
663
664 text_t field = args["fqf"];
665 if (field.empty()) return; // no query
666 unsafe_cgi_arg("ALL", field);
667 text_tarray fields;
668 splitchar(field.begin(), field.end(), ',', fields);
669
670 text_t value = args["fqv"];
671 if (value.empty()) return; // somethings wrong
672 unsafe_cgi_arg("ALL", value);
673 text_tarray values;
674 splitchar(value.begin(), value.end(), ',', values);
675
676 text_t comb = args["fqc"];
677 if (comb.empty()) return; //somethings wrong
678 //unsafe_cgi_arg("ALL", comb);
679 text_tarray combs;
680 splitchar(comb.begin(), comb.end(), ',', combs);
681
682 text_tarray stems;
683 text_tarray casefolds;
684 text_tarray accentfolds;
685 if (argct == 1) {// mgpp - lucene doesn't do stem/case
686 if (args["ss"]=="1") { //collection has stemming
687 text_t stem = args["fqs"];
688 if (stem.empty()) return; // somethings wrong
689 splitchar(stem.begin(), stem.end(), ',', stems);
690 }
691 if (args["ks"]=="1") { // collection has case folding
692 text_t fold = args["fqk"];
693 if (fold.empty()) return; // somethings wrong
694 splitchar(fold.begin(), fold.end(), ',', casefolds);
695 }
696 if (args["afs"]=="1") {
697 text_t accent = args["fqaf"];
698 if (accent.empty()) return; // somethings wrong
699 splitchar(accent.begin(), accent.end(), ',', accentfolds);
700 }
701 }
702
703 for(int i=0; i< values.size(); ++i) {
704 if (!values[i].empty()) {
705 if (i!=0) {
706 if (argct==1) {
707 if (combs[i-1]=="and") combine = "&";
708 else if (combs[i-1]=="or")combine = "|";
709 else if (combs[i-1]=="not")combine = "!";
710 }
711 else { // lucene
712 if (combs[i-1]=="and") combine = "AND";
713 else if (combs[i-1]=="or")combine = "OR";
714 else if (combs[i-1]=="not")combine = "NOT";
715 }
716 }
717 text_t this_value = values[i];
718 // remove operators for simple search, segments text if necessary
719 format_querystring(this_value, argb, segment);
720 if (argct == 1) { // mgpp only
721 this_value = addstemcase(this_value, ((args["ss"]=="1")?stems[i]:"0"), ((args["ks"]=="1")?casefolds[i]:"0"), ((args["afs"]=="1")?accentfolds[i]:"0"), argct);
722 }
723 // add tag info for this field (and other processing)
724 format_field_info(this_value, fields[i], argct, argt, argb);
725 // add into query string
726 if (!querystring.empty()) {
727 querystring += " "+ combine+ " ";
728 }
729 querystring += this_value;
730
731 }
732 }
733}
734
735
736// SQL versions for parsing query form
737
738void parse_sqlreg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
739{
740 querystring.clear();
741
742 int argt = args.getintarg("t");// t=0 -and, t=1 - or
743 int argb = args.getintarg("b");
744
745 text_t combine;
746
747 if (argt == 0) combine = "AND";
748 else combine = "OR";
749
750 text_t field = args["sqlfqf"];
751 if (field.empty()) return; // no query
752 unsafe_cgi_arg("ALL", field); // for the slash. //unsafe_cgi_arg("/", field);
753 text_tarray fields;
754 splitchar(field.begin(), field.end(), ',', fields);
755
756 text_t sqlcomb = args["sqlfqc"];
757 if (sqlcomb.empty()) return; //somethings wrong
758 //unsafe_cgi_arg("ALL", sqlcomb);
759 text_tarray sqlcombs;
760 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
761
762 text_t value = args["fqv"];
763 if (value.empty()) return; // somethings wrong
764 unsafe_cgi_arg("ALL", value);
765 text_tarray values;
766 splitchar(value.begin(), value.end(), ',', values);
767
768
769 for (int i=0; i< values.size(); ++i) {
770 if (!values[i].empty()) {
771 text_t this_value;
772 const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
773 const text_t LIKE_CONDITION = "LIKE";
774
775 //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
776 //in order to search a field starting with certain words.
777 if (sqlcombs[i] == STARTINGWITH_CONDITION)
778 {this_value = values[i];
779 this_value += "%";
780 // remove operators for simple search, segments text if necessary
781 format_querystring(this_value, argb, segment);
782 // add tag info for this field (and other processing)
783 format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}
784
785 else
786 {this_value = values[i];
787 // remove operators for simple search, segments text if necessary
788 format_querystring(this_value, argb, segment);
789 // add tag info for this field (and other processing)
790 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}
791
792
793 const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
794
795 if (querystring.empty()) {
796 // first query term
797 querystring = DISTINCT_SELECT_WHERE + this_value;
798 }
799 else {
800 this_value = DISTINCT_SELECT_WHERE + this_value;
801
802 if (combine=="AND") {
803 // INNER JOIN to restrict to only matching docOIDs
804 querystring = "SELECT docOID FROM (" + querystring + ")"
805 + " INNER JOIN (" + this_value +") USING (docOID)";
806 }
807 else if (combine=="OR") {
808 // Union to allow union of the two
809 querystring = querystring + " UNION " + this_value;
810 }
811 }
812 }
813 }
814}
815
816
817void parse_sqladv_query_form(text_t &querystring, cgiargsclass &args,
818 bool segment)
819{
820 querystring.clear();
821
822 int argt = 0; // set it to 0 = AND, by default
823 int argb = args.getintarg("b");
824 text_t combine = "AND";
825
826 text_t field = args["sqlfqf"];
827
828 if (field.empty()) return; // no query
829 // need to decode %2F to / in the URL, e.g. to get dc.Title/Title/ex.Title again in the fields to search in
830 unsafe_cgi_arg("ALL", field); //unsafe_cgi_arg("/", field);
831 text_tarray fields;
832 splitchar(field.begin(), field.end(), ',', fields);
833
834 text_t sqlcomb = args["sqlfqc"];
835 if (sqlcomb.empty()) return; //somethings wrong
836 //unsafe_cgi_arg("ALL", sqlcomb);
837 text_tarray sqlcombs;
838 splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
839
840 text_t value = args["fqv"];
841 if (value.empty()) return; // somethings wrong
842 unsafe_cgi_arg("ALL", value); // decode all url-encoded parts of the values to search in
843 text_tarray values;
844 splitchar(value.begin(), value.end(), ',', values);
845
846 text_t comb = args["fqc"];
847 if (comb.empty()) return; //somethings wrong
848 //unsafe_cgi_arg("ALL", comb);
849 text_tarray combs;
850 splitchar(comb.begin(), comb.end(), ',', combs);
851
852 for(int i=0; i< values.size(); ++i) {
853 if (!values[i].empty()) {
854 if (i>0) {
855 if (combs[i-1]=="and") { combine = "AND"; }
856 else if (combs[i-1]=="or") { combine = "OR"; }
857 else if (combs[i-1]=="not") { combine = "NOT"; }
858 }
859 text_t this_value;
860 const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
861 const text_t LIKE_CONDITION = "LIKE";
862
863 //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
864 //in order to search a field starting with certain words.
865 if (sqlcombs[i] == STARTINGWITH_CONDITION)
866 {this_value = values[i];
867 this_value += "%";
868 // remove operators for simple search, segments text if necessary
869 format_querystring(this_value, argb, segment);
870 // add tag info for this field (and other processing)
871 format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}
872
873 else
874 {this_value = values[i];
875 // remove operators for simple search, segments text if necessary
876 format_querystring(this_value, argb, segment);
877 // add tag info for this field (and other processing)
878 format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}
879
880 const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
881
882 if (querystring.empty()) {
883 // first query term
884 querystring = DISTINCT_SELECT_WHERE + this_value;
885 }
886 else {
887 this_value = DISTINCT_SELECT_WHERE + this_value;
888
889 if (combine=="AND") {
890 // INNER JOIN to restrict to only matching docOIDs
891 querystring = "SELECT docOID FROM (" + querystring + ")"
892 + " INNER JOIN (" + this_value +") USING (docOID)";
893 }
894 else if (combine=="OR") {
895 // Union to allow union of the two
896 querystring = querystring + " UNION " + this_value;
897 }
898 else {
899 cerr << "Unsupported combination operation: " << combine << endl;
900 }
901 }
902
903 }
904 }
905}
906
907
908
909
910// Extended addqueryelem for Human Info project
911void addqueryelem_ex(text_t &querystring, const text_t &tag,
912 const text_t &terms, const text_t &stem,
913 const text_t &fold,
914 const text_t& combine, const text_t& word_combine) {
915
916 if (!querystring.empty()) { // have to put and/or
917 querystring += " " + combine + " ";
918 }
919 text_t outtext; outtext.reserve(512);
920 text_t word; word.reserve(100);
921 //unsigned short c;
922 text_t::const_iterator here = terms.begin();
923 text_t::const_iterator end = terms.end();
924 bool inquote = false, firstword = true;
925
926 text_t word2; word2.reserve(256);
927
928 while (here !=end) {
929 if (is_unicode_space(*here)) {
930 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
931 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
932 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
933 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
934 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
935 if (inquote) {
936 word2.push_back(*here);
937 }
938 word.append(word2); word2.clear();
939
940 if (!inquote && !word.empty() ) {
941 // found word boundary
942
943 if (stem == "1" || fold =="1") {
944 word += "#";
945 if (stem == "1") word += "s";
946 //else word += "u";
947
948 if (fold == "1") word += "i";
949 //else word += "c";
950 }
951 if (firstword) {
952 firstword = false;
953 } else {
954 outtext += " " + word_combine + " ";
955 }
956 outtext += "[" + word + "]:"+tag;
957 word.clear();
958 }
959 ++here;
960 } else if (*here == '\"') {
961 word2.push_back(*here);
962 inquote = !inquote;
963 ++here;
964 } else {
965 // not word boundary
966 word2.push_back(*here);
967 ++here;
968 }
969 }
970
971 // get last word
972 if (!word2.empty()) {
973 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
974 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
975 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
976 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
977 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
978 word.append(word2); word2.clear();
979
980 if (stem == "1"|| fold == "1") {
981 word += "#";
982 if (stem == "1") word += "s";
983 //else word += "u";
984
985 if (fold == "1") word += "i";
986 //else word += "c";
987 }
988 if (!outtext.empty()) outtext += " " + word_combine + " ";
989 outtext += "[" + word + "]:"+tag;
990 }
991 querystring += "(" + outtext + ")";
992}
993
994void add_field_info(text_t &querystring, const text_t &tag, int type) {
995
996 if (tag == "") return; // do nothing
997 if (tag == "ZZ" && type == 1) return; // mgpp doesn't use ZZ tag internally
998 if (type == 1) { //mgpp
999 querystring = "["+querystring+"]:"+tag;
1000 } else if (type == 2) { // lucene
1001 querystring = tag+":("+querystring+")";
1002 }
1003
1004}
1005
1006
1007void add_field_info_sql(text_t &querystring, const text_t &tagseq,
1008 const text_t& sqlcomb)
1009{
1010
1011 if (tagseq == "") return; // do nothing
1012
1013 text_t element_in = "(element IN (";
1014
1015 text_tlist mdterms;
1016
1017 splitword(tagseq.begin(), tagseq.end(), "/", mdterms);
1018
1019 text_t tags_in = "";
1020
1021 while (!mdterms.empty()) {
1022 text_t tag = mdterms.front();
1023 mdterms.pop_front();
1024
1025 if (!tag.empty()) {
1026
1027 // remove "ex." prefix, but only if there are no other metadata set qualifiers
1028 // in the metaname, since we want to retain prefixes like "ex.dc." as-is
1029 text_t::iterator period = findchar(tag.begin(), tag.end(), '.');
1030 text_t::iterator lastperiod = findlastchar(tag.begin(), tag.end(), '.');
1031
1032 if (tag.size()>3 && (substr(tag.begin(), tag.begin()+3) == "ex.") && period == lastperiod) {
1033 tag = substr (tag.begin()+3, tag.end());
1034 }
1035
1036 if (!tags_in.empty()) {
1037 tags_in += ",";
1038 }
1039
1040 tags_in += "'" + tag + "'";
1041 }
1042 }
1043
1044 element_in += tags_in + ") AND (";
1045
1046
1047 if (sqlcomb == "=") {
1048 // override what it means to do equality, to make it more like full text
1049 // searching
1050
1051 text_t orterms = "";
1052 text_t term = "";
1053 bool in_phrase = false;
1054
1055 text_t::const_iterator here = querystring.begin();
1056 text_t::const_iterator end = querystring.end();
1057 while (here != end) {
1058 if (is_unicode_letdig(*here)) {
1059 term.push_back(*here);
1060 }
1061 else if (*here == '"') {
1062 term.push_back(*here);
1063 if (!in_phrase) {
1064 in_phrase = true;
1065 } else {
1066 in_phrase = false;
1067 }
1068 }
1069 else if (in_phrase) {
1070 // Found word boundary, but in a phrase, so does not complete term
1071 term.push_back(*here);
1072 }
1073 else {
1074 // Found a word boundary
1075 if (!orterms.empty()) {
1076 orterms += " OR ";
1077 }
1078 orterms += "value LIKE '%" + term + "%'";
1079 term.clear();
1080 }
1081 ++here;
1082 }
1083
1084 if (!term.empty()) {
1085 if (!orterms.empty()) {
1086 orterms += " OR ";
1087 }
1088 orterms += "value LIKE '%" + term + "%'";
1089 }
1090
1091 element_in += orterms;
1092 }
1093 //We cast the value from STRING to REAL to allow numeric sorting
1094 else if (sqlcomb == "<num") {
1095 element_in += "CAST(value as REAL) < CAST('" + querystring+"' AS REAL)";
1096 }
1097 else if (sqlcomb == ">num") {
1098 element_in += "CAST(value as REAL) > CAST('" + querystring+"' AS REAL)";
1099 }
1100 else if (sqlcomb == "<=num") {
1101 element_in += "CAST(value as REAL) <= CAST('" + querystring+"' AS REAL)";
1102 }
1103 else if (sqlcomb == ">=num") {
1104 element_in += "CAST(value as REAL) >= CAST('" + querystring+"' AS REAL)";
1105 }
1106 else if (sqlcomb == "=num") {
1107 element_in += "CAST(value as REAL) = CAST('" + querystring+"' AS REAL)";
1108 }
1109 else {
1110 // search on value is "as is" querystring
1111 element_in += "value " + sqlcomb + " '" + querystring+"'";
1112 }
1113
1114
1115 querystring = element_in + "))";
1116
1117}
1118
1119
1120void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) {
1121
1122 int type = 2; //lucene
1123
1124 if (argb==0) { // simple
1125 // there will be no & or | as they should have already been removed
1126 // just tag the entire thing
1127 if (tag != "") {
1128 add_field_info(querystring, tag, type);
1129 }
1130 return;
1131 }
1132
1133 // need to replace & with &&, | with ||
1134 text_t::const_iterator here = querystring.begin();
1135 text_t::const_iterator end = querystring.end();
1136
1137 text_t finalquery = "";
1138 while (here != end) {
1139 if (*here == '&') {
1140 finalquery.push_back('&');
1141 finalquery.push_back('&');
1142 while (*(here+1) == '&') {
1143 ++here;
1144 }
1145 }
1146 else if (*here == '|') {
1147 finalquery.push_back('|');
1148 finalquery.push_back('|');
1149 while (*(here+1) == '|') {
1150 ++here;
1151 }
1152 }
1153 else {
1154 finalquery.push_back(*here);
1155 }
1156 ++here;
1157 }
1158 querystring = finalquery;
1159 add_field_info(querystring, tag, type);
1160}
1161
1162
1163void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {
1164
1165 if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
1166 if (tag == "" && argb == 1) {
1167 return; // no field specifier, advanced mode, the query stays as written
1168 }
1169
1170 int type = 1; // mgpp
1171
1172 bool simple_and = (argb==0 && argt==0);
1173 text_t finalquery = "";
1174 text_t fieldpart ="";
1175 text_t queryelem = "";
1176 bool in_phrase = false;
1177 bool in_field = false;
1178
1179 text_t::const_iterator here = querystring.begin();
1180 text_t::const_iterator end = querystring.end();
1181 while (here != end) {
1182 if (is_unicode_letdig(*here) || *here == '&' || is_special_character(type, *here)) {
1183 queryelem.push_back(*here);
1184 }
1185 else if (*here == '|') {
1186 in_field = false;
1187 }
1188 else if (*here == '!' || *here == '(' || *here == ')') {
1189 if (!in_phrase) { // ignore these if in_phrase
1190 // output field, then output operator
1191 in_field = false;
1192 if (!queryelem.empty()) {
1193 if (!simple_and && !fieldpart.empty()) {
1194 add_field_info(fieldpart, tag, type);
1195 finalquery += fieldpart;
1196 finalquery.push_back(' ');
1197 fieldpart.clear();
1198 }
1199 fieldpart += queryelem;
1200 }
1201 if (!fieldpart.empty()) {
1202 add_field_info(fieldpart, tag, type);
1203 finalquery += fieldpart;
1204 finalquery.push_back(' ');
1205 }
1206 fieldpart.clear();
1207 queryelem.clear();
1208 finalquery.push_back(*here);
1209 finalquery.push_back(' ');
1210 }
1211 }
1212 else if (*here == '"') {
1213 queryelem.push_back(*here);
1214 if (in_phrase == false) in_phrase = true;
1215 else {
1216 in_phrase = false;
1217 }
1218 }
1219
1220 // Found word boundary, in a phrase
1221 else if (in_phrase) {
1222 queryelem.push_back(*here);
1223 }
1224 // Found a word boundary
1225 else {
1226 if (!queryelem.empty()) {
1227 if (queryelem == "&") {
1228 in_field = true;
1229 queryelem.clear();
1230 }
1231 else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
1232
1233 if (argb==1) {
1234 // simple search, these not allowed
1235 in_field = true;
1236 fieldpart += queryelem;
1237 fieldpart.push_back(' ');
1238 }
1239 queryelem.clear();
1240
1241 }
1242 else {
1243 if (!simple_and && !in_field) {
1244 if (!fieldpart.empty()) {
1245 add_field_info(fieldpart, tag, type);
1246 finalquery += fieldpart;
1247 finalquery.push_back(' ');
1248 fieldpart.clear();
1249 }
1250 }
1251
1252 fieldpart += queryelem;
1253 fieldpart.push_back(' ');
1254 queryelem.clear();
1255 }
1256 }
1257 }
1258 ++here;
1259 }
1260 // at the end
1261 if (!queryelem.empty()) {
1262 if (!simple_and && !in_field && !fieldpart.empty()) {
1263 add_field_info(fieldpart, tag, type);
1264 finalquery += fieldpart;
1265 finalquery.push_back(' ');
1266 fieldpart.clear();
1267 }
1268 fieldpart += queryelem;
1269 }
1270 if (!fieldpart.empty()) {
1271 add_field_info(fieldpart, tag, type);
1272 finalquery += fieldpart;
1273 fieldpart.clear();
1274
1275 // doesn't the following just leave a dangling space at the end ?? (used to make mgpp crash)
1276 // consider cutting this line
1277 finalquery.push_back(' ');
1278 }
1279
1280 querystring = finalquery;
1281}
1282
1283
1284void format_field_info_sql(text_t &querystring, const text_t &tagseq,
1285 const text_t &sqlcomb,
1286 int argt, int argb)
1287{
1288 add_field_info_sql(querystring, tagseq, sqlcomb);
1289}
1290
1291
1292void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) {
1293 if (argct == 1) {
1294 format_field_info_mgpp(querystring, tag, argt, argb);
1295 } else if (argct == 2) {
1296 format_field_info_lucene(querystring, tag, argt, argb);
1297 }
1298}
1299
1300void mgpp_adddateelem(text_t& querystring, const int date)
1301{
1302 querystring.appendcstr(" [");
1303 if(date<0) {
1304 querystring.appendcstr("bc");
1305 querystring.appendint((date*-1));
1306 }
1307 else {
1308 querystring.appendint(date);
1309 }
1310 querystring.appendcstr("]:CV");
1311}
1312
1313void lucene_adddateelem(text_t& querystring, const int date)
1314{
1315 querystring.appendcstr(" CV:(");
1316 if(date<0) {
1317 querystring.appendcstr("bc");
1318 querystring.appendint((date*-1));
1319 }
1320 else {
1321 querystring.appendint(date);
1322 }
1323 querystring.appendcstr(")");
1324}
1325
1326
1327void add_dates(text_t &querystring, int startdate, int enddate,
1328 int startbc, int endbc, int ct)
1329{
1330 if(startdate)
1331 {
1332 int querystringis = 0;
1333 text_t::const_iterator here = querystring.begin();
1334 text_t::const_iterator end = querystring.end();
1335 while(here!=end)
1336 {
1337 if(!(isspace((*here)))){
1338 here = end;
1339 querystringis = 1;
1340 }
1341 else
1342 ++here;
1343 }
1344 //converting BCE dates
1345 if(startbc && startdate > 0)
1346 {
1347 startdate *= -1;
1348 }
1349 if(endbc && enddate > 0)
1350 {
1351 enddate *= -1;
1352 }
1353 if(enddate != 0 && enddate<startdate)
1354 {
1355 cout<<"enddate too small"<<endl;
1356 return;
1357 }
1358 if(querystringis)
1359 querystring.appendcstr(" AND");
1360 if(!enddate)
1361 {
1362 if (ct==1) {
1363 mgpp_adddateelem(querystring,startdate);
1364 }
1365 else { // lucene
1366 lucene_adddateelem(querystring,startdate);
1367 }
1368 }
1369 else{
1370 int nextdate = startdate;
1371 querystring.appendcstr(" (");
1372 while(nextdate<=enddate)
1373 {
1374 if(nextdate!=0) {
1375 if (ct==1) {
1376 mgpp_adddateelem(querystring,nextdate);
1377 }
1378 else { // lucene
1379 lucene_adddateelem(querystring,nextdate);
1380 }
1381 }
1382 ++nextdate;
1383 }
1384 querystring.appendcstr(" )");
1385 }
1386 }
1387
1388}
Note: See TracBrowser for help on using the repository browser.