root/main/trunk/greenstone2/runtime-src/src/recpt/querytools.cpp @ 28221

Revision 28221, 37.3 KB (checked in by kjdon, 7 years ago)

tidying up for case/accent/stem checkboxes in advance mgpp search form. adding in accent fold one. making these boxes display or not based on whether these options are available in the collection

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * querytools.cpp --
4 * Copyright (C) 1999  The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "querytools.h"
27#include <ctype.h>
28#include "unitool.h" // for is_unicode_letdig
29
30// sets the ct, qt, qto arguments
31void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
32
33  if (args["ct"].empty()) {
34    text_t build_type = cinfo->buildType;
35    if (build_type == "mgpp") {
36      args["ct"] = "1";
37    } else if (build_type == "lucene") {
38      args["ct"] = "2";
39    } else {
40      args["ct"] = "0";
41    }
42  }
43  text_t arg_ct = args["ct"];
44  if (arg_ct == "0") {
45    // mg
46    args["qt"] = "0";
47    args["qto"] = "0";
48    return;
49  }
50
51  if (!args["qt"].empty() && !args["qto"].empty()) {
52    return;
53  }
54 
55  text_tmap::iterator check = cinfo->format.find("SearchTypes");
56  text_t search_types;
57  if(check != cinfo->format.end() && !(*check).second.empty()){
58    search_types = (*check).second;
59  } else {
60    // assume plain,form
61    if (args["qto"].empty()) args["qto"] = "3";
62    if (args["qt"].empty()) {
63      int arg_qto = args.getintarg("qto");
64      if (arg_qto == 2) {
65    args["qt"] = "1";
66      } else {
67    args["qt"] = "0";
68      }
69    }
70    return;
71  }
72 
73 
74  if (args["qto"].empty()) {
75    unsigned int type = 0;
76    if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
77      type |= 2;
78    }
79    if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
80      type |= 1;
81    }
82    args.setintarg("qto", type);
83  }
84
85  if (args["qt"].empty()) {
86    int arg_qto = args.getintarg("qto");
87    if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
88      args["qt"] = "1";
89    } else {
90      args["qt"] = "0";
91    }
92  }
93
94
95  // decide if sqlqto should be set or not
96  unsigned int sql_type = 0;
97  text_t infodb_type = cinfo->infodbType;
98  if ((infodb_type == "sqlite") || (infodb_type == "mssql")) {
99    if (findword(search_types.begin(), search_types.end(), "sqlform") != search_types.end()) {
100      sql_type = 1;
101    }
102  }
103
104  if (sql_type) {
105    args["sqlqto"] = "1";
106  }
107  else {
108    args["sqlqto"] = "0";
109  }
110
111
112}
113
114// sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args
115void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
116  int stemIndexes = cinfo->stemIndexes;
117
118  if (stemIndexes & SIcasefold) {
119    args["ks"] = 1;
120  }
121  if (stemIndexes & SIstem) {
122    args["ss"] = 1;
123  }
124  if (stemIndexes & SIaccentfold) {
125    args["afs"] = 1;
126  }
127
128}
129
130
131
132void set_basequeryfilter_options (FilterRequest_t &request,
133                  cgiargsclass &args)
134{
135
136  OptionValue_t option;
137  int arg_m = args.getintarg("m");
138 
139  option.name = "Maxdocs";
140  option.value = arg_m;
141  request.filterOptions.push_back (option);
142
143  //  option.name = "StartResults";
144  //  option.value = args["r"];
145  //  request.filterOptions.push_back (option);
146
147  //  option.name = "EndResults";
148  //  int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
149  //  if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
150  //  option.value = endresults;
151  //  request.filterOptions.push_back (option);
152}
153
154
155// request.filterResultOptions and request.fields (if required) should
156// be set from the calling code
157void set_fulltext_queryfilter_options (FilterRequest_t &request,
158                       const text_t &querystring,
159                       cgiargsclass &args)
160{
161  // better if this function, and the two-query companion function
162  // was implemented in queryaction.cpp
163  // Has to be done here to documentaction.cpp can call it directly
164
165  request.filterName = "QueryFilter";
166
167  OptionValue_t option;
168
169  option.name = "Term";
170  option.value = querystring;
171  request.filterOptions.push_back (option);
172
173  option.name = "QueryType";
174  option.value = (args.getintarg("t")) ? "ranked" : "boolean";
175  request.filterOptions.push_back (option);
176
177  option.name = "MatchMode";
178  // mgpp in advanced mode, always use some query
179  if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
180    option.value = "some";
181  } else {
182    option.value = (args.getintarg("t")) ? "some" : "all";
183  }
184  request.filterOptions.push_back (option);
185
186  option.name = "Casefold";
187  option.value = (args.getintarg("k")) ? "true" : "false";
188  request.filterOptions.push_back (option);
189
190  option.name = "Stem";
191  option.value = (args.getintarg("s")) ? "true" : "false";
192  request.filterOptions.push_back (option);
193
194  option.name = "AccentFold";
195  option.value = (args.getintarg("af")) ? "true" : "false";
196  request.filterOptions.push_back (option);
197 
198  if (!args["h"].empty()) {
199    option.name = "Index";
200    option.value = args["h"];
201    request.filterOptions.push_back (option);
202  }
203
204  if (!args["j"].empty()) {
205    option.name = "Subcollection";
206    option.value = args["j"];
207    request.filterOptions.push_back (option);
208  }
209
210  if (!args["n"].empty()) {
211    option.name = "Language";
212    option.value = args["n"];
213    request.filterOptions.push_back (option);
214  }
215 
216  if (!args["g"].empty()) { // granularity for mgpp
217    option.name = "Level";
218    option.value = args["g"];
219    request.filterOptions.push_back (option);
220  }
221
222  if (!args["fs"].empty()) { // filter string for lucene
223    option.name = "FilterString";
224    option.value = args["fs"];
225    request.filterOptions.push_back (option);
226  }
227
228  if (!args["sf"].empty()) { // sort field for lucene
229    option.name = "SortField";
230    option.value = args["sf"];
231    request.filterOptions.push_back (option);
232  }
233  if (!args["so"].empty()) { // sort order for lucene
234    option.name = "SortOrder";
235    option.value = (args.getintarg("so")? "descending" : "ascending");
236    request.filterOptions.push_back (option);
237  }
238
239  if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene
240    option.name = "Fuzziness";
241    option.value = (text_t) "0." + args["fuzziness"];
242    request.filterOptions.push_back (option);
243  }
244
245  set_basequeryfilter_options(request, args);
246}
247
248
249
250void set_fulltext_queryfilter_options (FilterRequest_t &request,
251                       const text_t &querystring1,
252                       const text_t &querystring2,
253                       cgiargsclass &args)
254{
255
256  set_fulltext_queryfilter_options (request, querystring1, args);
257
258  // fill in the second query if needed
259  if (!args["cq2"].empty()) {
260    OptionValue_t option;
261
262    option.name = "CombineQuery";
263    option.value = args["cq2"];
264    request.filterOptions.push_back (option);
265   
266    option.name = "Term";
267    option.value = querystring2;
268    request.filterOptions.push_back (option);
269   
270    option.name = "QueryType";
271    option.value = (args.getintarg("t")) ? "ranked" : "boolean";
272    request.filterOptions.push_back (option);
273
274    option.name = "Casefold";
275    option.value = (args.getintarg("k")) ? "true" : "false";
276    request.filterOptions.push_back (option);
277
278    option.name = "Stem";
279    option.value = (args.getintarg("s")) ? "true" : "false";
280    request.filterOptions.push_back (option);
281
282    option.name = "AccentFold";
283    option.value = (args.getintarg("af")) ? "true" : "false";
284    request.filterOptions.push_back (option);
285
286    if (!args["h2"].empty()) {
287      option.name = "Index";
288      option.value = args["h2"];
289      request.filterOptions.push_back (option);
290    }
291
292    if (!args["j2"].empty()) {
293      option.name = "Subcollection";
294      option.value = args["j2"];
295      request.filterOptions.push_back (option);
296    }
297
298    if (!args["n2"].empty()) {
299      option.name = "Language";
300      option.value = args["n2"];
301      request.filterOptions.push_back (option);
302    }
303  }
304
305  // this is probably redundant, as first line to this method will have
306  // already caused it to invoke set_basequeryfilter_options
307
308  set_basequeryfilter_options(request, args);
309}
310
311
312
313// request.filterResultOptions and request.fields (if required) should
314// be set from the calling code
315void set_sql_queryfilter_options (FilterRequest_t &request,
316                  cgiargsclass &args)
317{
318  if (!args["sqlsf"].empty()) { // sort field for lucene
319    OptionValue_t option;
320
321    option.name = "SortField";
322    option.value = args["sqlsf"];
323    request.filterOptions.push_back (option);
324  }
325
326  set_basequeryfilter_options(request, args);
327}
328
329
330bool is_special_character(int indexer_type, unsigned short character) {
331  // mgpp
332  if (indexer_type == 1) {
333    return (character == '#' || character == '/' || character == '*');
334  }
335  // lucene
336  else if (indexer_type == 2) {
337    return (character == '?' || character == '*' || character == '~' ||
338        character == '^');
339  }
340  return false;
341}
342
343// This function removes boolean operators from simple searches, and segments
344// chinese characters if segment=true
345void format_querystring (text_t &querystring, int querymode, bool segment) {
346  text_t formattedstring;
347
348  // advanced search, no segmenting, don't need to do anything
349  if (querymode == 1 && !segment) return;
350 
351  text_t::const_iterator here = querystring.begin();
352  text_t::const_iterator end = querystring.end();
353
354  // space is used to insert spaces between Chinese
355  // characters. No space is needed before the first
356  // Chinese character.
357  bool space = false;
358
359  // want to remove ()|!& from querystring so boolean queries are just
360  // "all the words" queries (unless querymode is advanced)
361  while (here != end) {
362    if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
363                 *here == '!' || *here == '&')) {
364      formattedstring.push_back(' ');
365    } else if (segment) {
366      if ((*here >= 0x2e80 && *here <= 0xd7a3) ||
367      ( *here >= 0xf900 && *here <= 0xfa6a)) {
368    /* text_t not big enough to handle these. */
369    /*    (*here >= 0x20000 && *here <= 0x2a6d6) ||
370      (*here >= 0x2f800 && *here <= 0x2fa1d)) { */
371   
372    // CJK character
373    if (!space) formattedstring.push_back (0x200b); // zero width space
374    formattedstring.push_back (*here);
375    formattedstring.push_back (0x200b);
376    space = true;
377      } else {
378   
379    // non-Chinese character
380    formattedstring.push_back (*here);
381    space = false;
382   
383      }
384   
385    } else {
386      formattedstring.push_back (*here);
387    }
388    ++here;
389  }
390  querystring = formattedstring;
391}
392
393// turn query string into terms separated by spaces.
394// still working on this...
395text_t get_plain_query_terms(const text_t &querystring, const text_t &arg_ct) {
396  text_t::const_iterator here = querystring.begin();
397  text_t::const_iterator end = querystring.end();
398
399  // lets look for [] and () first - these are a pain.
400  text_t::const_iterator bracket;
401  text_t query_no_brackets = "";
402
403  // mgpp brackets: [xxx]:TI
404  if (findchar(here, end, '[') != end) {
405    while ((bracket = findchar(here, end, '[')) != end) {
406      // get the first bit
407      query_no_brackets += substr(here, bracket);
408      bracket++;
409      here = bracket;
410      // get the end bracket
411      bracket = findchar(here, end, ']');
412      query_no_brackets += substr(here, bracket);
413      // skip the :TI bits
414      while (bracket != end     // do bracket != end test first, ELSE when bracket = end, we're past the string, in
415          && *bracket != ' ') { // which case *bracket becomes an invalid operation that causes the server to crash
416            bracket++;
417      }
418      here = bracket;
419    }
420    if (here != end) {
421      query_no_brackets += substr(here,end);
422    }
423  } else if (findchar(here, end, '(') != end) {
424    // lucene brackets TI:(xxx)
425    while ((bracket = findchar(here, end, '(')) != end) {
426      // back up the field name
427      text_t::const_iterator old_bracket = bracket;
428      while (bracket != here && *bracket != ' ') {  // order of tests in condition matters (see long comment above)
429    --bracket;                     
430      }
431      if (bracket != here) {
432    // get the first bit
433    query_no_brackets += substr(here, bracket+1);
434      }
435      here = old_bracket +1;
436      // get the end bracket
437      bracket = findchar(here, end, ')');
438      query_no_brackets += substr(here, bracket);
439      if (bracket != end) {
440    here = bracket+1;
441      }
442    }
443    if (here != end) {
444      query_no_brackets += substr(here,end);
445    }
446  } else {
447    // was no brackets
448    query_no_brackets = querystring;
449  }
450 
451 
452  if (arg_ct == "2") { // lucene
453    // look for AND OR NOT and remove
454    here = query_no_brackets.begin();
455    end = query_no_brackets.end();
456    text_tlist terms;
457    splitword(here, end, "AND", terms);
458    joinchar(terms, ' ', query_no_brackets);
459    here = query_no_brackets.begin();
460    end = query_no_brackets.end();
461    splitword(here, end, "OR", terms);
462    joinchar(terms, ' ', query_no_brackets);
463    here = query_no_brackets.begin();
464    end = query_no_brackets.end();
465    splitword(here, end, "NOT", terms);
466    joinchar(terms, ' ', query_no_brackets);
467   
468  }
469  text_t terms = "";
470  bool space = false;
471  here = query_no_brackets.begin();
472  end = query_no_brackets.end();
473 
474  while (here != end) {
475    if (*here ==  '#' || *here == '/') {
476      // skip over #is /10 etc
477      ++here;
478      while (here != end && *here != ' ') {
479    ++here;
480      }
481      if (here == end) break;
482    }
483    if (is_unicode_letdig(*here)) {
484      terms.push_back(*here);
485      space = false;
486    } else {
487      if (!space) {
488    terms.push_back(' ');
489    space = true;
490      }
491    }
492    ++here;
493  }
494  return trim(terms);
495   
496}
497
498// search history tool
499// also used for form query macros
500text_t escape_quotes(const text_t &querystring) {
501
502  text_t::const_iterator here = querystring.begin();
503  text_t::const_iterator end = querystring.end();
504 
505  text_t escquery = "";
506  while (here != end) {
507    if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
508    else if (*here == '\n' || *here == '\r') {
509      escquery.push_back(' ');
510    } else {
511      escquery +="\\\\";
512      escquery.push_back(*here);
513    }
514
515    ++here;
516  }
517  return escquery;
518
519}
520
521// Parses the terms into words, and adds #sif if necessary
522text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &casefold, const text_t &accentfold,
523           const int indexer_type) {
524 
525  // the default stem, case and accentfold are set to 0 if this is being used, so we are only adding on qualifiers if stem,case,accent is 1.
526  if (stem == "0" && casefold == "0" && accentfold =="0") {
527    return terms;
528  }
529  // this is only for mgpp collections, shouldn't be called for anything else
530  if (indexer_type != 1) {
531    return terms;
532  }
533 
534  text_t outtext;
535  text_t word;
536
537  text_t::const_iterator here = terms.begin();
538  text_t::const_iterator end = terms.end();
539
540  text_t word_modifier = "#";
541  if (stem == "1") word_modifier += "s";
542  if (casefold == "1") word_modifier += "i";
543  if (accentfold == "1") word_modifier += "f";
544
545  while (here !=end) {
546
547    if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
548      // not word boundary
549      word.push_back(*here);
550      ++here;   
551    }
552    else {
553      // found word boundary   
554      if (!word.empty() ) {
555    if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) {
556      outtext += word;
557      word.clear();
558    }
559    else {
560      outtext += word+word_modifier;
561      word.clear();
562    }
563      }
564      // this only used in advanced form, so we leave in boolean operators
565      if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' ||
566      *here == '(' || *here == ')' || is_unicode_space(*here)) {
567    outtext.push_back(*here);
568      }
569      ++here;
570    }
571  }
572   
573  // get last word
574  if (!word.empty()) {
575   outtext += word+word_modifier+" ";
576  }
577  return outtext;
578}
579
580
581// some query form parsing functions for use with mgpp & lucene
582
583void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
584{
585  querystring.clear();
586
587  int argct = args.getintarg("ct");
588  int argt = args.getintarg("t");// t=0 -and, t=1 - or
589  int argb = args.getintarg("b");
590   
591  text_t combine;
592
593  // lucene uses global combine, so only need this for mgpp
594  if (argct==1) {
595    if (argt == 0) combine = "&";
596    else combine = "|";
597  }
598 
599  text_t field = args["fqf"];
600  if (field.empty()) return; // no query
601  text_tarray fields;
602  splitchar(field.begin(), field.end(), ',', fields);
603 
604  text_t value = args["fqv"];
605  if (value.empty()) return; // somethings wrong
606  text_tarray values;
607  splitchar(value.begin(), value.end(), ',', values);
608
609
610  for (int i=0; i< values.size(); ++i) {
611    if (!values[i].empty()) {
612      text_t this_value = values[i];
613
614      // remove operators for simple search, segments text if necessary
615      format_querystring(this_value, argb, segment);
616           
617      // add tag info for this field (and other processing)
618      format_field_info(this_value, fields[i], argct, argt, argb);
619
620      // add into query string
621      if (argct == 2) {
622    // lucene
623    // we don't worry about AND/OR, cos this is done by defaultcombineoperator
624    querystring += this_value+" ";
625      } else {
626    // mgpp
627    if (!querystring.empty()) {
628      querystring += " "+ combine+ " ";
629    }
630    querystring += this_value;
631      }
632    }
633  }
634}
635
636
637void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
638  querystring.clear();
639
640  const int argct = args.getintarg("ct");
641  int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
642  int argb = args.getintarg("b");
643  text_t combine;
644  if (argct==1) {
645    combine = "&";
646  }
647  else { // lucene
648    combine = "AND";
649  }
650
651  text_t field = args["fqf"];
652  if (field.empty()) return; // no query
653  text_tarray fields;
654  splitchar(field.begin(), field.end(), ',', fields);
655 
656  text_t value = args["fqv"];
657  if (value.empty()) return; // somethings wrong
658  text_tarray values;
659  splitchar(value.begin(), value.end(), ',', values);
660
661  text_t comb = args["fqc"];
662  if (comb.empty()) return; //somethings wrong
663  text_tarray combs;
664  splitchar(comb.begin(), comb.end(), ',', combs);
665
666  text_tarray stems;
667  text_tarray casefolds;
668  text_tarray accentfolds;
669  if (argct == 1) {// mgpp - lucene doesn't do stem/case
670    if (args["ss"]=="1") { //collection has stemming
671      text_t stem = args["fqs"];
672      if (stem.empty()) return; // somethings wrong
673      splitchar(stem.begin(), stem.end(), ',', stems);
674    }
675    if (args["ks"]=="1") { // collection has case folding
676      text_t fold = args["fqk"];
677      if (fold.empty()) return; // somethings wrong
678      splitchar(fold.begin(), fold.end(), ',', casefolds);
679    }
680    if (args["afs"]=="1") {
681      text_t accent = args["fqaf"];
682      if (accent.empty()) return; // somethings wrong
683      splitchar(accent.begin(), accent.end(), ',', accentfolds);
684    }
685  }
686 
687  for(int i=0; i< values.size(); ++i) {
688    if (!values[i].empty()) {
689      if (i!=0) {
690    if (argct==1) {
691      if (combs[i-1]=="and") combine = "&";
692      else if (combs[i-1]=="or")combine = "|";
693      else if (combs[i-1]=="not")combine = "!";
694    }
695    else { // lucene
696      if (combs[i-1]=="and") combine = "AND";
697      else if (combs[i-1]=="or")combine = "OR";
698      else if (combs[i-1]=="not")combine = "NOT";
699    }
700      }
701      text_t this_value = values[i];
702      // remove operators for simple search, segments text if necessary
703      format_querystring(this_value, argb, segment);
704      if (argct == 1) { // mgpp only
705    this_value = addstemcase(this_value, ((args["ss"]=="1")?stems[i]:"0"), ((args["ks"]=="1")?casefolds[i]:"0"), ((args["afs"]=="1")?accentfolds[i]:"0"), argct);
706      }
707      // add tag info for this field (and other processing)
708      format_field_info(this_value, fields[i], argct, argt, argb);
709      // add into query string
710      if (!querystring.empty()) {
711    querystring += " "+ combine+ " ";
712      }
713      querystring += this_value;
714     
715    }
716  }
717}
718
719
720// SQL versions for parsing query form
721
722void parse_sqlreg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
723{
724  querystring.clear();
725
726  int argt = args.getintarg("t");// t=0 -and, t=1 - or
727  int argb = args.getintarg("b");
728   
729  text_t combine;
730
731  if (argt == 0) combine = "AND";
732  else combine = "OR";
733 
734  text_t field = args["sqlfqf"];
735  if (field.empty()) return; // no query
736  text_tarray fields;
737  splitchar(field.begin(), field.end(), ',', fields);
738
739  text_t sqlcomb = args["sqlfqc"];
740  if (sqlcomb.empty()) return; //somethings wrong
741  text_tarray sqlcombs;
742  splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
743 
744  text_t value = args["fqv"];
745  if (value.empty()) return; // somethings wrong
746  text_tarray values;
747  splitchar(value.begin(), value.end(), ',', values);
748
749
750  for (int i=0; i< values.size(); ++i) {
751    if (!values[i].empty()) {
752      text_t this_value;
753      const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
754      const text_t LIKE_CONDITION = "LIKE";
755     
756      //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
757      //in order to search a field starting with certain words.
758      if (sqlcombs[i] == STARTINGWITH_CONDITION)
759          {this_value = values[i];
760          this_value += "%";
761          // remove operators for simple search, segments text if necessary
762          format_querystring(this_value, argb, segment);
763          // add tag info for this field (and other processing)
764          format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}
765
766      else
767          {this_value = values[i];
768          // remove operators for simple search, segments text if necessary
769          format_querystring(this_value, argb, segment);
770          // add tag info for this field (and other processing)
771          format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}
772
773     
774      const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
775
776      if (querystring.empty()) {
777    // first query term
778    querystring = DISTINCT_SELECT_WHERE + this_value;
779      }
780      else {
781    this_value = DISTINCT_SELECT_WHERE + this_value;
782
783    if (combine=="AND") {   
784      // INNER JOIN to restrict to only matching docOIDs
785      querystring = "SELECT docOID FROM (" + querystring + ")"
786        + " INNER JOIN (" + this_value +") USING (docOID)";
787    }
788    else if (combine=="OR") {
789      // Union to allow union of the two
790      querystring = querystring + " UNION " + this_value;
791    }
792      }
793    }
794  }
795}
796
797
798void parse_sqladv_query_form(text_t &querystring, cgiargsclass &args,
799                 bool segment)
800{
801  querystring.clear();
802
803  int argt = 0; // set it to 0 = AND, by default
804  int argb = args.getintarg("b");
805  text_t combine = "AND";
806
807  text_t field = args["sqlfqf"];
808
809  if (field.empty()) return; // no query
810  text_tarray fields;
811  splitchar(field.begin(), field.end(), ',', fields);
812 
813  text_t sqlcomb = args["sqlfqc"];
814  if (sqlcomb.empty()) return; //somethings wrong
815  text_tarray sqlcombs;
816  splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
817
818  text_t value = args["fqv"];
819  if (value.empty()) return; // somethings wrong
820  text_tarray values;
821  splitchar(value.begin(), value.end(), ',', values);
822
823  text_t comb = args["fqc"];
824  if (comb.empty()) return; //somethings wrong
825  text_tarray combs;
826  splitchar(comb.begin(), comb.end(), ',', combs);
827
828  for(int i=0; i< values.size(); ++i) {
829    if (!values[i].empty()) {
830      if (i>0) {
831    if (combs[i-1]=="and") { combine = "AND"; }
832    else if (combs[i-1]=="or") { combine = "OR"; }
833    else if (combs[i-1]=="not") { combine = "NOT"; }
834      }
835      text_t this_value;
836      const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
837      const text_t LIKE_CONDITION = "LIKE";
838     
839      //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
840      //in order to search a field starting with certain words.
841      if (sqlcombs[i] == STARTINGWITH_CONDITION)
842          {this_value = values[i];
843          this_value += "%";
844          // remove operators for simple search, segments text if necessary
845          format_querystring(this_value, argb, segment);
846          // add tag info for this field (and other processing)
847          format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}
848
849      else
850          {this_value = values[i];
851          // remove operators for simple search, segments text if necessary
852          format_querystring(this_value, argb, segment);
853          // add tag info for this field (and other processing)
854          format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}
855     
856      const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
857
858      if (querystring.empty()) {
859    // first query term
860    querystring = DISTINCT_SELECT_WHERE + this_value;
861      }
862      else {
863    this_value = DISTINCT_SELECT_WHERE + this_value;
864
865    if (combine=="AND") {   
866      // INNER JOIN to restrict to only matching docOIDs
867      querystring = "SELECT docOID FROM (" + querystring + ")"
868        + " INNER JOIN (" + this_value +") USING (docOID)";
869    }
870    else if (combine=="OR") {
871      // Union to allow union of the two
872      querystring = querystring + " UNION " + this_value;
873    }
874    else {
875      cerr << "Unsupported combination operation: " << combine << endl;
876    }
877      }
878     
879    }
880  } 
881}
882
883
884
885
886// Extended addqueryelem for Human Info project
887void addqueryelem_ex(text_t &querystring, const text_t &tag,
888             const text_t &terms, const text_t &stem,
889             const text_t &fold,
890             const text_t& combine, const text_t& word_combine) {
891
892  if (!querystring.empty()) { // have to put and/or
893    querystring += " " + combine + " ";
894  }
895  text_t outtext; outtext.reserve(512);
896  text_t word; word.reserve(100);
897  //unsigned short c;                                                           
898  text_t::const_iterator here = terms.begin();
899  text_t::const_iterator end = terms.end();
900  bool inquote = false, firstword = true;
901
902  text_t word2; word2.reserve(256);
903   
904  while (here !=end) {
905    if (is_unicode_space(*here)) {
906      if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
907      else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
908      else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
909      else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
910      else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
911      if (inquote) {
912    word2.push_back(*here);
913      }
914      word.append(word2); word2.clear();
915           
916      if (!inquote && !word.empty() ) {
917    // found word boundary   
918               
919    if (stem == "1" || fold =="1") {
920      word += "#";
921      if (stem == "1") word += "s";
922      //else word += "u";
923                   
924      if (fold == "1") word += "i";
925      //else word += "c";
926    }
927    if (firstword) {
928      firstword = false;
929    } else {
930      outtext += " " + word_combine + " ";
931    }
932    outtext += "[" + word + "]:"+tag;
933    word.clear();
934      }
935      ++here;
936    } else if (*here == '\"') {
937      word2.push_back(*here);
938      inquote = !inquote;
939      ++here;
940    } else {
941      // not word boundary
942      word2.push_back(*here);
943      ++here;   
944    }
945  }
946   
947  // get last word
948  if (!word2.empty()) {
949    if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
950    else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
951    else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
952    else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
953    else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
954    word.append(word2); word2.clear();
955       
956    if (stem == "1"|| fold == "1") {
957      word += "#";
958      if (stem == "1") word += "s";
959      //else word += "u";
960           
961      if (fold == "1") word += "i";
962      //else word += "c";
963    }
964    if (!outtext.empty()) outtext += " " + word_combine + " ";
965    outtext += "[" + word + "]:"+tag;
966  }
967  querystring += "(" + outtext + ")";
968}
969
970void add_field_info(text_t &querystring, const text_t &tag, int type) {
971
972  if (tag == "") return; // do nothing
973  if (tag == "ZZ" && type == 1) return;  // mgpp doesn't use ZZ tag internally
974  if (type == 1) { //mgpp
975    querystring = "["+querystring+"]:"+tag;
976  } else if (type == 2) { // lucene
977    querystring = tag+":("+querystring+")";
978  }
979   
980}
981
982
983void add_field_info_sql(text_t &querystring, const text_t &tagseq,
984            const text_t& sqlcomb)
985{
986
987  if (tagseq == "") return; // do nothing
988
989  text_t element_in = "(element IN (";
990
991  text_tlist mdterms;
992
993  splitword(tagseq.begin(), tagseq.end(), "/", mdterms);
994
995  text_t tags_in = "";
996
997  while (!mdterms.empty()) {
998    text_t tag = mdterms.front();
999    mdterms.pop_front();
1000
1001    if (!tag.empty()) {
1002
1003      // remove "ex." prefix, but only if there are no other metadata set qualifiers
1004      // in the metaname, since we want to retain prefixes like "ex.dc." as-is
1005      text_t::iterator period = findchar(tag.begin(), tag.end(), '.');
1006      text_t::iterator lastperiod = findlastchar(tag.begin(), tag.end(), '.');
1007
1008      if (tag.size()>3 && (substr(tag.begin(), tag.begin()+3) == "ex.") && period == lastperiod) {
1009    tag = substr (tag.begin()+3, tag.end());
1010      }
1011
1012      if (!tags_in.empty()) {
1013    tags_in += ",";
1014      }
1015     
1016      tags_in += "'" + tag + "'";
1017    }
1018  }
1019
1020  element_in += tags_in + ") AND (";
1021
1022 
1023  if (sqlcomb == "=") {
1024    // override what it means to do equality, to make it more like full text
1025    // searching
1026
1027    text_t orterms = "";
1028    text_t term = "";
1029    bool in_phrase = false;
1030   
1031    text_t::const_iterator here = querystring.begin();
1032    text_t::const_iterator end = querystring.end();
1033    while (here != end) {
1034      if (is_unicode_letdig(*here)) {
1035    term.push_back(*here);
1036      }
1037      else if (*here == '"') {
1038    term.push_back(*here);
1039    if (!in_phrase) {
1040      in_phrase = true;
1041    } else {
1042      in_phrase = false;
1043    }
1044      }     
1045      else if (in_phrase) {
1046        // Found word boundary, but in a phrase, so does not complete term
1047    term.push_back(*here);
1048      }
1049      else {
1050        // Found a word boundary
1051    if (!orterms.empty()) {
1052      orterms += " OR ";
1053    }
1054    orterms += "value LIKE '%" + term + "%'";
1055    term.clear();
1056      }
1057      ++here;
1058    }
1059
1060    if (!term.empty()) {
1061    if (!orterms.empty()) {
1062      orterms += " OR ";
1063    }
1064        orterms += "value LIKE '%" + term + "%'";
1065    }
1066
1067    element_in += orterms;
1068  }
1069  //We cast the value from STRING to REAL to allow numeric sorting
1070  else if (sqlcomb == "<num") {
1071    element_in += "CAST(value as REAL) < CAST('" + querystring+"' AS REAL)";
1072  }
1073  else if (sqlcomb == ">num") {
1074    element_in += "CAST(value as REAL) > CAST('" + querystring+"' AS REAL)";
1075  }
1076   else if (sqlcomb == "<=num") {
1077    element_in += "CAST(value as REAL) <= CAST('" + querystring+"' AS REAL)";
1078  }
1079  else if (sqlcomb == ">=num") {
1080    element_in += "CAST(value as REAL) >= CAST('" + querystring+"' AS REAL)";
1081  }
1082  else if (sqlcomb == "=num") {
1083    element_in += "CAST(value as REAL) = CAST('" + querystring+"' AS REAL)";
1084  }
1085  else {
1086    // search on value is "as is" querystring
1087    element_in += "value " + sqlcomb + " '" + querystring+"'";
1088  }
1089
1090 
1091  querystring = element_in + "))";
1092   
1093}
1094
1095
1096void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) {
1097
1098  int type = 2; //lucene
1099
1100  if (argb==0) { // simple
1101    // there will be no & or | as they should have already been removed
1102    // just tag the entire thing
1103    if (tag != "") {
1104      add_field_info(querystring, tag, type);
1105    }
1106    return;
1107  }
1108
1109  // need to replace & with &&, | with ||
1110  text_t::const_iterator here = querystring.begin();
1111  text_t::const_iterator end = querystring.end();
1112
1113  text_t finalquery = "";
1114  while (here != end) {
1115    if (*here ==  '&') {
1116      finalquery.push_back('&');
1117      finalquery.push_back('&');
1118      while (*(here+1) == '&') {
1119    ++here;
1120      }
1121    }
1122    else if (*here == '|') {
1123      finalquery.push_back('|');
1124      finalquery.push_back('|');
1125      while (*(here+1) == '|') {
1126    ++here;
1127      }
1128    }
1129    else {
1130      finalquery.push_back(*here);
1131    }
1132    ++here;
1133  }
1134  querystring = finalquery;
1135  add_field_info(querystring, tag, type);
1136}
1137
1138
1139void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {
1140
1141  if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
1142  if (tag == "" && argb == 1) {
1143    return; // no field specifier, advanced mode, the query stays as written
1144  }
1145
1146  int type = 1; // mgpp
1147
1148  bool simple_and = (argb==0 && argt==0);
1149  text_t finalquery = "";
1150  text_t fieldpart ="";
1151  text_t queryelem = "";
1152  bool in_phrase = false;
1153  bool in_field = false;
1154
1155  text_t::const_iterator here = querystring.begin();
1156  text_t::const_iterator end = querystring.end();
1157  while (here != end) {
1158    if (is_unicode_letdig(*here)  || *here == '&' || is_special_character(type, *here)) {
1159      queryelem.push_back(*here);
1160    }
1161    else if (*here == '|') {
1162      in_field = false;
1163    }
1164    else if (*here == '!' || *here == '(' || *here == ')') {
1165      if (!in_phrase) { // ignore these if in_phrase
1166    // output field, then output operator
1167    in_field = false;
1168    if (!queryelem.empty()) {
1169      if (!simple_and && !fieldpart.empty()) {
1170        add_field_info(fieldpart, tag, type);
1171        finalquery += fieldpart;
1172        finalquery.push_back(' ');
1173        fieldpart.clear();
1174      }
1175      fieldpart += queryelem;
1176    }
1177    if (!fieldpart.empty()) {
1178      add_field_info(fieldpart, tag, type);
1179      finalquery += fieldpart;
1180      finalquery.push_back(' ');
1181    }
1182    fieldpart.clear();
1183    queryelem.clear();
1184    finalquery.push_back(*here);
1185    finalquery.push_back(' ');
1186      }
1187    }
1188    else if (*here == '"') {
1189      queryelem.push_back(*here);
1190      if (in_phrase == false) in_phrase = true;
1191      else {
1192    in_phrase = false;
1193      }
1194    }
1195
1196    // Found word boundary, in a phrase
1197    else if (in_phrase) {
1198      queryelem.push_back(*here);
1199    }
1200    // Found a word boundary
1201    else {
1202      if (!queryelem.empty()) {
1203    if (queryelem == "&") {
1204      in_field = true;
1205      queryelem.clear();
1206    }
1207    else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
1208     
1209      if (argb==1) {
1210        // simple search, these not allowed
1211        in_field = true;
1212        fieldpart += queryelem;
1213        fieldpart.push_back(' ');
1214      }
1215      queryelem.clear();
1216     
1217    }
1218    else {
1219      if (!simple_and && !in_field) {
1220        if (!fieldpart.empty()) {
1221          add_field_info(fieldpart, tag, type);
1222          finalquery += fieldpart;
1223          finalquery.push_back(' ');
1224          fieldpart.clear();
1225        }
1226      }
1227     
1228      fieldpart += queryelem;
1229      fieldpart.push_back(' ');
1230      queryelem.clear();
1231    }
1232      }
1233    }
1234    ++here;
1235  }
1236  // at the end
1237  if (!queryelem.empty()) {
1238    if (!simple_and && !in_field && !fieldpart.empty()) {
1239      add_field_info(fieldpart, tag, type);
1240      finalquery += fieldpart;
1241      finalquery.push_back(' ');
1242      fieldpart.clear();
1243    }
1244    fieldpart += queryelem;
1245  }
1246  if (!fieldpart.empty()) {
1247    add_field_info(fieldpart, tag, type);
1248    finalquery += fieldpart;
1249    fieldpart.clear();
1250
1251    // doesn't the following just leave a dangling space at the end ?? (used to make mgpp crash)
1252    // consider cutting this line
1253    finalquery.push_back(' ');
1254  }
1255 
1256  querystring  = finalquery;
1257}
1258
1259
1260void format_field_info_sql(text_t &querystring, const text_t &tagseq,
1261               const text_t &sqlcomb,
1262               int argt, int argb)
1263{
1264  add_field_info_sql(querystring, tagseq, sqlcomb);
1265}
1266
1267
1268void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) { 
1269  if (argct == 1) {
1270    format_field_info_mgpp(querystring, tag, argt, argb);
1271  } else if (argct == 2) {
1272    format_field_info_lucene(querystring, tag, argt, argb);
1273  }
1274}
1275
1276void mgpp_adddateelem(text_t& querystring, const int date)
1277{
1278  querystring.appendcstr(" [");
1279  if(date<0) {
1280      querystring.appendcstr("bc");
1281      querystring.appendint((date*-1));
1282  }
1283  else {
1284    querystring.appendint(date);
1285  }
1286  querystring.appendcstr("]:CV");
1287}
1288
1289void lucene_adddateelem(text_t& querystring, const int date)
1290{
1291  querystring.appendcstr(" CV:(");
1292  if(date<0) {
1293      querystring.appendcstr("bc");
1294      querystring.appendint((date*-1));
1295  }
1296  else {
1297    querystring.appendint(date);
1298  }
1299  querystring.appendcstr(")");
1300}
1301
1302
1303void add_dates(text_t &querystring, int startdate, int enddate,
1304           int startbc, int endbc, int ct)
1305{
1306  if(startdate)
1307    {
1308      int querystringis = 0;
1309      text_t::const_iterator here = querystring.begin();
1310      text_t::const_iterator end = querystring.end();
1311      while(here!=end)
1312    {
1313      if(!(isspace((*here)))){
1314        here = end;
1315        querystringis = 1;
1316      }
1317      else
1318        ++here;
1319    }
1320      //converting BCE dates
1321      if(startbc && startdate > 0)
1322    {
1323      startdate *= -1;
1324    }
1325      if(endbc && enddate > 0)
1326    {
1327      enddate *= -1;
1328    }
1329       if(enddate != 0 && enddate<startdate)
1330    {
1331      cout<<"enddate too small"<<endl;
1332      return;
1333    }
1334       if(querystringis)
1335     querystring.appendcstr(" AND");
1336       if(!enddate)
1337     {
1338       if (ct==1) {
1339         mgpp_adddateelem(querystring,startdate);
1340       }
1341       else { // lucene
1342         lucene_adddateelem(querystring,startdate);
1343       }
1344     }
1345       else{
1346     int nextdate = startdate;
1347     querystring.appendcstr(" (");
1348     while(nextdate<=enddate)
1349       {
1350         if(nextdate!=0) {
1351           if (ct==1) {
1352         mgpp_adddateelem(querystring,nextdate);
1353           }
1354           else { // lucene
1355         lucene_adddateelem(querystring,nextdate);
1356           }
1357         }
1358         ++nextdate;
1359       }
1360     querystring.appendcstr(" )");
1361       }
1362    }
1363
1364}
Note: See TracBrowser for help on using the browser.