source: gsdl/trunk/runtime-src/src/recpt/querytools.cpp@ 20602

Last change on this file since 20602 was 20602, checked in by kjdon, 15 years ago

get_plain_query_terms: first pass through to remove TI:(...) and [...]:TI, and AND,OR,NOT for lucene, then remove term modifiers etc

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 27.1 KB
Line 
1/**********************************************************************
2 *
3 * querytools.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "querytools.h"
27#include <ctype.h>
28#include "unitool.h" // for is_unicode_letdig
29
30// sets the ct, qt, qto arguments
31void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
32
33 if (args["ct"].empty()) {
34 text_t build_type = cinfo->buildType;
35 if (build_type == "mgpp") {
36 args["ct"] = "1";
37 } else if (build_type == "lucene") {
38 args["ct"] = "2";
39 } else {
40 args["ct"] = "0";
41 }
42 }
43 text_t arg_ct = args["ct"];
44 if (arg_ct == "0") {
45 // mg
46 args["qt"] = "0";
47 args["qto"] = "0";
48 return;
49 }
50
51 if (!args["qt"].empty() && !args["qto"].empty()) {
52 return;
53 }
54
55 text_tmap::iterator check = cinfo->format.find("SearchTypes");
56 text_t search_types;
57 if(check != cinfo->format.end() && !(*check).second.empty()){
58 search_types = (*check).second;
59 } else {
60 // assume plain,form
61 if (args["qto"].empty()) args["qto"] = "3";
62 if (args["qt"].empty()) {
63 int arg_qto = args.getintarg("qto");
64 if (arg_qto == 2) {
65 args["qt"] = "1";
66 } else {
67 args["qt"] = "0";
68 }
69 }
70 return;
71 }
72
73
74 if (args["qto"].empty()) {
75 unsigned int type = 0;
76 if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
77 type |= 2;
78 }
79 if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
80 type |= 1;
81 }
82 args.setintarg("qto", type);
83 }
84
85 if (args["qt"].empty()) {
86 int arg_qto = args.getintarg("qto");
87 if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
88 args["qt"] = "1";
89 } else {
90 args["qt"] = "0";
91 }
92 }
93}
94
95// sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args
96void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
97 int stemIndexes = cinfo->stemIndexes;
98
99 if (stemIndexes & SIcasefold) {
100 args["ks"] = 1;
101 }
102 if (stemIndexes & SIstem) {
103 args["ss"] = 1;
104 }
105 if (stemIndexes & SIaccentfold) {
106 args["afs"] = 1;
107 }
108
109}
110
111// request.filterResultOptions and request.fields (if required) should
112// be set from the calling code
113void set_queryfilter_options (FilterRequest_t &request,
114 const text_t &querystring,
115 cgiargsclass &args) {
116
117 request.filterName = "QueryFilter";
118
119 OptionValue_t option;
120
121 option.name = "Term";
122 option.value = querystring;
123 request.filterOptions.push_back (option);
124
125 option.name = "QueryType";
126 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
127 request.filterOptions.push_back (option);
128
129 option.name = "MatchMode";
130 // mgpp in advanced mode, always use some query
131 if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
132 option.value = "some";
133 } else {
134 option.value = (args.getintarg("t")) ? "some" : "all";
135 }
136 request.filterOptions.push_back (option);
137
138 option.name = "Casefold";
139 option.value = (args.getintarg("k")) ? "true" : "false";
140 request.filterOptions.push_back (option);
141
142 option.name = "Stem";
143 option.value = (args.getintarg("s")) ? "true" : "false";
144 request.filterOptions.push_back (option);
145
146 option.name = "AccentFold";
147 option.value = (args.getintarg("af")) ? "true" : "false";
148 request.filterOptions.push_back (option);
149
150 if (!args["h"].empty()) {
151 option.name = "Index";
152 option.value = args["h"];
153 request.filterOptions.push_back (option);
154 }
155
156 if (!args["j"].empty()) {
157 option.name = "Subcollection";
158 option.value = args["j"];
159 request.filterOptions.push_back (option);
160 }
161
162 if (!args["n"].empty()) {
163 option.name = "Language";
164 option.value = args["n"];
165 request.filterOptions.push_back (option);
166 }
167
168 if (!args["g"].empty()) { // granularity for mgpp
169 option.name = "Level";
170 option.value = args["g"];
171 request.filterOptions.push_back (option);
172 }
173
174 if (!args["fs"].empty()) { // filter string for lucene
175 option.name = "FilterString";
176 option.value = args["fs"];
177 request.filterOptions.push_back (option);
178 }
179
180 if (!args["sf"].empty()) { // sort field for lucene
181 option.name = "SortField";
182 option.value = args["sf"];
183 request.filterOptions.push_back (option);
184 }
185
186 if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene
187 option.name = "Fuzziness";
188 option.value = (text_t) "0." + args["fuzziness"];
189 request.filterOptions.push_back (option);
190 }
191
192 set_more_queryfilter_options (request, args);
193}
194
195void set_queryfilter_options (FilterRequest_t &request,
196 const text_t &querystring1,
197 const text_t &querystring2, cgiargsclass &args) {
198
199 set_queryfilter_options (request, querystring1, args);
200
201 // fill in the second query if needed
202 if (!args["cq2"].empty()) {
203 OptionValue_t option;
204
205 option.name = "CombineQuery";
206 option.value = args["cq2"];
207 request.filterOptions.push_back (option);
208
209 option.name = "Term";
210 option.value = querystring2;
211 request.filterOptions.push_back (option);
212
213 option.name = "QueryType";
214 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
215 request.filterOptions.push_back (option);
216
217 option.name = "Casefold";
218 option.value = (args.getintarg("k")) ? "true" : "false";
219 request.filterOptions.push_back (option);
220
221 option.name = "Stem";
222 option.value = (args.getintarg("s")) ? "true" : "false";
223 request.filterOptions.push_back (option);
224
225 option.name = "AccentFold";
226 option.value = (args.getintarg("af")) ? "true" : "false";
227 request.filterOptions.push_back (option);
228
229 if (!args["h2"].empty()) {
230 option.name = "Index";
231 option.value = args["h2"];
232 request.filterOptions.push_back (option);
233 }
234
235 if (!args["j2"].empty()) {
236 option.name = "Subcollection";
237 option.value = args["j2"];
238 request.filterOptions.push_back (option);
239 }
240
241 if (!args["n2"].empty()) {
242 option.name = "Language";
243 option.value = args["n2"];
244 request.filterOptions.push_back (option);
245 }
246 }
247 set_more_queryfilter_options (request, args);
248}
249
250void set_more_queryfilter_options (FilterRequest_t &request,
251 cgiargsclass &args) {
252
253 OptionValue_t option;
254 int arg_m = args.getintarg("m");
255
256 option.name = "Maxdocs";
257 option.value = arg_m;
258 request.filterOptions.push_back (option);
259
260 // option.name = "StartResults";
261 // option.value = args["r"];
262 // request.filterOptions.push_back (option);
263
264 // option.name = "EndResults";
265 // int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
266 // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
267 // option.value = endresults;
268 // request.filterOptions.push_back (option);
269}
270
271bool is_special_character(int indexer_type, unsigned short character) {
272 // mgpp
273 if (indexer_type == 1) {
274 return (character == '#' || character == '/' || character == '*');
275 }
276 // lucene
277 else if (indexer_type == 2) {
278 return (character == '?' || character == '*' || character == '~' ||
279 character == '^');
280 }
281 return false;
282}
283
284// This function removes boolean operators from simple searches, and segments
285// chinese characters if segment=true
286void format_querystring (text_t &querystring, int querymode, bool segment) {
287 text_t formattedstring;
288
289 // advanced search, no segmenting, don't need to do anything
290 if (querymode == 1 && !segment) return;
291
292 text_t::const_iterator here = querystring.begin();
293 text_t::const_iterator end = querystring.end();
294
295 // space is used to insert spaces between Chinese
296 // characters. No space is needed before the first
297 // Chinese character.
298 bool space = false;
299
300 // want to remove ()|!& from querystring so boolean queries are just
301 // "all the words" queries (unless querymode is advanced)
302 while (here != end) {
303 if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
304 *here == '!' || *here == '&')) {
305 formattedstring.push_back(' ');
306 } else if (segment) {
307 if ((*here >= 0x2e80 && *here <= 0xd7a3) ||
308 ( *here >= 0xf900 && *here <= 0xfa6a)) {
309 /* text_t not big enough to handle these. */
310 /* (*here >= 0x20000 && *here <= 0x2a6d6) ||
311 (*here >= 0x2f800 && *here <= 0x2fa1d)) { */
312
313 // CJK character
314 if (!space) formattedstring.push_back (0x200b); // zero width space
315 formattedstring.push_back (*here);
316 formattedstring.push_back (0x200b);
317 space = true;
318 } else {
319
320 // non-Chinese character
321 formattedstring.push_back (*here);
322 space = false;
323
324 }
325
326 } else {
327 formattedstring.push_back (*here);
328 }
329 ++here;
330 }
331 querystring = formattedstring;
332}
333
334// turn query string into terms separated by spaces.
335// still working on this...
336text_t get_plain_query_terms(const text_t &querystring, const text_t &arg_ct) {
337 text_t::const_iterator here = querystring.begin();
338 text_t::const_iterator end = querystring.end();
339
340 // lets look for [] and () first - these are a pain.
341 text_t::const_iterator bracket;
342 text_t query_no_brackets = "";
343
344 // mgpp brackets: [xxx]:TI
345 if (findchar(here, end, '[') != end) {
346 while ((bracket = findchar(here, end, '[')) != end) {
347 // get the first bit
348 query_no_brackets += substr(here, bracket);
349 bracket++;
350 here = bracket;
351 // get the end bracket
352 bracket = findchar(here, end, ']');
353 query_no_brackets += substr(here, bracket);
354 // skip the :TI bits
355 while (*bracket != ' ' && bracket != end) { bracket++;}
356 here = bracket;
357 }
358 if (here != end) {
359 query_no_brackets += substr(here,end);
360 }
361 } else if (findchar(here, end, '(') != end) {
362 // lucene brackets TI:(xxx)
363 while ((bracket = findchar(here, end, '(')) != end) {
364 // back up the field name
365 text_t::const_iterator old_bracket = bracket;
366 while (*bracket != ' ' && bracket != here) {
367 --bracket;
368 }
369 if (bracket != here) {
370 // get the first bit
371 query_no_brackets += substr(here, bracket+1);
372 }
373 here = old_bracket +1;
374 // get the end bracket
375 bracket = findchar(here, end, ')');
376 query_no_brackets += substr(here, bracket);
377 if (bracket != end) {
378 here = bracket+1;
379 }
380 }
381 if (here != end) {
382 query_no_brackets += substr(here,end);
383 }
384 } else {
385 // was no brackets
386 query_no_brackets = querystring;
387 }
388
389
390 if (arg_ct == "2") { // lucene
391 // look for AND OR NOT and remove
392 here = query_no_brackets.begin();
393 end = query_no_brackets.end();
394 text_tlist terms;
395 splitword(here, end, "AND", terms);
396 joinchar(terms, ' ', query_no_brackets);
397 here = query_no_brackets.begin();
398 end = query_no_brackets.end();
399 splitword(here, end, "OR", terms);
400 joinchar(terms, ' ', query_no_brackets);
401 here = query_no_brackets.begin();
402 end = query_no_brackets.end();
403 splitword(here, end, "NOT", terms);
404 joinchar(terms, ' ', query_no_brackets);
405
406 }
407 text_t terms = "";
408 bool space = false;
409 here = query_no_brackets.begin();
410 end = query_no_brackets.end();
411
412 while (here != end) {
413 if (*here == '#' || *here == '/') {
414 // skip over #is /10 etc
415 ++here;
416 while (here != end && *here != ' ') {
417 ++here;
418 }
419 if (here == end) break;
420 }
421 if (is_unicode_letdig(*here)) {
422 terms.push_back(*here);
423 space = false;
424 } else {
425 if (!space) {
426 terms.push_back(' ');
427 space = true;
428 }
429 }
430 ++here;
431 }
432 return terms;
433
434}
435
436// search history tool
437// also used for form query macros
438text_t escape_quotes(const text_t &querystring) {
439
440 text_t::const_iterator here = querystring.begin();
441 text_t::const_iterator end = querystring.end();
442
443 text_t escquery = "";
444 while (here != end) {
445 if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
446 else if (*here == '\n' || *here == '\r') {
447 escquery.push_back(' ');
448 } else {
449 escquery +="\\\\";
450 escquery.push_back(*here);
451 }
452
453 ++here;
454 }
455 return escquery;
456
457}
458
459// Parses the terms into words, and adds #si if necessary
460text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
461 const int indexer_type) {
462
463 // the default stem and case are set to 0 if this is being used, so we are only adding on qualifiers if stem or fold is 1.
464 if (stem == "0" && fold == "0") {
465 return terms;
466 }
467 // this is only for mgpp collections, shouldn't be called for anything else
468 if (indexer_type != 1) {
469 return terms;
470 }
471
472 text_t outtext;
473 text_t word;
474
475 text_t::const_iterator here = terms.begin();
476 text_t::const_iterator end = terms.end();
477
478 while (here !=end) {
479
480 if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
481 // not word boundary
482 word.push_back(*here);
483 ++here;
484 }
485 else {
486 // found word boundary
487 if (!word.empty() ) {
488 if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) {
489 outtext += word;
490 word.clear();
491 }
492 else {
493 word += "#";
494 if (stem == "1") word += "s";
495 if (fold == "1") word += "i";
496 outtext += word;
497 word.clear();
498 }
499 }
500 // this only used in advanced form, so we leave in boolean operators
501 if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' ||
502 *here == '(' || *here == ')' || is_unicode_space(*here)) {
503 outtext.push_back(*here);
504 }
505 ++here;
506 }
507 }
508
509 // get last word
510 if (!word.empty()) {
511 word += "#";
512 if (stem == "1") word += "s";
513 if (fold == "1") word += "i";
514 word += " ";
515 outtext += word;
516 }
517 return outtext;
518}
519
520
521// some query form parsing functions for use with mgpp & lucene
522
523void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
524{
525 querystring.clear();
526
527 int argct = args.getintarg("ct");
528 int argt = args.getintarg("t");// t=0 -and, t=1 - or
529 int argb = args.getintarg("b");
530
531 text_t combine;
532
533 // lucene uses global combine, so only need this for mgpp
534 if (argct==1) {
535 if (argt == 0) combine = "&";
536 else combine = "|";
537 }
538
539 text_t field = args["fqf"];
540 if (field.empty()) return; // no query
541 text_tarray fields;
542 splitchar(field.begin(), field.end(), ',', fields);
543
544 text_t value = args["fqv"];
545 if (value.empty()) return; // somethings wrong
546 text_tarray values;
547 splitchar(value.begin(), value.end(), ',', values);
548
549
550 for (int i=0; i< values.size(); ++i) {
551 if (!values[i].empty()) {
552 text_t this_value = values[i];
553 // remove operators for simple search, segments text if necessary
554 format_querystring(this_value, argb, segment);
555 // add tag info for this field (and other processing)
556 format_field_info(this_value, fields[i], argct, argt, argb);
557 // add into query string
558 if (argct == 2) {
559 // lucene
560 // we don't worry about AND/OR, cos this is done by defaultcombineoperator
561 querystring += this_value+" ";
562 } else {
563 // mgpp
564 if (!querystring.empty()) {
565 querystring += " "+ combine+ " ";
566 }
567 querystring += this_value;
568 }
569 }
570 }
571}
572
573
574void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
575 querystring.clear();
576
577 const int argct = args.getintarg("ct");
578 int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
579 int argb = args.getintarg("b");
580 text_t combine;
581 if (argct==1) {
582 combine = "&";
583 }
584 else { // lucene
585 combine = "AND";
586 }
587
588 text_t field = args["fqf"];
589 if (field.empty()) return; // no query
590 text_tarray fields;
591 splitchar(field.begin(), field.end(), ',', fields);
592
593 text_t value = args["fqv"];
594 if (value.empty()) return; // somethings wrong
595 text_tarray values;
596 splitchar(value.begin(), value.end(), ',', values);
597
598 text_t comb = args["fqc"];
599 if (comb.empty()) return; //somethings wrong
600 text_tarray combs;
601 splitchar(comb.begin(), comb.end(), ',', combs);
602
603 text_tarray stems;
604 text_tarray folds;
605 if (argct == 1) {// mgpp - lucene doesn't do stem/case
606 text_t stem = args["fqs"];
607 if (stem.empty()) return; // somethings wrong
608 splitchar(stem.begin(), stem.end(), ',', stems);
609
610 text_t fold = args["fqk"];
611 if (fold.empty()) return; // somethings wrong
612 splitchar(fold.begin(), fold.end(), ',', folds);
613 }
614
615 for(int i=0; i< values.size(); ++i) {
616 if (!values[i].empty()) {
617 if (i!=0) {
618 if (argct==1) {
619 if (combs[i-1]=="and") combine = "&";
620 else if (combs[i-1]=="or")combine = "|";
621 else if (combs[i-1]=="not")combine = "!";
622 }
623 else { // lucene
624 if (combs[i-1]=="and") combine = "AND";
625 else if (combs[i-1]=="or")combine = "OR";
626 else if (combs[i-1]=="not")combine = "NOT";
627 }
628 }
629 text_t this_value = values[i];
630 // remove operators for simple search, segments text if necessary
631 format_querystring(this_value, argb, segment);
632 if (argct == 1) { // mgpp only
633 this_value = addstemcase(this_value, stems[i], folds[i], argct);
634 }
635 // add tag info for this field (and other processing)
636 format_field_info(this_value, fields[i], argct, argt, argb);
637 // add into query string
638 if (!querystring.empty()) {
639 querystring += " "+ combine+ " ";
640 }
641 querystring += this_value;
642
643 }
644 }
645}
646
647// Extended addqueryelem for Human Info project
648void addqueryelem_ex(text_t &querystring, const text_t &tag,
649 const text_t &terms, const text_t &stem,
650 const text_t &fold,
651 const text_t& combine, const text_t& word_combine) {
652
653 if (!querystring.empty()) { // have to put and/or
654 querystring += " " + combine + " ";
655 }
656 text_t outtext; outtext.reserve(512);
657 text_t word; word.reserve(100);
658 //unsigned short c;
659 text_t::const_iterator here = terms.begin();
660 text_t::const_iterator end = terms.end();
661 bool inquote = false, firstword = true;
662
663 text_t word2; word2.reserve(256);
664
665 while (here !=end) {
666 if (is_unicode_space(*here)) {
667 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
668 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
669 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
670 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
671 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
672 if (inquote) {
673 word2.push_back(*here);
674 }
675 word.append(word2); word2.clear();
676
677 if (!inquote && !word.empty() ) {
678 // found word boundary
679
680 if (stem == "1" || fold =="1") {
681 word += "#";
682 if (stem == "1") word += "s";
683 //else word += "u";
684
685 if (fold == "1") word += "i";
686 //else word += "c";
687 }
688 if (firstword) {
689 firstword = false;
690 } else {
691 outtext += " " + word_combine + " ";
692 }
693 outtext += "[" + word + "]:"+tag;
694 word.clear();
695 }
696 ++here;
697 } else if (*here == '\"') {
698 word2.push_back(*here);
699 inquote = !inquote;
700 ++here;
701 } else {
702 // not word boundary
703 word2.push_back(*here);
704 ++here;
705 }
706 }
707
708 // get last word
709 if (!word2.empty()) {
710 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
711 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
712 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
713 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
714 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
715 word.append(word2); word2.clear();
716
717 if (stem == "1"|| fold == "1") {
718 word += "#";
719 if (stem == "1") word += "s";
720 //else word += "u";
721
722 if (fold == "1") word += "i";
723 //else word += "c";
724 }
725 if (!outtext.empty()) outtext += " " + word_combine + " ";
726 outtext += "[" + word + "]:"+tag;
727 }
728 querystring += "(" + outtext + ")";
729}
730
731void add_field_info(text_t &querystring, const text_t &tag, int type) {
732
733 if (tag == "") return; // do nothing
734 if (tag == "ZZ" && type == 1) return; // mgpp doesn't use ZZ tag internally
735 if (type == 1) { //mgpp
736 querystring = "["+querystring+"]:"+tag;
737 } else if (type == 2) { // lucene
738 querystring = tag+":("+querystring+")";
739 }
740
741}
742
743
744void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) {
745
746 int type = 2; //lucene
747
748 if (argb==0) { // simple
749 // there will be no & or | as they should have already been removed
750 // just tag the entire thing
751 if (tag != "") {
752 add_field_info(querystring, tag, type);
753 }
754 return;
755 }
756
757 // need to replace & with &&, | with ||
758 text_t::const_iterator here = querystring.begin();
759 text_t::const_iterator end = querystring.end();
760
761 text_t finalquery = "";
762 while (here != end) {
763 if (*here == '&') {
764 finalquery.push_back('&');
765 finalquery.push_back('&');
766 while (*(here+1) == '&') {
767 ++here;
768 }
769 }
770 else if (*here == '|') {
771 finalquery.push_back('|');
772 finalquery.push_back('|');
773 while (*(here+1) == '|') {
774 ++here;
775 }
776 }
777 else {
778 finalquery.push_back(*here);
779 }
780 ++here;
781 }
782 querystring = finalquery;
783 add_field_info(querystring, tag, type);
784}
785
786
787void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {
788
789 if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
790 if (tag == "" && argb == 1) {
791 return; // no field specifier, advanced mode, the query stays as written
792 }
793
794 int type = 1; // mgpp
795
796 bool simple_and = (argb==0 && argt==0);
797 text_t finalquery = "";
798 text_t fieldpart ="";
799 text_t queryelem = "";
800 bool in_phrase = false;
801 bool in_field = false;
802
803 text_t::const_iterator here = querystring.begin();
804 text_t::const_iterator end = querystring.end();
805 while (here != end) {
806 if (is_unicode_letdig(*here) || *here == '&' || is_special_character(type, *here)) {
807 queryelem.push_back(*here);
808 }
809 else if (*here == '|') {
810 in_field = false;
811 }
812 else if (*here == '!' || *here == '(' || *here == ')') {
813 if (!in_phrase) { // ignore these if in_phrase
814 // output field, then output operator
815 in_field = false;
816 if (!queryelem.empty()) {
817 if (!simple_and && !fieldpart.empty()) {
818 add_field_info(fieldpart, tag, type);
819 finalquery += fieldpart;
820 finalquery.push_back(' ');
821 fieldpart.clear();
822 }
823 fieldpart += queryelem;
824 }
825 if (!fieldpart.empty()) {
826 add_field_info(fieldpart, tag, type);
827 finalquery += fieldpart;
828 finalquery.push_back(' ');
829 }
830 fieldpart.clear();
831 queryelem.clear();
832 finalquery.push_back(*here);
833 finalquery.push_back(' ');
834 }
835 }
836 else if (*here == '"') {
837 queryelem.push_back(*here);
838 if (in_phrase == false) in_phrase = true;
839 else {
840 in_phrase = false;
841 }
842 }
843
844 // Found word boundary, in a phrase
845 else if (in_phrase) {
846 queryelem.push_back(*here);
847 }
848 // Found a word boundary
849 else {
850 if (!queryelem.empty()) {
851 if (queryelem == "&") {
852 in_field = true;
853 queryelem.clear();
854 }
855 else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
856
857 if (argb==1) {
858 // simple search, these not allowed
859 in_field = true;
860 fieldpart += queryelem;
861 fieldpart.push_back(' ');
862 }
863 queryelem.clear();
864
865 }
866 else {
867 if (!simple_and && !in_field) {
868 if (!fieldpart.empty()) {
869 add_field_info(fieldpart, tag, type);
870 finalquery += fieldpart;
871 finalquery.push_back(' ');
872 fieldpart.clear();
873 }
874 }
875
876 fieldpart += queryelem;
877 fieldpart.push_back(' ');
878 queryelem.clear();
879 }
880 }
881 }
882 ++here;
883 }
884 // at the end
885 if (!queryelem.empty()) {
886 if (!simple_and && !in_field && !fieldpart.empty()) {
887 add_field_info(fieldpart, tag, type);
888 finalquery += fieldpart;
889 finalquery.push_back(' ');
890 fieldpart.clear();
891 }
892 fieldpart += queryelem;
893 }
894 if (!fieldpart.empty()) {
895 add_field_info(fieldpart, tag, type);
896 finalquery += fieldpart;
897 fieldpart.clear();
898
899 // doesn't the following just leave a dangling space at the end ?? (used to make mgpp crash)
900 // consider cutting this line
901 finalquery.push_back(' ');
902 }
903
904 querystring = finalquery;
905}
906
907
908void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) {
909 if (argct == 1) {
910 format_field_info_mgpp(querystring, tag, argt, argb);
911 } else if (argct == 2) {
912 format_field_info_lucene(querystring, tag, argt, argb);
913 }
914}
915
916void mgpp_adddateelem(text_t& querystring, const int date)
917{
918 querystring.appendcstr(" [");
919 if(date<0) {
920 querystring.appendcstr("bc");
921 querystring.appendint((date*-1));
922 }
923 else {
924 querystring.appendint(date);
925 }
926 querystring.appendcstr("]:CV");
927}
928
929void lucene_adddateelem(text_t& querystring, const int date)
930{
931 querystring.appendcstr(" CV:(");
932 if(date<0) {
933 querystring.appendcstr("bc");
934 querystring.appendint((date*-1));
935 }
936 else {
937 querystring.appendint(date);
938 }
939 querystring.appendcstr(")");
940}
941
942
943void add_dates(text_t &querystring, int startdate, int enddate,
944 int startbc, int endbc, int ct)
945{
946 if(startdate)
947 {
948 int querystringis = 0;
949 text_t::const_iterator here = querystring.begin();
950 text_t::const_iterator end = querystring.end();
951 while(here!=end)
952 {
953 if(!(isspace((*here)))){
954 here = end;
955 querystringis = 1;
956 }
957 else
958 ++here;
959 }
960 //converting BCE dates
961 if(startbc && startdate > 0)
962 {
963 startdate *= -1;
964 }
965 if(endbc && enddate > 0)
966 {
967 enddate *= -1;
968 }
969 if(enddate != 0 && enddate<startdate)
970 {
971 cout<<"enddate too small"<<endl;
972 return;
973 }
974 if(querystringis)
975 querystring.appendcstr(" AND");
976 if(!enddate)
977 {
978 if (ct==1) {
979 mgpp_adddateelem(querystring,startdate);
980 }
981 else { // lucene
982 lucene_adddateelem(querystring,startdate);
983 }
984 }
985 else{
986 int nextdate = startdate;
987 querystring.appendcstr(" (");
988 while(nextdate<=enddate)
989 {
990 if(nextdate!=0) {
991 if (ct==1) {
992 mgpp_adddateelem(querystring,nextdate);
993 }
994 else { // lucene
995 lucene_adddateelem(querystring,nextdate);
996 }
997 }
998 ++nextdate;
999 }
1000 querystring.appendcstr(" )");
1001 }
1002 }
1003
1004}
Note: See TracBrowser for help on using the repository browser.