source: gsdl/trunk/runtime-src/src/recpt/querytools.cpp@ 17796

Last change on this file since 17796 was 17796, checked in by kjdon, 15 years ago

In lucene, if you don't specify a tag to search on then it uses the default field (text?), so for searching in ZZ field, we do need to keep teh ZZ specifier, unlike for mgpp

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 24.2 KB
RevLine 
[270]1/**********************************************************************
2 *
3 * querytools.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
[533]6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
[270]9 *
[533]10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
[270]24 *********************************************************************/
25
26#include "querytools.h"
[1373]27#include <ctype.h>
[1914]28#include "unitool.h" // for is_unicode_letdig
[270]29
[12784]30// sets the ct, qt, qto arguments
[11987]31void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
32
33 if (args["ct"].empty()) {
34 text_t build_type = cinfo->buildType;
35 if (build_type == "mgpp") {
36 args["ct"] = "1";
37 } else if (build_type == "lucene") {
38 args["ct"] = "2";
39 } else {
40 args["ct"] = "0";
41 }
42 }
43 text_t arg_ct = args["ct"];
44 if (arg_ct == "0") {
45 // mg
46 args["qt"] = "0";
47 args["qto"] = "0";
48 return;
49 }
50
51 if (!args["qt"].empty() && !args["qto"].empty()) {
52 return;
53 }
54
55 text_tmap::iterator check = cinfo->format.find("SearchTypes");
[12784]56 text_t search_types;
57 if(check != cinfo->format.end() && !(*check).second.empty()){
[11987]58 search_types = (*check).second;
[12784]59 } else {
60 // assume plain,form
61 if (args["qto"].empty()) args["qto"] = "3";
62 if (args["qt"].empty()) {
63 int arg_qto = args.getintarg("qto");
[12930]64 if (arg_qto == 2) {
[12784]65 args["qt"] = "1";
66 } else {
67 args["qt"] = "0";
68 }
[11987]69 }
[12784]70 return;
[11987]71 }
72
[12784]73
[11987]74 if (args["qto"].empty()) {
75 unsigned int type = 0;
76 if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
77 type |= 2;
78 }
79 if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
80 type |= 1;
81 }
82 args.setintarg("qto", type);
83 }
84
85 if (args["qt"].empty()) {
86 int arg_qto = args.getintarg("qto");
87 if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
88 args["qt"] = "1";
89 } else {
90 args["qt"] = "0";
91 }
92 }
93}
94
[12864]95// sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args
96void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
97 int stemIndexes = cinfo->stemIndexes;
98
99 if (stemIndexes & SIcasefold) {
100 args["ks"] = 1;
101 }
102 if (stemIndexes & SIstem) {
103 args["ss"] = 1;
104 }
105 if (stemIndexes & SIaccentfold) {
106 args["afs"] = 1;
107 }
108
109}
110
[759]111// request.filterResultOptions and request.fields (if required) should
112// be set from the calling code
[12784]113void set_queryfilter_options (FilterRequest_t &request,
114 const text_t &querystring,
[759]115 cgiargsclass &args) {
[270]116
117 request.filterName = "QueryFilter";
118
119 OptionValue_t option;
[470]120
[270]121 option.name = "Term";
[759]122 option.value = querystring;
[270]123 request.filterOptions.push_back (option);
124
125 option.name = "QueryType";
126 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
127 request.filterOptions.push_back (option);
128
[1774]129 option.name = "MatchMode";
[11765]130 // mgpp in advanced mode, always use some query
[12428]131 if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
[11765]132 option.value = "some";
133 } else {
134 option.value = (args.getintarg("t")) ? "some" : "all";
135 }
[1774]136 request.filterOptions.push_back (option);
137
[270]138 option.name = "Casefold";
139 option.value = (args.getintarg("k")) ? "true" : "false";
140 request.filterOptions.push_back (option);
141
142 option.name = "Stem";
143 option.value = (args.getintarg("s")) ? "true" : "false";
144 request.filterOptions.push_back (option);
145
[12864]146 option.name = "AccentFold";
147 option.value = (args.getintarg("af")) ? "true" : "false";
148 request.filterOptions.push_back (option);
149
[270]150 if (!args["h"].empty()) {
151 option.name = "Index";
152 option.value = args["h"];
153 request.filterOptions.push_back (option);
154 }
155
156 if (!args["j"].empty()) {
157 option.name = "Subcollection";
158 option.value = args["j"];
159 request.filterOptions.push_back (option);
160 }
161
162 if (!args["n"].empty()) {
163 option.name = "Language";
164 option.value = args["n"];
165 request.filterOptions.push_back (option);
166 }
[1329]167
168 if (!args["g"].empty()) { // granularity for mgpp
169 option.name = "Level";
170 option.value = args["g"];
171 request.filterOptions.push_back (option);
172 }
[270]173
[12410]174 if (!args["fs"].empty()) { // filter string for lucene
175 option.name = "FilterString";
176 option.value = args["fs"];
177 request.filterOptions.push_back (option);
178 }
179
[12276]180 if (!args["sf"].empty()) { // sort field for lucene
181 option.name = "SortField";
182 option.value = args["sf"];
183 request.filterOptions.push_back (option);
184 }
185
[12771]186 if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene
[12770]187 option.name = "Fuzziness";
[12771]188 option.value = (text_t) "0." + args["fuzziness"];
[12770]189 request.filterOptions.push_back (option);
190 }
[12388]191
[759]192 set_more_queryfilter_options (request, args);
193}
194
[12784]195void set_queryfilter_options (FilterRequest_t &request,
196 const text_t &querystring1,
[759]197 const text_t &querystring2, cgiargsclass &args) {
198
199 set_queryfilter_options (request, querystring1, args);
200
[349]201 // fill in the second query if needed
202 if (!args["cq2"].empty()) {
[759]203 OptionValue_t option;
204
[349]205 option.name = "CombineQuery";
206 option.value = args["cq2"];
207 request.filterOptions.push_back (option);
208
209 option.name = "Term";
[759]210 option.value = querystring2;
[349]211 request.filterOptions.push_back (option);
[759]212
[349]213 option.name = "QueryType";
214 option.value = (args.getintarg("t")) ? "ranked" : "boolean";
215 request.filterOptions.push_back (option);
216
217 option.name = "Casefold";
218 option.value = (args.getintarg("k")) ? "true" : "false";
219 request.filterOptions.push_back (option);
220
221 option.name = "Stem";
222 option.value = (args.getintarg("s")) ? "true" : "false";
223 request.filterOptions.push_back (option);
224
[12864]225 option.name = "AccentFold";
226 option.value = (args.getintarg("af")) ? "true" : "false";
227 request.filterOptions.push_back (option);
228
[349]229 if (!args["h2"].empty()) {
230 option.name = "Index";
231 option.value = args["h2"];
232 request.filterOptions.push_back (option);
233 }
234
235 if (!args["j2"].empty()) {
236 option.name = "Subcollection";
237 option.value = args["j2"];
238 request.filterOptions.push_back (option);
239 }
240
241 if (!args["n2"].empty()) {
242 option.name = "Language";
243 option.value = args["n2"];
244 request.filterOptions.push_back (option);
245 }
246 }
[759]247 set_more_queryfilter_options (request, args);
248}
[608]249
[12784]250void set_more_queryfilter_options (FilterRequest_t &request,
251 cgiargsclass &args) {
[759]252
253 OptionValue_t option;
[608]254 int arg_m = args.getintarg("m");
[759]255
[608]256 option.name = "Maxdocs";
257 option.value = arg_m;
258 request.filterOptions.push_back (option);
[1329]259
[759]260 // option.name = "StartResults";
261 // option.value = args["r"];
262 // request.filterOptions.push_back (option);
[270]263
[759]264 // option.name = "EndResults";
265 // int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
266 // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
267 // option.value = endresults;
268 // request.filterOptions.push_back (option);
[270]269}
270
[11987]271bool is_special_character(int indexer_type, unsigned short character) {
272 // mgpp
273 if (indexer_type == 1) {
274 return (character == '#' || character == '/' || character == '*');
275 }
276 // lucene
[12784]277 else if (indexer_type == 2) {
[11987]278 return (character == '?' || character == '*' || character == '~' ||
279 character == '^');
280 }
281 return false;
282}
283
[12784]284// This function removes boolean operators from simple searches, and segments
285// chinese characters if segment=true
[6584]286void format_querystring (text_t &querystring, int querymode, bool segment) {
[270]287 text_t formattedstring;
288
[12784]289 // advanced search, no segmenting, don't need to do anything
[6584]290 if (querymode == 1 && !segment) return;
291
[270]292 text_t::const_iterator here = querystring.begin();
293 text_t::const_iterator end = querystring.end();
294
295 // space is used to insert spaces between Chinese
296 // characters. No space is needed before the first
297 // Chinese character.
298 bool space = false;
299
300 // want to remove ()|!& from querystring so boolean queries are just
[470]301 // "all the words" queries (unless querymode is advanced)
[270]302 while (here != end) {
[470]303 if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
304 *here == '!' || *here == '&')) {
[270]305 formattedstring.push_back(' ');
[6584]306 } else if (segment) {
[16980]307 if ((*here >= 0x2e80 && *here <= 0xd7a3) ||
308 ( *here >= 0xf900 && *here <= 0xfa6a)) {
309 /* text_t not big enough to handle these. */
310 /* (*here >= 0x20000 && *here <= 0x2a6d6) ||
311 (*here >= 0x2f800 && *here <= 0x2fa1d)) { */
[16645]312
313 // CJK character
[8715]314 if (!space) formattedstring.push_back (0x200b); // zero width space
[397]315 formattedstring.push_back (*here);
316 formattedstring.push_back (0x200b);
317 space = true;
[270]318 } else {
[8715]319
[397]320 // non-Chinese character
321 formattedstring.push_back (*here);
322 space = false;
[8715]323
[270]324 }
[6584]325
326 } else {
327 formattedstring.push_back (*here);
[270]328 }
[9620]329 ++here;
[270]330 }
[397]331 querystring = formattedstring;
[270]332}
333
[1373]334
335
[1467]336
[3160]337// search history tool
338// also used for form query macros
[1914]339text_t escape_quotes(const text_t &querystring) {
340
341 text_t::const_iterator here = querystring.begin();
342 text_t::const_iterator end = querystring.end();
343
344 text_t escquery = "";
345 while (here != end) {
[1988]346 if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
347 else if (*here == '\n' || *here == '\r') {
348 escquery.push_back(' ');
349 } else {
[1914]350 escquery +="\\\\";
351 escquery.push_back(*here);
352 }
353
[9620]354 ++here;
[1914]355 }
356 return escquery;
357
358}
359
[12784]360// Parses the terms into words, and adds #si if necessary
361text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
362 const int indexer_type) {
363
364 // the default stem and case are set to 0 if this is being used, so we are only adding on qualifiers if stem or fold is 1.
365 if (stem == "0" && fold == "0") {
[12791]366 return terms;
[12784]367 }
368 // this is only for mgpp collections, shouldn't be called for anything else
369 if (indexer_type != 1) {
[12791]370 return terms;
[12784]371 }
372
373 text_t outtext;
374 text_t word;
375
376 text_t::const_iterator here = terms.begin();
377 text_t::const_iterator end = terms.end();
378
379 while (here !=end) {
380
381 if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
382 // not word boundary
383 word.push_back(*here);
384 ++here;
385 }
386 else {
387 // found word boundary
388 if (!word.empty() ) {
389 if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) {
390 outtext += word;
391 word.clear();
392 }
393 else {
394 word += "#";
395 if (stem == "1") word += "s";
396 if (fold == "1") word += "i";
397 outtext += word;
398 word.clear();
399 }
400 }
401 // this only used in advanced form, so we leave in boolean operators
[12792]402 if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' ||
403 *here == '(' || *here == ')' || is_unicode_space(*here)) {
[12784]404 outtext.push_back(*here);
405 }
406 ++here;
407 }
408 }
409
410 // get last word
411 if (!word.empty()) {
412 word += "#";
413 if (stem == "1") word += "s";
414 if (fold == "1") word += "i";
415 word += " ";
416 outtext += word;
417 }
418 return outtext;
419}
420
421
[11765]422// some query form parsing functions for use with mgpp & lucene
[1914]423
[12784]424void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
[8029]425{
426 querystring.clear();
[1914]427
[12784]428 int argct = args.getintarg("ct");
[8029]429 int argt = args.getintarg("t");// t=0 -and, t=1 - or
[12784]430 int argb = args.getintarg("b");
431
432 text_t combine;
[8029]433
[12784]434 // lucene uses global combine, so only need this for mgpp
435 if (argct==1) {
[8029]436 if (argt == 0) combine = "&";
437 else combine = "|";
438 }
[1914]439
440 text_t field = args["fqf"];
441 if (field.empty()) return; // no query
442 text_tarray fields;
443 splitchar(field.begin(), field.end(), ',', fields);
444
445 text_t value = args["fqv"];
446 if (value.empty()) return; // somethings wrong
447 text_tarray values;
448 splitchar(value.begin(), value.end(), ',', values);
449
[8029]450
[9620]451 for (int i=0; i< values.size(); ++i) {
[1914]452 if (!values[i].empty()) {
[12784]453 text_t this_value = values[i];
454 // remove operators for simple search, segments text if necessary
455 format_querystring(this_value, argb, segment);
456 // add tag info for this field (and other processing)
457 format_field_info(this_value, fields[i], argct, argt, argb);
458 // add into query string
459 if (argct == 2) {
460 // lucene
461 // we don't worry about AND/OR, cos this is done by defaultcombineoperator
462 querystring += this_value+" ";
463 } else {
464 // mgpp
465 if (!querystring.empty()) {
466 querystring += " "+ combine+ " ";
467 }
468 querystring += this_value;
[8029]469 }
[1914]470 }
471 }
472}
473
474
[12784]475void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
[1914]476 querystring.clear();
477
[12784]478 const int argct = args.getintarg("ct");
479 int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
480 int argb = args.getintarg("b");
[8029]481 text_t combine;
[12784]482 if (argct==1) {
[8029]483 combine = "&";
484 }
485 else { // lucene
486 combine = "AND";
487 }
488
[1914]489 text_t field = args["fqf"];
490 if (field.empty()) return; // no query
491 text_tarray fields;
492 splitchar(field.begin(), field.end(), ',', fields);
493
494 text_t value = args["fqv"];
495 if (value.empty()) return; // somethings wrong
496 text_tarray values;
497 splitchar(value.begin(), value.end(), ',', values);
498
499 text_t comb = args["fqc"];
500 if (comb.empty()) return; //somethings wrong
501 text_tarray combs;
502 splitchar(comb.begin(), comb.end(), ',', combs);
[12784]503
504 text_tarray stems;
505 text_tarray folds;
506 if (argct == 1) {// mgpp - lucene doesn't do stem/case
507 text_t stem = args["fqs"];
508 if (stem.empty()) return; // somethings wrong
509 splitchar(stem.begin(), stem.end(), ',', stems);
510
511 text_t fold = args["fqk"];
512 if (fold.empty()) return; // somethings wrong
513 splitchar(fold.begin(), fold.end(), ',', folds);
514 }
[1914]515
[9620]516 for(int i=0; i< values.size(); ++i) {
[1914]517 if (!values[i].empty()) {
518 if (i!=0) {
[12784]519 if (argct==1) {
[8029]520 if (combs[i-1]=="and") combine = "&";
521 else if (combs[i-1]=="or")combine = "|";
522 else if (combs[i-1]=="not")combine = "!";
523 }
524 else { // lucene
525 if (combs[i-1]=="and") combine = "AND";
526 else if (combs[i-1]=="or")combine = "OR";
527 else if (combs[i-1]=="not")combine = "NOT";
528 }
[1914]529 }
[12784]530 text_t this_value = values[i];
531 // remove operators for simple search, segments text if necessary
532 format_querystring(this_value, argb, segment);
533 if (argct == 1) { // mgpp only
534 this_value = addstemcase(this_value, stems[i], folds[i], argct);
[1914]535 }
[12784]536 // add tag info for this field (and other processing)
537 format_field_info(this_value, fields[i], argct, argt, argb);
538 // add into query string
539 if (!querystring.empty()) {
540 querystring += " "+ combine+ " ";
[2745]541 }
[12784]542 querystring += this_value;
[1914]543
544 }
545 }
546}
547
[12784]548// Extended addqueryelem for Human Info project
[7380]549void addqueryelem_ex(text_t &querystring, const text_t &tag,
[12784]550 const text_t &terms, const text_t &stem,
551 const text_t &fold,
[7380]552 const text_t& combine, const text_t& word_combine) {
[12784]553
[7380]554 if (!querystring.empty()) { // have to put and/or
555 querystring += " " + combine + " ";
556 }
557 text_t outtext; outtext.reserve(512);
558 text_t word; word.reserve(100);
559 //unsigned short c;
560 text_t::const_iterator here = terms.begin();
561 text_t::const_iterator end = terms.end();
562 bool inquote = false, firstword = true;
[1914]563
[7380]564 text_t word2; word2.reserve(256);
565
566 while (here !=end) {
567 if (is_unicode_space(*here)) {
568 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
569 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
570 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
571 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
572 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
573 if (inquote) {
574 word2.push_back(*here);
575 }
576 word.append(word2); word2.clear();
577
578 if (!inquote && !word.empty() ) {
[12784]579 // found word boundary
[7380]580
581 if (stem == "1" || fold =="1") {
582 word += "#";
583 if (stem == "1") word += "s";
584 //else word += "u";
585
586 if (fold == "1") word += "i";
587 //else word += "c";
588 }
589 if (firstword) {
590 firstword = false;
591 } else {
592 outtext += " " + word_combine + " ";
593 }
594 outtext += "[" + word + "]:"+tag;
595 word.clear();
596 }
597 ++here;
598 } else if (*here == '\"') {
599 word2.push_back(*here);
600 inquote = !inquote;
601 ++here;
602 } else {
603 // not word boundary
604 word2.push_back(*here);
605 ++here;
606 }
607 }
608
609 // get last word
610 if (!word2.empty()) {
611 if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
612 else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
613 else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
614 else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
615 else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
616 word.append(word2); word2.clear();
617
618 if (stem == "1"|| fold == "1") {
619 word += "#";
620 if (stem == "1") word += "s";
621 //else word += "u";
622
623 if (fold == "1") word += "i";
624 //else word += "c";
625 }
626 if (!outtext.empty()) outtext += " " + word_combine + " ";
627 outtext += "[" + word + "]:"+tag;
628 }
629 querystring += "(" + outtext + ")";
630}
631
[8357]632void add_field_info(text_t &querystring, const text_t &tag, int type) {
[7380]633
[17796]634 if (tag == "") return; // do nothing
635 if (tag == "ZZ" && type == 1) return; // mgpp doesn't use ZZ tag internally
[8357]636 if (type == 1) { //mgpp
637 querystring = "["+querystring+"]:"+tag;
638 } else if (type == 2) { // lucene
639 querystring = tag+":("+querystring+")";
[4757]640 }
[8357]641
[4757]642}
[8029]643
644
[17796]645void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) {
646
[11765]647 int type = 2; //lucene
[8029]648
[12784]649 if (argb==0) { // simple
650 // there will be no & or | as they should have already been removed
[11765]651 // just tag the entire thing
[10995]652 if (tag != "") {
[11765]653 add_field_info(querystring, tag, type);
[10995]654 }
[8357]655 return;
656 }
[10995]657
[12784]658 // need to replace & with &&, | with ||
[8357]659 text_t::const_iterator here = querystring.begin();
660 text_t::const_iterator end = querystring.end();
[12784]661
662 text_t finalquery = "";
[10995]663 while (here != end) {
[12784]664 if (*here == '&') {
665 finalquery.push_back('&');
666 finalquery.push_back('&');
667 while (*(here+1) == '&') {
668 ++here;
[10995]669 }
[12784]670 }
671 else if (*here == '|') {
672 finalquery.push_back('|');
673 finalquery.push_back('|');
674 while (*(here+1) == '|') {
675 ++here;
676 }
677 }
[8357]678 else {
[12784]679 finalquery.push_back(*here);
[8357]680 }
[10995]681 ++here;
[8357]682 }
[11765]683 querystring = finalquery;
[12784]684 add_field_info(querystring, tag, type);
[11765]685}
686
[12784]687
688void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {
689
[11765]690 if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
[12784]691 if (tag == "" && argb == 1) {
[11765]692 return; // no field specifier, advanced mode, the query stays as written
[10995]693 }
[11765]694
695 int type = 1; // mgpp
696
697 bool simple_and = (argb==0 && argt==0);
698 text_t finalquery = "";
699 text_t fieldpart ="";
700 text_t queryelem = "";
701 bool in_phrase = false;
702 bool in_field = false;
703
704 text_t::const_iterator here = querystring.begin();
705 text_t::const_iterator end = querystring.end();
706 while (here != end) {
707 if (is_unicode_letdig(*here) || *here == '&' || is_special_character(type, *here)) {
708 queryelem.push_back(*here);
709 }
710 else if (*here == '|') {
711 in_field = false;
712 }
713 else if (*here == '!' || *here == '(' || *here == ')') {
714 if (!in_phrase) { // ignore these if in_phrase
715 // output field, then output operator
716 in_field = false;
717 if (!queryelem.empty()) {
718 if (!simple_and && !fieldpart.empty()) {
719 add_field_info(fieldpart, tag, type);
720 finalquery += fieldpart;
721 finalquery.push_back(' ');
722 fieldpart.clear();
723 }
724 fieldpart += queryelem;
725 }
726 if (!fieldpart.empty()) {
727 add_field_info(fieldpart, tag, type);
728 finalquery += fieldpart;
729 finalquery.push_back(' ');
730 }
731 fieldpart.clear();
732 queryelem.clear();
733 finalquery.push_back(*here);
734 finalquery.push_back(' ');
735 }
736 }
737 else if (*here == '"') {
738 queryelem.push_back(*here);
739 if (in_phrase == false) in_phrase = true;
740 else {
741 in_phrase = false;
742 }
743 }
744
745 // Found word boundary, in a phrase
746 else if (in_phrase) {
747 queryelem.push_back(*here);
748 }
749 // Found a word boundary
750 else {
751 if (!queryelem.empty()) {
752 if (queryelem == "&") {
753 in_field = true;
754 queryelem.clear();
755 }
756 else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
757
758 if (argb==1) {
759 // simple search, these not allowed
760 in_field = true;
761 fieldpart += queryelem;
762 fieldpart.push_back(' ');
763 }
764 queryelem.clear();
765
766 }
767 else {
768 if (!simple_and && !in_field) {
769 if (!fieldpart.empty()) {
770 add_field_info(fieldpart, tag, type);
771 finalquery += fieldpart;
772 finalquery.push_back(' ');
773 fieldpart.clear();
774 }
775 }
776
777 fieldpart += queryelem;
778 fieldpart.push_back(' ');
779 queryelem.clear();
780 }
781 }
782 }
783 ++here;
784 }
785 // at the end
786 if (!queryelem.empty()) {
787 if (!simple_and && !in_field && !fieldpart.empty()) {
788 add_field_info(fieldpart, tag, type);
789 finalquery += fieldpart;
790 finalquery.push_back(' ');
791 fieldpart.clear();
792 }
793 fieldpart += queryelem;
794 }
795 if (!fieldpart.empty()) {
796 add_field_info(fieldpart, tag, type);
797 finalquery += fieldpart;
798 fieldpart.clear();
799 finalquery.push_back(' ');
800 }
801
802 querystring = finalquery;
[8029]803}
[8357]804
[12784]805
806void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) {
[11765]807 if (argct == 1) {
[12784]808 format_field_info_mgpp(querystring, tag, argt, argb);
[11765]809 } else if (argct == 2) {
[12784]810 format_field_info_lucene(querystring, tag, argt, argb);
[11765]811 }
812}
[10995]813
[12784]814void mgpp_adddateelem(text_t& querystring, const int date)
815{
816 querystring.appendcstr(" [");
817 if(date<0) {
818 querystring.appendcstr("bc");
819 querystring.appendint((date*-1));
820 }
821 else {
822 querystring.appendint(date);
823 }
824 querystring.appendcstr("]:CV");
825}
826
827void lucene_adddateelem(text_t& querystring, const int date)
828{
829 querystring.appendcstr(" CV:(");
830 if(date<0) {
831 querystring.appendcstr("bc");
832 querystring.appendint((date*-1));
833 }
834 else {
835 querystring.appendint(date);
836 }
837 querystring.appendcstr(")");
838}
839
840
841void add_dates(text_t &querystring, int startdate, int enddate,
842 int startbc, int endbc, int ct)
843{
844 if(startdate)
845 {
846 int querystringis = 0;
847 text_t::const_iterator here = querystring.begin();
848 text_t::const_iterator end = querystring.end();
849 while(here!=end)
850 {
851 if(!(isspace((*here)))){
852 here = end;
853 querystringis = 1;
854 }
855 else
856 ++here;
857 }
858 //converting BCE dates
859 if(startbc && startdate > 0)
860 {
861 startdate *= -1;
862 }
863 if(endbc && enddate > 0)
864 {
865 enddate *= -1;
866 }
867 if(enddate != 0 && enddate<startdate)
868 {
869 cout<<"enddate too small"<<endl;
870 return;
871 }
872 if(querystringis)
873 querystring.appendcstr(" AND");
874 if(!enddate)
875 {
876 if (ct==1) {
877 mgpp_adddateelem(querystring,startdate);
878 }
879 else { // lucene
880 lucene_adddateelem(querystring,startdate);
881 }
882 }
883 else{
884 int nextdate = startdate;
885 querystring.appendcstr(" (");
886 while(nextdate<=enddate)
887 {
888 if(nextdate!=0) {
889 if (ct==1) {
890 mgpp_adddateelem(querystring,nextdate);
891 }
892 else { // lucene
893 lucene_adddateelem(querystring,nextdate);
894 }
895 }
896 ++nextdate;
897 }
898 querystring.appendcstr(" )");
899 }
900 }
901
902}
Note: See TracBrowser for help on using the repository browser.