[270] | 1 | /**********************************************************************
|
---|
| 2 | *
|
---|
| 3 | * querytools.cpp --
|
---|
| 4 | * Copyright (C) 1999 The New Zealand Digital Library Project
|
---|
| 5 | *
|
---|
[533] | 6 | * A component of the Greenstone digital library software
|
---|
| 7 | * from the New Zealand Digital Library Project at the
|
---|
| 8 | * University of Waikato, New Zealand.
|
---|
[270] | 9 | *
|
---|
[533] | 10 | * This program is free software; you can redistribute it and/or modify
|
---|
| 11 | * it under the terms of the GNU General Public License as published by
|
---|
| 12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | * (at your option) any later version.
|
---|
| 14 | *
|
---|
| 15 | * This program is distributed in the hope that it will be useful,
|
---|
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | * GNU General Public License for more details.
|
---|
| 19 | *
|
---|
| 20 | * You should have received a copy of the GNU General Public License
|
---|
| 21 | * along with this program; if not, write to the Free Software
|
---|
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | *
|
---|
[270] | 24 | *********************************************************************/
|
---|
| 25 |
|
---|
| 26 | #include "querytools.h"
|
---|
[1373] | 27 | #include <ctype.h>
|
---|
[1914] | 28 | #include "unitool.h" // for is_unicode_letdig
|
---|
[270] | 29 |
|
---|
[11987] | 30 | void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
|
---|
| 31 |
|
---|
| 32 | if (args["ct"].empty()) {
|
---|
| 33 | text_t build_type = cinfo->buildType;
|
---|
| 34 | if (build_type == "mgpp") {
|
---|
| 35 | args["ct"] = "1";
|
---|
| 36 | } else if (build_type == "lucene") {
|
---|
| 37 | args["ct"] = "2";
|
---|
| 38 | } else {
|
---|
| 39 | args["ct"] = "0";
|
---|
| 40 | }
|
---|
| 41 | }
|
---|
| 42 | text_t arg_ct = args["ct"];
|
---|
| 43 | if (arg_ct == "0") {
|
---|
| 44 | // mg
|
---|
| 45 | args["qt"] = "0";
|
---|
| 46 | args["qto"] = "0";
|
---|
| 47 | return;
|
---|
| 48 | }
|
---|
| 49 |
|
---|
| 50 | if (!args["qt"].empty() && !args["qto"].empty()) {
|
---|
| 51 | return;
|
---|
| 52 | }
|
---|
| 53 |
|
---|
| 54 | text_tmap::iterator check = cinfo->format.find("SearchTypes");
|
---|
| 55 | text_t search_types = "plain,form";
|
---|
| 56 | if(check != cinfo->format.end()){
|
---|
| 57 | search_types = (*check).second;
|
---|
| 58 | if (search_types.empty()) {
|
---|
| 59 | search_types = "plain,form";
|
---|
| 60 | }
|
---|
| 61 | }
|
---|
| 62 |
|
---|
| 63 | if (args["qto"].empty()) {
|
---|
| 64 | unsigned int type = 0;
|
---|
| 65 | if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
|
---|
| 66 | type |= 2;
|
---|
| 67 | }
|
---|
| 68 | if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
|
---|
| 69 | type |= 1;
|
---|
| 70 | }
|
---|
| 71 | args.setintarg("qto", type);
|
---|
| 72 | }
|
---|
| 73 |
|
---|
| 74 | if (args["qt"].empty()) {
|
---|
| 75 | bool form_default = false;
|
---|
| 76 | int arg_qto = args.getintarg("qto");
|
---|
| 77 | if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
|
---|
| 78 | args["qt"] = "1";
|
---|
| 79 | } else {
|
---|
| 80 | args["qt"] = "0";
|
---|
| 81 | }
|
---|
| 82 | }
|
---|
| 83 | }
|
---|
| 84 |
|
---|
[759] | 85 | // request.filterResultOptions and request.fields (if required) should
|
---|
| 86 | // be set from the calling code
|
---|
| 87 | void set_queryfilter_options (FilterRequest_t &request, const text_t &querystring,
|
---|
| 88 | cgiargsclass &args) {
|
---|
[270] | 89 |
|
---|
| 90 | request.filterName = "QueryFilter";
|
---|
| 91 |
|
---|
| 92 | OptionValue_t option;
|
---|
[470] | 93 |
|
---|
[270] | 94 | option.name = "Term";
|
---|
[759] | 95 | option.value = querystring;
|
---|
[270] | 96 | request.filterOptions.push_back (option);
|
---|
| 97 |
|
---|
| 98 | option.name = "QueryType";
|
---|
| 99 | option.value = (args.getintarg("t")) ? "ranked" : "boolean";
|
---|
| 100 | request.filterOptions.push_back (option);
|
---|
| 101 |
|
---|
[1774] | 102 | option.name = "MatchMode";
|
---|
[11765] | 103 | // mgpp in advanced mode, always use some query
|
---|
[12428] | 104 | if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
|
---|
[11765] | 105 | option.value = "some";
|
---|
| 106 | } else {
|
---|
| 107 | option.value = (args.getintarg("t")) ? "some" : "all";
|
---|
| 108 | }
|
---|
[1774] | 109 | request.filterOptions.push_back (option);
|
---|
| 110 |
|
---|
[270] | 111 | option.name = "Casefold";
|
---|
| 112 | option.value = (args.getintarg("k")) ? "true" : "false";
|
---|
| 113 | request.filterOptions.push_back (option);
|
---|
| 114 |
|
---|
| 115 | option.name = "Stem";
|
---|
| 116 | option.value = (args.getintarg("s")) ? "true" : "false";
|
---|
| 117 | request.filterOptions.push_back (option);
|
---|
| 118 |
|
---|
| 119 | if (!args["h"].empty()) {
|
---|
| 120 | option.name = "Index";
|
---|
| 121 | option.value = args["h"];
|
---|
| 122 | request.filterOptions.push_back (option);
|
---|
| 123 | }
|
---|
| 124 |
|
---|
| 125 | if (!args["j"].empty()) {
|
---|
| 126 | option.name = "Subcollection";
|
---|
| 127 | option.value = args["j"];
|
---|
| 128 | request.filterOptions.push_back (option);
|
---|
| 129 | }
|
---|
| 130 |
|
---|
| 131 | if (!args["n"].empty()) {
|
---|
| 132 | option.name = "Language";
|
---|
| 133 | option.value = args["n"];
|
---|
| 134 | request.filterOptions.push_back (option);
|
---|
| 135 | }
|
---|
[1329] | 136 |
|
---|
| 137 | if (!args["g"].empty()) { // granularity for mgpp
|
---|
| 138 | option.name = "Level";
|
---|
| 139 | option.value = args["g"];
|
---|
| 140 | request.filterOptions.push_back (option);
|
---|
| 141 | }
|
---|
[270] | 142 |
|
---|
[12410] | 143 | if (!args["fs"].empty()) { // filter string for lucene
|
---|
| 144 | option.name = "FilterString";
|
---|
| 145 | option.value = args["fs"];
|
---|
| 146 | request.filterOptions.push_back (option);
|
---|
| 147 | }
|
---|
| 148 |
|
---|
[12276] | 149 | if (!args["sf"].empty()) { // sort field for lucene
|
---|
| 150 | option.name = "SortField";
|
---|
| 151 | option.value = args["sf"];
|
---|
| 152 | request.filterOptions.push_back (option);
|
---|
| 153 | }
|
---|
| 154 |
|
---|
[12388] | 155 | // sort field for lucene
|
---|
| 156 | option.name = "FuzzySearch";
|
---|
| 157 | option.value = (args.getintarg("fuzzy")) ? "true" : "false";
|
---|
| 158 | request.filterOptions.push_back (option);
|
---|
| 159 |
|
---|
[759] | 160 | set_more_queryfilter_options (request, args);
|
---|
| 161 | }
|
---|
| 162 |
|
---|
| 163 | void set_queryfilter_options (FilterRequest_t &request, const text_t &querystring1,
|
---|
| 164 | const text_t &querystring2, cgiargsclass &args) {
|
---|
| 165 |
|
---|
| 166 | set_queryfilter_options (request, querystring1, args);
|
---|
| 167 |
|
---|
[349] | 168 | // fill in the second query if needed
|
---|
| 169 | if (!args["cq2"].empty()) {
|
---|
[759] | 170 | OptionValue_t option;
|
---|
| 171 |
|
---|
[349] | 172 | option.name = "CombineQuery";
|
---|
| 173 | option.value = args["cq2"];
|
---|
| 174 | request.filterOptions.push_back (option);
|
---|
| 175 |
|
---|
| 176 | option.name = "Term";
|
---|
[759] | 177 | option.value = querystring2;
|
---|
[349] | 178 | request.filterOptions.push_back (option);
|
---|
[759] | 179 |
|
---|
[349] | 180 | option.name = "QueryType";
|
---|
| 181 | option.value = (args.getintarg("t")) ? "ranked" : "boolean";
|
---|
| 182 | request.filterOptions.push_back (option);
|
---|
| 183 |
|
---|
| 184 | option.name = "Casefold";
|
---|
| 185 | option.value = (args.getintarg("k")) ? "true" : "false";
|
---|
| 186 | request.filterOptions.push_back (option);
|
---|
| 187 |
|
---|
| 188 | option.name = "Stem";
|
---|
| 189 | option.value = (args.getintarg("s")) ? "true" : "false";
|
---|
| 190 | request.filterOptions.push_back (option);
|
---|
| 191 |
|
---|
| 192 | if (!args["h2"].empty()) {
|
---|
| 193 | option.name = "Index";
|
---|
| 194 | option.value = args["h2"];
|
---|
| 195 | request.filterOptions.push_back (option);
|
---|
| 196 | }
|
---|
| 197 |
|
---|
| 198 | if (!args["j2"].empty()) {
|
---|
| 199 | option.name = "Subcollection";
|
---|
| 200 | option.value = args["j2"];
|
---|
| 201 | request.filterOptions.push_back (option);
|
---|
| 202 | }
|
---|
| 203 |
|
---|
| 204 | if (!args["n2"].empty()) {
|
---|
| 205 | option.name = "Language";
|
---|
| 206 | option.value = args["n2"];
|
---|
| 207 | request.filterOptions.push_back (option);
|
---|
| 208 | }
|
---|
| 209 | }
|
---|
[759] | 210 | set_more_queryfilter_options (request, args);
|
---|
| 211 | }
|
---|
[608] | 212 |
|
---|
[759] | 213 | void set_more_queryfilter_options (FilterRequest_t &request, cgiargsclass &args) {
|
---|
| 214 |
|
---|
| 215 | OptionValue_t option;
|
---|
[608] | 216 | int arg_m = args.getintarg("m");
|
---|
[759] | 217 |
|
---|
[608] | 218 | option.name = "Maxdocs";
|
---|
| 219 | option.value = arg_m;
|
---|
| 220 | request.filterOptions.push_back (option);
|
---|
[1329] | 221 |
|
---|
[759] | 222 | // option.name = "StartResults";
|
---|
| 223 | // option.value = args["r"];
|
---|
| 224 | // request.filterOptions.push_back (option);
|
---|
[270] | 225 |
|
---|
[759] | 226 | // option.name = "EndResults";
|
---|
| 227 | // int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
|
---|
| 228 | // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
|
---|
| 229 | // option.value = endresults;
|
---|
| 230 | // request.filterOptions.push_back (option);
|
---|
[270] | 231 | }
|
---|
| 232 |
|
---|
[11987] | 233 | bool is_special_character(int indexer_type, unsigned short character) {
|
---|
| 234 | // mgpp
|
---|
| 235 | if (indexer_type == 1) {
|
---|
| 236 | return (character == '#' || character == '/' || character == '*');
|
---|
| 237 | }
|
---|
| 238 | // lucene
|
---|
| 239 | else if (indexer_type ==2) {
|
---|
| 240 | return (character == '?' || character == '*' || character == '~' ||
|
---|
| 241 | character == '^');
|
---|
| 242 | }
|
---|
| 243 | return false;
|
---|
| 244 | }
|
---|
| 245 |
|
---|
[6584] | 246 | void format_querystring (text_t &querystring, int querymode, bool segment) {
|
---|
[270] | 247 | text_t formattedstring;
|
---|
| 248 |
|
---|
[6584] | 249 | if (querymode == 1 && !segment) return;
|
---|
| 250 |
|
---|
[270] | 251 | text_t::const_iterator here = querystring.begin();
|
---|
| 252 | text_t::const_iterator end = querystring.end();
|
---|
| 253 |
|
---|
| 254 | // space is used to insert spaces between Chinese
|
---|
| 255 | // characters. No space is needed before the first
|
---|
| 256 | // Chinese character.
|
---|
| 257 | bool space = false;
|
---|
| 258 |
|
---|
| 259 | // want to remove ()|!& from querystring so boolean queries are just
|
---|
[470] | 260 | // "all the words" queries (unless querymode is advanced)
|
---|
[270] | 261 | while (here != end) {
|
---|
[470] | 262 | if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
|
---|
| 263 | *here == '!' || *here == '&')) {
|
---|
[270] | 264 | formattedstring.push_back(' ');
|
---|
[6584] | 265 | } else if (segment) {
|
---|
[397] | 266 | if ((*here >= 0x4e00 && *here <= 0x9fa5) ||
|
---|
| 267 | (*here >= 0xf900 && *here <= 0xfa2d)) {
|
---|
| 268 | // Chinese character
|
---|
[8715] | 269 | if (!space) formattedstring.push_back (0x200b); // zero width space
|
---|
[397] | 270 | formattedstring.push_back (*here);
|
---|
| 271 | formattedstring.push_back (0x200b);
|
---|
| 272 | space = true;
|
---|
[270] | 273 | } else {
|
---|
[8715] | 274 |
|
---|
[397] | 275 | // non-Chinese character
|
---|
| 276 | formattedstring.push_back (*here);
|
---|
| 277 | space = false;
|
---|
[8715] | 278 |
|
---|
[270] | 279 | }
|
---|
[6584] | 280 |
|
---|
| 281 | } else {
|
---|
| 282 | formattedstring.push_back (*here);
|
---|
[270] | 283 | }
|
---|
[9620] | 284 | ++here;
|
---|
[270] | 285 | }
|
---|
[397] | 286 | querystring = formattedstring;
|
---|
[270] | 287 | }
|
---|
| 288 |
|
---|
[1373] | 289 |
|
---|
| 290 |
|
---|
| 291 | void add_dates(text_t &querystring, int startdate, int enddate,
|
---|
[8029] | 292 | int startbc, int endbc, int ct)
|
---|
[1373] | 293 | {
|
---|
| 294 | if(startdate)
|
---|
| 295 | {
|
---|
| 296 | int querystringis = 0;
|
---|
| 297 | text_t::const_iterator here = querystring.begin();
|
---|
| 298 | text_t::const_iterator end = querystring.end();
|
---|
| 299 | while(here!=end)
|
---|
| 300 | {
|
---|
| 301 | if(!(isspace((*here)))){
|
---|
| 302 | here = end;
|
---|
| 303 | querystringis = 1;
|
---|
| 304 | }
|
---|
| 305 | else
|
---|
[9620] | 306 | ++here;
|
---|
[1373] | 307 | }
|
---|
| 308 | //converting BCE dates
|
---|
| 309 | if(startbc && startdate > 0)
|
---|
| 310 | {
|
---|
| 311 | startdate *= -1;
|
---|
| 312 | }
|
---|
| 313 | if(endbc && enddate > 0)
|
---|
| 314 | {
|
---|
| 315 | enddate *= -1;
|
---|
| 316 | }
|
---|
| 317 | if(enddate != 0 && enddate<startdate)
|
---|
| 318 | {
|
---|
| 319 | cout<<"enddate too small"<<endl;
|
---|
| 320 | return;
|
---|
| 321 | }
|
---|
| 322 | if(querystringis)
|
---|
| 323 | querystring.appendcstr(" AND");
|
---|
| 324 | if(!enddate)
|
---|
| 325 | {
|
---|
[8029] | 326 | if (ct==1) {
|
---|
| 327 | mgpp_adddateelem(querystring,startdate);
|
---|
| 328 | }
|
---|
| 329 | else { // lucene
|
---|
| 330 | lucene_adddateelem(querystring,startdate);
|
---|
| 331 | }
|
---|
[1373] | 332 | }
|
---|
| 333 | else{
|
---|
| 334 | int nextdate = startdate;
|
---|
| 335 | querystring.appendcstr(" (");
|
---|
| 336 | while(nextdate<=enddate)
|
---|
| 337 | {
|
---|
[8029] | 338 | if(nextdate!=0) {
|
---|
| 339 | if (ct==1) {
|
---|
| 340 | mgpp_adddateelem(querystring,nextdate);
|
---|
[1373] | 341 | }
|
---|
[8029] | 342 | else { // lucene
|
---|
| 343 | lucene_adddateelem(querystring,nextdate);
|
---|
| 344 | }
|
---|
| 345 | }
|
---|
[9620] | 346 | ++nextdate;
|
---|
[1373] | 347 | }
|
---|
| 348 | querystring.appendcstr(" )");
|
---|
| 349 | }
|
---|
| 350 | }
|
---|
[1467] | 351 |
|
---|
[1373] | 352 | }
|
---|
[1467] | 353 |
|
---|
[3160] | 354 | // search history tool
|
---|
| 355 | // also used for form query macros
|
---|
[1914] | 356 | text_t escape_quotes(const text_t &querystring) {
|
---|
| 357 |
|
---|
| 358 | text_t::const_iterator here = querystring.begin();
|
---|
| 359 | text_t::const_iterator end = querystring.end();
|
---|
| 360 |
|
---|
| 361 | text_t escquery = "";
|
---|
| 362 | while (here != end) {
|
---|
[1988] | 363 | if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
|
---|
| 364 | else if (*here == '\n' || *here == '\r') {
|
---|
| 365 | escquery.push_back(' ');
|
---|
| 366 | } else {
|
---|
[1914] | 367 | escquery +="\\\\";
|
---|
| 368 | escquery.push_back(*here);
|
---|
| 369 | }
|
---|
| 370 |
|
---|
[9620] | 371 | ++here;
|
---|
[1914] | 372 | }
|
---|
| 373 | return escquery;
|
---|
| 374 |
|
---|
| 375 | }
|
---|
| 376 |
|
---|
[11765] | 377 | // some query form parsing functions for use with mgpp & lucene
|
---|
[1914] | 378 |
|
---|
[8029] | 379 | void parse_reg_query_form(text_t &querystring, cgiargsclass &args)
|
---|
| 380 | {
|
---|
| 381 | querystring.clear();
|
---|
[1914] | 382 |
|
---|
[8029] | 383 | const int ct = args.getintarg("ct");
|
---|
| 384 | int argt = args.getintarg("t");// t=0 -and, t=1 - or
|
---|
| 385 |
|
---|
[1914] | 386 | text_t combine;
|
---|
[8029] | 387 | if (ct==1) {
|
---|
| 388 | if (argt == 0) combine = "&";
|
---|
| 389 | else combine = "|";
|
---|
| 390 | }
|
---|
| 391 | else { // lucene
|
---|
| 392 | if (argt == 0) combine = "AND";
|
---|
| 393 | else combine = "OR";
|
---|
| 394 | }
|
---|
[1914] | 395 |
|
---|
| 396 | text_t field = args["fqf"];
|
---|
| 397 | if (field.empty()) return; // no query
|
---|
| 398 | text_tarray fields;
|
---|
| 399 | splitchar(field.begin(), field.end(), ',', fields);
|
---|
| 400 |
|
---|
| 401 | text_t value = args["fqv"];
|
---|
| 402 | if (value.empty()) return; // somethings wrong
|
---|
| 403 | text_tarray values;
|
---|
| 404 | splitchar(value.begin(), value.end(), ',', values);
|
---|
| 405 |
|
---|
[8029] | 406 |
|
---|
[9620] | 407 | for (int i=0; i< values.size(); ++i) {
|
---|
[1914] | 408 | if (!values[i].empty()) {
|
---|
[8029] | 409 | if (ct == 1) {
|
---|
| 410 | mgpp_addqueryelem(querystring, fields[i], values[i], combine);
|
---|
| 411 | }
|
---|
| 412 | else { // lucene
|
---|
| 413 | lucene_addqueryelem(querystring, fields[i], values[i], combine);
|
---|
| 414 | }
|
---|
[1914] | 415 | }
|
---|
| 416 | }
|
---|
| 417 |
|
---|
| 418 | }
|
---|
| 419 |
|
---|
| 420 |
|
---|
| 421 | void parse_adv_query_form(text_t &querystring, cgiargsclass &args){
|
---|
| 422 |
|
---|
| 423 | querystring.clear();
|
---|
| 424 |
|
---|
[8029] | 425 | const int ct = args.getintarg("ct");
|
---|
| 426 | text_t combine;
|
---|
| 427 | if (ct==1) {
|
---|
| 428 | combine = "&";
|
---|
| 429 | }
|
---|
| 430 | else { // lucene
|
---|
| 431 | combine = "AND";
|
---|
| 432 | }
|
---|
| 433 |
|
---|
[1914] | 434 | text_t field = args["fqf"];
|
---|
| 435 | if (field.empty()) return; // no query
|
---|
| 436 | text_tarray fields;
|
---|
| 437 | splitchar(field.begin(), field.end(), ',', fields);
|
---|
| 438 |
|
---|
| 439 | text_t value = args["fqv"];
|
---|
| 440 | if (value.empty()) return; // somethings wrong
|
---|
| 441 | text_tarray values;
|
---|
| 442 | splitchar(value.begin(), value.end(), ',', values);
|
---|
| 443 |
|
---|
| 444 | text_t stem = args["fqs"];
|
---|
| 445 | if (stem.empty()) return; // somethings wrong
|
---|
| 446 | text_tarray stems;
|
---|
| 447 | splitchar(stem.begin(), stem.end(), ',', stems);
|
---|
| 448 |
|
---|
| 449 | text_t fold = args["fqk"];
|
---|
| 450 | if (fold.empty()) return; // somethings wrong
|
---|
| 451 | text_tarray folds;
|
---|
| 452 | splitchar(fold.begin(), fold.end(), ',', folds);
|
---|
| 453 |
|
---|
| 454 | text_t comb = args["fqc"];
|
---|
| 455 | if (comb.empty()) return; //somethings wrong
|
---|
| 456 | text_tarray combs;
|
---|
| 457 | splitchar(comb.begin(), comb.end(), ',', combs);
|
---|
| 458 |
|
---|
[9620] | 459 | for(int i=0; i< values.size(); ++i) {
|
---|
[1914] | 460 | if (!values[i].empty()) {
|
---|
| 461 | if (i!=0) {
|
---|
[8029] | 462 | if (ct==1) {
|
---|
| 463 | if (combs[i-1]=="and") combine = "&";
|
---|
| 464 | else if (combs[i-1]=="or")combine = "|";
|
---|
| 465 | else if (combs[i-1]=="not")combine = "!";
|
---|
| 466 | }
|
---|
| 467 | else { // lucene
|
---|
| 468 | if (combs[i-1]=="and") combine = "AND";
|
---|
| 469 | else if (combs[i-1]=="or")combine = "OR";
|
---|
| 470 | else if (combs[i-1]=="not")combine = "NOT";
|
---|
| 471 | }
|
---|
[1914] | 472 | }
|
---|
[11987] | 473 | text_t term = addstemcase(values[i], stems[i], folds[i], ct);
|
---|
[8029] | 474 | mgpp_addqueryelem(querystring, fields[i], term, combine);
|
---|
[1914] | 475 | }
|
---|
| 476 |
|
---|
| 477 | }
|
---|
| 478 | }
|
---|
| 479 |
|
---|
[11987] | 480 | text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
|
---|
| 481 | const int indexer_type) {
|
---|
[1914] | 482 |
|
---|
| 483 | text_t outtext;
|
---|
| 484 | text_t word;
|
---|
| 485 | //unsigned short c;
|
---|
[7383] | 486 | text_t::const_iterator here = terms.begin();
|
---|
| 487 | text_t::const_iterator end = terms.end();
|
---|
[1914] | 488 |
|
---|
| 489 | while (here !=end) {
|
---|
[2745] | 490 |
|
---|
[11987] | 491 | if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
|
---|
[1914] | 492 | // not word boundary
|
---|
| 493 | word.push_back(*here);
|
---|
[9620] | 494 | ++here;
|
---|
[1914] | 495 | }
|
---|
| 496 | else {
|
---|
| 497 | // found word boundary
|
---|
| 498 | if (!word.empty() ) {
|
---|
| 499 | if (stem == "1" || fold =="1") {
|
---|
| 500 | word += "#";
|
---|
| 501 | if (stem == "1") word += "s";
|
---|
| 502 | //else word += "u";
|
---|
| 503 |
|
---|
| 504 | if (fold == "1") word += "i";
|
---|
| 505 | //else word += "c";
|
---|
| 506 | }
|
---|
[2745] | 507 |
|
---|
[1914] | 508 | word += " ";
|
---|
| 509 | outtext += word;
|
---|
| 510 | word.clear();
|
---|
| 511 | }
|
---|
[2745] | 512 | if (*here == '\"') {
|
---|
| 513 | outtext.push_back(*here);
|
---|
| 514 | }
|
---|
[9620] | 515 | ++here;
|
---|
[1914] | 516 | }
|
---|
| 517 | }
|
---|
| 518 |
|
---|
| 519 | // get last word
|
---|
| 520 | if (!word.empty()) {
|
---|
| 521 | if (stem == "1"|| fold == "1") {
|
---|
| 522 | word += "#";
|
---|
| 523 | if (stem == "1") word += "s";
|
---|
| 524 | //else word += "u";
|
---|
| 525 |
|
---|
| 526 | if (fold == "1") word += "i";
|
---|
| 527 | //else word += "c";
|
---|
| 528 | }
|
---|
| 529 | word += " ";
|
---|
| 530 | outtext += word;
|
---|
| 531 | }
|
---|
| 532 | return outtext;
|
---|
| 533 | }
|
---|
| 534 |
|
---|
| 535 |
|
---|
[8029] | 536 | void mgpp_adddateelem(text_t& querystring, const int date)
|
---|
| 537 | {
|
---|
| 538 | querystring.appendcstr(" [");
|
---|
| 539 | if(date<0) {
|
---|
| 540 | querystring.appendcstr("bc");
|
---|
| 541 | querystring.appendint((date*-1));
|
---|
| 542 | }
|
---|
| 543 | else {
|
---|
| 544 | querystring.appendint(date);
|
---|
| 545 | }
|
---|
| 546 | querystring.appendcstr("]:CV");
|
---|
| 547 | }
|
---|
| 548 |
|
---|
| 549 | void lucene_adddateelem(text_t& querystring, const int date)
|
---|
| 550 | {
|
---|
| 551 | querystring.appendcstr(" CV:(");
|
---|
| 552 | if(date<0) {
|
---|
| 553 | querystring.appendcstr("bc");
|
---|
| 554 | querystring.appendint((date*-1));
|
---|
| 555 | }
|
---|
| 556 | else {
|
---|
| 557 | querystring.appendint(date);
|
---|
| 558 | }
|
---|
| 559 | querystring.appendcstr(")");
|
---|
| 560 | }
|
---|
| 561 |
|
---|
| 562 |
|
---|
| 563 | void mgpp_addqueryelem(text_t &querystring, text_t &tag,
|
---|
[7383] | 564 | text_t &query, text_t &combine) {
|
---|
[1914] | 565 | if (!querystring.empty()) { // have to put and/or
|
---|
[8029] | 566 | querystring += " " + combine + " ";
|
---|
[1914] | 567 |
|
---|
| 568 | }
|
---|
[3160] | 569 | if (tag=="ZZ" || tag=="") { // just add onto querystring
|
---|
[1914] | 570 | querystring += query;
|
---|
| 571 | }
|
---|
| 572 | else {
|
---|
| 573 | querystring += "["+query+"]:"+tag;
|
---|
| 574 | }
|
---|
| 575 |
|
---|
| 576 | }
|
---|
| 577 |
|
---|
[8029] | 578 | void lucene_addqueryelem(text_t &querystring, text_t &tag,
|
---|
| 579 | text_t &query, text_t &combine) {
|
---|
| 580 | if (!querystring.empty()) { // have to put and/or
|
---|
| 581 | querystring += " " + combine + " ";
|
---|
| 582 |
|
---|
| 583 | }
|
---|
| 584 | if (tag=="ZZ" || tag=="") { // just add onto querystring
|
---|
| 585 | querystring += query;
|
---|
| 586 | }
|
---|
| 587 | else {
|
---|
| 588 | querystring += tag+":("+query+")";
|
---|
| 589 | }
|
---|
| 590 | }
|
---|
[1914] | 591 |
|
---|
[8029] | 592 |
|
---|
[7380] | 593 | void addqueryelem_ex(text_t &querystring, const text_t &tag,
|
---|
| 594 | const text_t &terms, const text_t &stem, const text_t &fold,
|
---|
| 595 | const text_t& combine, const text_t& word_combine) {
|
---|
| 596 | if (!querystring.empty()) { // have to put and/or
|
---|
| 597 | querystring += " " + combine + " ";
|
---|
| 598 | }
|
---|
| 599 | text_t outtext; outtext.reserve(512);
|
---|
| 600 | text_t word; word.reserve(100);
|
---|
| 601 | //unsigned short c;
|
---|
| 602 | text_t::const_iterator here = terms.begin();
|
---|
| 603 | text_t::const_iterator end = terms.end();
|
---|
| 604 | bool inquote = false, firstword = true;
|
---|
[1914] | 605 |
|
---|
[7380] | 606 | text_t word2; word2.reserve(256);
|
---|
| 607 |
|
---|
| 608 | while (here !=end) {
|
---|
| 609 | if (is_unicode_space(*here)) {
|
---|
| 610 | if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
|
---|
| 611 | else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
|
---|
| 612 | else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
|
---|
| 613 | else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
|
---|
| 614 | else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
|
---|
| 615 | if (inquote) {
|
---|
| 616 | word2.push_back(*here);
|
---|
| 617 | }
|
---|
| 618 | word.append(word2); word2.clear();
|
---|
| 619 |
|
---|
| 620 | if (!inquote && !word.empty() ) {
|
---|
| 621 | // found word boundary
|
---|
| 622 |
|
---|
| 623 | if (stem == "1" || fold =="1") {
|
---|
| 624 | word += "#";
|
---|
| 625 | if (stem == "1") word += "s";
|
---|
| 626 | //else word += "u";
|
---|
| 627 |
|
---|
| 628 | if (fold == "1") word += "i";
|
---|
| 629 | //else word += "c";
|
---|
| 630 | }
|
---|
| 631 | if (firstword) {
|
---|
| 632 | firstword = false;
|
---|
| 633 | } else {
|
---|
| 634 | outtext += " " + word_combine + " ";
|
---|
| 635 | }
|
---|
| 636 | outtext += "[" + word + "]:"+tag;
|
---|
| 637 | word.clear();
|
---|
| 638 | }
|
---|
| 639 | ++here;
|
---|
| 640 | } else if (*here == '\"') {
|
---|
| 641 | word2.push_back(*here);
|
---|
| 642 | inquote = !inquote;
|
---|
| 643 | ++here;
|
---|
| 644 | } else {
|
---|
| 645 | // not word boundary
|
---|
| 646 | word2.push_back(*here);
|
---|
| 647 | ++here;
|
---|
| 648 | }
|
---|
| 649 | }
|
---|
| 650 |
|
---|
| 651 | // get last word
|
---|
| 652 | if (!word2.empty()) {
|
---|
| 653 | if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
|
---|
| 654 | else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
|
---|
| 655 | else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
|
---|
| 656 | else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
|
---|
| 657 | else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
|
---|
| 658 | word.append(word2); word2.clear();
|
---|
| 659 |
|
---|
| 660 | if (stem == "1"|| fold == "1") {
|
---|
| 661 | word += "#";
|
---|
| 662 | if (stem == "1") word += "s";
|
---|
| 663 | //else word += "u";
|
---|
| 664 |
|
---|
| 665 | if (fold == "1") word += "i";
|
---|
| 666 | //else word += "c";
|
---|
| 667 | }
|
---|
| 668 | if (!outtext.empty()) outtext += " " + word_combine + " ";
|
---|
| 669 | outtext += "[" + word + "]:"+tag;
|
---|
| 670 | }
|
---|
| 671 | querystring += "(" + outtext + ")";
|
---|
| 672 | }
|
---|
| 673 |
|
---|
| 674 |
|
---|
[8357] | 675 | void add_field_info(text_t &querystring, const text_t &tag, int type) {
|
---|
[7380] | 676 |
|
---|
[10995] | 677 | if (tag == "") return; // do nothing
|
---|
[8357] | 678 | if (type == 1) { //mgpp
|
---|
| 679 | querystring = "["+querystring+"]:"+tag;
|
---|
| 680 | } else if (type == 2) { // lucene
|
---|
| 681 | querystring = tag+":("+querystring+")";
|
---|
[4757] | 682 | }
|
---|
[8357] | 683 |
|
---|
[4757] | 684 | }
|
---|
[8029] | 685 |
|
---|
| 686 |
|
---|
[11765] | 687 | void format_field_info_lucene(text_t &querystring, cgiargsclass &args) {
|
---|
[8357] | 688 | text_t tag = args["fqf"];
|
---|
[10995] | 689 | if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
|
---|
[11765] | 690 | int type = 2; //lucene
|
---|
[8357] | 691 | int argt = args.getintarg("t");// t=0 -and, t=1 - or
|
---|
| 692 | int argb = args.getintarg("b"); // b=0 simple, b=1 advanced
|
---|
[8029] | 693 |
|
---|
[11765] | 694 | // lucene simple OR - the string stays as is, but may need field tag
|
---|
| 695 | if (argb==0 && argt == 1) {
|
---|
| 696 | // just tag the entire thing
|
---|
[10995] | 697 | if (tag != "") {
|
---|
[11765] | 698 | add_field_info(querystring, tag, type);
|
---|
[10995] | 699 | }
|
---|
[8357] | 700 | return;
|
---|
| 701 | }
|
---|
[11765] | 702 | bool in_phrase = false;
|
---|
[8357] | 703 |
|
---|
[11765] | 704 | text_t queryelem = "";
|
---|
| 705 | text_t finalquery = "";
|
---|
| 706 |
|
---|
| 707 | // only add in + for simple AND search
|
---|
| 708 | text_t combine = ((argb==0)? "+" : "");
|
---|
[10995] | 709 |
|
---|
[11765] | 710 | // for lucene, we need to change & to && and | to || if advanced search
|
---|
| 711 | // we need to tag the entire string, if we have a field
|
---|
| 712 | // if we are simple and search, then we put && in between words
|
---|
[10995] | 713 |
|
---|
[8357] | 714 | text_t::const_iterator here = querystring.begin();
|
---|
| 715 | text_t::const_iterator end = querystring.end();
|
---|
[10995] | 716 | while (here != end) {
|
---|
[11765] | 717 | if (is_unicode_letdig(*here) || is_special_character(type, *here)) {
|
---|
| 718 | queryelem.push_back(*here);
|
---|
[10995] | 719 | }
|
---|
[8357] | 720 |
|
---|
[10995] | 721 | // Detect phrase starts/finishes
|
---|
| 722 | else if (*here == '"') {
|
---|
[11765] | 723 | queryelem.push_back(*here);
|
---|
[10995] | 724 | if (in_phrase == false) in_phrase = true;
|
---|
| 725 | else {
|
---|
[11765] | 726 | finalquery += combine + queryelem;
|
---|
| 727 | queryelem.clear();
|
---|
[10995] | 728 | in_phrase = false;
|
---|
| 729 | }
|
---|
| 730 | }
|
---|
[8357] | 731 |
|
---|
[10995] | 732 | // Found word boundary, in a phrase
|
---|
| 733 | else if (in_phrase) {
|
---|
[11765] | 734 | queryelem.push_back(*here);
|
---|
[8357] | 735 | }
|
---|
[10995] | 736 | // Word boundary, but not in a phrase
|
---|
[8357] | 737 | else {
|
---|
[11765] | 738 | if (*here == '&') {
|
---|
| 739 | queryelem.push_back('&');
|
---|
| 740 | queryelem.push_back('&');
|
---|
| 741 | } else if (*here == '|') {
|
---|
| 742 | queryelem.push_back('|');
|
---|
| 743 | queryelem.push_back('|');
|
---|
| 744 | } else {
|
---|
| 745 | if (!queryelem.empty()) {
|
---|
| 746 | finalquery += combine + queryelem;
|
---|
| 747 | queryelem.clear();
|
---|
| 748 | }
|
---|
| 749 | finalquery.push_back(*here);
|
---|
[10995] | 750 | }
|
---|
[8357] | 751 | }
|
---|
[10995] | 752 |
|
---|
| 753 | ++here;
|
---|
[8357] | 754 | }
|
---|
[10995] | 755 |
|
---|
| 756 | // Get last element
|
---|
[11765] | 757 | if (!queryelem.empty()) {
|
---|
| 758 | finalquery += combine + queryelem;
|
---|
[8357] | 759 | }
|
---|
[10411] | 760 |
|
---|
[11765] | 761 | add_field_info(finalquery, tag, type);
|
---|
| 762 | querystring = finalquery;
|
---|
| 763 | cerr << "final query = "<<finalquery<<endl;
|
---|
| 764 | }
|
---|
| 765 |
|
---|
| 766 | void format_field_info_mgpp(text_t &querystring, cgiargsclass &args) {
|
---|
| 767 | text_t tag = args["fqf"];
|
---|
| 768 | if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
|
---|
[10995] | 769 |
|
---|
[11765] | 770 | int argt = args.getintarg("t");// t=0 -and, t=1 - or
|
---|
| 771 | int argb = args.getintarg("b"); // b=0 simple, b=1 advanced
|
---|
| 772 |
|
---|
| 773 | if (tag == "" && argb ==1) {
|
---|
| 774 | return; // no field specifier, advanced mode, the query stays as written
|
---|
[10995] | 775 | }
|
---|
[11765] | 776 |
|
---|
| 777 | int type = 1; // mgpp
|
---|
| 778 |
|
---|
| 779 | bool simple_and = (argb==0 && argt==0);
|
---|
| 780 | text_t finalquery = "";
|
---|
| 781 | text_t fieldpart ="";
|
---|
| 782 | text_t queryelem = "";
|
---|
| 783 | bool in_phrase = false;
|
---|
| 784 | bool in_field = false;
|
---|
| 785 |
|
---|
| 786 | text_t::const_iterator here = querystring.begin();
|
---|
| 787 | text_t::const_iterator end = querystring.end();
|
---|
| 788 | while (here != end) {
|
---|
| 789 | if (is_unicode_letdig(*here) || *here == '&' || is_special_character(type, *here)) {
|
---|
| 790 | queryelem.push_back(*here);
|
---|
| 791 | }
|
---|
| 792 | else if (*here == '|') {
|
---|
| 793 | in_field = false;
|
---|
| 794 | }
|
---|
| 795 | else if (*here == '!' || *here == '(' || *here == ')') {
|
---|
| 796 | if (!in_phrase) { // ignore these if in_phrase
|
---|
| 797 | // output field, then output operator
|
---|
| 798 | in_field = false;
|
---|
| 799 | if (!queryelem.empty()) {
|
---|
| 800 | if (!simple_and && !fieldpart.empty()) {
|
---|
| 801 | add_field_info(fieldpart, tag, type);
|
---|
| 802 | finalquery += fieldpart;
|
---|
| 803 | finalquery.push_back(' ');
|
---|
| 804 | fieldpart.clear();
|
---|
| 805 | }
|
---|
| 806 | fieldpart += queryelem;
|
---|
| 807 | }
|
---|
| 808 | if (!fieldpart.empty()) {
|
---|
| 809 | add_field_info(fieldpart, tag, type);
|
---|
| 810 | finalquery += fieldpart;
|
---|
| 811 | finalquery.push_back(' ');
|
---|
| 812 | }
|
---|
| 813 | fieldpart.clear();
|
---|
| 814 | queryelem.clear();
|
---|
| 815 | finalquery.push_back(*here);
|
---|
| 816 | finalquery.push_back(' ');
|
---|
| 817 | }
|
---|
| 818 | }
|
---|
| 819 | else if (*here == '"') {
|
---|
| 820 | queryelem.push_back(*here);
|
---|
| 821 | if (in_phrase == false) in_phrase = true;
|
---|
| 822 | else {
|
---|
| 823 | in_phrase = false;
|
---|
| 824 | }
|
---|
| 825 | }
|
---|
| 826 |
|
---|
| 827 | // Found word boundary, in a phrase
|
---|
| 828 | else if (in_phrase) {
|
---|
| 829 | queryelem.push_back(*here);
|
---|
| 830 | }
|
---|
| 831 | // Found a word boundary
|
---|
| 832 | else {
|
---|
| 833 | if (!queryelem.empty()) {
|
---|
| 834 | if (queryelem == "&") {
|
---|
| 835 | in_field = true;
|
---|
| 836 | queryelem.clear();
|
---|
| 837 | }
|
---|
| 838 | else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
|
---|
| 839 |
|
---|
| 840 | if (argb==1) {
|
---|
| 841 | // simple search, these not allowed
|
---|
| 842 | in_field = true;
|
---|
| 843 | fieldpart += queryelem;
|
---|
| 844 | fieldpart.push_back(' ');
|
---|
| 845 | }
|
---|
| 846 | queryelem.clear();
|
---|
| 847 |
|
---|
| 848 | }
|
---|
| 849 | else {
|
---|
| 850 | if (!simple_and && !in_field) {
|
---|
| 851 | if (!fieldpart.empty()) {
|
---|
| 852 | add_field_info(fieldpart, tag, type);
|
---|
| 853 | finalquery += fieldpart;
|
---|
| 854 | finalquery.push_back(' ');
|
---|
| 855 | fieldpart.clear();
|
---|
| 856 | }
|
---|
| 857 | }
|
---|
| 858 |
|
---|
| 859 | fieldpart += queryelem;
|
---|
| 860 | fieldpart.push_back(' ');
|
---|
| 861 | queryelem.clear();
|
---|
| 862 | }
|
---|
| 863 | }
|
---|
| 864 | }
|
---|
| 865 | ++here;
|
---|
| 866 | }
|
---|
| 867 | // at the end
|
---|
| 868 | if (!queryelem.empty()) {
|
---|
| 869 | if (!simple_and && !in_field && !fieldpart.empty()) {
|
---|
| 870 | add_field_info(fieldpart, tag, type);
|
---|
| 871 | finalquery += fieldpart;
|
---|
| 872 | finalquery.push_back(' ');
|
---|
| 873 | fieldpart.clear();
|
---|
| 874 | }
|
---|
| 875 | fieldpart += queryelem;
|
---|
| 876 | }
|
---|
| 877 | if (!fieldpart.empty()) {
|
---|
| 878 | add_field_info(fieldpart, tag, type);
|
---|
| 879 | finalquery += fieldpart;
|
---|
| 880 | fieldpart.clear();
|
---|
| 881 | finalquery.push_back(' ');
|
---|
| 882 | }
|
---|
| 883 |
|
---|
| 884 | querystring = finalquery;
|
---|
| 885 | cerr << "final query = "<<finalquery<<endl;
|
---|
[8029] | 886 | }
|
---|
[8357] | 887 |
|
---|
[11765] | 888 | void format_field_info(text_t &querystring, cgiargsclass &args) {
|
---|
| 889 | int argct = args.getintarg("ct");
|
---|
| 890 | if (argct == 1) {
|
---|
| 891 | format_field_info_mgpp(querystring, args);
|
---|
| 892 | } else if (argct == 2) {
|
---|
| 893 | format_field_info_lucene(querystring, args);
|
---|
| 894 | }
|
---|
| 895 | }
|
---|
[10995] | 896 |
|
---|