[270] | 1 | /**********************************************************************
|
---|
| 2 | *
|
---|
| 3 | * querytools.cpp --
|
---|
| 4 | * Copyright (C) 1999 The New Zealand Digital Library Project
|
---|
| 5 | *
|
---|
[533] | 6 | * A component of the Greenstone digital library software
|
---|
| 7 | * from the New Zealand Digital Library Project at the
|
---|
| 8 | * University of Waikato, New Zealand.
|
---|
[270] | 9 | *
|
---|
[533] | 10 | * This program is free software; you can redistribute it and/or modify
|
---|
| 11 | * it under the terms of the GNU General Public License as published by
|
---|
| 12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | * (at your option) any later version.
|
---|
| 14 | *
|
---|
| 15 | * This program is distributed in the hope that it will be useful,
|
---|
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | * GNU General Public License for more details.
|
---|
| 19 | *
|
---|
| 20 | * You should have received a copy of the GNU General Public License
|
---|
| 21 | * along with this program; if not, write to the Free Software
|
---|
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | *
|
---|
[270] | 24 | *********************************************************************/
|
---|
| 25 |
|
---|
| 26 | #include "querytools.h"
|
---|
[1373] | 27 | #include <ctype.h>
|
---|
[1914] | 28 | #include "unitool.h" // for is_unicode_letdig
|
---|
[270] | 29 |
|
---|
[12784] | 30 | // sets the ct, qt, qto arguments
|
---|
[11987] | 31 | void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
|
---|
| 32 |
|
---|
| 33 | if (args["ct"].empty()) {
|
---|
| 34 | text_t build_type = cinfo->buildType;
|
---|
| 35 | if (build_type == "mgpp") {
|
---|
| 36 | args["ct"] = "1";
|
---|
| 37 | } else if (build_type == "lucene") {
|
---|
| 38 | args["ct"] = "2";
|
---|
| 39 | } else {
|
---|
| 40 | args["ct"] = "0";
|
---|
| 41 | }
|
---|
| 42 | }
|
---|
| 43 | text_t arg_ct = args["ct"];
|
---|
| 44 | if (arg_ct == "0") {
|
---|
| 45 | // mg
|
---|
| 46 | args["qt"] = "0";
|
---|
| 47 | args["qto"] = "0";
|
---|
| 48 | return;
|
---|
| 49 | }
|
---|
| 50 |
|
---|
| 51 | if (!args["qt"].empty() && !args["qto"].empty()) {
|
---|
| 52 | return;
|
---|
| 53 | }
|
---|
| 54 |
|
---|
| 55 | text_tmap::iterator check = cinfo->format.find("SearchTypes");
|
---|
[12784] | 56 | text_t search_types;
|
---|
| 57 | if(check != cinfo->format.end() && !(*check).second.empty()){
|
---|
[11987] | 58 | search_types = (*check).second;
|
---|
[12784] | 59 | } else {
|
---|
| 60 | // assume plain,form
|
---|
| 61 | if (args["qto"].empty()) args["qto"] = "3";
|
---|
| 62 | if (args["qt"].empty()) {
|
---|
| 63 | int arg_qto = args.getintarg("qto");
|
---|
[12930] | 64 | if (arg_qto == 2) {
|
---|
[12784] | 65 | args["qt"] = "1";
|
---|
| 66 | } else {
|
---|
| 67 | args["qt"] = "0";
|
---|
| 68 | }
|
---|
[11987] | 69 | }
|
---|
[12784] | 70 | return;
|
---|
[11987] | 71 | }
|
---|
| 72 |
|
---|
[12784] | 73 |
|
---|
[11987] | 74 | if (args["qto"].empty()) {
|
---|
| 75 | unsigned int type = 0;
|
---|
| 76 | if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) {
|
---|
| 77 | type |= 2;
|
---|
| 78 | }
|
---|
| 79 | if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) {
|
---|
| 80 | type |= 1;
|
---|
| 81 | }
|
---|
| 82 | args.setintarg("qto", type);
|
---|
| 83 | }
|
---|
[22046] | 84 |
|
---|
[11987] | 85 | if (args["qt"].empty()) {
|
---|
| 86 | int arg_qto = args.getintarg("qto");
|
---|
| 87 | if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) {
|
---|
| 88 | args["qt"] = "1";
|
---|
| 89 | } else {
|
---|
| 90 | args["qt"] = "0";
|
---|
| 91 | }
|
---|
| 92 | }
|
---|
[22046] | 93 |
|
---|
| 94 |
|
---|
| 95 | // decide if sqlqto should be set or not
|
---|
| 96 | unsigned int sql_type = 0;
|
---|
| 97 | text_t infodb_type = cinfo->infodbType;
|
---|
| 98 | if ((infodb_type == "sqlite") || (infodb_type == "mssql")) {
|
---|
| 99 | if (findword(search_types.begin(), search_types.end(), "sqlform") != search_types.end()) {
|
---|
| 100 | sql_type = 1;
|
---|
| 101 | }
|
---|
| 102 | }
|
---|
| 103 |
|
---|
| 104 | if (sql_type) {
|
---|
| 105 | args["sqlqto"] = "1";
|
---|
| 106 | }
|
---|
| 107 | else {
|
---|
| 108 | args["sqlqto"] = "0";
|
---|
| 109 | }
|
---|
| 110 |
|
---|
| 111 |
|
---|
[11987] | 112 | }
|
---|
| 113 |
|
---|
[12864] | 114 | // sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args
|
---|
| 115 | void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) {
|
---|
| 116 | int stemIndexes = cinfo->stemIndexes;
|
---|
| 117 |
|
---|
| 118 | if (stemIndexes & SIcasefold) {
|
---|
| 119 | args["ks"] = 1;
|
---|
| 120 | }
|
---|
| 121 | if (stemIndexes & SIstem) {
|
---|
| 122 | args["ss"] = 1;
|
---|
| 123 | }
|
---|
| 124 | if (stemIndexes & SIaccentfold) {
|
---|
| 125 | args["afs"] = 1;
|
---|
| 126 | }
|
---|
| 127 |
|
---|
| 128 | }
|
---|
| 129 |
|
---|
[22046] | 130 |
|
---|
| 131 |
|
---|
| 132 | void set_basequeryfilter_options (FilterRequest_t &request,
|
---|
| 133 | cgiargsclass &args)
|
---|
| 134 | {
|
---|
| 135 |
|
---|
| 136 | OptionValue_t option;
|
---|
| 137 | int arg_m = args.getintarg("m");
|
---|
| 138 |
|
---|
| 139 | option.name = "Maxdocs";
|
---|
| 140 | option.value = arg_m;
|
---|
| 141 | request.filterOptions.push_back (option);
|
---|
| 142 |
|
---|
| 143 | // option.name = "StartResults";
|
---|
| 144 | // option.value = args["r"];
|
---|
| 145 | // request.filterOptions.push_back (option);
|
---|
| 146 |
|
---|
| 147 | // option.name = "EndResults";
|
---|
| 148 | // int endresults = args.getintarg("o") + (args.getintarg("r") - 1);
|
---|
| 149 | // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m;
|
---|
| 150 | // option.value = endresults;
|
---|
| 151 | // request.filterOptions.push_back (option);
|
---|
| 152 | }
|
---|
| 153 |
|
---|
| 154 |
|
---|
[759] | 155 | // request.filterResultOptions and request.fields (if required) should
|
---|
| 156 | // be set from the calling code
|
---|
[22046] | 157 | void set_fulltext_queryfilter_options (FilterRequest_t &request,
|
---|
| 158 | const text_t &querystring,
|
---|
| 159 | cgiargsclass &args)
|
---|
| 160 | {
|
---|
| 161 | // better if this function, and the two-query companion function
|
---|
| 162 | // was implemented in queryaction.cpp
|
---|
| 163 | // Has to be done here to documentaction.cpp can call it directly
|
---|
[270] | 164 |
|
---|
| 165 | request.filterName = "QueryFilter";
|
---|
| 166 |
|
---|
| 167 | OptionValue_t option;
|
---|
[470] | 168 |
|
---|
[270] | 169 | option.name = "Term";
|
---|
[759] | 170 | option.value = querystring;
|
---|
[270] | 171 | request.filterOptions.push_back (option);
|
---|
| 172 |
|
---|
| 173 | option.name = "QueryType";
|
---|
| 174 | option.value = (args.getintarg("t")) ? "ranked" : "boolean";
|
---|
| 175 | request.filterOptions.push_back (option);
|
---|
| 176 |
|
---|
[1774] | 177 | option.name = "MatchMode";
|
---|
[11765] | 178 | // mgpp in advanced mode, always use some query
|
---|
[12428] | 179 | if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) {
|
---|
[11765] | 180 | option.value = "some";
|
---|
| 181 | } else {
|
---|
| 182 | option.value = (args.getintarg("t")) ? "some" : "all";
|
---|
| 183 | }
|
---|
[1774] | 184 | request.filterOptions.push_back (option);
|
---|
| 185 |
|
---|
[270] | 186 | option.name = "Casefold";
|
---|
| 187 | option.value = (args.getintarg("k")) ? "true" : "false";
|
---|
| 188 | request.filterOptions.push_back (option);
|
---|
| 189 |
|
---|
| 190 | option.name = "Stem";
|
---|
| 191 | option.value = (args.getintarg("s")) ? "true" : "false";
|
---|
| 192 | request.filterOptions.push_back (option);
|
---|
| 193 |
|
---|
[12864] | 194 | option.name = "AccentFold";
|
---|
| 195 | option.value = (args.getintarg("af")) ? "true" : "false";
|
---|
| 196 | request.filterOptions.push_back (option);
|
---|
| 197 |
|
---|
[270] | 198 | if (!args["h"].empty()) {
|
---|
| 199 | option.name = "Index";
|
---|
| 200 | option.value = args["h"];
|
---|
| 201 | request.filterOptions.push_back (option);
|
---|
| 202 | }
|
---|
| 203 |
|
---|
| 204 | if (!args["j"].empty()) {
|
---|
| 205 | option.name = "Subcollection";
|
---|
| 206 | option.value = args["j"];
|
---|
| 207 | request.filterOptions.push_back (option);
|
---|
| 208 | }
|
---|
| 209 |
|
---|
| 210 | if (!args["n"].empty()) {
|
---|
| 211 | option.name = "Language";
|
---|
| 212 | option.value = args["n"];
|
---|
| 213 | request.filterOptions.push_back (option);
|
---|
| 214 | }
|
---|
[1329] | 215 |
|
---|
| 216 | if (!args["g"].empty()) { // granularity for mgpp
|
---|
| 217 | option.name = "Level";
|
---|
| 218 | option.value = args["g"];
|
---|
| 219 | request.filterOptions.push_back (option);
|
---|
| 220 | }
|
---|
[270] | 221 |
|
---|
[12410] | 222 | if (!args["fs"].empty()) { // filter string for lucene
|
---|
| 223 | option.name = "FilterString";
|
---|
| 224 | option.value = args["fs"];
|
---|
| 225 | request.filterOptions.push_back (option);
|
---|
| 226 | }
|
---|
| 227 |
|
---|
[12276] | 228 | if (!args["sf"].empty()) { // sort field for lucene
|
---|
| 229 | option.name = "SortField";
|
---|
| 230 | option.value = args["sf"];
|
---|
| 231 | request.filterOptions.push_back (option);
|
---|
| 232 | }
|
---|
[27066] | 233 | if (!args["so"].empty()) { // sort order for lucene
|
---|
| 234 | option.name = "SortOrder";
|
---|
| 235 | option.value = (args.getintarg("so")? "descending" : "ascending");
|
---|
| 236 | request.filterOptions.push_back (option);
|
---|
| 237 | }
|
---|
[12276] | 238 |
|
---|
[12771] | 239 | if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene
|
---|
[12770] | 240 | option.name = "Fuzziness";
|
---|
[12771] | 241 | option.value = (text_t) "0." + args["fuzziness"];
|
---|
[12770] | 242 | request.filterOptions.push_back (option);
|
---|
| 243 | }
|
---|
[12388] | 244 |
|
---|
[22046] | 245 | set_basequeryfilter_options(request, args);
|
---|
[759] | 246 | }
|
---|
| 247 |
|
---|
| 248 |
|
---|
| 249 |
|
---|
[22046] | 250 | void set_fulltext_queryfilter_options (FilterRequest_t &request,
|
---|
| 251 | const text_t &querystring1,
|
---|
| 252 | const text_t &querystring2,
|
---|
| 253 | cgiargsclass &args)
|
---|
| 254 | {
|
---|
| 255 |
|
---|
| 256 | set_fulltext_queryfilter_options (request, querystring1, args);
|
---|
| 257 |
|
---|
[349] | 258 | // fill in the second query if needed
|
---|
| 259 | if (!args["cq2"].empty()) {
|
---|
[759] | 260 | OptionValue_t option;
|
---|
| 261 |
|
---|
[349] | 262 | option.name = "CombineQuery";
|
---|
| 263 | option.value = args["cq2"];
|
---|
| 264 | request.filterOptions.push_back (option);
|
---|
| 265 |
|
---|
| 266 | option.name = "Term";
|
---|
[759] | 267 | option.value = querystring2;
|
---|
[349] | 268 | request.filterOptions.push_back (option);
|
---|
[759] | 269 |
|
---|
[349] | 270 | option.name = "QueryType";
|
---|
| 271 | option.value = (args.getintarg("t")) ? "ranked" : "boolean";
|
---|
| 272 | request.filterOptions.push_back (option);
|
---|
| 273 |
|
---|
| 274 | option.name = "Casefold";
|
---|
| 275 | option.value = (args.getintarg("k")) ? "true" : "false";
|
---|
| 276 | request.filterOptions.push_back (option);
|
---|
| 277 |
|
---|
| 278 | option.name = "Stem";
|
---|
| 279 | option.value = (args.getintarg("s")) ? "true" : "false";
|
---|
| 280 | request.filterOptions.push_back (option);
|
---|
| 281 |
|
---|
[12864] | 282 | option.name = "AccentFold";
|
---|
| 283 | option.value = (args.getintarg("af")) ? "true" : "false";
|
---|
| 284 | request.filterOptions.push_back (option);
|
---|
| 285 |
|
---|
[349] | 286 | if (!args["h2"].empty()) {
|
---|
| 287 | option.name = "Index";
|
---|
| 288 | option.value = args["h2"];
|
---|
| 289 | request.filterOptions.push_back (option);
|
---|
| 290 | }
|
---|
| 291 |
|
---|
| 292 | if (!args["j2"].empty()) {
|
---|
| 293 | option.name = "Subcollection";
|
---|
| 294 | option.value = args["j2"];
|
---|
| 295 | request.filterOptions.push_back (option);
|
---|
| 296 | }
|
---|
| 297 |
|
---|
| 298 | if (!args["n2"].empty()) {
|
---|
| 299 | option.name = "Language";
|
---|
| 300 | option.value = args["n2"];
|
---|
| 301 | request.filterOptions.push_back (option);
|
---|
| 302 | }
|
---|
| 303 | }
|
---|
[22046] | 304 |
|
---|
| 305 | // this is probably redundant, as first line to this method will have
|
---|
| 306 | // already caused it to invoke set_basequeryfilter_options
|
---|
| 307 |
|
---|
| 308 | set_basequeryfilter_options(request, args);
|
---|
[759] | 309 | }
|
---|
[608] | 310 |
|
---|
[759] | 311 |
|
---|
[1329] | 312 |
|
---|
[22046] | 313 | // request.filterResultOptions and request.fields (if required) should
|
---|
| 314 | // be set from the calling code
|
---|
| 315 | void set_sql_queryfilter_options (FilterRequest_t &request,
|
---|
| 316 | cgiargsclass &args)
|
---|
| 317 | {
|
---|
| 318 | if (!args["sqlsf"].empty()) { // sort field for lucene
|
---|
| 319 | OptionValue_t option;
|
---|
[270] | 320 |
|
---|
[22046] | 321 | option.name = "SortField";
|
---|
| 322 | option.value = args["sqlsf"];
|
---|
| 323 | request.filterOptions.push_back (option);
|
---|
| 324 | }
|
---|
| 325 |
|
---|
| 326 | set_basequeryfilter_options(request, args);
|
---|
[270] | 327 | }
|
---|
| 328 |
|
---|
[22046] | 329 |
|
---|
[11987] | 330 | bool is_special_character(int indexer_type, unsigned short character) {
|
---|
| 331 | // mgpp
|
---|
| 332 | if (indexer_type == 1) {
|
---|
| 333 | return (character == '#' || character == '/' || character == '*');
|
---|
| 334 | }
|
---|
| 335 | // lucene
|
---|
[12784] | 336 | else if (indexer_type == 2) {
|
---|
[11987] | 337 | return (character == '?' || character == '*' || character == '~' ||
|
---|
| 338 | character == '^');
|
---|
| 339 | }
|
---|
| 340 | return false;
|
---|
| 341 | }
|
---|
| 342 |
|
---|
[12784] | 343 | // This function removes boolean operators from simple searches, and segments
|
---|
| 344 | // chinese characters if segment=true
|
---|
[6584] | 345 | void format_querystring (text_t &querystring, int querymode, bool segment) {
|
---|
[270] | 346 | text_t formattedstring;
|
---|
| 347 |
|
---|
[12784] | 348 | // advanced search, no segmenting, don't need to do anything
|
---|
[6584] | 349 | if (querymode == 1 && !segment) return;
|
---|
| 350 |
|
---|
[270] | 351 | text_t::const_iterator here = querystring.begin();
|
---|
| 352 | text_t::const_iterator end = querystring.end();
|
---|
| 353 |
|
---|
| 354 | // space is used to insert spaces between Chinese
|
---|
| 355 | // characters. No space is needed before the first
|
---|
| 356 | // Chinese character.
|
---|
| 357 | bool space = false;
|
---|
| 358 |
|
---|
| 359 | // want to remove ()|!& from querystring so boolean queries are just
|
---|
[470] | 360 | // "all the words" queries (unless querymode is advanced)
|
---|
[270] | 361 | while (here != end) {
|
---|
[470] | 362 | if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' ||
|
---|
| 363 | *here == '!' || *here == '&')) {
|
---|
[270] | 364 | formattedstring.push_back(' ');
|
---|
[6584] | 365 | } else if (segment) {
|
---|
[16980] | 366 | if ((*here >= 0x2e80 && *here <= 0xd7a3) ||
|
---|
| 367 | ( *here >= 0xf900 && *here <= 0xfa6a)) {
|
---|
| 368 | /* text_t not big enough to handle these. */
|
---|
| 369 | /* (*here >= 0x20000 && *here <= 0x2a6d6) ||
|
---|
| 370 | (*here >= 0x2f800 && *here <= 0x2fa1d)) { */
|
---|
[16645] | 371 |
|
---|
| 372 | // CJK character
|
---|
[8715] | 373 | if (!space) formattedstring.push_back (0x200b); // zero width space
|
---|
[397] | 374 | formattedstring.push_back (*here);
|
---|
| 375 | formattedstring.push_back (0x200b);
|
---|
| 376 | space = true;
|
---|
[270] | 377 | } else {
|
---|
[8715] | 378 |
|
---|
[397] | 379 | // non-Chinese character
|
---|
| 380 | formattedstring.push_back (*here);
|
---|
| 381 | space = false;
|
---|
[8715] | 382 |
|
---|
[270] | 383 | }
|
---|
[6584] | 384 |
|
---|
| 385 | } else {
|
---|
| 386 | formattedstring.push_back (*here);
|
---|
[270] | 387 | }
|
---|
[9620] | 388 | ++here;
|
---|
[270] | 389 | }
|
---|
[397] | 390 | querystring = formattedstring;
|
---|
[270] | 391 | }
|
---|
| 392 |
|
---|
[20481] | 393 | // turn query string into terms separated by spaces.
|
---|
| 394 | // still working on this...
|
---|
[20602] | 395 | text_t get_plain_query_terms(const text_t &querystring, const text_t &arg_ct) {
|
---|
[20481] | 396 | text_t::const_iterator here = querystring.begin();
|
---|
| 397 | text_t::const_iterator end = querystring.end();
|
---|
[20602] | 398 |
|
---|
| 399 | // lets look for [] and () first - these are a pain.
|
---|
| 400 | text_t::const_iterator bracket;
|
---|
| 401 | text_t query_no_brackets = "";
|
---|
| 402 |
|
---|
| 403 | // mgpp brackets: [xxx]:TI
|
---|
| 404 | if (findchar(here, end, '[') != end) {
|
---|
| 405 | while ((bracket = findchar(here, end, '[')) != end) {
|
---|
| 406 | // get the first bit
|
---|
| 407 | query_no_brackets += substr(here, bracket);
|
---|
| 408 | bracket++;
|
---|
| 409 | here = bracket;
|
---|
| 410 | // get the end bracket
|
---|
| 411 | bracket = findchar(here, end, ']');
|
---|
| 412 | query_no_brackets += substr(here, bracket);
|
---|
| 413 | // skip the :TI bits
|
---|
[23635] | 414 | while (bracket != end // do bracket != end test first, ELSE when bracket = end, we're past the string, in
|
---|
| 415 | && *bracket != ' ') { // which case *bracket becomes an invalid operation that causes the server to crash
|
---|
| 416 | bracket++;
|
---|
| 417 | }
|
---|
[20602] | 418 | here = bracket;
|
---|
| 419 | }
|
---|
| 420 | if (here != end) {
|
---|
| 421 | query_no_brackets += substr(here,end);
|
---|
| 422 | }
|
---|
| 423 | } else if (findchar(here, end, '(') != end) {
|
---|
| 424 | // lucene brackets TI:(xxx)
|
---|
| 425 | while ((bracket = findchar(here, end, '(')) != end) {
|
---|
| 426 | // back up the field name
|
---|
| 427 | text_t::const_iterator old_bracket = bracket;
|
---|
[23635] | 428 | while (bracket != here && *bracket != ' ') { // order of tests in condition matters (see long comment above)
|
---|
| 429 | --bracket;
|
---|
[20602] | 430 | }
|
---|
| 431 | if (bracket != here) {
|
---|
| 432 | // get the first bit
|
---|
| 433 | query_no_brackets += substr(here, bracket+1);
|
---|
| 434 | }
|
---|
| 435 | here = old_bracket +1;
|
---|
| 436 | // get the end bracket
|
---|
| 437 | bracket = findchar(here, end, ')');
|
---|
| 438 | query_no_brackets += substr(here, bracket);
|
---|
| 439 | if (bracket != end) {
|
---|
| 440 | here = bracket+1;
|
---|
| 441 | }
|
---|
| 442 | }
|
---|
| 443 | if (here != end) {
|
---|
| 444 | query_no_brackets += substr(here,end);
|
---|
| 445 | }
|
---|
| 446 | } else {
|
---|
| 447 | // was no brackets
|
---|
| 448 | query_no_brackets = querystring;
|
---|
| 449 | }
|
---|
| 450 |
|
---|
| 451 |
|
---|
| 452 | if (arg_ct == "2") { // lucene
|
---|
| 453 | // look for AND OR NOT and remove
|
---|
| 454 | here = query_no_brackets.begin();
|
---|
| 455 | end = query_no_brackets.end();
|
---|
| 456 | text_tlist terms;
|
---|
| 457 | splitword(here, end, "AND", terms);
|
---|
| 458 | joinchar(terms, ' ', query_no_brackets);
|
---|
| 459 | here = query_no_brackets.begin();
|
---|
| 460 | end = query_no_brackets.end();
|
---|
| 461 | splitword(here, end, "OR", terms);
|
---|
| 462 | joinchar(terms, ' ', query_no_brackets);
|
---|
| 463 | here = query_no_brackets.begin();
|
---|
| 464 | end = query_no_brackets.end();
|
---|
| 465 | splitword(here, end, "NOT", terms);
|
---|
| 466 | joinchar(terms, ' ', query_no_brackets);
|
---|
| 467 |
|
---|
| 468 | }
|
---|
[20481] | 469 | text_t terms = "";
|
---|
| 470 | bool space = false;
|
---|
[20602] | 471 | here = query_no_brackets.begin();
|
---|
| 472 | end = query_no_brackets.end();
|
---|
| 473 |
|
---|
[20481] | 474 | while (here != end) {
|
---|
| 475 | if (*here == '#' || *here == '/') {
|
---|
| 476 | // skip over #is /10 etc
|
---|
| 477 | ++here;
|
---|
| 478 | while (here != end && *here != ' ') {
|
---|
| 479 | ++here;
|
---|
| 480 | }
|
---|
| 481 | if (here == end) break;
|
---|
| 482 | }
|
---|
| 483 | if (is_unicode_letdig(*here)) {
|
---|
| 484 | terms.push_back(*here);
|
---|
| 485 | space = false;
|
---|
| 486 | } else {
|
---|
| 487 | if (!space) {
|
---|
| 488 | terms.push_back(' ');
|
---|
| 489 | space = true;
|
---|
| 490 | }
|
---|
| 491 | }
|
---|
| 492 | ++here;
|
---|
| 493 | }
|
---|
[24111] | 494 | return trim(terms);
|
---|
[20481] | 495 |
|
---|
| 496 | }
|
---|
[1467] | 497 |
|
---|
[3160] | 498 | // search history tool
|
---|
| 499 | // also used for form query macros
|
---|
[1914] | 500 | text_t escape_quotes(const text_t &querystring) {
|
---|
| 501 |
|
---|
| 502 | text_t::const_iterator here = querystring.begin();
|
---|
| 503 | text_t::const_iterator end = querystring.end();
|
---|
| 504 |
|
---|
| 505 | text_t escquery = "";
|
---|
| 506 | while (here != end) {
|
---|
[1988] | 507 | if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here);
|
---|
| 508 | else if (*here == '\n' || *here == '\r') {
|
---|
| 509 | escquery.push_back(' ');
|
---|
| 510 | } else {
|
---|
[1914] | 511 | escquery +="\\\\";
|
---|
| 512 | escquery.push_back(*here);
|
---|
| 513 | }
|
---|
| 514 |
|
---|
[9620] | 515 | ++here;
|
---|
[1914] | 516 | }
|
---|
| 517 | return escquery;
|
---|
| 518 |
|
---|
| 519 | }
|
---|
| 520 |
|
---|
[12784] | 521 | // Parses the terms into words, and adds #si if necessary
|
---|
| 522 | text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold,
|
---|
| 523 | const int indexer_type) {
|
---|
| 524 |
|
---|
| 525 | // the default stem and case are set to 0 if this is being used, so we are only adding on qualifiers if stem or fold is 1.
|
---|
| 526 | if (stem == "0" && fold == "0") {
|
---|
[12791] | 527 | return terms;
|
---|
[12784] | 528 | }
|
---|
| 529 | // this is only for mgpp collections, shouldn't be called for anything else
|
---|
| 530 | if (indexer_type != 1) {
|
---|
[12791] | 531 | return terms;
|
---|
[12784] | 532 | }
|
---|
| 533 |
|
---|
| 534 | text_t outtext;
|
---|
| 535 | text_t word;
|
---|
| 536 |
|
---|
| 537 | text_t::const_iterator here = terms.begin();
|
---|
| 538 | text_t::const_iterator end = terms.end();
|
---|
| 539 |
|
---|
| 540 | while (here !=end) {
|
---|
| 541 |
|
---|
| 542 | if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) {
|
---|
| 543 | // not word boundary
|
---|
| 544 | word.push_back(*here);
|
---|
| 545 | ++here;
|
---|
| 546 | }
|
---|
| 547 | else {
|
---|
| 548 | // found word boundary
|
---|
| 549 | if (!word.empty() ) {
|
---|
| 550 | if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) {
|
---|
| 551 | outtext += word;
|
---|
| 552 | word.clear();
|
---|
| 553 | }
|
---|
| 554 | else {
|
---|
| 555 | word += "#";
|
---|
| 556 | if (stem == "1") word += "s";
|
---|
| 557 | if (fold == "1") word += "i";
|
---|
| 558 | outtext += word;
|
---|
| 559 | word.clear();
|
---|
| 560 | }
|
---|
| 561 | }
|
---|
| 562 | // this only used in advanced form, so we leave in boolean operators
|
---|
[12792] | 563 | if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' ||
|
---|
| 564 | *here == '(' || *here == ')' || is_unicode_space(*here)) {
|
---|
[12784] | 565 | outtext.push_back(*here);
|
---|
| 566 | }
|
---|
| 567 | ++here;
|
---|
| 568 | }
|
---|
| 569 | }
|
---|
| 570 |
|
---|
| 571 | // get last word
|
---|
| 572 | if (!word.empty()) {
|
---|
| 573 | word += "#";
|
---|
| 574 | if (stem == "1") word += "s";
|
---|
| 575 | if (fold == "1") word += "i";
|
---|
| 576 | word += " ";
|
---|
| 577 | outtext += word;
|
---|
| 578 | }
|
---|
| 579 | return outtext;
|
---|
| 580 | }
|
---|
| 581 |
|
---|
| 582 |
|
---|
[11765] | 583 | // some query form parsing functions for use with mgpp & lucene
|
---|
[1914] | 584 |
|
---|
[12784] | 585 | void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
|
---|
[8029] | 586 | {
|
---|
| 587 | querystring.clear();
|
---|
[1914] | 588 |
|
---|
[12784] | 589 | int argct = args.getintarg("ct");
|
---|
[8029] | 590 | int argt = args.getintarg("t");// t=0 -and, t=1 - or
|
---|
[12784] | 591 | int argb = args.getintarg("b");
|
---|
| 592 |
|
---|
| 593 | text_t combine;
|
---|
[8029] | 594 |
|
---|
[12784] | 595 | // lucene uses global combine, so only need this for mgpp
|
---|
| 596 | if (argct==1) {
|
---|
[8029] | 597 | if (argt == 0) combine = "&";
|
---|
| 598 | else combine = "|";
|
---|
| 599 | }
|
---|
[1914] | 600 |
|
---|
| 601 | text_t field = args["fqf"];
|
---|
| 602 | if (field.empty()) return; // no query
|
---|
| 603 | text_tarray fields;
|
---|
| 604 | splitchar(field.begin(), field.end(), ',', fields);
|
---|
| 605 |
|
---|
| 606 | text_t value = args["fqv"];
|
---|
| 607 | if (value.empty()) return; // somethings wrong
|
---|
| 608 | text_tarray values;
|
---|
| 609 | splitchar(value.begin(), value.end(), ',', values);
|
---|
| 610 |
|
---|
[8029] | 611 |
|
---|
[9620] | 612 | for (int i=0; i< values.size(); ++i) {
|
---|
[1914] | 613 | if (!values[i].empty()) {
|
---|
[12784] | 614 | text_t this_value = values[i];
|
---|
[22046] | 615 |
|
---|
[12784] | 616 | // remove operators for simple search, segments text if necessary
|
---|
| 617 | format_querystring(this_value, argb, segment);
|
---|
[22046] | 618 |
|
---|
[12784] | 619 | // add tag info for this field (and other processing)
|
---|
| 620 | format_field_info(this_value, fields[i], argct, argt, argb);
|
---|
[22046] | 621 |
|
---|
[12784] | 622 | // add into query string
|
---|
| 623 | if (argct == 2) {
|
---|
| 624 | // lucene
|
---|
| 625 | // we don't worry about AND/OR, cos this is done by defaultcombineoperator
|
---|
| 626 | querystring += this_value+" ";
|
---|
| 627 | } else {
|
---|
| 628 | // mgpp
|
---|
| 629 | if (!querystring.empty()) {
|
---|
| 630 | querystring += " "+ combine+ " ";
|
---|
| 631 | }
|
---|
| 632 | querystring += this_value;
|
---|
[8029] | 633 | }
|
---|
[1914] | 634 | }
|
---|
| 635 | }
|
---|
| 636 | }
|
---|
| 637 |
|
---|
| 638 |
|
---|
[12784] | 639 | void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){
|
---|
[1914] | 640 | querystring.clear();
|
---|
| 641 |
|
---|
[12784] | 642 | const int argct = args.getintarg("ct");
|
---|
| 643 | int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default
|
---|
| 644 | int argb = args.getintarg("b");
|
---|
[8029] | 645 | text_t combine;
|
---|
[12784] | 646 | if (argct==1) {
|
---|
[8029] | 647 | combine = "&";
|
---|
| 648 | }
|
---|
| 649 | else { // lucene
|
---|
| 650 | combine = "AND";
|
---|
| 651 | }
|
---|
| 652 |
|
---|
[1914] | 653 | text_t field = args["fqf"];
|
---|
| 654 | if (field.empty()) return; // no query
|
---|
| 655 | text_tarray fields;
|
---|
| 656 | splitchar(field.begin(), field.end(), ',', fields);
|
---|
| 657 |
|
---|
| 658 | text_t value = args["fqv"];
|
---|
| 659 | if (value.empty()) return; // somethings wrong
|
---|
| 660 | text_tarray values;
|
---|
| 661 | splitchar(value.begin(), value.end(), ',', values);
|
---|
| 662 |
|
---|
| 663 | text_t comb = args["fqc"];
|
---|
| 664 | if (comb.empty()) return; //somethings wrong
|
---|
| 665 | text_tarray combs;
|
---|
| 666 | splitchar(comb.begin(), comb.end(), ',', combs);
|
---|
[12784] | 667 |
|
---|
| 668 | text_tarray stems;
|
---|
| 669 | text_tarray folds;
|
---|
| 670 | if (argct == 1) {// mgpp - lucene doesn't do stem/case
|
---|
| 671 | text_t stem = args["fqs"];
|
---|
| 672 | if (stem.empty()) return; // somethings wrong
|
---|
| 673 | splitchar(stem.begin(), stem.end(), ',', stems);
|
---|
| 674 |
|
---|
| 675 | text_t fold = args["fqk"];
|
---|
| 676 | if (fold.empty()) return; // somethings wrong
|
---|
| 677 | splitchar(fold.begin(), fold.end(), ',', folds);
|
---|
| 678 | }
|
---|
[1914] | 679 |
|
---|
[9620] | 680 | for(int i=0; i< values.size(); ++i) {
|
---|
[1914] | 681 | if (!values[i].empty()) {
|
---|
| 682 | if (i!=0) {
|
---|
[12784] | 683 | if (argct==1) {
|
---|
[8029] | 684 | if (combs[i-1]=="and") combine = "&";
|
---|
| 685 | else if (combs[i-1]=="or")combine = "|";
|
---|
| 686 | else if (combs[i-1]=="not")combine = "!";
|
---|
| 687 | }
|
---|
| 688 | else { // lucene
|
---|
| 689 | if (combs[i-1]=="and") combine = "AND";
|
---|
| 690 | else if (combs[i-1]=="or")combine = "OR";
|
---|
| 691 | else if (combs[i-1]=="not")combine = "NOT";
|
---|
| 692 | }
|
---|
[1914] | 693 | }
|
---|
[12784] | 694 | text_t this_value = values[i];
|
---|
| 695 | // remove operators for simple search, segments text if necessary
|
---|
| 696 | format_querystring(this_value, argb, segment);
|
---|
| 697 | if (argct == 1) { // mgpp only
|
---|
| 698 | this_value = addstemcase(this_value, stems[i], folds[i], argct);
|
---|
[1914] | 699 | }
|
---|
[12784] | 700 | // add tag info for this field (and other processing)
|
---|
| 701 | format_field_info(this_value, fields[i], argct, argt, argb);
|
---|
| 702 | // add into query string
|
---|
| 703 | if (!querystring.empty()) {
|
---|
| 704 | querystring += " "+ combine+ " ";
|
---|
[2745] | 705 | }
|
---|
[12784] | 706 | querystring += this_value;
|
---|
[1914] | 707 |
|
---|
| 708 | }
|
---|
| 709 | }
|
---|
| 710 | }
|
---|
| 711 |
|
---|
[22046] | 712 |
|
---|
| 713 | // SQL versions for parsing query form
|
---|
| 714 |
|
---|
| 715 | void parse_sqlreg_query_form(text_t &querystring, cgiargsclass &args, bool segment)
|
---|
| 716 | {
|
---|
| 717 | querystring.clear();
|
---|
| 718 |
|
---|
| 719 | int argt = args.getintarg("t");// t=0 -and, t=1 - or
|
---|
| 720 | int argb = args.getintarg("b");
|
---|
| 721 |
|
---|
| 722 | text_t combine;
|
---|
| 723 |
|
---|
| 724 | if (argt == 0) combine = "AND";
|
---|
| 725 | else combine = "OR";
|
---|
| 726 |
|
---|
| 727 | text_t field = args["sqlfqf"];
|
---|
| 728 | if (field.empty()) return; // no query
|
---|
| 729 | text_tarray fields;
|
---|
| 730 | splitchar(field.begin(), field.end(), ',', fields);
|
---|
| 731 |
|
---|
| 732 | text_t sqlcomb = args["sqlfqc"];
|
---|
| 733 | if (sqlcomb.empty()) return; //somethings wrong
|
---|
| 734 | text_tarray sqlcombs;
|
---|
| 735 | splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
|
---|
| 736 |
|
---|
| 737 | text_t value = args["fqv"];
|
---|
| 738 | if (value.empty()) return; // somethings wrong
|
---|
| 739 | text_tarray values;
|
---|
| 740 | splitchar(value.begin(), value.end(), ',', values);
|
---|
| 741 |
|
---|
| 742 |
|
---|
| 743 | for (int i=0; i< values.size(); ++i) {
|
---|
| 744 | if (!values[i].empty()) {
|
---|
[24073] | 745 | text_t this_value;
|
---|
| 746 | const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
|
---|
| 747 | const text_t LIKE_CONDITION = "LIKE";
|
---|
| 748 |
|
---|
| 749 | //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
|
---|
| 750 | //in order to search a field starting with certain words.
|
---|
| 751 | if (sqlcombs[i] == STARTINGWITH_CONDITION)
|
---|
| 752 | {this_value = values[i];
|
---|
| 753 | this_value += "%";
|
---|
| 754 | // remove operators for simple search, segments text if necessary
|
---|
| 755 | format_querystring(this_value, argb, segment);
|
---|
| 756 | // add tag info for this field (and other processing)
|
---|
| 757 | format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}
|
---|
[22046] | 758 |
|
---|
[24073] | 759 | else
|
---|
| 760 | {this_value = values[i];
|
---|
| 761 | // remove operators for simple search, segments text if necessary
|
---|
| 762 | format_querystring(this_value, argb, segment);
|
---|
| 763 | // add tag info for this field (and other processing)
|
---|
| 764 | format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}
|
---|
[22046] | 765 |
|
---|
[24073] | 766 |
|
---|
| 767 | const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
|
---|
[22046] | 768 |
|
---|
| 769 | if (querystring.empty()) {
|
---|
| 770 | // first query term
|
---|
| 771 | querystring = DISTINCT_SELECT_WHERE + this_value;
|
---|
| 772 | }
|
---|
| 773 | else {
|
---|
| 774 | this_value = DISTINCT_SELECT_WHERE + this_value;
|
---|
| 775 |
|
---|
| 776 | if (combine=="AND") {
|
---|
| 777 | // INNER JOIN to restrict to only matching docOIDs
|
---|
| 778 | querystring = "SELECT docOID FROM (" + querystring + ")"
|
---|
| 779 | + " INNER JOIN (" + this_value +") USING (docOID)";
|
---|
| 780 | }
|
---|
| 781 | else if (combine=="OR") {
|
---|
| 782 | // Union to allow union of the two
|
---|
| 783 | querystring = querystring + " UNION " + this_value;
|
---|
| 784 | }
|
---|
| 785 | }
|
---|
| 786 | }
|
---|
| 787 | }
|
---|
| 788 | }
|
---|
| 789 |
|
---|
| 790 |
|
---|
| 791 | void parse_sqladv_query_form(text_t &querystring, cgiargsclass &args,
|
---|
| 792 | bool segment)
|
---|
| 793 | {
|
---|
| 794 | querystring.clear();
|
---|
| 795 |
|
---|
| 796 | int argt = 0; // set it to 0 = AND, by default
|
---|
| 797 | int argb = args.getintarg("b");
|
---|
| 798 | text_t combine = "AND";
|
---|
| 799 |
|
---|
| 800 | text_t field = args["sqlfqf"];
|
---|
| 801 |
|
---|
| 802 | if (field.empty()) return; // no query
|
---|
| 803 | text_tarray fields;
|
---|
| 804 | splitchar(field.begin(), field.end(), ',', fields);
|
---|
| 805 |
|
---|
| 806 | text_t sqlcomb = args["sqlfqc"];
|
---|
| 807 | if (sqlcomb.empty()) return; //somethings wrong
|
---|
| 808 | text_tarray sqlcombs;
|
---|
| 809 | splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
|
---|
| 810 |
|
---|
| 811 | text_t value = args["fqv"];
|
---|
| 812 | if (value.empty()) return; // somethings wrong
|
---|
| 813 | text_tarray values;
|
---|
| 814 | splitchar(value.begin(), value.end(), ',', values);
|
---|
| 815 |
|
---|
| 816 | text_t comb = args["fqc"];
|
---|
| 817 | if (comb.empty()) return; //somethings wrong
|
---|
| 818 | text_tarray combs;
|
---|
| 819 | splitchar(comb.begin(), comb.end(), ',', combs);
|
---|
| 820 |
|
---|
| 821 | for(int i=0; i< values.size(); ++i) {
|
---|
| 822 | if (!values[i].empty()) {
|
---|
| 823 | if (i>0) {
|
---|
| 824 | if (combs[i-1]=="and") { combine = "AND"; }
|
---|
| 825 | else if (combs[i-1]=="or") { combine = "OR"; }
|
---|
| 826 | else if (combs[i-1]=="not") { combine = "NOT"; }
|
---|
| 827 | }
|
---|
[24073] | 828 | text_t this_value;
|
---|
| 829 | const text_t STARTINGWITH_CONDITION = "STARTINGWITH";
|
---|
| 830 | const text_t LIKE_CONDITION = "LIKE";
|
---|
| 831 |
|
---|
| 832 | //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field
|
---|
| 833 | //in order to search a field starting with certain words.
|
---|
| 834 | if (sqlcombs[i] == STARTINGWITH_CONDITION)
|
---|
| 835 | {this_value = values[i];
|
---|
| 836 | this_value += "%";
|
---|
| 837 | // remove operators for simple search, segments text if necessary
|
---|
| 838 | format_querystring(this_value, argb, segment);
|
---|
| 839 | // add tag info for this field (and other processing)
|
---|
| 840 | format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);}
|
---|
[22046] | 841 |
|
---|
[24073] | 842 | else
|
---|
| 843 | {this_value = values[i];
|
---|
| 844 | // remove operators for simple search, segments text if necessary
|
---|
| 845 | format_querystring(this_value, argb, segment);
|
---|
| 846 | // add tag info for this field (and other processing)
|
---|
| 847 | format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);}
|
---|
| 848 |
|
---|
| 849 | const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE ";
|
---|
[22046] | 850 |
|
---|
| 851 | if (querystring.empty()) {
|
---|
| 852 | // first query term
|
---|
| 853 | querystring = DISTINCT_SELECT_WHERE + this_value;
|
---|
| 854 | }
|
---|
| 855 | else {
|
---|
| 856 | this_value = DISTINCT_SELECT_WHERE + this_value;
|
---|
| 857 |
|
---|
| 858 | if (combine=="AND") {
|
---|
| 859 | // INNER JOIN to restrict to only matching docOIDs
|
---|
| 860 | querystring = "SELECT docOID FROM (" + querystring + ")"
|
---|
| 861 | + " INNER JOIN (" + this_value +") USING (docOID)";
|
---|
| 862 | }
|
---|
| 863 | else if (combine=="OR") {
|
---|
| 864 | // Union to allow union of the two
|
---|
| 865 | querystring = querystring + " UNION " + this_value;
|
---|
| 866 | }
|
---|
| 867 | else {
|
---|
| 868 | cerr << "Unsupported combination operation: " << combine << endl;
|
---|
| 869 | }
|
---|
| 870 | }
|
---|
| 871 |
|
---|
| 872 | }
|
---|
| 873 | }
|
---|
| 874 | }
|
---|
| 875 |
|
---|
| 876 |
|
---|
| 877 |
|
---|
| 878 |
|
---|
[12784] | 879 | // Extended addqueryelem for Human Info project
|
---|
[7380] | 880 | void addqueryelem_ex(text_t &querystring, const text_t &tag,
|
---|
[12784] | 881 | const text_t &terms, const text_t &stem,
|
---|
| 882 | const text_t &fold,
|
---|
[7380] | 883 | const text_t& combine, const text_t& word_combine) {
|
---|
[12784] | 884 |
|
---|
[7380] | 885 | if (!querystring.empty()) { // have to put and/or
|
---|
| 886 | querystring += " " + combine + " ";
|
---|
| 887 | }
|
---|
| 888 | text_t outtext; outtext.reserve(512);
|
---|
| 889 | text_t word; word.reserve(100);
|
---|
| 890 | //unsigned short c;
|
---|
| 891 | text_t::const_iterator here = terms.begin();
|
---|
| 892 | text_t::const_iterator end = terms.end();
|
---|
| 893 | bool inquote = false, firstword = true;
|
---|
[1914] | 894 |
|
---|
[7380] | 895 | text_t word2; word2.reserve(256);
|
---|
| 896 |
|
---|
| 897 | while (here !=end) {
|
---|
| 898 | if (is_unicode_space(*here)) {
|
---|
| 899 | if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
|
---|
| 900 | else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
|
---|
| 901 | else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
|
---|
| 902 | else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
|
---|
| 903 | else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
|
---|
| 904 | if (inquote) {
|
---|
| 905 | word2.push_back(*here);
|
---|
| 906 | }
|
---|
| 907 | word.append(word2); word2.clear();
|
---|
| 908 |
|
---|
| 909 | if (!inquote && !word.empty() ) {
|
---|
[12784] | 910 | // found word boundary
|
---|
[7380] | 911 |
|
---|
| 912 | if (stem == "1" || fold =="1") {
|
---|
| 913 | word += "#";
|
---|
| 914 | if (stem == "1") word += "s";
|
---|
| 915 | //else word += "u";
|
---|
| 916 |
|
---|
| 917 | if (fold == "1") word += "i";
|
---|
| 918 | //else word += "c";
|
---|
| 919 | }
|
---|
| 920 | if (firstword) {
|
---|
| 921 | firstword = false;
|
---|
| 922 | } else {
|
---|
| 923 | outtext += " " + word_combine + " ";
|
---|
| 924 | }
|
---|
| 925 | outtext += "[" + word + "]:"+tag;
|
---|
| 926 | word.clear();
|
---|
| 927 | }
|
---|
| 928 | ++here;
|
---|
| 929 | } else if (*here == '\"') {
|
---|
| 930 | word2.push_back(*here);
|
---|
| 931 | inquote = !inquote;
|
---|
| 932 | ++here;
|
---|
| 933 | } else {
|
---|
| 934 | // not word boundary
|
---|
| 935 | word2.push_back(*here);
|
---|
| 936 | ++here;
|
---|
| 937 | }
|
---|
| 938 | }
|
---|
| 939 |
|
---|
| 940 | // get last word
|
---|
| 941 | if (!word2.empty()) {
|
---|
| 942 | if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); }
|
---|
| 943 | else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); }
|
---|
| 944 | else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); }
|
---|
| 945 | else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); }
|
---|
| 946 | else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); }
|
---|
| 947 | word.append(word2); word2.clear();
|
---|
| 948 |
|
---|
| 949 | if (stem == "1"|| fold == "1") {
|
---|
| 950 | word += "#";
|
---|
| 951 | if (stem == "1") word += "s";
|
---|
| 952 | //else word += "u";
|
---|
| 953 |
|
---|
| 954 | if (fold == "1") word += "i";
|
---|
| 955 | //else word += "c";
|
---|
| 956 | }
|
---|
| 957 | if (!outtext.empty()) outtext += " " + word_combine + " ";
|
---|
| 958 | outtext += "[" + word + "]:"+tag;
|
---|
| 959 | }
|
---|
| 960 | querystring += "(" + outtext + ")";
|
---|
| 961 | }
|
---|
| 962 |
|
---|
[8357] | 963 | void add_field_info(text_t &querystring, const text_t &tag, int type) {
|
---|
[7380] | 964 |
|
---|
[17796] | 965 | if (tag == "") return; // do nothing
|
---|
| 966 | if (tag == "ZZ" && type == 1) return; // mgpp doesn't use ZZ tag internally
|
---|
[8357] | 967 | if (type == 1) { //mgpp
|
---|
| 968 | querystring = "["+querystring+"]:"+tag;
|
---|
| 969 | } else if (type == 2) { // lucene
|
---|
| 970 | querystring = tag+":("+querystring+")";
|
---|
[4757] | 971 | }
|
---|
[8357] | 972 |
|
---|
[4757] | 973 | }
|
---|
[8029] | 974 |
|
---|
| 975 |
|
---|
[22046] | 976 | void add_field_info_sql(text_t &querystring, const text_t &tagseq,
|
---|
| 977 | const text_t& sqlcomb)
|
---|
| 978 | {
|
---|
| 979 |
|
---|
| 980 | if (tagseq == "") return; // do nothing
|
---|
| 981 |
|
---|
| 982 | text_t element_in = "(element IN (";
|
---|
| 983 |
|
---|
| 984 | text_tlist mdterms;
|
---|
| 985 |
|
---|
| 986 | splitword(tagseq.begin(), tagseq.end(), "/", mdterms);
|
---|
| 987 |
|
---|
| 988 | text_t tags_in = "";
|
---|
| 989 |
|
---|
| 990 | while (!mdterms.empty()) {
|
---|
| 991 | text_t tag = mdterms.front();
|
---|
| 992 | mdterms.pop_front();
|
---|
| 993 |
|
---|
| 994 | if (!tag.empty()) {
|
---|
| 995 |
|
---|
[24306] | 996 | // remove "ex." prefix, but only if there are no other metadata set qualifiers
|
---|
| 997 | // in the metaname, since we want to retain prefixes like "ex.dc." as-is
|
---|
| 998 | text_t::iterator period = findchar(tag.begin(), tag.end(), '.');
|
---|
| 999 | text_t::iterator lastperiod = findlastchar(tag.begin(), tag.end(), '.');
|
---|
| 1000 |
|
---|
| 1001 | if (tag.size()>3 && (substr(tag.begin(), tag.begin()+3) == "ex.") && period == lastperiod) {
|
---|
[22046] | 1002 | tag = substr (tag.begin()+3, tag.end());
|
---|
| 1003 | }
|
---|
| 1004 |
|
---|
| 1005 | if (!tags_in.empty()) {
|
---|
| 1006 | tags_in += ",";
|
---|
| 1007 | }
|
---|
| 1008 |
|
---|
| 1009 | tags_in += "'" + tag + "'";
|
---|
| 1010 | }
|
---|
| 1011 | }
|
---|
| 1012 |
|
---|
| 1013 | element_in += tags_in + ") AND (";
|
---|
| 1014 |
|
---|
[24073] | 1015 |
|
---|
[22046] | 1016 | if (sqlcomb == "=") {
|
---|
| 1017 | // override what it means to do equality, to make it more like full text
|
---|
| 1018 | // searching
|
---|
| 1019 |
|
---|
| 1020 | text_t orterms = "";
|
---|
| 1021 | text_t term = "";
|
---|
| 1022 | bool in_phrase = false;
|
---|
| 1023 |
|
---|
| 1024 | text_t::const_iterator here = querystring.begin();
|
---|
| 1025 | text_t::const_iterator end = querystring.end();
|
---|
| 1026 | while (here != end) {
|
---|
| 1027 | if (is_unicode_letdig(*here)) {
|
---|
| 1028 | term.push_back(*here);
|
---|
| 1029 | }
|
---|
| 1030 | else if (*here == '"') {
|
---|
| 1031 | term.push_back(*here);
|
---|
| 1032 | if (!in_phrase) {
|
---|
| 1033 | in_phrase = true;
|
---|
| 1034 | } else {
|
---|
| 1035 | in_phrase = false;
|
---|
| 1036 | }
|
---|
| 1037 | }
|
---|
| 1038 | else if (in_phrase) {
|
---|
| 1039 | // Found word boundary, but in a phrase, so does not complete term
|
---|
| 1040 | term.push_back(*here);
|
---|
| 1041 | }
|
---|
| 1042 | else {
|
---|
| 1043 | // Found a word boundary
|
---|
| 1044 | if (!orterms.empty()) {
|
---|
| 1045 | orterms += " OR ";
|
---|
| 1046 | }
|
---|
| 1047 | orterms += "value LIKE '%" + term + "%'";
|
---|
| 1048 | term.clear();
|
---|
| 1049 | }
|
---|
| 1050 | ++here;
|
---|
| 1051 | }
|
---|
| 1052 |
|
---|
| 1053 | if (!term.empty()) {
|
---|
| 1054 | if (!orterms.empty()) {
|
---|
| 1055 | orterms += " OR ";
|
---|
| 1056 | }
|
---|
| 1057 | orterms += "value LIKE '%" + term + "%'";
|
---|
| 1058 | }
|
---|
| 1059 |
|
---|
| 1060 | element_in += orterms;
|
---|
| 1061 | }
|
---|
[24073] | 1062 | //We cast the value from STRING to REAL to allow numeric sorting
|
---|
| 1063 | else if (sqlcomb == "<num") {
|
---|
| 1064 | element_in += "CAST(value as REAL) < CAST('" + querystring+"' AS REAL)";
|
---|
| 1065 | }
|
---|
| 1066 | else if (sqlcomb == ">num") {
|
---|
| 1067 | element_in += "CAST(value as REAL) > CAST('" + querystring+"' AS REAL)";
|
---|
| 1068 | }
|
---|
| 1069 | else if (sqlcomb == "<=num") {
|
---|
| 1070 | element_in += "CAST(value as REAL) <= CAST('" + querystring+"' AS REAL)";
|
---|
| 1071 | }
|
---|
| 1072 | else if (sqlcomb == ">=num") {
|
---|
| 1073 | element_in += "CAST(value as REAL) >= CAST('" + querystring+"' AS REAL)";
|
---|
| 1074 | }
|
---|
| 1075 | else if (sqlcomb == "=num") {
|
---|
| 1076 | element_in += "CAST(value as REAL) = CAST('" + querystring+"' AS REAL)";
|
---|
| 1077 | }
|
---|
[22046] | 1078 | else {
|
---|
| 1079 | // search on value is "as is" querystring
|
---|
| 1080 | element_in += "value " + sqlcomb + " '" + querystring+"'";
|
---|
| 1081 | }
|
---|
| 1082 |
|
---|
| 1083 |
|
---|
| 1084 | querystring = element_in + "))";
|
---|
| 1085 |
|
---|
| 1086 | }
|
---|
| 1087 |
|
---|
| 1088 |
|
---|
[17796] | 1089 | void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) {
|
---|
| 1090 |
|
---|
[11765] | 1091 | int type = 2; //lucene
|
---|
[8029] | 1092 |
|
---|
[12784] | 1093 | if (argb==0) { // simple
|
---|
| 1094 | // there will be no & or | as they should have already been removed
|
---|
[11765] | 1095 | // just tag the entire thing
|
---|
[10995] | 1096 | if (tag != "") {
|
---|
[11765] | 1097 | add_field_info(querystring, tag, type);
|
---|
[10995] | 1098 | }
|
---|
[8357] | 1099 | return;
|
---|
| 1100 | }
|
---|
[10995] | 1101 |
|
---|
[12784] | 1102 | // need to replace & with &&, | with ||
|
---|
[8357] | 1103 | text_t::const_iterator here = querystring.begin();
|
---|
| 1104 | text_t::const_iterator end = querystring.end();
|
---|
[12784] | 1105 |
|
---|
| 1106 | text_t finalquery = "";
|
---|
[10995] | 1107 | while (here != end) {
|
---|
[12784] | 1108 | if (*here == '&') {
|
---|
| 1109 | finalquery.push_back('&');
|
---|
| 1110 | finalquery.push_back('&');
|
---|
| 1111 | while (*(here+1) == '&') {
|
---|
| 1112 | ++here;
|
---|
[10995] | 1113 | }
|
---|
[12784] | 1114 | }
|
---|
| 1115 | else if (*here == '|') {
|
---|
| 1116 | finalquery.push_back('|');
|
---|
| 1117 | finalquery.push_back('|');
|
---|
| 1118 | while (*(here+1) == '|') {
|
---|
| 1119 | ++here;
|
---|
| 1120 | }
|
---|
| 1121 | }
|
---|
[8357] | 1122 | else {
|
---|
[12784] | 1123 | finalquery.push_back(*here);
|
---|
[8357] | 1124 | }
|
---|
[10995] | 1125 | ++here;
|
---|
[8357] | 1126 | }
|
---|
[11765] | 1127 | querystring = finalquery;
|
---|
[12784] | 1128 | add_field_info(querystring, tag, type);
|
---|
[11765] | 1129 | }
|
---|
| 1130 |
|
---|
[12784] | 1131 |
|
---|
| 1132 | void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) {
|
---|
| 1133 |
|
---|
[11765] | 1134 | if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields)
|
---|
[12784] | 1135 | if (tag == "" && argb == 1) {
|
---|
[11765] | 1136 | return; // no field specifier, advanced mode, the query stays as written
|
---|
[10995] | 1137 | }
|
---|
[11765] | 1138 |
|
---|
| 1139 | int type = 1; // mgpp
|
---|
| 1140 |
|
---|
| 1141 | bool simple_and = (argb==0 && argt==0);
|
---|
| 1142 | text_t finalquery = "";
|
---|
| 1143 | text_t fieldpart ="";
|
---|
| 1144 | text_t queryelem = "";
|
---|
| 1145 | bool in_phrase = false;
|
---|
| 1146 | bool in_field = false;
|
---|
| 1147 |
|
---|
| 1148 | text_t::const_iterator here = querystring.begin();
|
---|
| 1149 | text_t::const_iterator end = querystring.end();
|
---|
| 1150 | while (here != end) {
|
---|
| 1151 | if (is_unicode_letdig(*here) || *here == '&' || is_special_character(type, *here)) {
|
---|
| 1152 | queryelem.push_back(*here);
|
---|
| 1153 | }
|
---|
| 1154 | else if (*here == '|') {
|
---|
| 1155 | in_field = false;
|
---|
| 1156 | }
|
---|
| 1157 | else if (*here == '!' || *here == '(' || *here == ')') {
|
---|
| 1158 | if (!in_phrase) { // ignore these if in_phrase
|
---|
| 1159 | // output field, then output operator
|
---|
| 1160 | in_field = false;
|
---|
| 1161 | if (!queryelem.empty()) {
|
---|
| 1162 | if (!simple_and && !fieldpart.empty()) {
|
---|
| 1163 | add_field_info(fieldpart, tag, type);
|
---|
| 1164 | finalquery += fieldpart;
|
---|
| 1165 | finalquery.push_back(' ');
|
---|
| 1166 | fieldpart.clear();
|
---|
| 1167 | }
|
---|
| 1168 | fieldpart += queryelem;
|
---|
| 1169 | }
|
---|
| 1170 | if (!fieldpart.empty()) {
|
---|
| 1171 | add_field_info(fieldpart, tag, type);
|
---|
| 1172 | finalquery += fieldpart;
|
---|
| 1173 | finalquery.push_back(' ');
|
---|
| 1174 | }
|
---|
| 1175 | fieldpart.clear();
|
---|
| 1176 | queryelem.clear();
|
---|
| 1177 | finalquery.push_back(*here);
|
---|
| 1178 | finalquery.push_back(' ');
|
---|
| 1179 | }
|
---|
| 1180 | }
|
---|
| 1181 | else if (*here == '"') {
|
---|
| 1182 | queryelem.push_back(*here);
|
---|
| 1183 | if (in_phrase == false) in_phrase = true;
|
---|
| 1184 | else {
|
---|
| 1185 | in_phrase = false;
|
---|
| 1186 | }
|
---|
| 1187 | }
|
---|
| 1188 |
|
---|
| 1189 | // Found word boundary, in a phrase
|
---|
| 1190 | else if (in_phrase) {
|
---|
| 1191 | queryelem.push_back(*here);
|
---|
| 1192 | }
|
---|
| 1193 | // Found a word boundary
|
---|
| 1194 | else {
|
---|
| 1195 | if (!queryelem.empty()) {
|
---|
| 1196 | if (queryelem == "&") {
|
---|
| 1197 | in_field = true;
|
---|
| 1198 | queryelem.clear();
|
---|
| 1199 | }
|
---|
| 1200 | else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) {
|
---|
| 1201 |
|
---|
| 1202 | if (argb==1) {
|
---|
| 1203 | // simple search, these not allowed
|
---|
| 1204 | in_field = true;
|
---|
| 1205 | fieldpart += queryelem;
|
---|
| 1206 | fieldpart.push_back(' ');
|
---|
| 1207 | }
|
---|
| 1208 | queryelem.clear();
|
---|
| 1209 |
|
---|
| 1210 | }
|
---|
| 1211 | else {
|
---|
| 1212 | if (!simple_and && !in_field) {
|
---|
| 1213 | if (!fieldpart.empty()) {
|
---|
| 1214 | add_field_info(fieldpart, tag, type);
|
---|
| 1215 | finalquery += fieldpart;
|
---|
| 1216 | finalquery.push_back(' ');
|
---|
| 1217 | fieldpart.clear();
|
---|
| 1218 | }
|
---|
| 1219 | }
|
---|
| 1220 |
|
---|
| 1221 | fieldpart += queryelem;
|
---|
| 1222 | fieldpart.push_back(' ');
|
---|
| 1223 | queryelem.clear();
|
---|
| 1224 | }
|
---|
| 1225 | }
|
---|
| 1226 | }
|
---|
| 1227 | ++here;
|
---|
| 1228 | }
|
---|
| 1229 | // at the end
|
---|
| 1230 | if (!queryelem.empty()) {
|
---|
| 1231 | if (!simple_and && !in_field && !fieldpart.empty()) {
|
---|
| 1232 | add_field_info(fieldpart, tag, type);
|
---|
| 1233 | finalquery += fieldpart;
|
---|
[18459] | 1234 | finalquery.push_back(' ');
|
---|
[11765] | 1235 | fieldpart.clear();
|
---|
| 1236 | }
|
---|
| 1237 | fieldpart += queryelem;
|
---|
| 1238 | }
|
---|
| 1239 | if (!fieldpart.empty()) {
|
---|
| 1240 | add_field_info(fieldpart, tag, type);
|
---|
| 1241 | finalquery += fieldpart;
|
---|
| 1242 | fieldpart.clear();
|
---|
[18459] | 1243 |
|
---|
| 1244 | // doesn't the following just leave a dangling space at the end ?? (used to make mgpp crash)
|
---|
| 1245 | // consider cutting this line
|
---|
| 1246 | finalquery.push_back(' ');
|
---|
[11765] | 1247 | }
|
---|
[22046] | 1248 |
|
---|
[11765] | 1249 | querystring = finalquery;
|
---|
[8029] | 1250 | }
|
---|
[8357] | 1251 |
|
---|
[12784] | 1252 |
|
---|
[22046] | 1253 | void format_field_info_sql(text_t &querystring, const text_t &tagseq,
|
---|
| 1254 | const text_t &sqlcomb,
|
---|
| 1255 | int argt, int argb)
|
---|
| 1256 | {
|
---|
| 1257 | add_field_info_sql(querystring, tagseq, sqlcomb);
|
---|
| 1258 | }
|
---|
| 1259 |
|
---|
| 1260 |
|
---|
[12784] | 1261 | void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) {
|
---|
[11765] | 1262 | if (argct == 1) {
|
---|
[12784] | 1263 | format_field_info_mgpp(querystring, tag, argt, argb);
|
---|
[11765] | 1264 | } else if (argct == 2) {
|
---|
[12784] | 1265 | format_field_info_lucene(querystring, tag, argt, argb);
|
---|
[11765] | 1266 | }
|
---|
| 1267 | }
|
---|
[10995] | 1268 |
|
---|
[12784] | 1269 | void mgpp_adddateelem(text_t& querystring, const int date)
|
---|
| 1270 | {
|
---|
| 1271 | querystring.appendcstr(" [");
|
---|
| 1272 | if(date<0) {
|
---|
| 1273 | querystring.appendcstr("bc");
|
---|
| 1274 | querystring.appendint((date*-1));
|
---|
| 1275 | }
|
---|
| 1276 | else {
|
---|
| 1277 | querystring.appendint(date);
|
---|
| 1278 | }
|
---|
| 1279 | querystring.appendcstr("]:CV");
|
---|
| 1280 | }
|
---|
| 1281 |
|
---|
| 1282 | void lucene_adddateelem(text_t& querystring, const int date)
|
---|
| 1283 | {
|
---|
| 1284 | querystring.appendcstr(" CV:(");
|
---|
| 1285 | if(date<0) {
|
---|
| 1286 | querystring.appendcstr("bc");
|
---|
| 1287 | querystring.appendint((date*-1));
|
---|
| 1288 | }
|
---|
| 1289 | else {
|
---|
| 1290 | querystring.appendint(date);
|
---|
| 1291 | }
|
---|
| 1292 | querystring.appendcstr(")");
|
---|
| 1293 | }
|
---|
| 1294 |
|
---|
| 1295 |
|
---|
| 1296 | void add_dates(text_t &querystring, int startdate, int enddate,
|
---|
| 1297 | int startbc, int endbc, int ct)
|
---|
| 1298 | {
|
---|
| 1299 | if(startdate)
|
---|
| 1300 | {
|
---|
| 1301 | int querystringis = 0;
|
---|
| 1302 | text_t::const_iterator here = querystring.begin();
|
---|
| 1303 | text_t::const_iterator end = querystring.end();
|
---|
| 1304 | while(here!=end)
|
---|
| 1305 | {
|
---|
| 1306 | if(!(isspace((*here)))){
|
---|
| 1307 | here = end;
|
---|
| 1308 | querystringis = 1;
|
---|
| 1309 | }
|
---|
| 1310 | else
|
---|
| 1311 | ++here;
|
---|
| 1312 | }
|
---|
| 1313 | //converting BCE dates
|
---|
| 1314 | if(startbc && startdate > 0)
|
---|
| 1315 | {
|
---|
| 1316 | startdate *= -1;
|
---|
| 1317 | }
|
---|
| 1318 | if(endbc && enddate > 0)
|
---|
| 1319 | {
|
---|
| 1320 | enddate *= -1;
|
---|
| 1321 | }
|
---|
| 1322 | if(enddate != 0 && enddate<startdate)
|
---|
| 1323 | {
|
---|
| 1324 | cout<<"enddate too small"<<endl;
|
---|
| 1325 | return;
|
---|
| 1326 | }
|
---|
| 1327 | if(querystringis)
|
---|
| 1328 | querystring.appendcstr(" AND");
|
---|
| 1329 | if(!enddate)
|
---|
| 1330 | {
|
---|
| 1331 | if (ct==1) {
|
---|
| 1332 | mgpp_adddateelem(querystring,startdate);
|
---|
| 1333 | }
|
---|
| 1334 | else { // lucene
|
---|
| 1335 | lucene_adddateelem(querystring,startdate);
|
---|
| 1336 | }
|
---|
| 1337 | }
|
---|
| 1338 | else{
|
---|
| 1339 | int nextdate = startdate;
|
---|
| 1340 | querystring.appendcstr(" (");
|
---|
| 1341 | while(nextdate<=enddate)
|
---|
| 1342 | {
|
---|
| 1343 | if(nextdate!=0) {
|
---|
| 1344 | if (ct==1) {
|
---|
| 1345 | mgpp_adddateelem(querystring,nextdate);
|
---|
| 1346 | }
|
---|
| 1347 | else { // lucene
|
---|
| 1348 | lucene_adddateelem(querystring,nextdate);
|
---|
| 1349 | }
|
---|
| 1350 | }
|
---|
| 1351 | ++nextdate;
|
---|
| 1352 | }
|
---|
| 1353 | querystring.appendcstr(" )");
|
---|
| 1354 | }
|
---|
| 1355 | }
|
---|
| 1356 |
|
---|
| 1357 | }
|
---|