/********************************************************************** * * querytools.cpp -- * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #include "querytools.h" #include #include "unitool.h" // for is_unicode_letdig // sets the ct, qt, qto arguments void set_query_type_args(ColInfoResponse_t *cinfo, cgiargsclass &args) { if (args["ct"].empty()) { text_t build_type = cinfo->buildType; if (build_type == "mgpp") { args["ct"] = "1"; } else if (build_type == "lucene") { args["ct"] = "2"; } else { args["ct"] = "0"; } } text_t arg_ct = args["ct"]; if (arg_ct == "0") { // mg args["qt"] = "0"; args["qto"] = "0"; return; } if (!args["qt"].empty() && !args["qto"].empty()) { return; } text_tmap::iterator check = cinfo->format.find("SearchTypes"); text_t search_types; if(check != cinfo->format.end() && !(*check).second.empty()){ search_types = (*check).second; } else { // assume plain,form if (args["qto"].empty()) args["qto"] = "3"; if (args["qt"].empty()) { int arg_qto = args.getintarg("qto"); if (arg_qto == 2) { args["qt"] = "1"; } else { args["qt"] = "0"; } } return; } if (args["qto"].empty()) { unsigned int type = 0; if (findword(search_types.begin(), search_types.end(), "form") != search_types.end()) { type |= 2; } if (findword(search_types.begin(), search_types.end(), "plain") != search_types.end()) { type |= 1; } args.setintarg("qto", type); } if (args["qt"].empty()) { int arg_qto = args.getintarg("qto"); if (arg_qto == 2 || (arg_qto == 3 && starts_with(search_types, "form"))) { args["qt"] = "1"; } else { args["qt"] = "0"; } } // decide if sqlqto should be set or not unsigned int sql_type = 0; text_t infodb_type = cinfo->infodbType; if ((infodb_type == "sqlite") || (infodb_type == "mssql")) { if (findword(search_types.begin(), search_types.end(), "sqlform") != search_types.end()) { sql_type = 1; } } if (sql_type) { args["sqlqto"] = "1"; } else { args["sqlqto"] = "0"; } } // sets the ks, ss, afs (casesupport, stemsupport, accentfoldsupport) args void set_stem_index_args(ColInfoResponse_t *cinfo, cgiargsclass &args) { int stemIndexes = cinfo->stemIndexes; if (stemIndexes & SIcasefold) { args["ks"] = 1; } if (stemIndexes & SIstem) { args["ss"] = 1; } if (stemIndexes & SIaccentfold) { args["afs"] = 1; } } void set_basequeryfilter_options (FilterRequest_t &request, cgiargsclass &args) { OptionValue_t option; int arg_m = args.getintarg("m"); option.name = "Maxdocs"; option.value = arg_m; request.filterOptions.push_back (option); // option.name = "StartResults"; // option.value = args["r"]; // request.filterOptions.push_back (option); // option.name = "EndResults"; // int endresults = args.getintarg("o") + (args.getintarg("r") - 1); // if ((endresults > arg_m) && (arg_m != -1)) endresults = arg_m; // option.value = endresults; // request.filterOptions.push_back (option); } // request.filterResultOptions and request.fields (if required) should // be set from the calling code void set_fulltext_queryfilter_options (FilterRequest_t &request, const text_t &querystring, cgiargsclass &args) { // better if this function, and the two-query companion function // was implemented in queryaction.cpp // Has to be done here to documentaction.cpp can call it directly request.filterName = "QueryFilter"; OptionValue_t option; option.name = "Term"; option.value = querystring; request.filterOptions.push_back (option); option.name = "QueryType"; option.value = (args.getintarg("t")) ? "ranked" : "boolean"; request.filterOptions.push_back (option); option.name = "MatchMode"; // mgpp in advanced mode, always use some query if (args.getintarg("ct") == 1 && args.getintarg("b") == 1) { option.value = "some"; } else { option.value = (args.getintarg("t")) ? "some" : "all"; } request.filterOptions.push_back (option); option.name = "Casefold"; option.value = (args.getintarg("k")) ? "true" : "false"; request.filterOptions.push_back (option); option.name = "Stem"; option.value = (args.getintarg("s")) ? "true" : "false"; request.filterOptions.push_back (option); option.name = "AccentFold"; option.value = (args.getintarg("af")) ? "true" : "false"; request.filterOptions.push_back (option); if (!args["h"].empty()) { option.name = "Index"; option.value = args["h"]; request.filterOptions.push_back (option); } if (!args["j"].empty()) { option.name = "Subcollection"; option.value = args["j"]; request.filterOptions.push_back (option); } if (!args["n"].empty()) { option.name = "Language"; option.value = args["n"]; request.filterOptions.push_back (option); } if (!args["g"].empty()) { // granularity for mgpp option.name = "Level"; option.value = args["g"]; request.filterOptions.push_back (option); } if (!args["fs"].empty()) { // filter string for lucene option.name = "FilterString"; option.value = args["fs"]; request.filterOptions.push_back (option); } if (!args["sf"].empty()) { // sort field for lucene option.name = "SortField"; option.value = args["sf"]; request.filterOptions.push_back (option); } if (!args["fuzziness"].empty() && args["fuzziness"] != "100") { // fuzziness value for lucene option.name = "Fuzziness"; option.value = (text_t) "0." + args["fuzziness"]; request.filterOptions.push_back (option); } set_basequeryfilter_options(request, args); } void set_fulltext_queryfilter_options (FilterRequest_t &request, const text_t &querystring1, const text_t &querystring2, cgiargsclass &args) { set_fulltext_queryfilter_options (request, querystring1, args); // fill in the second query if needed if (!args["cq2"].empty()) { OptionValue_t option; option.name = "CombineQuery"; option.value = args["cq2"]; request.filterOptions.push_back (option); option.name = "Term"; option.value = querystring2; request.filterOptions.push_back (option); option.name = "QueryType"; option.value = (args.getintarg("t")) ? "ranked" : "boolean"; request.filterOptions.push_back (option); option.name = "Casefold"; option.value = (args.getintarg("k")) ? "true" : "false"; request.filterOptions.push_back (option); option.name = "Stem"; option.value = (args.getintarg("s")) ? "true" : "false"; request.filterOptions.push_back (option); option.name = "AccentFold"; option.value = (args.getintarg("af")) ? "true" : "false"; request.filterOptions.push_back (option); if (!args["h2"].empty()) { option.name = "Index"; option.value = args["h2"]; request.filterOptions.push_back (option); } if (!args["j2"].empty()) { option.name = "Subcollection"; option.value = args["j2"]; request.filterOptions.push_back (option); } if (!args["n2"].empty()) { option.name = "Language"; option.value = args["n2"]; request.filterOptions.push_back (option); } } // this is probably redundant, as first line to this method will have // already caused it to invoke set_basequeryfilter_options set_basequeryfilter_options(request, args); } // request.filterResultOptions and request.fields (if required) should // be set from the calling code void set_sql_queryfilter_options (FilterRequest_t &request, cgiargsclass &args) { if (!args["sqlsf"].empty()) { // sort field for lucene OptionValue_t option; option.name = "SortField"; option.value = args["sqlsf"]; request.filterOptions.push_back (option); } set_basequeryfilter_options(request, args); } bool is_special_character(int indexer_type, unsigned short character) { // mgpp if (indexer_type == 1) { return (character == '#' || character == '/' || character == '*'); } // lucene else if (indexer_type == 2) { return (character == '?' || character == '*' || character == '~' || character == '^'); } return false; } // This function removes boolean operators from simple searches, and segments // chinese characters if segment=true void format_querystring (text_t &querystring, int querymode, bool segment) { text_t formattedstring; // advanced search, no segmenting, don't need to do anything if (querymode == 1 && !segment) return; text_t::const_iterator here = querystring.begin(); text_t::const_iterator end = querystring.end(); // space is used to insert spaces between Chinese // characters. No space is needed before the first // Chinese character. bool space = false; // want to remove ()|!& from querystring so boolean queries are just // "all the words" queries (unless querymode is advanced) while (here != end) { if ((querymode == 0) && (*here == '(' || *here == ')' || *here == '|' || *here == '!' || *here == '&')) { formattedstring.push_back(' '); } else if (segment) { if ((*here >= 0x2e80 && *here <= 0xd7a3) || ( *here >= 0xf900 && *here <= 0xfa6a)) { /* text_t not big enough to handle these. */ /* (*here >= 0x20000 && *here <= 0x2a6d6) || (*here >= 0x2f800 && *here <= 0x2fa1d)) { */ // CJK character if (!space) formattedstring.push_back (0x200b); // zero width space formattedstring.push_back (*here); formattedstring.push_back (0x200b); space = true; } else { // non-Chinese character formattedstring.push_back (*here); space = false; } } else { formattedstring.push_back (*here); } ++here; } querystring = formattedstring; } // turn query string into terms separated by spaces. // still working on this... text_t get_plain_query_terms(const text_t &querystring, const text_t &arg_ct) { text_t::const_iterator here = querystring.begin(); text_t::const_iterator end = querystring.end(); // lets look for [] and () first - these are a pain. text_t::const_iterator bracket; text_t query_no_brackets = ""; // mgpp brackets: [xxx]:TI if (findchar(here, end, '[') != end) { while ((bracket = findchar(here, end, '[')) != end) { // get the first bit query_no_brackets += substr(here, bracket); bracket++; here = bracket; // get the end bracket bracket = findchar(here, end, ']'); query_no_brackets += substr(here, bracket); // skip the :TI bits while (bracket != end // do bracket != end test first, ELSE when bracket = end, we're past the string, in && *bracket != ' ') { // which case *bracket becomes an invalid operation that causes the server to crash bracket++; } here = bracket; } if (here != end) { query_no_brackets += substr(here,end); } } else if (findchar(here, end, '(') != end) { // lucene brackets TI:(xxx) while ((bracket = findchar(here, end, '(')) != end) { // back up the field name text_t::const_iterator old_bracket = bracket; while (bracket != here && *bracket != ' ') { // order of tests in condition matters (see long comment above) --bracket; } if (bracket != here) { // get the first bit query_no_brackets += substr(here, bracket+1); } here = old_bracket +1; // get the end bracket bracket = findchar(here, end, ')'); query_no_brackets += substr(here, bracket); if (bracket != end) { here = bracket+1; } } if (here != end) { query_no_brackets += substr(here,end); } } else { // was no brackets query_no_brackets = querystring; } if (arg_ct == "2") { // lucene // look for AND OR NOT and remove here = query_no_brackets.begin(); end = query_no_brackets.end(); text_tlist terms; splitword(here, end, "AND", terms); joinchar(terms, ' ', query_no_brackets); here = query_no_brackets.begin(); end = query_no_brackets.end(); splitword(here, end, "OR", terms); joinchar(terms, ' ', query_no_brackets); here = query_no_brackets.begin(); end = query_no_brackets.end(); splitword(here, end, "NOT", terms); joinchar(terms, ' ', query_no_brackets); } text_t terms = ""; bool space = false; here = query_no_brackets.begin(); end = query_no_brackets.end(); while (here != end) { if (*here == '#' || *here == '/') { // skip over #is /10 etc ++here; while (here != end && *here != ' ') { ++here; } if (here == end) break; } if (is_unicode_letdig(*here)) { terms.push_back(*here); space = false; } else { if (!space) { terms.push_back(' '); space = true; } } ++here; } return trim(terms); } // search history tool // also used for form query macros text_t escape_quotes(const text_t &querystring) { text_t::const_iterator here = querystring.begin(); text_t::const_iterator end = querystring.end(); text_t escquery = ""; while (here != end) { if (*here != '\'' && *here != '\"' && *here != '\n' && *here != '\r') escquery.push_back(*here); else if (*here == '\n' || *here == '\r') { escquery.push_back(' '); } else { escquery +="\\\\"; escquery.push_back(*here); } ++here; } return escquery; } // Parses the terms into words, and adds #si if necessary text_t addstemcase(const text_t &terms, const text_t &stem, const text_t &fold, const int indexer_type) { // the default stem and case are set to 0 if this is being used, so we are only adding on qualifiers if stem or fold is 1. if (stem == "0" && fold == "0") { return terms; } // this is only for mgpp collections, shouldn't be called for anything else if (indexer_type != 1) { return terms; } text_t outtext; text_t word; text_t::const_iterator here = terms.begin(); text_t::const_iterator end = terms.end(); while (here !=end) { if (is_unicode_letdig(*here) || is_special_character(indexer_type, *here)) { // not word boundary word.push_back(*here); ++here; } else { // found word boundary if (!word.empty() ) { if (starts_with(word, "NEAR") || starts_with(word, "WITHIN")) { outtext += word; word.clear(); } else { word += "#"; if (stem == "1") word += "s"; if (fold == "1") word += "i"; outtext += word; word.clear(); } } // this only used in advanced form, so we leave in boolean operators if (*here == '\"' || *here == '&' || *here == '|' || *here == '!' || *here == '(' || *here == ')' || is_unicode_space(*here)) { outtext.push_back(*here); } ++here; } } // get last word if (!word.empty()) { word += "#"; if (stem == "1") word += "s"; if (fold == "1") word += "i"; word += " "; outtext += word; } return outtext; } // some query form parsing functions for use with mgpp & lucene void parse_reg_query_form(text_t &querystring, cgiargsclass &args, bool segment) { querystring.clear(); int argct = args.getintarg("ct"); int argt = args.getintarg("t");// t=0 -and, t=1 - or int argb = args.getintarg("b"); text_t combine; // lucene uses global combine, so only need this for mgpp if (argct==1) { if (argt == 0) combine = "&"; else combine = "|"; } text_t field = args["fqf"]; if (field.empty()) return; // no query text_tarray fields; splitchar(field.begin(), field.end(), ',', fields); text_t value = args["fqv"]; if (value.empty()) return; // somethings wrong text_tarray values; splitchar(value.begin(), value.end(), ',', values); for (int i=0; i< values.size(); ++i) { if (!values[i].empty()) { text_t this_value = values[i]; // remove operators for simple search, segments text if necessary format_querystring(this_value, argb, segment); // add tag info for this field (and other processing) format_field_info(this_value, fields[i], argct, argt, argb); // add into query string if (argct == 2) { // lucene // we don't worry about AND/OR, cos this is done by defaultcombineoperator querystring += this_value+" "; } else { // mgpp if (!querystring.empty()) { querystring += " "+ combine+ " "; } querystring += this_value; } } } } void parse_adv_query_form(text_t &querystring, cgiargsclass &args, bool segment){ querystring.clear(); const int argct = args.getintarg("ct"); int argt = 0;// arg t is either not used (lucene) or used for natural/ranked (mgpp), so we set it to 0 = AND, by default int argb = args.getintarg("b"); text_t combine; if (argct==1) { combine = "&"; } else { // lucene combine = "AND"; } text_t field = args["fqf"]; if (field.empty()) return; // no query text_tarray fields; splitchar(field.begin(), field.end(), ',', fields); text_t value = args["fqv"]; if (value.empty()) return; // somethings wrong text_tarray values; splitchar(value.begin(), value.end(), ',', values); text_t comb = args["fqc"]; if (comb.empty()) return; //somethings wrong text_tarray combs; splitchar(comb.begin(), comb.end(), ',', combs); text_tarray stems; text_tarray folds; if (argct == 1) {// mgpp - lucene doesn't do stem/case text_t stem = args["fqs"]; if (stem.empty()) return; // somethings wrong splitchar(stem.begin(), stem.end(), ',', stems); text_t fold = args["fqk"]; if (fold.empty()) return; // somethings wrong splitchar(fold.begin(), fold.end(), ',', folds); } for(int i=0; i< values.size(); ++i) { if (!values[i].empty()) { if (i!=0) { if (argct==1) { if (combs[i-1]=="and") combine = "&"; else if (combs[i-1]=="or")combine = "|"; else if (combs[i-1]=="not")combine = "!"; } else { // lucene if (combs[i-1]=="and") combine = "AND"; else if (combs[i-1]=="or")combine = "OR"; else if (combs[i-1]=="not")combine = "NOT"; } } text_t this_value = values[i]; // remove operators for simple search, segments text if necessary format_querystring(this_value, argb, segment); if (argct == 1) { // mgpp only this_value = addstemcase(this_value, stems[i], folds[i], argct); } // add tag info for this field (and other processing) format_field_info(this_value, fields[i], argct, argt, argb); // add into query string if (!querystring.empty()) { querystring += " "+ combine+ " "; } querystring += this_value; } } } // SQL versions for parsing query form void parse_sqlreg_query_form(text_t &querystring, cgiargsclass &args, bool segment) { querystring.clear(); int argt = args.getintarg("t");// t=0 -and, t=1 - or int argb = args.getintarg("b"); text_t combine; if (argt == 0) combine = "AND"; else combine = "OR"; text_t field = args["sqlfqf"]; if (field.empty()) return; // no query text_tarray fields; splitchar(field.begin(), field.end(), ',', fields); text_t sqlcomb = args["sqlfqc"]; if (sqlcomb.empty()) return; //somethings wrong text_tarray sqlcombs; splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs); text_t value = args["fqv"]; if (value.empty()) return; // somethings wrong text_tarray values; splitchar(value.begin(), value.end(), ',', values); for (int i=0; i< values.size(); ++i) { if (!values[i].empty()) { text_t this_value; const text_t STARTINGWITH_CONDITION = "STARTINGWITH"; const text_t LIKE_CONDITION = "LIKE"; //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field //in order to search a field starting with certain words. if (sqlcombs[i] == STARTINGWITH_CONDITION) {this_value = values[i]; this_value += "%"; // remove operators for simple search, segments text if necessary format_querystring(this_value, argb, segment); // add tag info for this field (and other processing) format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);} else {this_value = values[i]; // remove operators for simple search, segments text if necessary format_querystring(this_value, argb, segment); // add tag info for this field (and other processing) format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);} const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE "; if (querystring.empty()) { // first query term querystring = DISTINCT_SELECT_WHERE + this_value; } else { this_value = DISTINCT_SELECT_WHERE + this_value; if (combine=="AND") { // INNER JOIN to restrict to only matching docOIDs querystring = "SELECT docOID FROM (" + querystring + ")" + " INNER JOIN (" + this_value +") USING (docOID)"; } else if (combine=="OR") { // Union to allow union of the two querystring = querystring + " UNION " + this_value; } } } } } void parse_sqladv_query_form(text_t &querystring, cgiargsclass &args, bool segment) { querystring.clear(); int argt = 0; // set it to 0 = AND, by default int argb = args.getintarg("b"); text_t combine = "AND"; text_t field = args["sqlfqf"]; if (field.empty()) return; // no query text_tarray fields; splitchar(field.begin(), field.end(), ',', fields); text_t sqlcomb = args["sqlfqc"]; if (sqlcomb.empty()) return; //somethings wrong text_tarray sqlcombs; splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs); text_t value = args["fqv"]; if (value.empty()) return; // somethings wrong text_tarray values; splitchar(value.begin(), value.end(), ',', values); text_t comb = args["fqc"]; if (comb.empty()) return; //somethings wrong text_tarray combs; splitchar(comb.begin(), comb.end(), ',', combs); for(int i=0; i< values.size(); ++i) { if (!values[i].empty()) { if (i>0) { if (combs[i-1]=="and") { combine = "AND"; } else if (combs[i-1]=="or") { combine = "OR"; } else if (combs[i-1]=="not") { combine = "NOT"; } } text_t this_value; const text_t STARTINGWITH_CONDITION = "STARTINGWITH"; const text_t LIKE_CONDITION = "LIKE"; //Change the STARTINGWITH operator to 'LIKE' and then adds '%' to the end of the value field //in order to search a field starting with certain words. if (sqlcombs[i] == STARTINGWITH_CONDITION) {this_value = values[i]; this_value += "%"; // remove operators for simple search, segments text if necessary format_querystring(this_value, argb, segment); // add tag info for this field (and other processing) format_field_info_sql(this_value, fields[i], LIKE_CONDITION, argt, argb);} else {this_value = values[i]; // remove operators for simple search, segments text if necessary format_querystring(this_value, argb, segment); // add tag info for this field (and other processing) format_field_info_sql(this_value, fields[i], sqlcombs[i], argt, argb);} const text_t DISTINCT_SELECT_WHERE = "SELECT DISTINCT docOID FROM document_metadata WHERE "; if (querystring.empty()) { // first query term querystring = DISTINCT_SELECT_WHERE + this_value; } else { this_value = DISTINCT_SELECT_WHERE + this_value; if (combine=="AND") { // INNER JOIN to restrict to only matching docOIDs querystring = "SELECT docOID FROM (" + querystring + ")" + " INNER JOIN (" + this_value +") USING (docOID)"; } else if (combine=="OR") { // Union to allow union of the two querystring = querystring + " UNION " + this_value; } else { cerr << "Unsupported combination operation: " << combine << endl; } } } } } // Extended addqueryelem for Human Info project void addqueryelem_ex(text_t &querystring, const text_t &tag, const text_t &terms, const text_t &stem, const text_t &fold, const text_t& combine, const text_t& word_combine) { if (!querystring.empty()) { // have to put and/or querystring += " " + combine + " "; } text_t outtext; outtext.reserve(512); text_t word; word.reserve(100); //unsigned short c; text_t::const_iterator here = terms.begin(); text_t::const_iterator end = terms.end(); bool inquote = false, firstword = true; text_t word2; word2.reserve(256); while (here !=end) { if (is_unicode_space(*here)) { if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); } else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); } else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); } else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); } else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); } if (inquote) { word2.push_back(*here); } word.append(word2); word2.clear(); if (!inquote && !word.empty() ) { // found word boundary if (stem == "1" || fold =="1") { word += "#"; if (stem == "1") word += "s"; //else word += "u"; if (fold == "1") word += "i"; //else word += "c"; } if (firstword) { firstword = false; } else { outtext += " " + word_combine + " "; } outtext += "[" + word + "]:"+tag; word.clear(); } ++here; } else if (*here == '\"') { word2.push_back(*here); inquote = !inquote; ++here; } else { // not word boundary word2.push_back(*here); ++here; } } // get last word if (!word2.empty()) { if (word2 == "AND") { word2.clear(); word2.push_back(7527); word2.appendcarr("AND", 3); word2.push_back(7527); } else if (word2 == "OR") { word2.clear(); word2.push_back(7527); word2.appendcarr("OR", 2); word2.push_back(7527); } else if (word2 == "NOT") { word2.clear(); word2.push_back(7527); word2.appendcarr("NOT", 3); word2.push_back(7527); } else if (word2 == "NEAR") { word2.clear(); word2.push_back(7527); word2.appendcarr("NEAR", 4); word2.push_back(7527); } else if (word2 == "WITHIN") { word2.clear(); word2.push_back(7527); word2.appendcarr("WITHIN", 6); word2.push_back(7527); } word.append(word2); word2.clear(); if (stem == "1"|| fold == "1") { word += "#"; if (stem == "1") word += "s"; //else word += "u"; if (fold == "1") word += "i"; //else word += "c"; } if (!outtext.empty()) outtext += " " + word_combine + " "; outtext += "[" + word + "]:"+tag; } querystring += "(" + outtext + ")"; } void add_field_info(text_t &querystring, const text_t &tag, int type) { if (tag == "") return; // do nothing if (tag == "ZZ" && type == 1) return; // mgpp doesn't use ZZ tag internally if (type == 1) { //mgpp querystring = "["+querystring+"]:"+tag; } else if (type == 2) { // lucene querystring = tag+":("+querystring+")"; } } void add_field_info_sql(text_t &querystring, const text_t &tagseq, const text_t& sqlcomb) { if (tagseq == "") return; // do nothing text_t element_in = "(element IN ("; text_tlist mdterms; splitword(tagseq.begin(), tagseq.end(), "/", mdterms); text_t tags_in = ""; while (!mdterms.empty()) { text_t tag = mdterms.front(); mdterms.pop_front(); if (!tag.empty()) { if (tag.size()>3 && (substr(tag.begin(), tag.begin()+3) == "ex.")) { tag = substr (tag.begin()+3, tag.end()); } if (!tags_in.empty()) { tags_in += ","; } tags_in += "'" + tag + "'"; } } element_in += tags_in + ") AND ("; if (sqlcomb == "=") { // override what it means to do equality, to make it more like full text // searching text_t orterms = ""; text_t term = ""; bool in_phrase = false; text_t::const_iterator here = querystring.begin(); text_t::const_iterator end = querystring.end(); while (here != end) { if (is_unicode_letdig(*here)) { term.push_back(*here); } else if (*here == '"') { term.push_back(*here); if (!in_phrase) { in_phrase = true; } else { in_phrase = false; } } else if (in_phrase) { // Found word boundary, but in a phrase, so does not complete term term.push_back(*here); } else { // Found a word boundary if (!orterms.empty()) { orterms += " OR "; } orterms += "value LIKE '%" + term + "%'"; term.clear(); } ++here; } if (!term.empty()) { if (!orterms.empty()) { orterms += " OR "; } orterms += "value LIKE '%" + term + "%'"; } element_in += orterms; } //We cast the value from STRING to REAL to allow numeric sorting else if (sqlcomb == "num") { element_in += "CAST(value as REAL) > CAST('" + querystring+"' AS REAL)"; } else if (sqlcomb == "<=num") { element_in += "CAST(value as REAL) <= CAST('" + querystring+"' AS REAL)"; } else if (sqlcomb == ">=num") { element_in += "CAST(value as REAL) >= CAST('" + querystring+"' AS REAL)"; } else if (sqlcomb == "=num") { element_in += "CAST(value as REAL) = CAST('" + querystring+"' AS REAL)"; } else { // search on value is "as is" querystring element_in += "value " + sqlcomb + " '" + querystring+"'"; } querystring = element_in + "))"; } void format_field_info_lucene(text_t &querystring, text_t &tag, int argt, int argb) { int type = 2; //lucene if (argb==0) { // simple // there will be no & or | as they should have already been removed // just tag the entire thing if (tag != "") { add_field_info(querystring, tag, type); } return; } // need to replace & with &&, | with || text_t::const_iterator here = querystring.begin(); text_t::const_iterator end = querystring.end(); text_t finalquery = ""; while (here != end) { if (*here == '&') { finalquery.push_back('&'); finalquery.push_back('&'); while (*(here+1) == '&') { ++here; } } else if (*here == '|') { finalquery.push_back('|'); finalquery.push_back('|'); while (*(here+1) == '|') { ++here; } } else { finalquery.push_back(*here); } ++here; } querystring = finalquery; add_field_info(querystring, tag, type); } void format_field_info_mgpp(text_t &querystring, text_t tag, int argt, int argb) { if (tag == "ZZ") tag = ""; // ZZ is a special tag meaning no tag (all fields) if (tag == "" && argb == 1) { return; // no field specifier, advanced mode, the query stays as written } int type = 1; // mgpp bool simple_and = (argb==0 && argt==0); text_t finalquery = ""; text_t fieldpart =""; text_t queryelem = ""; bool in_phrase = false; bool in_field = false; text_t::const_iterator here = querystring.begin(); text_t::const_iterator end = querystring.end(); while (here != end) { if (is_unicode_letdig(*here) || *here == '&' || is_special_character(type, *here)) { queryelem.push_back(*here); } else if (*here == '|') { in_field = false; } else if (*here == '!' || *here == '(' || *here == ')') { if (!in_phrase) { // ignore these if in_phrase // output field, then output operator in_field = false; if (!queryelem.empty()) { if (!simple_and && !fieldpart.empty()) { add_field_info(fieldpart, tag, type); finalquery += fieldpart; finalquery.push_back(' '); fieldpart.clear(); } fieldpart += queryelem; } if (!fieldpart.empty()) { add_field_info(fieldpart, tag, type); finalquery += fieldpart; finalquery.push_back(' '); } fieldpart.clear(); queryelem.clear(); finalquery.push_back(*here); finalquery.push_back(' '); } } else if (*here == '"') { queryelem.push_back(*here); if (in_phrase == false) in_phrase = true; else { in_phrase = false; } } // Found word boundary, in a phrase else if (in_phrase) { queryelem.push_back(*here); } // Found a word boundary else { if (!queryelem.empty()) { if (queryelem == "&") { in_field = true; queryelem.clear(); } else if (starts_with(queryelem, "NEAR") || starts_with(queryelem, "WITHIN")) { if (argb==1) { // simple search, these not allowed in_field = true; fieldpart += queryelem; fieldpart.push_back(' '); } queryelem.clear(); } else { if (!simple_and && !in_field) { if (!fieldpart.empty()) { add_field_info(fieldpart, tag, type); finalquery += fieldpart; finalquery.push_back(' '); fieldpart.clear(); } } fieldpart += queryelem; fieldpart.push_back(' '); queryelem.clear(); } } } ++here; } // at the end if (!queryelem.empty()) { if (!simple_and && !in_field && !fieldpart.empty()) { add_field_info(fieldpart, tag, type); finalquery += fieldpart; finalquery.push_back(' '); fieldpart.clear(); } fieldpart += queryelem; } if (!fieldpart.empty()) { add_field_info(fieldpart, tag, type); finalquery += fieldpart; fieldpart.clear(); // doesn't the following just leave a dangling space at the end ?? (used to make mgpp crash) // consider cutting this line finalquery.push_back(' '); } querystring = finalquery; } void format_field_info_sql(text_t &querystring, const text_t &tagseq, const text_t &sqlcomb, int argt, int argb) { add_field_info_sql(querystring, tagseq, sqlcomb); } void format_field_info(text_t &querystring, text_t tag, int argct, int argt, int argb) { if (argct == 1) { format_field_info_mgpp(querystring, tag, argt, argb); } else if (argct == 2) { format_field_info_lucene(querystring, tag, argt, argb); } } void mgpp_adddateelem(text_t& querystring, const int date) { querystring.appendcstr(" ["); if(date<0) { querystring.appendcstr("bc"); querystring.appendint((date*-1)); } else { querystring.appendint(date); } querystring.appendcstr("]:CV"); } void lucene_adddateelem(text_t& querystring, const int date) { querystring.appendcstr(" CV:("); if(date<0) { querystring.appendcstr("bc"); querystring.appendint((date*-1)); } else { querystring.appendint(date); } querystring.appendcstr(")"); } void add_dates(text_t &querystring, int startdate, int enddate, int startbc, int endbc, int ct) { if(startdate) { int querystringis = 0; text_t::const_iterator here = querystring.begin(); text_t::const_iterator end = querystring.end(); while(here!=end) { if(!(isspace((*here)))){ here = end; querystringis = 1; } else ++here; } //converting BCE dates if(startbc && startdate > 0) { startdate *= -1; } if(endbc && enddate > 0) { enddate *= -1; } if(enddate != 0 && enddate