Changeset 28841


Ignore:
Timestamp:
2014-02-21T18:46:01+13:00 (10 years ago)
Author:
ak19
Message:

Fixing up URL encoding of cgi args so that phrase searching works again. Tested MGPP, Lucene and SQLite searching. Tested simple search, fielded search, advanced single field and multi-field as well as running a query.

Location:
main/trunk/greenstone2/runtime-src/src/recpt
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/runtime-src/src/recpt/cgiutils.cpp

    r26560 r28841  
    4343#endif
    4444
     45// set to false to undo security changes (url-encoding arguments)
     46static bool do_safe_cgi_args = true;
    4547
    4648static unsigned short hexdigit (unsigned short c) {
     
    336338// This function encodes <>, &, ", ', / which are scripting chars or chars which can be used to
    337339// break out of an html/XML/javascript context.
    338 void safe_cgi_arg (text_t &argstr) {
     340void safe_cgi_arg (const text_t &key, text_t &argstr) {
     341  if(!do_safe_cgi_args) {
     342    return;
     343  }
     344
    339345  text_t::iterator in = argstr.begin();
    340346  text_t out = "";
     
    350356    else { // append whatever char is in *in, but as a char, not int
    351357            //out += *in; // appends as int
    352       out += " "; // append placeholder character
    353       out[out.size()-1] = *in; // now set location containing placeholder to what's in *in
     358      out.push_back(*in);
    354359    }
    355360    ++in;
     
    359364  argstr += out; 
    360365}
     366
     367
     368// given a list of characters (or "all") to decode, and given the string, str, where those
     369// characters are to be decoded, this method replaces any occurrences of the url-encoded
     370// variants of those characters with their actual characters in the given string str.
     371void unsafe_cgi_arg(const text_t &chars, text_t &str) {
     372  if(!do_safe_cgi_args) {
     373    return;
     374  }
     375
     376  text_t allchars = "<>&\"\'/";
     377
     378  text_t chars_to_decode = (chars == "all" || chars == "ALL") ? allchars : chars;
     379
     380  text_t::iterator in = chars_to_decode.begin();
     381  text_t::iterator end = chars_to_decode.end();
     382
     383  char hex_char[4];
     384
     385  // using sprint to urlencode a character. See http://www.programmingforums.org/thread15443.html
     386
     387  while (in != end) {
     388   
     389    // *in is a character from the accepted list of chars_to_decode list
     390   
     391    // 1. create the url-encoded value of the char *in in variable hex_char
     392    // sprintf adds in a null byte at the end
     393    sprintf(hex_char,"%%%02X",*in);
     394   
     395    // 2. Need the actual char to be decoded as a text_t string, so we can do a string replace with it
     396    text_t tmp = "";
     397    tmp.push_back(*in);
     398   
     399    // 3. replaces occurrences of hex_char (the url_encoded version of the char *in) in str with its decoded version   
     400    str.replace(hex_char, tmp);
     401
     402    ++in;
     403  } 
     404}
     405
    361406
    362407// split up the cgi arguments
     
    378423    decode_cgi_arg (value);
    379424
    380     safe_cgi_arg(value); // mitigate obvious cross-site scripting hacks in URL cgi-params
     425    safe_cgi_arg(key, value); // mitigate obvious cross-site scripting hacks in URL cgi-params
    381426
    382427    value.setencoding(1); // other encoding
  • main/trunk/greenstone2/runtime-src/src/recpt/cgiutils.h

    r13456 r28841  
    4242void split_cgi_args (const cgiargsinfoclass &argsinfo, text_t argstr,
    4343             cgiargsclass &args);
     44
     45// url-decode selected chars of a given string
     46void unsafe_cgi_arg(const text_t &chars_to_decode, text_t &str);
    4447
    4548text_t encode_commas (const text_t &intext);
  • main/trunk/greenstone2/runtime-src/src/recpt/queryaction.cpp

    r28220 r28841  
    13421342    formattedstring = args["q"];
    13431343    // remove & | ! for simple search,do segmentation if necessary
     1344    // To url-decode the '&', format_querystring() will call unsafe_cgi_arg() first
    13441345    format_querystring (formattedstring, args.getintarg("b"), segment);
    13451346    if (args["ct"]!=0) { // mgpp and lucene - need to add in tag info if appropriate
     
    13581359    if (args["b"]=="1" && args["fqa"]=="1") { // explicit query
    13591360      formattedstring = args["q"];
     1361
     1362      // Replace %22 and %26 with " and & respectively, since these characters have meaning
     1363      // in queries: " are used in phrases and & is used in boolean advanced searches.
     1364      // For form searches below, unsafe_cgi_arg is called in the parse_..._form() functions
     1365
     1366      unsafe_cgi_arg("ALL", formattedstring);
    13601367    }
    13611368    else { // form search
    13621369      if (args["b"]=="0") { // regular form
    1363     parse_reg_query_form(formattedstring, args, segment);
     1370    parse_reg_query_form(formattedstring, args, segment); // will call unsafe_cgi_arg to decode url encoding
    13641371      }
    13651372      else  { // advanced form
    1366     parse_adv_query_form(formattedstring, args, segment);
     1373    parse_adv_query_form(formattedstring, args, segment); // will call unsafe_cgi_arg to decode url encoding
    13671374      }
    13681375      args["q"] = formattedstring;
  • main/trunk/greenstone2/runtime-src/src/recpt/querytools.cpp

    r28222 r28841  
    2525
    2626#include "querytools.h"
     27#include "cgiutils.h"
    2728#include <ctype.h>
    2829#include "unitool.h" // for is_unicode_letdig
     
    343344// This function removes boolean operators from simple searches, and segments
    344345// chinese characters if segment=true
     346// Called by several parse_..._form methods here, this function decodes &
     347// to undo the URL encoding done in cgiutils.cpp for security purposes
    345348void format_querystring (text_t &querystring, int querymode, bool segment) {
    346349  text_t formattedstring;
     350
     351  // & has meaning in boolean searches and can be %26 encoded at this point, need to decode them now.
     352  // Also decode any " here, so that the entire search phrase is highlighted and not just the final word
     353  unsafe_cgi_arg("ALL", querystring);
    347354
    348355  // advanced search, no segmenting, don't need to do anything
     
    449456  }
    450457 
    451  
    452458  if (arg_ct == "2") { // lucene
    453459    // look for AND OR NOT and remove
     
    579585
    580586
     587// The following parse_..._form functions first decode various fields for
     588// both simple and advanced searches to undo the URL encoding.
     589// E.g. quotes have meaning in phrase searches and these have to be decoded
     590// before sending the search off to the index.
     591
    581592// some query form parsing functions for use with mgpp & lucene
    582593
     
    599610  text_t field = args["fqf"];
    600611  if (field.empty()) return; // no query
     612  unsafe_cgi_arg("ALL", field);
    601613  text_tarray fields;
    602614  splitchar(field.begin(), field.end(), ',', fields);
     
    604616  text_t value = args["fqv"];
    605617  if (value.empty()) return; // somethings wrong
     618  unsafe_cgi_arg("ALL", value);
    606619  text_tarray values;
    607620  splitchar(value.begin(), value.end(), ',', values);
     
    651664  text_t field = args["fqf"];
    652665  if (field.empty()) return; // no query
     666  unsafe_cgi_arg("ALL", field);
    653667  text_tarray fields;
    654668  splitchar(field.begin(), field.end(), ',', fields);
     
    656670  text_t value = args["fqv"];
    657671  if (value.empty()) return; // somethings wrong
     672  unsafe_cgi_arg("ALL", value);
    658673  text_tarray values;
    659674  splitchar(value.begin(), value.end(), ',', values);
     
    661676  text_t comb = args["fqc"];
    662677  if (comb.empty()) return; //somethings wrong
     678  //unsafe_cgi_arg("ALL", comb);
    663679  text_tarray combs;
    664680  splitchar(comb.begin(), comb.end(), ',', combs);
     
    734750  text_t field = args["sqlfqf"];
    735751  if (field.empty()) return; // no query
     752  unsafe_cgi_arg("ALL", field); // for the slash. //unsafe_cgi_arg("/", field);
    736753  text_tarray fields;
    737754  splitchar(field.begin(), field.end(), ',', fields);
     
    739756  text_t sqlcomb = args["sqlfqc"];
    740757  if (sqlcomb.empty()) return; //somethings wrong
     758  //unsafe_cgi_arg("ALL", sqlcomb);
    741759  text_tarray sqlcombs;
    742760  splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
     
    744762  text_t value = args["fqv"];
    745763  if (value.empty()) return; // somethings wrong
     764  unsafe_cgi_arg("ALL", value);
    746765  text_tarray values;
    747766  splitchar(value.begin(), value.end(), ',', values);
     
    808827
    809828  if (field.empty()) return; // no query
     829  // need to decode %2F to / in the URL, e.g. to get dc.Title/Title/ex.Title again in the fields to search in
     830  unsafe_cgi_arg("ALL", field); //unsafe_cgi_arg("/", field);
    810831  text_tarray fields;
    811832  splitchar(field.begin(), field.end(), ',', fields);
     
    813834  text_t sqlcomb = args["sqlfqc"];
    814835  if (sqlcomb.empty()) return; //somethings wrong
     836  //unsafe_cgi_arg("ALL", sqlcomb);
    815837  text_tarray sqlcombs;
    816838  splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs);
     
    818840  text_t value = args["fqv"];
    819841  if (value.empty()) return; // somethings wrong
     842  unsafe_cgi_arg("ALL", value); // decode all url-encoded parts of the values to search in
    820843  text_tarray values;
    821844  splitchar(value.begin(), value.end(), ',', values);
     
    823846  text_t comb = args["fqc"];
    824847  if (comb.empty()) return; //somethings wrong
     848  //unsafe_cgi_arg("ALL", comb);
    825849  text_tarray combs;
    826850  splitchar(comb.begin(), comb.end(), ',', combs);
  • main/trunk/greenstone2/runtime-src/src/recpt/sqlqueryaction.cpp

    r23398 r28841  
    260260                          ostream& logout)
    261261{
     262  // A great many characters have meanings in SQL queries, including > and %,
     263  // where % stands for a multi-char wildcard
     264  // http://docs.oracle.com/cd/B10501_01/text.920/a96518/cqspcl.htm
     265  // Further, Greenstone's Advanced SQLite Search allows <, >, %, ' (rounded brackets and more)
     266  // So it's best to url-decode all encoded cgi-args
     267  // We do so here if normal text search or explicit query, and in the
     268  // parse_sql_query_form functions if dealing with forms.
     269
    262270  if (args["qt"]=="0" && args["sqlqto"] != "1") { // normal text search
     271    unsafe_cgi_arg("ALL", args["q"]);
    263272    formattedstring = "SELECT DISTINCT docOID FROM document_metadata WHERE " + args["q"];   
    264273  }
     
    267276    if (args["b"]=="1" && args["fqa"]=="1") { // explicit query
    268277      formattedstring = args["q"];
     278      unsafe_cgi_arg("ALL", formattedstring);
    269279    }
    270280    else { // form search
Note: See TracChangeset for help on using the changeset viewer.