Show
Ignore:
Timestamp:
21.02.2014 18:46:01 (6 years ago)
Author:
ak19
Message:

Fixing up URL encoding of cgi args so that phrase searching works again. Tested MGPP, Lucene and SQLite searching. Tested simple search, fielded search, advanced single field and multi-field as well as running a query.

Location:
main/trunk/greenstone2/runtime-src/src/recpt
Files:
5 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/runtime-src/src/recpt/cgiutils.cpp

    r26560 r28841  
    4343#endif 
    4444 
     45// set to false to undo security changes (url-encoding arguments) 
     46static bool do_safe_cgi_args = true; 
    4547 
    4648static unsigned short hexdigit (unsigned short c) { 
     
    336338// This function encodes <>, &, ", ', / which are scripting chars or chars which can be used to 
    337339// break out of an html/XML/javascript context. 
    338 void safe_cgi_arg (text_t &argstr) { 
     340void safe_cgi_arg (const text_t &key, text_t &argstr) { 
     341  if(!do_safe_cgi_args) { 
     342    return; 
     343  } 
     344 
    339345  text_t::iterator in = argstr.begin(); 
    340346  text_t out = ""; 
     
    350356    else { // append whatever char is in *in, but as a char, not int 
    351357            //out += *in; // appends as int 
    352       out += " "; // append placeholder character 
    353       out[out.size()-1] = *in; // now set location containing placeholder to what's in *in 
     358      out.push_back(*in); 
    354359    } 
    355360    ++in; 
     
    359364  argstr += out;   
    360365} 
     366 
     367 
     368// given a list of characters (or "all") to decode, and given the string, str, where those  
     369// characters are to be decoded, this method replaces any occurrences of the url-encoded  
     370// variants of those characters with their actual characters in the given string str. 
     371void unsafe_cgi_arg(const text_t &chars, text_t &str) { 
     372  if(!do_safe_cgi_args) { 
     373    return; 
     374  } 
     375 
     376  text_t allchars = "<>&\"\'/"; 
     377 
     378  text_t chars_to_decode = (chars == "all" || chars == "ALL") ? allchars : chars; 
     379 
     380  text_t::iterator in = chars_to_decode.begin(); 
     381  text_t::iterator end = chars_to_decode.end(); 
     382 
     383  char hex_char[4]; 
     384 
     385  // using sprint to urlencode a character. See http://www.programmingforums.org/thread15443.html 
     386 
     387  while (in != end) {  
     388     
     389    // *in is a character from the accepted list of chars_to_decode list 
     390     
     391    // 1. create the url-encoded value of the char *in in variable hex_char 
     392    // sprintf adds in a null byte at the end 
     393    sprintf(hex_char,"%%%02X",*in); 
     394     
     395    // 2. Need the actual char to be decoded as a text_t string, so we can do a string replace with it 
     396    text_t tmp = ""; 
     397    tmp.push_back(*in); 
     398     
     399    // 3. replaces occurrences of hex_char (the url_encoded version of the char *in) in str with its decoded version     
     400    str.replace(hex_char, tmp); 
     401 
     402    ++in; 
     403  }   
     404} 
     405 
    361406 
    362407// split up the cgi arguments 
     
    378423    decode_cgi_arg (value); 
    379424 
    380     safe_cgi_arg(value); // mitigate obvious cross-site scripting hacks in URL cgi-params 
     425    safe_cgi_arg(key, value); // mitigate obvious cross-site scripting hacks in URL cgi-params 
    381426 
    382427    value.setencoding(1); // other encoding 
  • main/trunk/greenstone2/runtime-src/src/recpt/cgiutils.h

    r13456 r28841  
    4242void split_cgi_args (const cgiargsinfoclass &argsinfo, text_t argstr,  
    4343             cgiargsclass &args); 
     44 
     45// url-decode selected chars of a given string 
     46void unsafe_cgi_arg(const text_t &chars_to_decode, text_t &str); 
    4447 
    4548text_t encode_commas (const text_t &intext); 
  • main/trunk/greenstone2/runtime-src/src/recpt/queryaction.cpp

    r28220 r28841  
    13421342    formattedstring = args["q"]; 
    13431343    // remove & | ! for simple search,do segmentation if necessary 
     1344    // To url-decode the '&', format_querystring() will call unsafe_cgi_arg() first 
    13441345    format_querystring (formattedstring, args.getintarg("b"), segment); 
    13451346    if (args["ct"]!=0) { // mgpp and lucene - need to add in tag info if appropriate 
     
    13581359    if (args["b"]=="1" && args["fqa"]=="1") { // explicit query 
    13591360      formattedstring = args["q"]; 
     1361 
     1362      // Replace %22 and %26 with " and & respectively, since these characters have meaning  
     1363      // in queries: " are used in phrases and & is used in boolean advanced searches. 
     1364      // For form searches below, unsafe_cgi_arg is called in the parse_..._form() functions 
     1365 
     1366      unsafe_cgi_arg("ALL", formattedstring); 
    13601367    } 
    13611368    else { // form search 
    13621369      if (args["b"]=="0") { // regular form 
    1363     parse_reg_query_form(formattedstring, args, segment); 
     1370    parse_reg_query_form(formattedstring, args, segment); // will call unsafe_cgi_arg to decode url encoding 
    13641371      } 
    13651372      else  { // advanced form 
    1366     parse_adv_query_form(formattedstring, args, segment); 
     1373    parse_adv_query_form(formattedstring, args, segment); // will call unsafe_cgi_arg to decode url encoding 
    13671374      } 
    13681375      args["q"] = formattedstring; 
  • main/trunk/greenstone2/runtime-src/src/recpt/querytools.cpp

    r28222 r28841  
    2525 
    2626#include "querytools.h" 
     27#include "cgiutils.h" 
    2728#include <ctype.h> 
    2829#include "unitool.h" // for is_unicode_letdig 
     
    343344// This function removes boolean operators from simple searches, and segments 
    344345// chinese characters if segment=true 
     346// Called by several parse_..._form methods here, this function decodes & 
     347// to undo the URL encoding done in cgiutils.cpp for security purposes 
    345348void format_querystring (text_t &querystring, int querymode, bool segment) { 
    346349  text_t formattedstring; 
     350 
     351  // & has meaning in boolean searches and can be %26 encoded at this point, need to decode them now.  
     352  // Also decode any " here, so that the entire search phrase is highlighted and not just the final word 
     353  unsafe_cgi_arg("ALL", querystring); 
    347354 
    348355  // advanced search, no segmenting, don't need to do anything 
     
    449456  } 
    450457   
    451    
    452458  if (arg_ct == "2") { // lucene 
    453459    // look for AND OR NOT and remove 
     
    579585 
    580586 
     587// The following parse_..._form functions first decode various fields for  
     588// both simple and advanced searches to undo the URL encoding.  
     589// E.g. quotes have meaning in phrase searches and these have to be decoded  
     590// before sending the search off to the index. 
     591 
    581592// some query form parsing functions for use with mgpp & lucene 
    582593 
     
    599610  text_t field = args["fqf"]; 
    600611  if (field.empty()) return; // no query 
     612  unsafe_cgi_arg("ALL", field); 
    601613  text_tarray fields; 
    602614  splitchar(field.begin(), field.end(), ',', fields);  
     
    604616  text_t value = args["fqv"]; 
    605617  if (value.empty()) return; // somethings wrong 
     618  unsafe_cgi_arg("ALL", value); 
    606619  text_tarray values; 
    607620  splitchar(value.begin(), value.end(), ',', values); 
     
    651664  text_t field = args["fqf"]; 
    652665  if (field.empty()) return; // no query 
     666  unsafe_cgi_arg("ALL", field); 
    653667  text_tarray fields; 
    654668  splitchar(field.begin(), field.end(), ',', fields);  
     
    656670  text_t value = args["fqv"]; 
    657671  if (value.empty()) return; // somethings wrong 
     672  unsafe_cgi_arg("ALL", value); 
    658673  text_tarray values; 
    659674  splitchar(value.begin(), value.end(), ',', values); 
     
    661676  text_t comb = args["fqc"]; 
    662677  if (comb.empty()) return; //somethings wrong 
     678  //unsafe_cgi_arg("ALL", comb); 
    663679  text_tarray combs; 
    664680  splitchar(comb.begin(), comb.end(), ',', combs); 
     
    734750  text_t field = args["sqlfqf"]; 
    735751  if (field.empty()) return; // no query 
     752  unsafe_cgi_arg("ALL", field); // for the slash. //unsafe_cgi_arg("/", field); 
    736753  text_tarray fields; 
    737754  splitchar(field.begin(), field.end(), ',', fields);  
     
    739756  text_t sqlcomb = args["sqlfqc"]; 
    740757  if (sqlcomb.empty()) return; //somethings wrong 
     758  //unsafe_cgi_arg("ALL", sqlcomb); 
    741759  text_tarray sqlcombs; 
    742760  splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs); 
     
    744762  text_t value = args["fqv"]; 
    745763  if (value.empty()) return; // somethings wrong 
     764  unsafe_cgi_arg("ALL", value); 
    746765  text_tarray values; 
    747766  splitchar(value.begin(), value.end(), ',', values); 
     
    808827 
    809828  if (field.empty()) return; // no query 
     829  // need to decode %2F to / in the URL, e.g. to get dc.Title/Title/ex.Title again in the fields to search in 
     830  unsafe_cgi_arg("ALL", field); //unsafe_cgi_arg("/", field); 
    810831  text_tarray fields; 
    811832  splitchar(field.begin(), field.end(), ',', fields);  
     
    813834  text_t sqlcomb = args["sqlfqc"]; 
    814835  if (sqlcomb.empty()) return; //somethings wrong 
     836  //unsafe_cgi_arg("ALL", sqlcomb); 
    815837  text_tarray sqlcombs; 
    816838  splitchar(sqlcomb.begin(), sqlcomb.end(), ',', sqlcombs); 
     
    818840  text_t value = args["fqv"]; 
    819841  if (value.empty()) return; // somethings wrong 
     842  unsafe_cgi_arg("ALL", value); // decode all url-encoded parts of the values to search in 
    820843  text_tarray values; 
    821844  splitchar(value.begin(), value.end(), ',', values); 
     
    823846  text_t comb = args["fqc"]; 
    824847  if (comb.empty()) return; //somethings wrong 
     848  //unsafe_cgi_arg("ALL", comb); 
    825849  text_tarray combs; 
    826850  splitchar(comb.begin(), comb.end(), ',', combs); 
  • main/trunk/greenstone2/runtime-src/src/recpt/sqlqueryaction.cpp

    r23398 r28841  
    260260                          ostream& logout)  
    261261{ 
     262  // A great many characters have meanings in SQL queries, including > and %,  
     263  // where % stands for a multi-char wildcard  
     264  // http://docs.oracle.com/cd/B10501_01/text.920/a96518/cqspcl.htm 
     265  // Further, Greenstone's Advanced SQLite Search allows <, >, %, ' (rounded brackets and more)  
     266  // So it's best to url-decode all encoded cgi-args  
     267  // We do so here if normal text search or explicit query, and in the 
     268  // parse_sql_query_form functions if dealing with forms. 
     269 
    262270  if (args["qt"]=="0" && args["sqlqto"] != "1") { // normal text search 
     271    unsafe_cgi_arg("ALL", args["q"]); 
    263272    formattedstring = "SELECT DISTINCT docOID FROM document_metadata WHERE " + args["q"];     
    264273  } 
     
    267276    if (args["b"]=="1" && args["fqa"]=="1") { // explicit query 
    268277      formattedstring = args["q"]; 
     278      unsafe_cgi_arg("ALL", formattedstring); 
    269279    } 
    270280    else { // form search