/********************************************************************** * * cgiutils.cpp -- general cgi utilities * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #include "cgiutils.h" #include "fileutil.h" #include "gsdlunicode.h" #include "fileutil.h" #include "unitool.h" // in mg, for output_utf8_char #include #include #if defined(GSDL_USE_OBJECTSPACE) # include # include #elif defined(GSDL_USE_IOS_H) # include # include #else # include # include #endif static unsigned short hexdigit (unsigned short c) { if (c >= '0' && c <= '9') return (c-'0'); if (c >= 'a' && c <= 'f') return (c-'a'+10); if (c >= 'A' && c <= 'F') return (c-'A'+10); return c; } static void c2hex (unsigned short c, text_t &t) { t.clear(); if (c >= 256) { t = "20"; // ' ' return; } unsigned short o1, o2; o1 = (c/16) % 16; o2 = c % 16; if (o1 >= 10) o1 += 'a' - 10; else o1 += '0'; if (o2 >= 10) o2 += 'a' - 10; else o2 += '0'; t.push_back(o1); t.push_back(o2); } static text_t::iterator getline (text_t::iterator first, text_t::iterator last, bool include_crlf) { while (first != last) { if (((first+1) != last) && (*first == 13) && (*(first+1) == 10)) { // found if (include_crlf) first += 2; break; } first++; } return first; } static void process_post_section (text_t &argname, text_t &argdata, text_t &filename, text_t &filedata, text_t &filetype, bool &isfile, text_t &argstr, fileupload_tmap &fileuploads, const text_t &gsdlhome) { if (!argname.empty()) { if (!isfile) { // argdata includes a trailing that we must remove if ((argdata.size() > 1) && (*(argdata.end()-2) == 13) && (*(argdata.end()-1) == 10)) { argdata.erase(argdata.end()-2, argdata.end()); } if (!argstr.empty()) argstr += "&"; argstr += argname + "=" + argdata; } else if (!filename.empty()) { // filedata includes a trailing that we must remove if ((filedata.size() > 1) && (*(filedata.end()-2) == 13) && (*(filedata.end()-1) == 10)) { filedata.erase(filedata.end()-2, filedata.end()); } // create tmp_name for storing the file on disk, using the current timestamp text_t tmp_name(time(NULL)); tmp_name = filename_cat(gsdlhome, "tmp", tmp_name); char *tmp_name_c = tmp_name.getcstr(); // write the file data to disk outconvertclass out; ofstream filestream(tmp_name_c, ios::out | ios::binary); filestream << out << filedata; filestream.close(); delete tmp_name_c; // populate the fields of a fileupload_t and put it in the // fileuploads map fileupload_t fu; // note that filename currently may or may not include the path since // some browsers (e.g. IE) include the path while others // (e.g. mozilla) do not. we should probably remove the path from // this field here to get a consistent value across all browsers. text_t::iterator slash = findlastchar(filename.begin(), filename.end(), '\\'); if (slash != filename.end()) { filename = substr(slash+1, filename.end()); } fu.name = filename; fu.type = filetype; // size has yet to be implemented fu.size = filedata.size(); fu.tmp_name = tmp_name; fileuploads[argname] = fu; } } isfile = false; argname.clear(); argdata.clear(); filename.clear(); filedata.clear(); filetype.clear(); } // parse data obtained through a CGI POST request text_t parse_post_data (text_t &content_type, text_t &raw_post_data, fileupload_tmap &fileuploads, const text_t &gsdlhome) { text_t argstr; text_t::iterator content_type_begin = content_type.begin(); text_t::iterator content_type_end = content_type.end(); if (findword(content_type_begin, content_type_end, "multipart/form-data") == content_type_end) { // a simple post request return raw_post_data; } else { // multipart/form data - may contain one or more uploaded files /* content_type should look something like the following multipart/form-data; boundary=---------------------------7d411e1a50330 while raw_post_data will be as follows -----------------------------7d43e73450330CRLF Content-Disposition: form-data; name="e" d-0testss--1-0-00---4----0--0-110--1en-Zz-1---10-about-0--00031-001utfZz-8-0 -----------------------------7d43e73450330 Content-Disposition: form-data; name="afile"; filename="C:\somedoc.doc" Content-Type: application/msword */ // first get the boundary from content-type text_t::iterator boundary_begin = findword(content_type_begin, content_type_end, "boundary="); if (boundary_begin+9 < content_type_end) { // skip over "boundary=" part of string boundary_begin += 9; } else { // error cerr << "Error: malformed boundary? '" << content_type << "'" << endl; return ""; } text_t boundary = substr(boundary_begin, getline(boundary_begin, content_type_end, false)); int boundary_len = boundary.size(); text_t argname, argdata, filename, filedata, filetype; bool isfile = false; text_t::iterator data_here = raw_post_data.begin(); text_t::iterator data_end = raw_post_data.end(); while (data_here != data_end) { // get the next available line (including the trailing text_t line = substr(data_here, getline(data_here, data_end, true)); data_here += line.size(); text_t::iterator line_begin = line.begin(); text_t::iterator line_end = line.end(); if (findword(line_begin, line_end, boundary) != line_end) { // we've found a boundary process_post_section(argname, argdata, filename, filedata, filetype, isfile, argstr, fileuploads, gsdlhome); } else if (findword(line_begin, line_end, "Content-Disposition: form-data") != line_end) { // we've found the the beginning of a new section argname.clear(); argdata.clear(); // get the name of this piece of form data text_t::iterator it = findword(line_begin, line_end, "name=\""); if (it == line_end) break; // error - this shouldn't happen it = findchar(it, line_end, '"'); if ((it != line_end) && (it+1 != line_end)) { argname = substr(it+1, findchar(it+1, line_end, '"')); } // if this piece of form data contains filename="" it's a file // upload and needs to be treated special it = (findword(line_begin, line_end, "filename=\"")); if (it != line_end) { // we've found a file upload isfile = true; it = findchar(it, line_end, '"'); if ((it != line_end) && (it+1 != line_end)) { filename = substr(it+1, findchar(it+1, line_end, '"')); } // the next line is the content-type of this section line = substr(data_here, getline(data_here, data_end, true)); data_here += line.size(); line_begin = line.begin(); line_end = line.end(); it = (findword(line_begin, line_end, "Content-Type: ")); if (it != line_end) { filetype = substr(it+14, getline(it, line_end, false)); } } // eat up the next line as it's just a on it's own data_here += 2; } else { if (isfile) filedata += line; else argdata += line; } } // process last section process_post_section(argname, argdata, filename, filedata, filetype, isfile, argstr, fileuploads, gsdlhome); return argstr; } } // convert %xx and + to their appropriate equivalents // IE 6.0 and later use "%u" followed by 4 hex digits... MS IIS extension! // NOTE: this method is crap. It assumes the input encoding is utf-8. If it // actually was, then this returns utf-8, and needs to_uni on the // result to get it back to unicode. If the encoding wasn't utf-8, then the // output may be crap. Seems to work for 8 bit encodings. // Really, this should be given the encoding, and should always return unicode. void decode_cgi_arg (text_t &argstr) { text_t::iterator in = argstr.begin(); text_t::iterator out = in; text_t::iterator end = argstr.end(); while (in != end) { if (*in == '+') *out = ' '; else if (*in == '%') { unsigned short c = '%'; ++in; if (in != end) { // this is an encoding... if (*in == 'u') { // convert %uHHHH to unicode then current encoding // this assumes a short int is at least 16 bits... ++in; if (in != end) c=hexdigit(*in++) << 12; if (in != end) c+=hexdigit(*in++) << 8; if (in != end) c+=hexdigit(*in++) << 4; if (in != end) c+=hexdigit(*in); /* BAD!! The following assumes the interface is using utf-8. But at this point we don't know what encoding we are using, unless we can parse it out of the string we are currently decoding... */ text_t uni=" "; uni[0]=c; text_t utf8=to_utf8(uni); int last_byte=utf8.size()-1; for (int i=0;ialert("hacked")) and log poisoning (apache writes unrecognised URLs // into log. If the user entered c=garbage in the URL, it gets written out into the // apache log and that log file can be included in a local file inclusion (LFI) or // remote file include (RFI) attack. // This function encodes <>, &, ", ', / which are scripting chars or chars which can be used to // break out of an html/XML/javascript context. void safe_cgi_arg (text_t &argstr) { text_t::iterator in = argstr.begin(); text_t out = ""; text_t::iterator end = argstr.end(); while (in != end) { if (*in == '<') out += "%3C"; else if (*in == '>') out += "%3E"; else if (*in == '&') out += "%26"; else if (*in == '\"') out += "%22"; else if (*in == '\'') out += "%27"; else if (*in == '/') out += "%2F"; else { // append whatever char is in *in, but as a char, not int //out += *in; // appends as int out += " "; // append placeholder character out[out.size()-1] = *in; // now set location containing placeholder to what's in *in } ++in; } argstr.erase (argstr.begin(), end); argstr += out; } // split up the cgi arguments void split_cgi_args (const cgiargsinfoclass &argsinfo, text_t argstr, cgiargsclass &args) { args.clear(); text_t::const_iterator here = argstr.begin(); text_t::const_iterator end = argstr.end(); text_t key, value; // extract out the key=value pairs while (here != end) { // get the next key and value pair here = getdelimitstr (here, end, '=', key); here = getdelimitstr (here, end, '&', value); // convert %xx and + to their appropriate equivalents decode_cgi_arg (value); safe_cgi_arg(value); // mitigate obvious cross-site scripting hacks in URL cgi-params value.setencoding(1); // other encoding // store this key=value pair if (!key.empty()) { // if arg occurs multiple times (as is the case with multiple // checkboxes using the same name) we'll create a comma separated // list of all the values (this uses a hack that encodes naturally // occurring commas as %2C - values will therefore need to be decoded // again before use) - it should use an array instead const cgiarginfo *info = argsinfo.getarginfo (key); if (info==NULL) { // If info is NULL, we can't tell if the arg is multiple value or not // Because we need to have dynamically named arguments multivalued, we // will always assume multiplevalue = true // If the arg is not multi valued, then you need to decode the commas. if (args.getarg(key)==NULL) { args.setarg (key, encode_commas(value), cgiarg_t::cgi_arg); } else { text_t newvalue = args[key]; newvalue += "," + encode_commas(value); newvalue.setencoding(1); // other encoding args.setarg (key, newvalue, cgiarg_t::cgi_arg); } } else { if (info->multiplevalue) { text_t newvalue = args[key]; if (args.lookupcgiarg(key).source == cgiarg_t::cgi_arg) newvalue += ","; newvalue += encode_commas(value); newvalue.setencoding(1); // other encoding args.setarg (key, newvalue, cgiarg_t::cgi_arg); } else { args.setarg (key, value, cgiarg_t::cgi_arg); } } } } } text_t encode_commas (const text_t &intext) { text_t outtext; text_t::const_iterator here = intext.begin (); text_t::const_iterator end = intext.end (); while (here != end) { if (*here == ',') outtext += "%2C"; else outtext.push_back (*here); ++here; } return outtext; } text_t decode_commas (const text_t &intext) { text_t outtext; text_t::const_iterator here = intext.begin (); text_t::const_iterator end = intext.end (); // for loop int intext_len = intext.size(); for(int i = 0; i < intext_len; i++) { if ((i+2)= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) || ((c >= '0') && (c <= '9')) || (c == '%') || (c == '-')) { // alphanumeric character outtext.push_back(c); } else if (c == ' ') { // space outtext.push_back('+'); } else if (c > 255) { // not utf-8 character cerr << "WARNING: expected utf-8 char, but got unicode!!\n"; } else { // everything else outtext.push_back('%'); c2hex(c, ttmp); outtext += ttmp; } ++here; } return outtext; } // takes unicode input text_t cgi_safe_unicode (const text_t &intext) { text_t outtext; text_t::const_iterator here = intext.begin (); text_t::const_iterator end = intext.end (); unsigned short c; text_t ttmp; while (here != end) { c = *here; if (((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) || ((c >= '0') && (c <= '9')) || (c == '%') || (c == '-')) { // alphanumeric character outtext.push_back(c); } else if (c == ' ') { // space outtext.push_back('+'); } else if (c > 127) { // unicode character unsigned char buf[3]; // up to 3 bytes buf[0]='\0';buf[1]='\0';buf[2]='\0'; output_utf8_char(c,buf, buf+2); outtext.push_back('%'); c2hex(buf[0], ttmp); outtext += ttmp; outtext.push_back('%'); c2hex(buf[1], ttmp); outtext += ttmp; if (buf[2]) { outtext.push_back('%'); c2hex(buf[2], ttmp); outtext += ttmp; } } else { // everything else outtext.push_back('%'); c2hex(c, ttmp); outtext += ttmp; } ++here; } return outtext; } static text_t::const_iterator get_next_save_arg (text_t::const_iterator first, text_t::const_iterator last, text_t &argname) { first = getdelimitstr (first, last, '-', argname); return first; } // check_save_conf_str checks the configuration string for // the saved args and makes sure it does not conflict with // the information about the arguments. If an error is encountered // it will return false and the program should not produce any // output. bool check_save_conf_str (const text_t &saveconf, const cgiargsinfoclass &argsinfo, ostream &logout) { outconvertclass text_t2ascii; text_tset argsset; text_t::const_iterator saveconfhere = saveconf.begin (); text_t::const_iterator saveconfend = saveconf.end (); text_t argname; const cgiarginfo *info; // first check to make sure all saved arguments can be saved while (saveconfhere != saveconfend) { saveconfhere = get_next_save_arg (saveconfhere, saveconfend, argname); if (!argname.empty()) { // save the argument name for later argsset.insert (argname); // check the argument info = argsinfo.getarginfo (argname); if (info == NULL) { logout << text_t2ascii << "Error: the cgi argument \"" << argname << "\" is used in the configuration string for the\n" << "saved arguments but does not exist as a valid argument.\n\n"; return false; } if (info->savedarginfo == cgiarginfo::mustnot) { logout << text_t2ascii << "Error: the cgi argument \"" << argname << "\" is used in the configuration string for the\n" << "saved arguments but has been specified as an argument whose\n" << "state must not be saved.\n\n"; return false; } } } // next check that all saved arguments that should be saved // are saved cgiargsinfoclass::const_iterator argsinfohere = argsinfo.begin (); cgiargsinfoclass::const_iterator argsinfoend = argsinfo.end (); while (argsinfohere != argsinfoend) { if (((*argsinfohere).second.savedarginfo == cgiarginfo::must) && (argsset.find((*argsinfohere).second.shortname) == argsset.end())) { logout << text_t2ascii << "Error: the cgi argument \"" << (*argsinfohere).second.shortname << "\" was specified as needing to\n" << "be save but was not listed in the saved arguments.\n\n"; return false; } ++argsinfohere; } return true; // made it, no clashes } // create_save_conf_str will create a configuration string // based on the information in argsinfo. This method of configuration // is not recomended as small changes can produce large changes in // the resulting configuration string (for instance a totally different // ordering). Only arguments which "must" be saved are included in // the resulting string. text_t create_save_conf_str (const cgiargsinfoclass &argsinfo, ostream &/*logout*/) { cgiargsinfoclass::const_iterator argsinfohere = argsinfo.begin (); cgiargsinfoclass::const_iterator argsinfoend = argsinfo.end (); text_t saveconf; bool first = true; while (argsinfohere != argsinfoend) { // save this argument if it must be saved if ((*argsinfohere).second.savedarginfo == cgiarginfo::must) { if (!first) saveconf.push_back ('-'); else first = false; saveconf += (*argsinfohere).second.shortname; } ++argsinfohere; } return saveconf; } // expand_save_args will expand the saved arguments based // on saveconf placing the results in args if they are not // already defined. If it encounters an error it will return false // and output more information to logout. bool expand_save_args (const cgiargsinfoclass &argsinfo, const text_t &saveconf, cgiargsclass &args, ostream &logout) { outconvertclass text_t2ascii; text_t *arg_e = args.getarg("e"); if (arg_e == NULL) return true; // no compressed arguments if (arg_e->empty()) return true; // no compressed arguments text_t argname, argvalue; const cgiarginfo *argnameinfo; text_t::const_iterator saveconfhere = saveconf.begin(); text_t::const_iterator saveconfend = saveconf.end(); text_t::iterator arg_ebegin = arg_e->begin(); text_t::iterator arg_eend = arg_e->end(); text_t::iterator arg_ehere = arg_ebegin; while (saveconfhere != saveconfend && arg_ehere != arg_eend) { saveconfhere = get_next_save_arg (saveconfhere, saveconfend, argname); if (!argname.empty()) { // found another entry argnameinfo = argsinfo.getarginfo (argname); if (argnameinfo == NULL) { // no information about the argument could be found // we can't keep going because we don't know whether // this argument is a single or multiple character value logout << text_t2ascii << "Error: the cgi argument \"" << argname << "\" was specified as being a compressed argument\n" << "but no information about it could be found within the " << "cgiargsinfoclass.\n"; return false; } else { // found the argument information if (argnameinfo->multiplechar) { text_t::const_iterator sav = arg_ehere; arg_ehere = getdelimitstr (arg_ehere, arg_eend, '-', argvalue); if (distance(arg_ebegin, arg_ehere) > 2) { // replace any '-' chars escaped with 'Zz' bool first = true; while ((*(arg_ehere-3) == 'Z') && (*(arg_ehere-2) == 'z')) { if (first) argvalue.clear(); // Hey, here's a wild idea. Why don't we check that there is // another hyphen in the cgiarge before we get a pointer to it and // add one. That way we are far less likely to wander off into // random memory merrily parsing arguments that are then lovingly // spewed all over the HTML page returned at the usage logs. text_t::iterator minus_itr = findchar (arg_ehere, arg_eend, '-'); if (minus_itr == arg_eend) { logout << text_t2ascii << "Error: the cgi argument \"" << argname << "\" was specified as being a compressed argument but we have run out of cgiarge to decompress!\n"; return false; } arg_ehere = minus_itr + 1; while (sav != (arg_ehere-1)) { if (!((*sav == 'Z') && (*(sav+1) == 'z') && (*(sav+2) == '-')) && !((*(sav-1) == 'Z') && (*sav == 'z') && (*(sav+1) == '-'))) argvalue.push_back (*sav); ++sav; } first = false; } } argvalue.setencoding(1); // other encoding if (!argvalue.empty()) args.setdefaultarg (argname, argvalue, cgiarg_t::compressed_arg); } else { args.setdefaultcarg (argname,*arg_ehere, cgiarg_t::compressed_arg); ++arg_ehere; } } } } return true; } // adds the default values for those arguments which have not // been specified void add_default_args (const cgiargsinfoclass &argsinfo, cgiargsclass &args, ostream &/*logout*/) { cgiargsinfoclass::const_iterator argsinfohere = argsinfo.begin (); cgiargsinfoclass::const_iterator argsinfoend = argsinfo.end (); while (argsinfohere != argsinfoend) { if ((*argsinfohere).second.defaultstatus != cgiarginfo::none) { args.setdefaultarg ((*argsinfohere).second.shortname, (*argsinfohere).second.argdefault, cgiarg_t::default_arg); } ++argsinfohere; } } void add_fileupload_args (const cgiargsinfoclass &argsinfo, cgiargsclass &args, fileupload_tmap &fileuploads, ostream &logout) { const cgiarginfo *info = argsinfo.getarginfo("a"); fileupload_tmap::const_iterator this_file = fileuploads.begin(); fileupload_tmap::const_iterator end_file = fileuploads.end(); while (this_file != end_file) { const cgiarginfo *info = argsinfo.getarginfo((*this_file).first); if (info != NULL) { if ((*info).fileupload && (file_exists((*this_file).second.tmp_name))) { args.setargfile((*this_file).first, (*this_file).second); } } this_file++; } } // compress_save_args will compress the arguments and return // them in compressed_args. If an error was encountered // compressed_args will be set to to "", an error will be // written to logout, and the function will return false. bool compress_save_args (const cgiargsinfoclass &argsinfo, const text_t &saveconf, cgiargsclass &args, text_t &compressed_args, outconvertclass &outconvert, ostream &logout) { outconvertclass text_t2ascii; compressed_args.clear(); text_t argname, argvalue; const cgiarginfo *argnameinfo; text_t::const_iterator saveconfhere = saveconf.begin(); text_t::const_iterator saveconfend = saveconf.end(); while (saveconfhere != saveconfend) { saveconfhere = get_next_save_arg (saveconfhere, saveconfend, argname); if (!argname.empty()) { // found another entry argnameinfo = argsinfo.getarginfo (argname); if (argnameinfo == NULL) { // no information about the argument could be found // we can't keep going because we don't know whether // this argument is a single or multiple character value logout << text_t2ascii << "Error: the cgi argument \"" << argname << "\" was specified as being a compressed argument\n" << "but no information about it could be found within the " << "cgiargsinfoclass.\n"; compressed_args.clear(); return false; } else { // found the argument information if (argnameinfo->multiplechar) { // multiple character argument -- sort out any '-' chars if (args["w"]=="utf-16be") // browsers don't like \0 in urls... compressed_args += minus_safe (args[argname], false); else compressed_args += minus_safe (outconvert.convert(args[argname]), true); if (saveconfhere != saveconfend) compressed_args.push_back ('-'); } else { // single character argument if (args[argname].size() == 0) { logout << text_t2ascii << "Error: the cgi argument \"" << argname << "\" was specified as being a compressed argument which\n" << "should have a one character value but it was empty.\n\n"; compressed_args.clear (); return false; } else if (args[argname].size() > 1) { logout << text_t2ascii << "Error: the cgi argument \"" << argname << "\" was specified as being a compressed argument which\n" << "should have a one character value but it had multiple characters.\n\n"; compressed_args.clear (); return false; } // everything is ok compressed_args += args[argname]; } } } } return true; } // args_tounicode converts any arguments which are not in unicode // to unicode using inconvert void args_tounicode (cgiargsclass &args, inconvertclass &inconvert) { cgiargsclass::iterator here = args.begin(); cgiargsclass::iterator end = args.end(); while (here != end) { if ((*here).second.value.getencoding() > 0) { // Call reset() before converting each argument, to prevent problems when converting the last // argument left the converter in a bad state inconvert.reset(); (*here).second.value = inconvert.convert((*here).second.value); } ++here; } } // fcgienv will be loaded with environment name-value pairs // if using fastcgi (had to do this as getenv doesn't work // with our implementation of fastcgi). if fcgienv is empty // we'll simply use getenv text_t gsdl_getenv (const text_t &name, text_tmap &fcgienv) { if (fcgienv.empty()) { char *n = name.getcstr(); char *v = getenv(n); delete []n; if (v != NULL) return v; return g_EmptyText; } else return fcgienv[name]; }