Changeset 33045


Ignore:
Timestamp:
2019-05-03T21:46:42+12:00 (5 years ago)
Author:
ak19
Message:
  1. Dr Bainbridge's fix for the Windows encoding issue when online doc editing Title meta containing non-ASCII/non-basic unicode chars, was to URL encode the QUERY_STRING so that CGI.pm would then do the right thing. This was because the problem had turned out not to be env vars on Windows, which could set and recall unicode and win-1252 chars just fine and therefore retained what was entered. The problem was that on Windows, the perl did not get the actual chars transmitted in the case of UTF-8 whereas win-1252 was received looking apparently like a unicode codepoint, but then in the latter case the utf82unicode call in metadataaction would then clobber the codepoint in attempting to do utf82unicode on it. On linux, perl happened to receive the chars as utf8-encoded bytes and so utf82unicode worked (to make them unicode aware strings?). The real problem was that it could go wrong in different ways on windows, since utf8 chars weren't even received properly by perl/CGI, so we didn't even need to start worrying about them getting sometimes clobbered in metadataaction. URL encoding the QUERY_STRING was meant to solve this. Except that URL encoding the whole QUERY_STRING made CGI.pm choke on the equals signs between param name and param value and possibly other chars. I don't know why. I found that URL encoding just the param values in the QUERY_STRING works, so I am committing that. 2. Renaming the recently introduced string2hex() in JavaScript to debug_unicode_string and stringToHex() in Java to debugUnicodeString() to be more consistent with the perl variant, debug_unicode_string. Also like in the perl, the JavaScript and Java now print the unicode value inside curly braces for better legibility. 3. Leaving in some commented out encoding debugging statements in the Java and JavaScript code, but not committing the debugging on the perl side. 4. Some further improvements to overloaded functions in GSXML using debug_unicode_string for converting XML elements or printing them to logs.
Location:
main/trunk/greenstone3
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone3/src/java/org/greenstone/gsdl3/build/GS2PerlConstructor.java

    r32892 r33045  
    2020import java.io.InputStream;
    2121import java.io.IOException;
     22//import java.io.UnsupportedEncodingException;
     23//import java.net.URLEncoder;
     24//import java.nio.charset.StandardCharsets;
    2225import java.util.ArrayList;
    2326import java.util.Vector;
     27
    2428
    2529import org.apache.log4j.*;
     
    325329        command.add(cgi_directory + File.separator + "metadata-server.pl");
    326330       
     331       
    327332        // Need to set QUERY_STRING and REQUEST_METHOD=GET in environment
    328333        // Also set GS3_AUTHENTICATED, to allow running metadata-server.pl with mod (set and remove) commands
    329334        // http://www.cgi101.com/class/ch3/text.html
     335       
     336        // And need to ensure that special characters won't get clobbered on Windows by perl/CGI.pm (https://www.nntp.perl.org/group/perl.perl5.porters/2016/10/msg240120.html),
     337        // URL encode the query_string, as at https://stackoverflow.com/questions/10786042/java-url-encoding-of-query-string-parameters
     338       
     339        // perl/CGI.pm doesn't like us URL encoding the entire query string such as the equal sign between each paramName and paramValue.
     340        // So we URL encode each paramValue separately, which is done in GS2Construct.java::runCommand()
     341        /*
     342        String old_query_string = this.query_string;
     343        try{
     344            this.query_string = URLEncoder.encode(this.query_string, StandardCharsets.UTF_8.name());
     345            //this.query_string = this.query_string.replace("+","%2B"); // https://stackoverflow.com/questions/1211229/in-a-url-should-spaces-be-encoded-using-20-or
     346        } catch(UnsupportedEncodingException uee) {
     347            logger.warn("**** Unable to encode query_string in UTF-8, so attempting to continue with the unencoded value of query_string");
     348            this.query_string = old_query_string;
     349        }
     350        */
     351       
    330352        String[] envvars = {
    331353            "QUERY_STRING=" + this.query_string,
     
    464486   
    465487    ///logger.info("### Running logged command = " + command_str);
    466 
     488    /*
     489    // USEFUL DEBUGGING WHEN USING DOC EDITOR TO MODIFY ex.Title META THAT CONTAINS NON-BASIC ASCII CHARS
     490    logger.error("### Running logged command = " + command_str);
     491    logger.error("### DEBUG Running logged command = " + Misc.stringToHex(command_str));
     492    if(envvars != null) {
     493        for(int i = 0; i < envvars.length; i++) {
     494            logger.error("### envvar = " + envvars[i]);
     495            logger.error("### DEBUG envvar = " + Misc.stringToHex(envvars[i]));
     496        }
     497    }
     498    */
     499   
    467500    // This is where we create and run our perl process safely
    468501    SafeProcess perlProcess = createPerlProcess(command, envvars, dir); //  dir can be null
     
    755788            if(this.source == SafeProcess.STDERR) {
    756789                System.err.println("STDERR: " + line);
     790                logger.info("@@@@@@@@ STDERR: " + line );
    757791            } else {
    758792                System.err.println("STDOUT: " + line);
     793                logger.info("@@@@@@@@ STDOUT: " + line );
    759794            }   
    760795           
  • main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/GS2Construct.java

    r32892 r33045  
    2323import java.io.FileWriter;
    2424import java.io.Serializable;
     25import java.io.UnsupportedEncodingException;
     26import java.net.URLEncoder;
     27import java.nio.charset.StandardCharsets;
    2528import java.util.Collections;
    2629import java.util.Iterator;
     
    746749
    747750        Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
     751       
     752        //GSXML.elementToLogAsString("###      Extracted param_list: ", param_list, true);
     753        //GSXML.elementToLogAsUnicodeDebugString("### DEBUG Extracted param_list: ", param_list, true);
     754       
    748755        HashMap<String, Serializable> params = GSXML.extractParams(param_list, false);
    749756
     
    838845                String paramvalue = (String) entry.getValue();
    839846
    840                 querystring.append(paramname + "=" + paramvalue);
     847                // And need to ensure that special characters won't get clobbered on Windows by perl/CGI.pm (https://www.nntp.perl.org/group/perl.perl5.porters/2016/10/msg240120.html),
     848                // URL encode the query_string, as at https://stackoverflow.com/questions/10786042/java-url-encoding-of-query-string-parameters
     849       
     850                // perl/CGI.pm doesn't like us URL encoding the entire query string such as the equal sign between each paramName and paramValue.
     851                // So we URL encode each paramValue separately, which is done in GS2Construct.java::runCommand()
     852                querystring.append(paramname + "=" + urlEncodeValue(paramname, paramvalue));
    841853                if (i.hasNext()) {
    842854                    querystring.append("&");
     
    882894    //************************
    883895
     896    private String urlEncodeValue(String paramName, String paramVal) {
     897        String oldParamVal = paramVal;
     898        try{
     899            paramVal = URLEncoder.encode(paramVal, StandardCharsets.UTF_8.name());         
     900        } catch(UnsupportedEncodingException uee) {
     901            logger.warn("**** Unable to encode query_string param " + paramName + " in UTF-8, so attempting to continue with its unencoded value."); // don't output param value to log, in case of sensitive data?
     902            paramVal = oldParamVal;
     903        }
     904        return paramVal;
     905    }
     906   
    884907    /** parse the collect directory and return a list of collection names */
    885908    protected String[] getCollectionList()
  • main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/GSXML.java

    r33043 r33045  
    16241624    }
    16251625
    1626     public static void elementToLogAsString(Element e, boolean indent)
    1627     {
    1628         String str = elementToString(e, indent);
     1626    private static void elementToLogAsString(String prefix, Element e, boolean indent, boolean debugEncoding)
     1627    {
     1628        String str = prefix + "\n" + elementToString(e, indent, debugEncoding);
    16291629        System.err.println(str);
    1630         logger.error(str);
     1630        logger.info(str);
     1631    }
     1632   
     1633    // hex/unicode codepoint used only for those chars that are beyond printable/basic ASCII
     1634    public static void elementToLogAsUnicodeDebugString(String prefix, Element e, boolean indent)
     1635    {
     1636        elementToLogAsString(prefix, e, indent, true);
     1637    }
     1638   
     1639    public static void elementToLogAsString(String prefix, Element e, boolean indent)
     1640    {
     1641        elementToLogAsString(prefix, e, indent, false);
    16311642    }
    16321643
    16331644    // pass in debugEncoding=true to investigate encoding issues. This function will then return non-basic ASCII characters in hex
    1634     public static String elementToString(Element e, boolean indent, boolean debugEncoding)
     1645    private static String elementToString(Element e, boolean indent, boolean debugEncoding)
    16351646    {
    16361647        String str = "";
     
    16521663           
    16531664            // if debugging encoding issues, then encode unicode code pts as hex for all but non-alphanumeric and space/tab/newline chars
    1654             if(debugEncoding) str = Misc.stringToHex(str);
     1665            if(debugEncoding) str = Misc.debugUnicodeString(str);
    16551666        }
    16561667        catch (Exception ex)
     
    16671678    {
    16681679        return elementToString(e, indent, false);
     1680    }
     1681   
     1682    // hex/unicode codepoint used only for those chars that are beyond printable/basic ASCII
     1683    public static String elementToUnicodeDebugString(Element e, boolean indent)
     1684    {
     1685        return elementToString(e, indent, true);
    16691686    }
    16701687   
  • main/trunk/greenstone3/src/java/org/greenstone/util/Misc.java

    r33043 r33045  
    5858   
    5959   
    60     // Debugging function to print a string's non-basic chars in hex
     60    // Debugging function to print a string's non-basic chars in hex, so stringToHex on all non-basic and non-printable ASCII
     61    // Dr Bainbridge said that printing anything with charCode over 128 in hex is okay, but I'd already made extra allowances for non-printable ASCII
    6162    // Based on https://stackoverflow.com/questions/923863/converting-a-string-to-hexadecimal-in-java
    62     public static String stringToHex(String str) {
     63    public static String debugUnicodeString(String str) {
    6364      String result = "";
    6465      for(int i = 0; i < str.length(); i++) {
     
    6768            // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
    6869            // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
    69             if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space to tilda, TAB, LF, CR are printable
     70            if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing
    7071                result += str.charAt(i);
    7172            } else {
    72                 result += "x" + String.format("%04x", charCode);
     73                result += "x{" + String.format("%04x", charCode) + "}"; // looks like: x{4-char-codepoint}
    7374            }
    7475      }
  • main/trunk/greenstone3/web/interfaces/default/js/javascript-global-functions.js

    r33043 r33045  
    1313}
    1414
    15 // Debugging function to print a string's non-basic chars in hex
     15// Debugging function to print a string's non-basic chars in hex. So does string2hex on all non-basic and non-printable ASCII chars
     16// Dr Bainbridge said that printing anything with charCode over 128 in hex is okay, but I'd already made extra allowances for non-printable ASCII
    1617// Based on https://stackoverflow.com/questions/36637146/javascript-encode-string-to-hex/36637293
    1718// https://stackoverflow.com/questions/21647928/javascript-unicode-string-to-hex
    18 gs.functions.string2hex = function(str) {
     19gs.functions.debug_unicode_string = function(str) {
    1920    var hex, i;
    2021
     
    2627        if(charcode < 20 || charcode > 126) { //doesn't work: if(str.charAt(i) < ' ' || str.charAt(i) > '~') {
    2728            hex = charcode.toString(16);
    28             result += "x" + ("000"+hex).slice(-4);
     29            result += "x{" + ("000"+hex).slice(-4) + "}"; // looks like: x{4-char-codepoint}
    2930        }
    3031        else {
     
    785786gs.functions.setArchivesMetadata = function(collection, site, documentID, metadataName, metadataPosition, metadataValue, prevMetadataValue, metamode, successResponseFunction, errorResponseFunction)
    786787{
     788    if(metadataValue) console.log("metaval: " + metadataValue + " | " + gs.functions.debug_unicode_string(metadataValue)); //metadataValue.hexEncode()
     789    if(prevMetadataValue) console.log("prevmetaval: " + prevMetadataValue + " | " + gs.functions.debug_unicode_string(prevMetadataValue));
     790   
    787791    if( typeof errorResponseFunction === 'undefined' ) { errorResponseFunction = null; } // force error callback to be defined: either null or has value
    788792   
Note: See TracChangeset for help on using the changeset viewer.