Changeset 33045 for main

Show
Ignore:
Timestamp:
03.05.2019 21:46:42 (6 months ago)
Author:
ak19
Message:

1. Dr Bainbridge's fix for the Windows encoding issue when online doc editing Title meta containing non-ASCII/non-basic unicode chars, was to URL encode the QUERY_STRING so that CGI.pm would then do the right thing. This was because the problem had turned out not to be env vars on Windows, which could set and recall unicode and win-1252 chars just fine and therefore retained what was entered. The problem was that on Windows, the perl did not get the actual chars transmitted in the case of UTF-8 whereas win-1252 was received looking apparently like a unicode codepoint, but then in the latter case the utf82unicode call in metadataaction would then clobber the codepoint in attempting to do utf82unicode on it. On linux, perl happened to receive the chars as utf8-encoded bytes and so utf82unicode worked (to make them unicode aware strings?). The real problem was that it could go wrong in different ways on windows, since utf8 chars weren't even received properly by perl/CGI, so we didn't even need to start worrying about them getting sometimes clobbered in metadataaction. URL encoding the QUERY_STRING was meant to solve this. Except that URL encoding the whole QUERY_STRING made CGI.pm choke on the equals signs between param name and param value and possibly other chars. I don't know why. I found that URL encoding just the param values in the QUERY_STRING works, so I am committing that. 2. Renaming the recently introduced string2hex() in JavaScript? to debug_unicode_string and stringToHex() in Java to debugUnicodeString() to be more consistent with the perl variant, debug_unicode_string. Also like in the perl, the JavaScript? and Java now print the unicode value inside curly braces for better legibility. 3. Leaving in some commented out encoding debugging statements in the Java and JavaScript? code, but not committing the debugging on the perl side. 4. Some further improvements to overloaded functions in GSXML using debug_unicode_string for converting XML elements or printing them to logs.

Location:
main/trunk/greenstone3
Files:
5 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone3/src/java/org/greenstone/gsdl3/build/GS2PerlConstructor.java

    r32892 r33045  
    2020import java.io.InputStream; 
    2121import java.io.IOException; 
     22//import java.io.UnsupportedEncodingException; 
     23//import java.net.URLEncoder; 
     24//import java.nio.charset.StandardCharsets; 
    2225import java.util.ArrayList; 
    2326import java.util.Vector; 
     27 
    2428 
    2529import org.apache.log4j.*; 
     
    325329        command.add(cgi_directory + File.separator + "metadata-server.pl"); 
    326330         
     331         
    327332        // Need to set QUERY_STRING and REQUEST_METHOD=GET in environment 
    328333        // Also set GS3_AUTHENTICATED, to allow running metadata-server.pl with mod (set and remove) commands 
    329334        // http://www.cgi101.com/class/ch3/text.html 
     335         
     336        // And need to ensure that special characters won't get clobbered on Windows by perl/CGI.pm (https://www.nntp.perl.org/group/perl.perl5.porters/2016/10/msg240120.html),  
     337        // URL encode the query_string, as at https://stackoverflow.com/questions/10786042/java-url-encoding-of-query-string-parameters 
     338         
     339        // perl/CGI.pm doesn't like us URL encoding the entire query string such as the equal sign between each paramName and paramValue. 
     340        // So we URL encode each paramValue separately, which is done in GS2Construct.java::runCommand() 
     341        /* 
     342        String old_query_string = this.query_string; 
     343        try{ 
     344            this.query_string = URLEncoder.encode(this.query_string, StandardCharsets.UTF_8.name()); 
     345            //this.query_string = this.query_string.replace("+","%2B"); // https://stackoverflow.com/questions/1211229/in-a-url-should-spaces-be-encoded-using-20-or 
     346        } catch(UnsupportedEncodingException uee) { 
     347            logger.warn("**** Unable to encode query_string in UTF-8, so attempting to continue with the unencoded value of query_string"); 
     348            this.query_string = old_query_string;  
     349        } 
     350        */ 
     351         
    330352        String[] envvars = { 
    331353            "QUERY_STRING=" + this.query_string, 
     
    464486     
    465487    ///logger.info("### Running logged command = " + command_str); 
    466  
     488    /* 
     489    // USEFUL DEBUGGING WHEN USING DOC EDITOR TO MODIFY ex.Title META THAT CONTAINS NON-BASIC ASCII CHARS 
     490    logger.error("### Running logged command = " + command_str); 
     491    logger.error("### DEBUG Running logged command = " + Misc.stringToHex(command_str)); 
     492    if(envvars != null) { 
     493        for(int i = 0; i < envvars.length; i++) { 
     494            logger.error("### envvar = " + envvars[i]); 
     495            logger.error("### DEBUG envvar = " + Misc.stringToHex(envvars[i])); 
     496        } 
     497    } 
     498    */ 
     499     
    467500    // This is where we create and run our perl process safely 
    468501    SafeProcess perlProcess = createPerlProcess(command, envvars, dir); //  dir can be null 
     
    755788            if(this.source == SafeProcess.STDERR) { 
    756789                System.err.println("STDERR: " + line); 
     790                logger.info("@@@@@@@@ STDERR: " + line ); 
    757791            } else { 
    758792                System.err.println("STDOUT: " + line); 
     793                logger.info("@@@@@@@@ STDOUT: " + line ); 
    759794            }    
    760795             
  • main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/GS2Construct.java

    r32892 r33045  
    2323import java.io.FileWriter; 
    2424import java.io.Serializable; 
     25import java.io.UnsupportedEncodingException; 
     26import java.net.URLEncoder; 
     27import java.nio.charset.StandardCharsets; 
    2528import java.util.Collections; 
    2629import java.util.Iterator; 
     
    746749 
    747750        Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 
     751         
     752        //GSXML.elementToLogAsString("###      Extracted param_list: ", param_list, true); 
     753        //GSXML.elementToLogAsUnicodeDebugString("### DEBUG Extracted param_list: ", param_list, true); 
     754         
    748755        HashMap<String, Serializable> params = GSXML.extractParams(param_list, false); 
    749756 
     
    838845                String paramvalue = (String) entry.getValue(); 
    839846 
    840                 querystring.append(paramname + "=" + paramvalue); 
     847                // And need to ensure that special characters won't get clobbered on Windows by perl/CGI.pm (https://www.nntp.perl.org/group/perl.perl5.porters/2016/10/msg240120.html),  
     848                // URL encode the query_string, as at https://stackoverflow.com/questions/10786042/java-url-encoding-of-query-string-parameters 
     849         
     850                // perl/CGI.pm doesn't like us URL encoding the entire query string such as the equal sign between each paramName and paramValue. 
     851                // So we URL encode each paramValue separately, which is done in GS2Construct.java::runCommand() 
     852                querystring.append(paramname + "=" + urlEncodeValue(paramname, paramvalue)); 
    841853                if (i.hasNext()) { 
    842854                    querystring.append("&"); 
     
    882894    //************************ 
    883895 
     896    private String urlEncodeValue(String paramName, String paramVal) { 
     897        String oldParamVal = paramVal; 
     898        try{ 
     899            paramVal = URLEncoder.encode(paramVal, StandardCharsets.UTF_8.name());           
     900        } catch(UnsupportedEncodingException uee) { 
     901            logger.warn("**** Unable to encode query_string param " + paramName + " in UTF-8, so attempting to continue with its unencoded value."); // don't output param value to log, in case of sensitive data? 
     902            paramVal = oldParamVal;  
     903        } 
     904        return paramVal; 
     905    } 
     906     
    884907    /** parse the collect directory and return a list of collection names */ 
    885908    protected String[] getCollectionList() 
  • main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/GSXML.java

    r33043 r33045  
    16241624    } 
    16251625 
    1626     public static void elementToLogAsString(Element e, boolean indent) 
    1627     { 
    1628         String str = elementToString(e, indent); 
     1626    private static void elementToLogAsString(String prefix, Element e, boolean indent, boolean debugEncoding) 
     1627    { 
     1628        String str = prefix + "\n" + elementToString(e, indent, debugEncoding); 
    16291629        System.err.println(str); 
    1630         logger.error(str); 
     1630        logger.info(str); 
     1631    } 
     1632     
     1633    // hex/unicode codepoint used only for those chars that are beyond printable/basic ASCII 
     1634    public static void elementToLogAsUnicodeDebugString(String prefix, Element e, boolean indent) 
     1635    { 
     1636        elementToLogAsString(prefix, e, indent, true); 
     1637    } 
     1638     
     1639    public static void elementToLogAsString(String prefix, Element e, boolean indent) 
     1640    { 
     1641        elementToLogAsString(prefix, e, indent, false); 
    16311642    } 
    16321643 
    16331644    // pass in debugEncoding=true to investigate encoding issues. This function will then return non-basic ASCII characters in hex 
    1634     public static String elementToString(Element e, boolean indent, boolean debugEncoding) 
     1645    private static String elementToString(Element e, boolean indent, boolean debugEncoding) 
    16351646    { 
    16361647        String str = ""; 
     
    16521663             
    16531664            // if debugging encoding issues, then encode unicode code pts as hex for all but non-alphanumeric and space/tab/newline chars 
    1654             if(debugEncoding) str = Misc.stringToHex(str); 
     1665            if(debugEncoding) str = Misc.debugUnicodeString(str); 
    16551666        } 
    16561667        catch (Exception ex) 
     
    16671678    { 
    16681679        return elementToString(e, indent, false); 
     1680    } 
     1681     
     1682    // hex/unicode codepoint used only for those chars that are beyond printable/basic ASCII 
     1683    public static String elementToUnicodeDebugString(Element e, boolean indent) 
     1684    { 
     1685        return elementToString(e, indent, true); 
    16691686    } 
    16701687     
  • main/trunk/greenstone3/src/java/org/greenstone/util/Misc.java

    r33043 r33045  
    5858     
    5959     
    60     // Debugging function to print a string's non-basic chars in hex 
     60    // Debugging function to print a string's non-basic chars in hex, so stringToHex on all non-basic and non-printable ASCII 
     61    // Dr Bainbridge said that printing anything with charCode over 128 in hex is okay, but I'd already made extra allowances for non-printable ASCII 
    6162    // Based on https://stackoverflow.com/questions/923863/converting-a-string-to-hexadecimal-in-java 
    62     public static String stringToHex(String str) { 
     63    public static String debugUnicodeString(String str) { 
    6364      String result = ""; 
    6465      for(int i = 0; i < str.length(); i++) { 
     
    6768            // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png 
    6869            // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format) 
    69             if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space to tilda, TAB, LF, CR are printable 
     70            if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing 
    7071                result += str.charAt(i); 
    7172            } else { 
    72                 result += "x" + String.format("%04x", charCode); 
     73                result += "x{" + String.format("%04x", charCode) + "}"; // looks like: x{4-char-codepoint} 
    7374            } 
    7475      } 
  • main/trunk/greenstone3/web/interfaces/default/js/javascript-global-functions.js

    r33043 r33045  
    1313} 
    1414 
    15 // Debugging function to print a string's non-basic chars in hex 
     15// Debugging function to print a string's non-basic chars in hex. So does string2hex on all non-basic and non-printable ASCII chars 
     16// Dr Bainbridge said that printing anything with charCode over 128 in hex is okay, but I'd already made extra allowances for non-printable ASCII 
    1617// Based on https://stackoverflow.com/questions/36637146/javascript-encode-string-to-hex/36637293 
    1718// https://stackoverflow.com/questions/21647928/javascript-unicode-string-to-hex 
    18 gs.functions.string2hex = function(str) { 
     19gs.functions.debug_unicode_string = function(str) { 
    1920    var hex, i; 
    2021 
     
    2627        if(charcode < 20 || charcode > 126) { //doesn't work: if(str.charAt(i) < ' ' || str.charAt(i) > '~') { 
    2728            hex = charcode.toString(16); 
    28             result += "x" + ("000"+hex).slice(-4); 
     29            result += "x{" + ("000"+hex).slice(-4) + "}"; // looks like: x{4-char-codepoint} 
    2930        } 
    3031        else { 
     
    785786gs.functions.setArchivesMetadata = function(collection, site, documentID, metadataName, metadataPosition, metadataValue, prevMetadataValue, metamode, successResponseFunction, errorResponseFunction) 
    786787{ 
     788    if(metadataValue) console.log("metaval: " + metadataValue + " | " + gs.functions.debug_unicode_string(metadataValue)); //metadataValue.hexEncode() 
     789    if(prevMetadataValue) console.log("prevmetaval: " + prevMetadataValue + " | " + gs.functions.debug_unicode_string(prevMetadataValue)); 
     790     
    787791    if( typeof errorResponseFunction === 'undefined' ) { errorResponseFunction = null; } // force error callback to be defined: either null or has value 
    788792