Changeset 33739

Show
Ignore:
Timestamp:
02.12.2019 23:15:50 (4 days ago)
Author:
ak19
Message:

Bugfix to GLI to being able to parse metadata.xml files containing & chars. Basically, after FilenameEncoding?.filtToURLEncoding() calls URI.toASCIIString(), I replace all non-hex-entity ampersands with their hex entity. This preserves it correctly in the metadata.xml files. Some details: Because we need to use URI.toASCIIString(), which only converts char values > 127 to in the URI to its URL encoded hex value (and not hex entity &#x....; as I now find, despite having coded carefully around hex entities with extra unnecessary effort), the code for encoding values less than 127 as their URL hex code has to be manually done. And escaping & to its hex entity becomes complicated by the fact that we don't want to modify any existing hex entities in the ASCIIstring produced by toASCIIString. (Not a complication when we're dealing with strings containing URL/% encoded hex values.) If more chars in metadata.xml pose a problem for GLI parsing them, then a better solution should be invented to replace URI.toASCIIString(). If only stringToHex() would suffice. I think however that toASCIIString() properly URL encodes unicode codepoints, so that one can have 3 or 4 %XX in sequence, like %xx%xx%xx. Work to be done: plus signs in filenames still need to be handled. GLI handles them okay so far as loading the correct file level meta attached to a filename containing + signs, but when built metadata for such a file does not appear in doc.xml. This may be a perl issue if it happens on build?

Location:
main/trunk/gli/src/org/greenstone/gatherer/metadata
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java

    r33738 r33739  
    355355    }    
    356356 
    357     /** Attempting to produce the equivalent method fileToURLEncoding() above, but taking a String as input parameter */ 
    358      public static String fileNameToHex(String filename) { 
    359  
    360         String hexFilename = ""; 
    361         for(int i = 0; i < filename.length(); i++) { 
    362             int charCode = filename.codePointAt(i); // unicode codepoint / ASCII code 
     357    /** 
     358     * Attempting to produce the equivalent method fileToURLEncoding(), but taking a String as input parameter 
     359     * UNUSED - REPLACED by filenameToURLEncoding(String str) which reuses existing fileToURLEncoding(File) method. 
     360    */ 
     361    public static String stringToHex(String str) { 
     362 
     363        String hex_str = ""; 
     364        for(int i = 0; i < str.length(); i++) { 
     365            int charCode = str.codePointAt(i); // unicode codepoint / ASCII code 
    363366             
    364367            // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png 
    365368            // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format) 
    366             if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 || charCode == 36 || charCode == 43) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too  
    367                 hexFilename += filename.charAt(i); 
     369            if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 /*|| charCode == 36 || charCode == 43*/) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too  
     370                hex_str += str.charAt(i); 
    368371            } else { 
    369                 hexFilename += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];" 
     372                hex_str += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];" 
    370373            } 
    371374        } 
    372375           
    373         return hexFilename; 
     376        return hex_str; 
    374377     } 
    375  
     378      
     379     
     380    /** Takes a String containing a single char and returns the hex entity for it */ 
     381    public static String hexEntityForChar(String char_as_string) { 
     382        int charCode = char_as_string.codePointAt(0); // unicode codepoint / ASCII code      
     383        String hexCodeStr = "&#x" + String.format("%x", charCode).toUpperCase() + ";"; 
     384        return hexCodeStr; 
     385     }  
     386     
     387    /** 
     388     * Given a String containing 0 or more occurrences of CHARACTER, 
     389     * this method will replace all occurrences of that CHARACTER with its hex entity variant, "&x....;" 
     390     * Special care is taken where the CHARACTER to be replaced is &, 
     391     * as in that case, we don't want to replace any existing hex entities already present in the String. 
     392    */ 
     393    public static String escapeAllCharWithHexEntity(String str, char CHARACTER/*, String hexCodeString*/) { 
     394        String char_as_string = Character.toString(CHARACTER); 
     395        String hexCodeString = hexEntityForChar(char_as_string); 
     396         
     397        //System.err.println("@@@ hexCodeString for: " + char_as_string + " is: " + hexCodeString); 
     398         
     399        Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 
     400        Matcher hexPatternMatch = hexPattern.matcher(str);       
     401         
     402        // want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match hexPattern 
     403        int searchIndex = 0; 
     404         
     405        boolean finished = false; 
     406        while(!finished) {           
     407             
     408            searchIndex = str.indexOf(CHARACTER, searchIndex); 
     409             
     410            if(searchIndex == -1) { 
     411                finished = true; 
     412            }    
     413            else {               
     414                 
     415                // replacing ampersands, &, is a special case: don't want to replace the & of (hex) entities in the string: 
     416                if(hexPatternMatch.find(searchIndex) && searchIndex == hexPatternMatch.start()) { 
     417                    searchIndex = hexPatternMatch.end(); 
     418                } else { 
     419                     
     420                    String tmp = str.substring(0, searchIndex) + hexCodeString; 
     421                    searchIndex++; 
     422                    if(str.length() > searchIndex) { 
     423                        tmp += str.substring(searchIndex); 
     424                    } 
     425                    str = tmp; 
     426                    searchIndex = searchIndex+ hexCodeString.length() - 1; 
     427                     
     428                    // String has been modified, so have to update Matcher 
     429                    hexPatternMatch = hexPattern.matcher(str); 
     430                     
     431                    if(searchIndex >= str.length()) { 
     432                        finished = true; 
     433                    } 
     434                } 
     435            } 
     436        } 
     437         
     438        return str; 
     439    } 
     440     
    376441     
    377442    // follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter 
     
    428493            return filename; 
    429494        } 
    430          
    431495        File file = new File (filename); 
    432496        return fileToURLEncoding(file); 
     
    454518        } 
    455519 
     520        // we'll want to protect & by replacing with &'s hex value 
     521        // but we don't want to replace &#x....; with the same! 
     522        Pattern plain_ampersand_not_hex_prefix_Pattern = Pattern.compile("&[^#]"); 
     523         
     524         
     525        int containsAmp = 0; 
     526        if(file.getName().contains("&amp;")) { 
     527            System.err.println("@@@ 1 to encode " + file.getName()); 
     528            containsAmp = 1; 
     529        } else if(file.getName().contains("&")) { 
     530            System.err.println("@@@ 2 to encode " + file.getName()); 
     531            containsAmp = 2; 
     532        } else { 
     533            System.err.println("@@@ 0 to encode " + file.getName()); 
     534        } 
     535         
     536         
    456537        String filename_url_encoded = ""; 
    457538         
     
    489570            // then used to determine how to URL encode it 
    490571             
    491             String filename_ascii = filename_uri.toASCIIString(); 
    492             String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1"); 
     572            String filename_ascii = filename_uri.toASCIIString(); 
     573            // protect & and + in the filename too 
     574            filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); 
     575             
     576            if(containsAmp > 0) System.err.println("@@@ filename_ascii: " + filename_ascii); 
     577             
     578             
     579            //if(containsAmp > 0) System.err.println("@@@ filename_ascii with hexed &: " + filename_ascii); 
     580            String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");          
    493581            filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes); 
     582             
     583             
     584            //filename_url_encoded = filename_url_encoded.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex 
     585            //filename_ascii = filename_ascii.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased 
     586            //if(containsAmp > 0) System.err.println("@@@ filename_url_encoded: " + filename_url_encoded); 
    494587             
    495588        } 
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java

    r33738 r33739  
    668668        } 
    669669         
    670         String metadata_xml_file_directory_path = FilenameEncoding.filenameToURLEncoding("."); 
    671         metadata_xml_file_directory_path = metadata_xml_file_directory_path.substring(0, metadata_xml_file_directory_path.length()-2); // cut off /. at end 
    672         System.err.println("@@@ metadata_xml_file_directory_path: " + metadata_xml_file_directory_path); 
     670        String curr_directory_path = FilenameEncoding.filenameToURLEncoding("."); 
     671        curr_directory_path = curr_directory_path.substring(0, curr_directory_path.length()-2); // cut off /. at end 
     672        //System.err.println("@@@ curr_directory_path: " + curr_directory_path); 
    673673         
    674674        //System.err.println("PARSED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true)); 
     
    695695 
    696696                    // now lop off the metadataxml dir prefix the FilenameEncoding.filenameToURLEncoding(STRING) variant would have added 
    697                     encoded_filename = encoded_filename.substring(metadata_xml_file_directory_path.length()); 
     697                    encoded_filename = encoded_filename.substring(curr_directory_path.length()); 
    698698                    if (encoded_filename.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { 
    699699                        encoded_filename = encoded_filename.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());