Changeset 33739


Ignore:
Timestamp:
2019-12-02T23:15:50+13:00 (4 years ago)
Author:
ak19
Message:

Bugfix to GLI to being able to parse metadata.xml files containing & chars. Basically, after FilenameEncoding.filtToURLEncoding() calls URI.toASCIIString(), I replace all non-hex-entity ampersands with their hex entity. This preserves it correctly in the metadata.xml files. Some details: Because we need to use URI.toASCIIString(), which only converts char values > 127 to in the URI to its URL encoded hex value (and not hex entity &#x....; as I now find, despite having coded carefully around hex entities with extra unnecessary effort), the code for encoding values less than 127 as their URL hex code has to be manually done. And escaping & to its hex entity becomes complicated by the fact that we don't want to modify any existing hex entities in the ASCIIstring produced by toASCIIString. (Not a complication when we're dealing with strings containing URL/% encoded hex values.) If more chars in metadata.xml pose a problem for GLI parsing them, then a better solution should be invented to replace URI.toASCIIString(). If only stringToHex() would suffice. I think however that toASCIIString() properly URL encodes unicode codepoints, so that one can have 3 or 4 %XX in sequence, like %xx%xx%xx. Work to be done: plus signs in filenames still need to be handled. GLI handles them okay so far as loading the correct file level meta attached to a filename containing + signs, but when built metadata for such a file does not appear in doc.xml. This may be a perl issue if it happens on build?

Location:
main/trunk/gli/src/org/greenstone/gatherer/metadata
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java

    r33738 r33739  
    355355    }   
    356356
    357     /** Attempting to produce the equivalent method fileToURLEncoding() above, but taking a String as input parameter */
    358      public static String fileNameToHex(String filename) {
    359 
    360         String hexFilename = "";
    361         for(int i = 0; i < filename.length(); i++) {
    362             int charCode = filename.codePointAt(i); // unicode codepoint / ASCII code
     357    /**
     358     * Attempting to produce the equivalent method fileToURLEncoding(), but taking a String as input parameter
     359     * UNUSED - REPLACED by filenameToURLEncoding(String str) which reuses existing fileToURLEncoding(File) method.
     360    */
     361    public static String stringToHex(String str) {
     362
     363        String hex_str = "";
     364        for(int i = 0; i < str.length(); i++) {
     365            int charCode = str.codePointAt(i); // unicode codepoint / ASCII code
    363366           
    364367            // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
    365368            // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
    366             if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 || charCode == 36 || charCode == 43) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too
    367                 hexFilename += filename.charAt(i);
     369            if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 /*|| charCode == 36 || charCode == 43*/) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too
     370                hex_str += str.charAt(i);
    368371            } else {
    369                 hexFilename += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"
     372                hex_str += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"
    370373            }
    371374        }
    372375         
    373         return hexFilename;
     376        return hex_str;
    374377     }
    375 
     378     
     379   
     380    /** Takes a String containing a single char and returns the hex entity for it */
     381    public static String hexEntityForChar(String char_as_string) {
     382        int charCode = char_as_string.codePointAt(0); // unicode codepoint / ASCII code     
     383        String hexCodeStr = "&#x" + String.format("%x", charCode).toUpperCase() + ";";
     384        return hexCodeStr;
     385     }
     386   
     387    /**
     388     * Given a String containing 0 or more occurrences of CHARACTER,
     389     * this method will replace all occurrences of that CHARACTER with its hex entity variant, "&x....;"
     390     * Special care is taken where the CHARACTER to be replaced is &,
     391     * as in that case, we don't want to replace any existing hex entities already present in the String.
     392    */
     393    public static String escapeAllCharWithHexEntity(String str, char CHARACTER/*, String hexCodeString*/) {
     394        String char_as_string = Character.toString(CHARACTER);
     395        String hexCodeString = hexEntityForChar(char_as_string);
     396       
     397        //System.err.println("@@@ hexCodeString for: " + char_as_string + " is: " + hexCodeString);
     398       
     399        Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)");
     400        Matcher hexPatternMatch = hexPattern.matcher(str);     
     401       
     402        // want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match hexPattern
     403        int searchIndex = 0;
     404       
     405        boolean finished = false;
     406        while(!finished) {         
     407           
     408            searchIndex = str.indexOf(CHARACTER, searchIndex);
     409           
     410            if(searchIndex == -1) {
     411                finished = true;
     412            }   
     413            else {             
     414               
     415                // replacing ampersands, &, is a special case: don't want to replace the & of (hex) entities in the string:
     416                if(hexPatternMatch.find(searchIndex) && searchIndex == hexPatternMatch.start()) {
     417                    searchIndex = hexPatternMatch.end();
     418                } else {
     419                   
     420                    String tmp = str.substring(0, searchIndex) + hexCodeString;
     421                    searchIndex++;
     422                    if(str.length() > searchIndex) {
     423                        tmp += str.substring(searchIndex);
     424                    }
     425                    str = tmp;
     426                    searchIndex = searchIndex+ hexCodeString.length() - 1;
     427                   
     428                    // String has been modified, so have to update Matcher
     429                    hexPatternMatch = hexPattern.matcher(str);
     430                   
     431                    if(searchIndex >= str.length()) {
     432                        finished = true;
     433                    }
     434                }
     435            }
     436        }
     437       
     438        return str;
     439    }
     440   
    376441   
    377442    // follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter
     
    428493            return filename;
    429494        }
    430        
    431495        File file = new File (filename);
    432496        return fileToURLEncoding(file);
     
    454518        }
    455519
     520        // we'll want to protect & by replacing with &'s hex value
     521        // but we don't want to replace &#x....; with the same!
     522        Pattern plain_ampersand_not_hex_prefix_Pattern = Pattern.compile("&[^#]");
     523       
     524       
     525        int containsAmp = 0;
     526        if(file.getName().contains("&amp;")) {
     527            System.err.println("@@@ 1 to encode " + file.getName());
     528            containsAmp = 1;
     529        } else if(file.getName().contains("&")) {
     530            System.err.println("@@@ 2 to encode " + file.getName());
     531            containsAmp = 2;
     532        } else {
     533            System.err.println("@@@ 0 to encode " + file.getName());
     534        }
     535       
     536       
    456537        String filename_url_encoded = "";
    457538       
     
    489570            // then used to determine how to URL encode it
    490571           
    491             String filename_ascii = filename_uri.toASCIIString();
    492             String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
     572            String filename_ascii = filename_uri.toASCIIString();
     573            // protect & and + in the filename too
     574            filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&');
     575           
     576            if(containsAmp > 0) System.err.println("@@@ filename_ascii: " + filename_ascii);
     577           
     578           
     579            //if(containsAmp > 0) System.err.println("@@@ filename_ascii with hexed &: " + filename_ascii);
     580            String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");         
    493581            filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
     582           
     583           
     584            //filename_url_encoded = filename_url_encoded.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex
     585            //filename_ascii = filename_ascii.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased
     586            //if(containsAmp > 0) System.err.println("@@@ filename_url_encoded: " + filename_url_encoded);
    494587           
    495588        }
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java

    r33738 r33739  
    668668        }
    669669       
    670         String metadata_xml_file_directory_path = FilenameEncoding.filenameToURLEncoding(".");
    671         metadata_xml_file_directory_path = metadata_xml_file_directory_path.substring(0, metadata_xml_file_directory_path.length()-2); // cut off /. at end
    672         System.err.println("@@@ metadata_xml_file_directory_path: " + metadata_xml_file_directory_path);
     670        String curr_directory_path = FilenameEncoding.filenameToURLEncoding(".");
     671        curr_directory_path = curr_directory_path.substring(0, curr_directory_path.length()-2); // cut off /. at end
     672        //System.err.println("@@@ curr_directory_path: " + curr_directory_path);
    673673       
    674674        //System.err.println("PARSED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true));
     
    695695
    696696                    // now lop off the metadataxml dir prefix the FilenameEncoding.filenameToURLEncoding(STRING) variant would have added
    697                     encoded_filename = encoded_filename.substring(metadata_xml_file_directory_path.length());
     697                    encoded_filename = encoded_filename.substring(curr_directory_path.length());
    698698                    if (encoded_filename.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
    699699                        encoded_filename = encoded_filename.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
Note: See TracChangeset for help on using the changeset viewer.