Changeset 33748

Show
Ignore:
Timestamp:
03.12.2019 21:06:44 (3 days ago)
Author:
ak19
Message:

Linux bugfixes to recent commits to do with getting file-level meta assigned to non-ascii filenames or filenames containing plus/ampersand signs to work. Cumulative past commits were sufficient for fixing these issues on Windows. All those changes plus the current ones get it all working on Linux too.

Location:
main/trunk/gli/src/org/greenstone/gatherer/metadata
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java

    r33747 r33748  
    7676    /** Compiled pattern for hex entities of characters. These are of the forn "&#x....;" with 1 to 4 digits */ 
    7777    public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 
    78      
     78 
     79    /** The hex entity version of the ampersand character. 
     80     * We use this in place of the ampersand character in filenames in metadata.xml files to 
     81     * preserve the reference to the literal ampersand in the real file name on the file system. 
     82     */ 
     83    public static final String HEX_ENTITY_AMPERSAND = FilenameEncoding.hexEntityForChar("&"); //"&"; 
     84     
    7985 
    8086//*********************** BUSY REFRESHING / REQUIRING  REFRESH ********************* 
     
    371377     
    372378    public static String fileToURLEncoding(File file) { 
     379    // on a UTF-8 file system, DO NOT do the stuff further below, 
     380    // just return input filename param, but with any & in the filename replaced with its hex entity 
    373381        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
    374             return file.getAbsolutePath(); 
     382            // protect ampersands in filenames by converting it to its hex entity 
     383            String filepath = file.getAbsolutePath(); 
     384            filepath = filepath.replace("&", HEX_ENTITY_AMPERSAND); 
     385            return filepath; 
    375386        } 
    376387         
     
    419430             
    420431            // Before proceeding, protect & in the filename too. 
    421             // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & 
     432            // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & (HEX_ENTITY_AMPERSAND) 
    422433            // But dangerous to do simple replace if there are &#x...; entities in the filename already! 
    423434            // That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same! 
     
    434445            //filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller 
    435446            filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex 
    436             filename_url_encoded = filename_url_encoded.replace("%26", "&"); // convert URL encoding for ampersand into hex entity for ampersand 
     447            filename_url_encoded = filename_url_encoded.replace("%26", HEX_ENTITY_AMPERSAND); // convert URL encoding for ampersand into hex entity for ampersand 
    437448        } 
    438449        catch (Exception e) { 
     
    530541    */ 
    531542    public static String fullFilepathToURLEncoding(String filename) { 
    532         if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 
    533             return filename; 
     543        // on a UTF-8 file system, DO NOT do the stuff further below, 
     544        // just return input filename param, but with any & in the filename replaced with its hex entity 
     545        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
     546            return filename.replace("&", HEX_ENTITY_AMPERSAND); 
    534547        } 
    535548         
     
    554567    public static String relativeFilenameToURLEncoding(String filename) { 
    555568        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 
    556             return filename; 
     569            return filename.replace("&", HEX_ENTITY_AMPERSAND); 
    557570        } 
    558571         
     
    567580    public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) { 
    568581        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 
    569             return filename; 
     582            return filename.replace("&", HEX_ENTITY_AMPERSAND); 
    570583        } 
    571584         
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java

    r33747 r33748  
    8787    // Determine the file's path relative to the location of the metadata.xml file 
    8888    String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile()); 
    89     String file_relative_path = file_node.getURLEncodedFilePath().substring(metadata_xml_file_directory_path.length()); 
     89 
     90    String file_relative_path = file_node.getURLEncodedFilePath(); 
     91    if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
     92        file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND); 
     93    } 
     94    file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length()); 
     95     
    9096    if (file_relative_path.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { 
    9197        file_relative_path = file_relative_path.substring(FilenameEncoding.URL_FILE_SEPARATOR.length()); 
     
    103109    } 
    104110 
     111    // LEAVE THIS DEBUGGING STATEMENT IN - USEFUL TO DEBUG FILENAME ENCODING ISSUES WHEN META ASSIGNED 
    105112    //System.err.println("MetadataXMLFile.addMetadata() Adding meta for file regexp: " 
    106113    //  + file_path_regexp + " - " + org.greenstone.gatherer.util.Utility.debugUnicodeString(file_path_regexp)); 
     
    220227 
    221228 
     229    // DO NOT REMOVE THE System.err DEBUGGING STATEMENTS FROM THIS METHOD: HELPS WITH TESTING/DEBUGGING 
     230    // WHEN FILE-LEVEL META IS ASSIGNED TO NON-ASCII ENCODED FILENAMES OR WITH FILENAMES CONTAINING +/ampersand 
    222231    public ArrayList getMetadataAssignedToFile(File file, boolean fileEncodingOnly) 
    223232    { 
     
    652661     * At the end of this function, the doc will be modified with the re-encoded filenames. 
    653662     *  
     663     * DO NOT REMOVE THE DEBUGGING STATEMENTS IN THIS FUNCTION, AS THEY'RE USEFUL 
     664     * FOR DEBUGGING ENCODING ISSUES TO DO WITH FILE LEVEL META ASSIGNED TO FILENAMES 
     665     * THAT ARE NON-ASCII OR CONTAIN +/ampersands IN THEM. 
    654666    */ 
    655667    static private void reEncodeFilenamesInMetadataXML(Document doc) { 
    656         if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
    657             return; 
    658         } 
    659668         
    660669        String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); // returns the curr dir path after removing the /./ at end         
     
    677686                     
    678687                    //System.err.println("Filename before reencoding was: " + filename);                     
    679                      
     688 
     689                    String encoded_filename = filename; 
     690                    if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
     691                    encoded_filename = encoded_filename.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND); 
     692                    } else { 
    680693                    // Can't convert to URI with backslash-escaped chars (backslash used in regexed filename are illegal in URI object 
    681694                    // created by filenameToURLEncoding). So replace backslashes in regex with url-encoded hex-value of backslash, %5C. 
    682                     String encoded_filename = filename.replace("\\", "%5C"); 
     695                    encoded_filename = filename.replace("\\", "%5C"); 
    683696                     
    684697                    // get the URL encoded filename preserving special encodings, with any curr_directory_path prefix removed 
     
    687700                    // Reintrodudce the backslash characters in place of their %5C hex placeholders 
    688701                    encoded_filename = encoded_filename.replace("%5C", "\\"); 
    689                      
    690                     // Update filename element in DOM 
    691                     XMLTools.setElementTextValue(filename_element, encoded_filename); 
    692                     //System.err.println("Filename after reencoding was: " + encoded_filename); 
     702                    } 
     703                    // Update filename element in DOM 
     704                    XMLTools.setElementTextValue(filename_element, encoded_filename); 
     705                    //System.err.println("Filename after reencoding was: " + encoded_filename); 
    693706                } 
    694707            }