Changeset 33748 for main/trunk


Ignore:
Timestamp:
2019-12-03T21:06:44+13:00 (4 years ago)
Author:
ak19
Message:

Linux bugfixes to recent commits to do with getting file-level meta assigned to non-ascii filenames or filenames containing plus/ampersand signs to work. Cumulative past commits were sufficient for fixing these issues on Windows. All those changes plus the current ones get it all working on Linux too.

Location:
main/trunk/gli/src/org/greenstone/gatherer/metadata
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java

    r33747 r33748  
    7676    /** Compiled pattern for hex entities of characters. These are of the forn "&#x....;" with 1 to 4 digits */
    7777    public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)");
    78    
     78
     79    /** The hex entity version of the ampersand character.
     80     * We use this in place of the ampersand character in filenames in metadata.xml files to
     81     * preserve the reference to the literal ampersand in the real file name on the file system.
     82     */
     83    public static final String HEX_ENTITY_AMPERSAND = FilenameEncoding.hexEntityForChar("&"); //"&";
     84   
    7985
    8086//*********************** BUSY REFRESHING / REQUIRING  REFRESH *********************
     
    371377   
    372378    public static String fileToURLEncoding(File file) {
     379    // on a UTF-8 file system, DO NOT do the stuff further below,
     380    // just return input filename param, but with any & in the filename replaced with its hex entity
    373381        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
    374             return file.getAbsolutePath();
     382            // protect ampersands in filenames by converting it to its hex entity
     383            String filepath = file.getAbsolutePath();
     384            filepath = filepath.replace("&", HEX_ENTITY_AMPERSAND);
     385            return filepath;
    375386        }
    376387       
     
    419430           
    420431            // Before proceeding, protect & in the filename too.
    421             // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with &
     432            // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & (HEX_ENTITY_AMPERSAND)
    422433            // But dangerous to do simple replace if there are &#x...; entities in the filename already!
    423434            // That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same!
     
    434445            //filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller
    435446            filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex
    436             filename_url_encoded = filename_url_encoded.replace("%26", "&"); // convert URL encoding for ampersand into hex entity for ampersand
     447            filename_url_encoded = filename_url_encoded.replace("%26", HEX_ENTITY_AMPERSAND); // convert URL encoding for ampersand into hex entity for ampersand
    437448        }
    438449        catch (Exception e) {
     
    530541    */
    531542    public static String fullFilepathToURLEncoding(String filename) {
    532         if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
    533             return filename;
     543        // on a UTF-8 file system, DO NOT do the stuff further below,
     544        // just return input filename param, but with any & in the filename replaced with its hex entity
     545        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
     546            return filename.replace("&", HEX_ENTITY_AMPERSAND);
    534547        }
    535548       
     
    554567    public static String relativeFilenameToURLEncoding(String filename) {
    555568        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
    556             return filename;
     569            return filename.replace("&", HEX_ENTITY_AMPERSAND);
    557570        }
    558571       
     
    567580    public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) {
    568581        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
    569             return filename;
     582            return filename.replace("&", HEX_ENTITY_AMPERSAND);
    570583        }
    571584       
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java

    r33747 r33748  
    8787    // Determine the file's path relative to the location of the metadata.xml file
    8888    String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile());
    89     String file_relative_path = file_node.getURLEncodedFilePath().substring(metadata_xml_file_directory_path.length());
     89
     90    String file_relative_path = file_node.getURLEncodedFilePath();
     91    if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
     92        file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND);
     93    }
     94    file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length());
     95   
    9096    if (file_relative_path.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
    9197        file_relative_path = file_relative_path.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
     
    103109    }
    104110
     111    // LEAVE THIS DEBUGGING STATEMENT IN - USEFUL TO DEBUG FILENAME ENCODING ISSUES WHEN META ASSIGNED
    105112    //System.err.println("MetadataXMLFile.addMetadata() Adding meta for file regexp: "
    106113    //  + file_path_regexp + " - " + org.greenstone.gatherer.util.Utility.debugUnicodeString(file_path_regexp));
     
    220227
    221228
     229    // DO NOT REMOVE THE System.err DEBUGGING STATEMENTS FROM THIS METHOD: HELPS WITH TESTING/DEBUGGING
     230    // WHEN FILE-LEVEL META IS ASSIGNED TO NON-ASCII ENCODED FILENAMES OR WITH FILENAMES CONTAINING +/ampersand
    222231    public ArrayList getMetadataAssignedToFile(File file, boolean fileEncodingOnly)
    223232    {
     
    652661     * At the end of this function, the doc will be modified with the re-encoded filenames.
    653662     *
     663     * DO NOT REMOVE THE DEBUGGING STATEMENTS IN THIS FUNCTION, AS THEY'RE USEFUL
     664     * FOR DEBUGGING ENCODING ISSUES TO DO WITH FILE LEVEL META ASSIGNED TO FILENAMES
     665     * THAT ARE NON-ASCII OR CONTAIN +/ampersands IN THEM.
    654666    */
    655667    static private void reEncodeFilenamesInMetadataXML(Document doc) {
    656         if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
    657             return;
    658         }
    659668       
    660669        String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); // returns the curr dir path after removing the /./ at end       
     
    677686                   
    678687                    //System.err.println("Filename before reencoding was: " + filename);                   
    679                    
     688
     689                    String encoded_filename = filename;
     690                    if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
     691                    encoded_filename = encoded_filename.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND);
     692                    } else {
    680693                    // Can't convert to URI with backslash-escaped chars (backslash used in regexed filename are illegal in URI object
    681694                    // created by filenameToURLEncoding). So replace backslashes in regex with url-encoded hex-value of backslash, %5C.
    682                     String encoded_filename = filename.replace("\\", "%5C");
     695                    encoded_filename = filename.replace("\\", "%5C");
    683696                   
    684697                    // get the URL encoded filename preserving special encodings, with any curr_directory_path prefix removed
     
    687700                    // Reintrodudce the backslash characters in place of their %5C hex placeholders
    688701                    encoded_filename = encoded_filename.replace("%5C", "\\");
    689                    
    690                     // Update filename element in DOM
    691                     XMLTools.setElementTextValue(filename_element, encoded_filename);
    692                     //System.err.println("Filename after reencoding was: " + encoded_filename);
     702                    }
     703                    // Update filename element in DOM
     704                    XMLTools.setElementTextValue(filename_element, encoded_filename);
     705                    //System.err.println("Filename after reencoding was: " + encoded_filename);
    693706                }
    694707            }
Note: See TracChangeset for help on using the changeset viewer.