Ignore:
Timestamp:
2019-12-02T20:03:57+13:00 (4 years ago)
Author:
ak19
Message:

A larger fix but not complete fix to the problem of attaching and retaining file level assigned meta to filenames containing non-ASCII characters. 1. Committing intermediate version of bugfix containing the idea suggested by Kathy to reuse the steps in fileToURLEncoding(File) for a String parameter as she felt that since the String represents a filename, a URI object should be instantiable on a String. Worked with some massaging. Can't yet get the new fileToURLEncoding(String) to work by calling fileToURLEncoding(File). So am committing the version of fileToURLEncoding(String) that is largely a copy of fileToURLEncoding(File), until I can get the simpler variant working. 2. The new method is called after each successful parseXML call from MetadataXMLFile, so that the decoded entities resulting from parseXML() are reencoded in the DOM. This allows us to retain the correct filenames originally mentioned in metadata.xml files, do proper comparisons against them to attach/modify further metdata and so that the correct values get written out again into metadata.xml files. 3. Still want to get simpler version of fileToURLEncoding(String) to work that reuses fileToURLEncoding(File). 4. Want to get ampersand and plus signs in filenames to work (+ signs in filenames are lost when filenames are converted to URL). 5. Still need to investigate the missing ex. metadata for filenames containing non-ASCII.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java

    r33730 r33737  
    364364            // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
    365365            // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
    366             if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing
     366            if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 || charCode == 36 || charCode == 43) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too
    367367                hexFilename += filename.charAt(i);
    368368            } else {
     
    374374     }
    375375
     376   
     377    // follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter
     378    public static String filenameToURLEncoding(String filename) {
     379        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
     380            return filename;
     381        }
     382
     383         // Can't create a URI out of a filename containing spaces. Spaces must be encoded as %20
     384        String filename_url_encoded = filename.replace(" ", "%20");
     385        //filename_url_encoded = filename_url_encoded.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex
     386        //filename_url_encoded = filename_url_encoded.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased
     387
     388        try {
     389            URI filename_uri = new URI(filename_url_encoded);
     390            // The trick:
     391            //  1. toASCIIString() will %xx encode values > 127
     392            //  2. Decode the result to "ISO-8859-1"
     393            //  3. URL encode the bytes to string
     394           
     395            // Step 2 forces the string to be 8-bit values.  It
     396            // doesn't matter if the starting raw filename was *not*
     397            // in the ISO-8859-1 encoding, the effect is to ensure
     398            // we have an 8-bit byte string that (numerically)
     399            // captures the right value.  These numerical values are
     400            // then used to determine how to URL encode it
     401           
     402            String filename_ascii = filename_uri.toASCIIString();
     403            //filename_ascii = filename_ascii.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex
     404            //filename_ascii = filename_ascii.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased
     405            String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
     406            filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
     407           
     408            // DEALING WITH & and + in filenames: NOT WORKING YET
     409            //if(filename_url_encoded.contains("&amp;")) {
     410            //  filename_url_encoded = filename_url_encoded.replace("&amp;", "%36amp;");
     411            //} else if(filename_url_encoded.contains("&")) {
     412            //  filename_url_encoded = filename_url_encoded.replace("&", "%36");
     413            //}
     414           
     415        }
     416        catch (Exception e) {
     417            e.printStackTrace();
     418            // Give up trying to convert
     419            filename_url_encoded = filename;
     420        }
     421        return filename_url_encoded;
     422    }
     423   
     424   
     425    // follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter   
     426    public static String _filenameToURLEncoding(String filename) {
     427        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
     428            return filename;
     429        }
     430       
     431        File file = new File (filename);
     432        return fileToURLEncoding(file);
     433    }       
     434
     435   
    376436    // Dr Bainbridge's methods
    377437    /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,
Note: See TracChangeset for help on using the changeset viewer.