Changeset 33727 for main/trunk


Ignore:
Timestamp:
2019-11-28T22:17:15+13:00 (4 years ago)
Author:
ak19
Message:

Experimental encoding related bugfix to GLI. In GLI, meta assigned at file level to filenames with non-ascii chars were not sticking to the file, because repeated entries were written out to metadata.xml under 2 variants of the filename but never loaded back into GLI again. This problem was not apparent with the old FilenameEncodings test set of docs or Kathy's complex test case of Russian filenames gathered into a folder structure. In the latter case, meta was assigned at folder level and so the regex to match was .* which is just ASCII. Neither test document sets were tested with meta assigned at file level. I can't now remember whether we tested today whether assigning file level meta to docs in the FilenameEncodings test set worked or not, but if it did, maybe that was because the special characters were not too complex and just Latin-1 or Win codepage 850 (like 1252) for the docs where meta was assigned. In any case, with test docs where filenames had A-macrons in them, the problem showed up and also in the Russian test set if meta got assigned at doc level. GLI was correctly saving filenames that had meta into metadata.xml as hex-encoded filenames the first time around. It just wasn't comparing them to hex values on subsequent times, and thus not finding a match. Method FilenameEncoding.fileNameToHex() introduced to fix this (experimental, need to run some questions by Dr Bainbridge). For all current tests, this appears to have fixed it. However, there must be somewhere else that ex.meta is being loaded in, as that is still not appearing for specially named files.

Location:
main/trunk/gli/src/org/greenstone/gatherer/metadata
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java

    r29815 r33727  
    359359    }
    360360
     361    /** Attempting to produce the equivalent method fileToURLEncoding() above, but taking a String as input parameter */
     362     public static String fileNameToHex(String filename) {
     363        /*String filename_url_encoded = "";
     364        try {
     365            URI filename_uri = new URI(filename);
     366            String filename_ascii = filename_uri.toASCIIString();
     367            String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
     368            filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
     369            return filename_url_encoded;
     370        } catch (Exception e) {
     371            e.printStackTrace();
     372            // Give up trying to convert
     373            filename_url_encoded = filename;
     374        }
     375        return filename_url_encoded;
     376        */
     377       
     378        String hexFilename = "";
     379        for(int i = 0; i < filename.length(); i++) {
     380            int charCode = filename.codePointAt(i); // unicode codepoint / ASCII code
     381           
     382            // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
     383            // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
     384            if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing
     385                hexFilename += filename.charAt(i);
     386            } else {
     387                hexFilename += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"
     388            }
     389        }
     390         
     391        return hexFilename;
     392     }
     393   
    361394   
    362395    // For unicode codepoints see:
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java

    r29793 r33727  
    100100    }
    101101
     102    //System.err.println("MetadataXMLFile.addMetadata() Adding meta for file regexp: "
     103    //  + file_path_regexp + " - " + org.greenstone.gatherer.util.Utility.debugUnicodeString(file_path_regexp));
     104   
    102105    // Find the appropriate FileSet element for this file
    103106    Element appropriate_fileset_element = null;
     
    259262        String current_filename_element_value = XMLTools.getElementTextValue(current_filename_element);
    260263
     264        String regexed_file_relative_path = MetadataTools.getRegularExpressionThatMatchesFilePath(file_relative_path);
     265        //System.err.println("Looking in meta.xml for regexed version of filename: " + regexed_file_relative_path);
     266   
    261267        // Does this fileset specify metadata for one file only?
    262268        is_one_file_only_metadata = true;
     
    266272        }
    267273
     274        String current_filename_element_value_hex = FilenameEncoding.fileNameToHex(current_filename_element_value);
     275       
    268276        // This fileset specifies metadata for the file
    269         if (file_relative_path.matches(current_filename_element_value)) {
     277        // MetadataXMLFile.addMetadata(CollectionTreeNode, ArrayList) stored filename in uppercase hex, so need to compare with the same
     278        if (file_relative_path.matches(current_filename_element_value_hex)) { //if (file_relative_path.matches(current_filename_element_value)) {
     279            //System.err.println("Found a match in meta.xml for file name: " + regexed_file_relative_path);
    270280            current_fileset_matches = true;
    271281            if (!file_relative_path.equals("") && current_filename_element_value.equals(DIRECTORY_FILENAME)) {
     
    273283            }
    274284            break;
    275         }
    276 
     285        }       
     286       
    277287        // This fileset specifies metadata for the folder the file is in
    278         if (file_relative_path.startsWith(current_filename_element_value + FilenameEncoding.URL_FILE_SEPARATOR)) {
     288        if (regexed_file_relative_path.startsWith(current_filename_element_value + FilenameEncoding.URL_FILE_SEPARATOR)) {
    279289            current_fileset_matches = true;
    280290            folder_metadata_inherited_from = new File(metadata_xml_file_directory, current_filename_element_value);
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFileManager.java

    r23433 r33727  
    3737import org.greenstone.gatherer.util.XMLTools;
    3838
     39import org.greenstone.gatherer.util.Utility;
     40
    3941
    4042/** This class is a static class that manages the metadata.xml files */
     
    7880    for (int i = 0; i < file_nodes.length; i++) {
    7981        File current_file = file_nodes[i].getFile();
    80         DebugStream.println("Adding metadata to " + current_file.getAbsolutePath());
     82        DebugStream.println("Adding metadata to " + current_file.getAbsolutePath() + " - hex: " + Utility.debugUnicodeString(current_file.getAbsolutePath()));     
    8183
    8284        // Find which metadata.xml file needs editing
Note: See TracChangeset for help on using the changeset viewer.