Changeset 33737

Show
Ignore:
Timestamp:
02.12.2019 20:03:57 (4 days ago)
Author:
ak19
Message:

A larger fix but not complete fix to the problem of attaching and retaining file level assigned meta to filenames containing non-ASCII characters. 1. Committing intermediate version of bugfix containing the idea suggested by Kathy to reuse the steps in fileToURLEncoding(File) for a String parameter as she felt that since the String represents a filename, a URI object should be instantiable on a String. Worked with some massaging. Can't yet get the new fileToURLEncoding(String) to work by calling fileToURLEncoding(File). So am committing the version of fileToURLEncoding(String) that is largely a copy of fileToURLEncoding(File), until I can get the simpler variant working. 2. The new method is called after each successful parseXML call from MetadataXMLFile, so that the decoded entities resulting from parseXML() are reencoded in the DOM. This allows us to retain the correct filenames originally mentioned in metadata.xml files, do proper comparisons against them to attach/modify further metdata and so that the correct values get written out again into metadata.xml files. 3. Still want to get simpler version of fileToURLEncoding(String) to work that reuses fileToURLEncoding(File). 4. Want to get ampersand and plus signs in filenames to work (+ signs in filenames are lost when filenames are converted to URL). 5. Still need to investigate the missing ex. metadata for filenames containing non-ASCII.

Location:
main/trunk/gli/src/org/greenstone/gatherer/metadata
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java

    r33730 r33737  
    364364            // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png 
    365365            // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format) 
    366             if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing 
     366            if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 || charCode == 36 || charCode == 43) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too  
    367367                hexFilename += filename.charAt(i); 
    368368            } else { 
     
    374374     } 
    375375 
     376     
     377    // follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter 
     378    public static String filenameToURLEncoding(String filename) { 
     379        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
     380            return filename; 
     381        } 
     382 
     383         // Can't create a URI out of a filename containing spaces. Spaces must be encoded as %20 
     384        String filename_url_encoded = filename.replace(" ", "%20"); 
     385        //filename_url_encoded = filename_url_encoded.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex 
     386        //filename_url_encoded = filename_url_encoded.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased 
     387 
     388        try { 
     389            URI filename_uri = new URI(filename_url_encoded); 
     390            // The trick: 
     391            //  1. toASCIIString() will %xx encode values > 127 
     392            //  2. Decode the result to "ISO-8859-1" 
     393            //  3. URL encode the bytes to string 
     394             
     395            // Step 2 forces the string to be 8-bit values.  It 
     396            // doesn't matter if the starting raw filename was *not* 
     397            // in the ISO-8859-1 encoding, the effect is to ensure 
     398            // we have an 8-bit byte string that (numerically) 
     399            // captures the right value.  These numerical values are 
     400            // then used to determine how to URL encode it 
     401             
     402            String filename_ascii = filename_uri.toASCIIString(); 
     403            //filename_ascii = filename_ascii.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex 
     404            //filename_ascii = filename_ascii.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased 
     405            String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1"); 
     406            filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes); 
     407             
     408            // DEALING WITH & and + in filenames: NOT WORKING YET 
     409            //if(filename_url_encoded.contains("&amp;")) { 
     410            //  filename_url_encoded = filename_url_encoded.replace("&amp;", "%36amp;"); 
     411            //} else if(filename_url_encoded.contains("&")) { 
     412            //  filename_url_encoded = filename_url_encoded.replace("&", "%36"); 
     413            //} 
     414             
     415        } 
     416        catch (Exception e) { 
     417            e.printStackTrace(); 
     418            // Give up trying to convert 
     419            filename_url_encoded = filename;  
     420        } 
     421        return filename_url_encoded; 
     422    } 
     423     
     424     
     425    // follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter     
     426    public static String _filenameToURLEncoding(String filename) { 
     427        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 
     428            return filename; 
     429        } 
     430         
     431        File file = new File (filename); 
     432        return fileToURLEncoding(file); 
     433    }        
     434 
     435     
    376436    // Dr Bainbridge's methods 
    377437    /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,  
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java

    r33730 r33737  
    8181        loaded_file = this; 
    8282        loaded_file_document = document; 
     83        reEncodeFilenamesInMetadataXML(loaded_file_document); 
     84         
    8385    } 
    8486 
     
    234236        loaded_file = this; 
    235237        loaded_file_document = document; 
     238         
     239        reEncodeFilenamesInMetadataXML(loaded_file_document); 
    236240    } 
    237241 
     
    395399        loaded_file = this; 
    396400        loaded_file_document = document; 
     401         
     402        reEncodeFilenamesInMetadataXML(loaded_file_document); 
    397403    } 
    398404 
     
    512518        loaded_file = this; 
    513519        loaded_file_document = document; 
     520         
     521        reEncodeFilenamesInMetadataXML(loaded_file_document); 
    514522    } 
    515523 
     
    629637        XMLTools.writeXMLFile(loaded_file, loaded_file_document, nonEscapingElements); 
    630638     
     639    /*  // DEBUGGING: 
    631640        Document doc = XMLTools.parseXMLFile(loaded_file); 
    632         //System.err.println("AT END saveLoadedFile(), PARSED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true)); 
    633          
     641        System.err.println("AT END saveLoadedFile(), PARSED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true)); 
     642         
     643        reEncodeFilenamesInMetadataXML(doc); 
     644        System.err.println("AT END saveLoadedFile(), RE-ENCODED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true)); 
     645    */   
    634646        loaded_file_changed = false; 
    635647    } 
     648     
     649    //System.err.println("@@@@ END of saveLoadedFile()"); 
     650    //Utility.printCaller(); 
    636651    } 
    637652 
    638  
     653    /**  
     654     * parseXML(metadata.xml) has the side-effect of resolving html entities. 
     655     * Although this is not done by the GLIEntityResolver usage in parseXML(), something 
     656     * in parseXML() is resolving the html entities, including those used in carefully 
     657     * html-entity-escaped filenames. 
     658     * We need to get the filenames in the DOM correct after parsing a metadata.xml file 
     659     * into memory, so that we have the correct filenames and so that we'll write it out correctly. 
     660     * Therefore, always call this method after a successful parseXML() call on a metadata.xml. 
     661     * @param doc is the Document where the FILENAME_ELEMENTs need to be re-encoded. 
     662     * At the end of this function, the doc will be modified with the re-encoded filenames. 
     663     *  
     664    */ 
     665    static private void reEncodeFilenamesInMetadataXML(Document doc) { 
     666        if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
     667            return; 
     668        } 
     669         
     670        //System.err.println("PARSED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true)); 
     671         
     672        // Read all the FileSet elements in the file 
     673        NodeList fileset_elements_nodelist = doc.getElementsByTagName(FILESET_ELEMENT); 
     674        for (int i = 0; i < fileset_elements_nodelist.getLength(); i++) { 
     675            Element current_fileset_element = (Element) fileset_elements_nodelist.item(i); 
     676 
     677            // get the value of all FileName elements 
     678            NodeList filename_elements_nodelist = current_fileset_element.getElementsByTagName(FILENAME_ELEMENT); 
     679            for (int j = 0; j < filename_elements_nodelist.getLength(); j++) { 
     680                Element filename_element = (Element) filename_elements_nodelist.item(j); 
     681                String filename = XMLTools.getElementTextValue(filename_element); 
     682                if(!filename.equals(DIRECTORY_FILENAME)) { 
     683                    //System.err.println("Filename before reencoding was: " + filename); 
     684                    // reencode filename                     
     685                    // can't convert to URI with backslash-escaped chars (backslash used in regexed filename are illegal in URI object 
     686                    // created by filenameToURLEncoding). 
     687                    String encoded_filename = filename.replace("\\", "%5C");  
     688                    encoded_filename = FilenameEncoding.filenameToURLEncoding(encoded_filename); 
     689                    // escape chars for regex again 
     690                    encoded_filename = encoded_filename.replace("%5C", "\\"); 
     691                    XMLTools.setElementTextValue(filename_element, encoded_filename); 
     692                    //System.err.println("Filename after reencoding was: " + encoded_filename); 
     693                } 
     694            } 
     695        }        
     696        //System.err.println("RE-ENCODED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true)); 
     697    } 
     698     
    639699    /** 
    640700     * Every metadata.xml file must be skimmed when a collection is opened, for three very important reasons: 
     
    655715        return; 
    656716    } 
    657  
     717    // Always call this method after calling parseXMLFile 
     718    reEncodeFilenamesInMetadataXML(document); 
     719     
    658720    // Read all the Metadata elements in the file 
    659721    HashMap target_metadata_element_name_attrs_cache = new HashMap();