Ignore:
Timestamp:
2019-12-02T20:03:57+13:00 (4 years ago)
Author:
ak19
Message:

A larger fix but not complete fix to the problem of attaching and retaining file level assigned meta to filenames containing non-ASCII characters. 1. Committing intermediate version of bugfix containing the idea suggested by Kathy to reuse the steps in fileToURLEncoding(File) for a String parameter as she felt that since the String represents a filename, a URI object should be instantiable on a String. Worked with some massaging. Can't yet get the new fileToURLEncoding(String) to work by calling fileToURLEncoding(File). So am committing the version of fileToURLEncoding(String) that is largely a copy of fileToURLEncoding(File), until I can get the simpler variant working. 2. The new method is called after each successful parseXML call from MetadataXMLFile, so that the decoded entities resulting from parseXML() are reencoded in the DOM. This allows us to retain the correct filenames originally mentioned in metadata.xml files, do proper comparisons against them to attach/modify further metdata and so that the correct values get written out again into metadata.xml files. 3. Still want to get simpler version of fileToURLEncoding(String) to work that reuses fileToURLEncoding(File). 4. Want to get ampersand and plus signs in filenames to work (+ signs in filenames are lost when filenames are converted to URL). 5. Still need to investigate the missing ex. metadata for filenames containing non-ASCII.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java

    r33730 r33737  
    8181        loaded_file = this;
    8282        loaded_file_document = document;
     83        reEncodeFilenamesInMetadataXML(loaded_file_document);
     84       
    8385    }
    8486
     
    234236        loaded_file = this;
    235237        loaded_file_document = document;
     238       
     239        reEncodeFilenamesInMetadataXML(loaded_file_document);
    236240    }
    237241
     
    395399        loaded_file = this;
    396400        loaded_file_document = document;
     401       
     402        reEncodeFilenamesInMetadataXML(loaded_file_document);
    397403    }
    398404
     
    512518        loaded_file = this;
    513519        loaded_file_document = document;
     520       
     521        reEncodeFilenamesInMetadataXML(loaded_file_document);
    514522    }
    515523
     
    629637        XMLTools.writeXMLFile(loaded_file, loaded_file_document, nonEscapingElements);
    630638   
     639    /*  // DEBUGGING:
    631640        Document doc = XMLTools.parseXMLFile(loaded_file);
    632         //System.err.println("AT END saveLoadedFile(), PARSED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true));
    633        
     641        System.err.println("AT END saveLoadedFile(), PARSED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true));
     642       
     643        reEncodeFilenamesInMetadataXML(doc);
     644        System.err.println("AT END saveLoadedFile(), RE-ENCODED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true));
     645    */ 
    634646        loaded_file_changed = false;
    635647    }
     648   
     649    //System.err.println("@@@@ END of saveLoadedFile()");
     650    //Utility.printCaller();
    636651    }
    637652
    638 
     653    /**
     654     * parseXML(metadata.xml) has the side-effect of resolving html entities.
     655     * Although this is not done by the GLIEntityResolver usage in parseXML(), something
     656     * in parseXML() is resolving the html entities, including those used in carefully
     657     * html-entity-escaped filenames.
     658     * We need to get the filenames in the DOM correct after parsing a metadata.xml file
     659     * into memory, so that we have the correct filenames and so that we'll write it out correctly.
     660     * Therefore, always call this method after a successful parseXML() call on a metadata.xml.
     661     * @param doc is the Document where the FILENAME_ELEMENTs need to be re-encoded.
     662     * At the end of this function, the doc will be modified with the re-encoded filenames.
     663     *
     664    */
     665    static private void reEncodeFilenamesInMetadataXML(Document doc) {
     666        if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
     667            return;
     668        }
     669       
     670        //System.err.println("PARSED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true));
     671       
     672        // Read all the FileSet elements in the file
     673        NodeList fileset_elements_nodelist = doc.getElementsByTagName(FILESET_ELEMENT);
     674        for (int i = 0; i < fileset_elements_nodelist.getLength(); i++) {
     675            Element current_fileset_element = (Element) fileset_elements_nodelist.item(i);
     676
     677            // get the value of all FileName elements
     678            NodeList filename_elements_nodelist = current_fileset_element.getElementsByTagName(FILENAME_ELEMENT);
     679            for (int j = 0; j < filename_elements_nodelist.getLength(); j++) {
     680                Element filename_element = (Element) filename_elements_nodelist.item(j);
     681                String filename = XMLTools.getElementTextValue(filename_element);
     682                if(!filename.equals(DIRECTORY_FILENAME)) {
     683                    //System.err.println("Filename before reencoding was: " + filename);
     684                    // reencode filename                   
     685                    // can't convert to URI with backslash-escaped chars (backslash used in regexed filename are illegal in URI object
     686                    // created by filenameToURLEncoding).
     687                    String encoded_filename = filename.replace("\\", "%5C");
     688                    encoded_filename = FilenameEncoding.filenameToURLEncoding(encoded_filename);
     689                    // escape chars for regex again
     690                    encoded_filename = encoded_filename.replace("%5C", "\\");
     691                    XMLTools.setElementTextValue(filename_element, encoded_filename);
     692                    //System.err.println("Filename after reencoding was: " + encoded_filename);
     693                }
     694            }
     695        }       
     696        //System.err.println("RE-ENCODED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true));
     697    }
     698   
    639699    /**
    640700     * Every metadata.xml file must be skimmed when a collection is opened, for three very important reasons:
     
    655715        return;
    656716    }
    657 
     717    // Always call this method after calling parseXMLFile
     718    reEncodeFilenamesInMetadataXML(document);
     719   
    658720    // Read all the Metadata elements in the file
    659721    HashMap target_metadata_element_name_attrs_cache = new HashMap();
Note: See TracChangeset for help on using the changeset viewer.