Ignore:
Timestamp:
2020-09-22T00:57:33+12:00 (4 years ago)
Author:
ak19
Message:

Bugfix for slowdown when assigning meta to multiple gathered docs in GLI's Enrich pane. Tested on Windows. This is the simplest way I could think of to solve the problem: XMLParsing always resolves html entities (unless possibly when using the StAX parser, but that may not return the Document object as code expects). Entities start with ampersand and are resolved upon parsing, so too standalone ampersand signs. The earlier code, a bugfix for metadata not sticking to filenames/import folder structures containing non-ASCII or ampersands or plus signs, had caused the slow-down, as after each XML parse of the current metadata.xml file, the code would loop through each FileName element of the metadata.xml file and reintroduce the resolved html entities. The best and simplest solution that worked is simply to escape ampersands with %26 when writing out values for the FileName element and compare against filenames that have a similar substitution done. Still to test on Linux, but this reincorporates recent ideas for the bugfix that had worked on Linux (but then broke on Windows) so I feel somewhat confident that this commit is likely to largely work on Linux when I test it tomorrow.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java

    r34414 r34415  
    7979        loaded_file = this;
    8080        loaded_file_document = document;
    81        
    82         reEncodeFilenamesInMetadataXML(loaded_file_document);
    8381    }
    8482
     
    108106        loaded_file = this;
    109107        loaded_file_document = document;
    110         reEncodeFilenamesInMetadataXML(loaded_file_document);
    111        
    112108    }
    113109
    114110    // Determine the file's path relative to the location of the metadata.xml file
    115111    String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile());
    116 
    117112    String file_relative_path = file_node.getURLEncodedFilePath();
    118     if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
    119         file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND);
    120     }
    121113    file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length());
    122114   
     
    132124    }
    133125    else {
     126        // When XML files are parsed, predefined XML entities get resolved, which includes & in & and &#x...;
     127        // see https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references#Predefined_entities_in_XML
     128        // (and https://stackoverflow.com/questions/1777878/is-there-a-java-xml-api-that-can-parse-a-document-without-resolving-character-en)
     129        // We don't want &/entities in FileName elements stored in metadata.xml, as we'd have to put the entities
     130        // back (undo the xml entity resolution) after each XML parse operation, which is costly and slows GLI down
     131        // when assigning meta to multiple docs.
     132        // Instead, when writing out or comparing against FileName elements in metadata.xml, we ensure all
     133        // ampersands are replaced by their hex URL encoded value of %26.       
     134        file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND);
     135       
    134136        // Convert the file path into a regular expression that will match it
    135137        file_path_regexp = MetadataTools.getRegularExpressionThatMatchesFilePath(file_relative_path);
     
    153155        Element current_filename_element = (Element) filename_elements_nodelist.item(j);
    154156        String current_filename_element_value = XMLTools.getElementTextValue(current_filename_element);
    155 
     157       
    156158        // Only exact matches can be extended with new metadata
    157159        if (current_filename_element_value.equals(file_path_regexp)) {
     
    200202        // changed, it must be applied on the file(name) whose metadata has been adjusted
    201203        if(metadata_element_name_full.equals(FILENAME_ENCODING_METADATA)) {
    202             metadata_value_string = processFilenameEncoding(file_path_regexp,
     204            metadata_value_string = processFilenameEncoding(file_path_regexp, // file_path_regexp has & replaced by HEX_AMPERSAND but processFilenameEncoding doesn't use param
    203205                                        file_node, metadata_value_string, false);
    204206                              // true only if removing meta
     
    252254    loaded_file_changed = true;
    253255    }
    254 
    255 
    256     // DO NOT REMOVE THE System.err DEBUGGING STATEMENTS FROM THIS METHOD: HELPS WITH TESTING/DEBUGGING
     256   
     257    // DO NOT REMOVE THE System.err DEBUGGING STATEMENTS FROM THIS METHOD: HELPS WITH TESTING/DEBUGGING
    257258    // WHEN FILE-LEVEL META IS ASSIGNED TO NON-ASCII ENCODED FILENAMES OR WITH FILENAMES CONTAINING +/ampersand
     259   
     260   
     261    // By default, XML parsing automatically resolves certain predefined XML entities including the ampersand.
     262    // https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references#Predefined_entities_in_XML
     263    // "The XML specification defines five "predefined entities" representing special characters, and requires that all XML processors honor them. The entities can be explicitly declared in a DTD, as well, but if this is done, the replacement text must be the same as the built-in definitions. XML also allows other named entities of any size to be defined on a per-document basis."
     264    // Also https://stackoverflow.com/questions/1777878/is-there-a-java-xml-api-that-can-parse-a-document-without-resolving-character-en
     265    // which suggests using StAX instead of SAX or DOM parsers allows us to bypass automatic entity resolution.
     266    // However, https://docs.oracle.com/javase/tutorial/jaxp/stax/why.html and
     267    // https://docs.oracle.com/javase/tutorial/jaxp/stax/api.html show that StAX works like SAX rather than DOM parser
     268    // while the XMLTools.parseXML() that we use throughout this file relies on DOMParser behaviour to get access to the
     269    // XML DOM Document, so that it's not straightforward to replace DOMParser's use in Document XMLTools.parseXML() with
     270    // an equivalent using a streambased StAX parser.
     271    // Instead, method reEncodeFilenamesInMetadataXML(Doc doc) has been removed, as the solution is to no longer store
     272    // ampersands: no longer encoding ampersands to entities but to %26, and all hex entities in filenames are further
     273    // protected from XML's entity resolution because their ampersand prefixes are encoded as %26 (i.e. � is
     274    // stored as %26#xDDDD;) and therefore we no longer need to go over the XML Doc reinstating entities after parseXML
     275    // either, entities being now preserved though with %26 prefixed in place of the & prefix. 
     276   
    258277    public ArrayList getMetadataAssignedToFile(File file, boolean fileEncodingOnly)
    259278    {
     
    272291        loaded_file = this;
    273292        loaded_file_document = document;
    274        
    275         reEncodeFilenamesInMetadataXML(loaded_file_document);
    276293    }
    277294
     
    285302        file_relative_path = file_relative_path.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
    286303    }
    287     String hexdecoded_regexed_file_relative_path = FilenameEncoding.decodeStringContainingHexEntities(file_relative_path);
    288 
     304   
     305    // Ampersands and entities in FileName elements are protected from being resolved on parsing back the XML.
     306    // To compare apples with apples convert any & to its hex url encoded value of %26 
     307    file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND);
     308   
    289309    // Build up a list of metadata assigned to this file
    290310    ArrayList metadata_values = new ArrayList();
     
    305325
    306326        //System.err.println("\n  Original TAIL filename was: " + Utility.debugUnicodeString(file.getName()));
    307         //System.err.println("Looking in meta.xml for hexdecoded_regexed_file_RELATIVE_path: " + hexdecoded_regexed_file_relative_path
    308             //+ " - debug version: " + Utility.debugUnicodeString(hexdecoded_regexed_file_relative_path));     
     327        //System.err.println("Looking in meta.xml for file_relative_path: " + file_relative_path);
     328            //+ " - debug version: " + Utility.debugUnicodeString(file_relative_path));     
    309329   
    310330        // Does this fileset specify metadata for one file only?
     
    315335        }
    316336
    317         String hexdecoded_current_filename_element_value = FilenameEncoding.decodeStringContainingHexEntities(current_filename_element_value);
    318         //System.err.println("   Checking to see if it matches " + hexdecoded_current_filename_element_value + " - debug: " + Utility.debugUnicodeString(hexdecoded_current_filename_element_value));
    319             //System.err.println("   Checking to see if it matches " + current_filename_element_value + " - debug: " + Utility.debugUnicodeString(current_filename_element_value));
     337        //System.err.println("   Checking to see if it matches " + current_filename_element_value);// + " - debug: " + Utility.debugUnicodeString(current_filename_element_value));
    320338       
    321339        // This fileset specifies metadata for the file
    322340        // MetadataXMLFile.addMetadata(CollectionTreeNode, ArrayList) stored filename in uppercase hex
    323341        // so need to make sure everything hex has been decoded (no more hex) to compare apples with apples     
    324         if (hexdecoded_regexed_file_relative_path.matches(hexdecoded_current_filename_element_value)) { //if (file_relative_path.matches(current_filename_element_value)) {
    325                 //System.err.println("   @@@ Found a match in meta.xml for hexdecoded_regexed_file_relative_path: " + hexdecoded_regexed_file_relative_path + "\n");
     342        if (file_relative_path.matches(current_filename_element_value)) {
     343                //System.err.println("   @@@ Found a match in meta.xml for file_relative_path: " + file_relative_path + "\n");
    326344            current_fileset_matches = true;
    327345            if (!file_relative_path.equals("") && current_filename_element_value.equals(DIRECTORY_FILENAME)) {
     
    330348            break;
    331349        } //else {
    332             //System.err.println( hexdecoded_regexed_file_relative_path + " does not match " + hexdecoded_current_filename_element_value);
    333             //System.err.println( Utility.debugUnicodeString(hexdecoded_regexed_file_relative_path) + " does not match " + Utility.debugUnicodeString(hexdecoded_current_filename_element_value));
     350            //System.err.println("    ###" + file_relative_path + " does not match " + current_filename_element_value);
     351            //System.err.println( Utility.debugUnicodeString(file_relative_path) + " does not match " + Utility.debugUnicodeString(current_filename_element_value));
    334352        //}
    335353       
    336354        // This fileset specifies metadata for the folder the file is in
    337         if (hexdecoded_regexed_file_relative_path.startsWith(hexdecoded_current_filename_element_value + FilenameEncoding.URL_FILE_SEPARATOR)) {
     355        if (file_relative_path.startsWith(current_filename_element_value + FilenameEncoding.URL_FILE_SEPARATOR)) {
    338356            current_fileset_matches = true;
    339357            folder_metadata_inherited_from = new File(metadata_xml_file_directory, current_filename_element_value);
     
    435453        loaded_file = this;
    436454        loaded_file_document = document;
    437        
    438         reEncodeFilenamesInMetadataXML(loaded_file_document);
    439455    }
    440456
     
    442458    String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile());
    443459    String file_relative_path = file_node.getURLEncodedFilePath();
    444     if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
    445         file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND);
    446     }
    447460    file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length());
    448461    if (file_relative_path.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
     
    457470    }
    458471    else {
     472        // Ampersands and entities in FileName elements are protected from being resolved on parsing back the XML.
     473        // To compare apples with apples convert any & to its hex url encoded value of %26 
     474        file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND);
     475       
    459476        // Convert the file path into a regular expression that will match it
    460477        file_path_regexp = MetadataTools.getRegularExpressionThatMatchesFilePath(file_relative_path);
     
    558575        loaded_file = this;
    559576        loaded_file_document = document;
    560        
    561         reEncodeFilenamesInMetadataXML(loaded_file_document);
    562577    }
    563578
     
    565580    String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile());
    566581    String file_relative_path = file_node.getURLEncodedFilePath();
    567     if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
    568         file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND);
    569     }
    570582    file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length());   
    571583    if (file_relative_path.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
     
    580592    }
    581593    else {
     594        // Ampersands and entities in FileName elements are protected from being resolved on parsing back the XML.
     595        // To compare apples with apples convert any & to its hex url encoded value of %26 
     596        file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND);
     597       
    582598        // Convert the file path into a regular expression that will match it
    583599        file_path_regexp = MetadataTools.getRegularExpressionThatMatchesFilePath(file_relative_path);
     
    684700        }
    685701    }
    686 
    687     /**
    688      * parseXML(metadata.xml) has the side-effect of resolving html entities.
    689      * Although this is not done by the GLIEntityResolver usage in parseXML(), something
    690      * in parseXML() is resolving the html entities, including those used in carefully
    691      * html-entity-escaped filenames.
    692      * We need to get the filenames in the DOM correct after parsing a metadata.xml file
    693      * into memory, so that we have the correct filenames and so that we'll write it out correctly.
    694      * Therefore, always call this method after a successful parseXML() call on a metadata.xml.
    695      * @param doc is the Document where the FILENAME_ELEMENTs need to be re-encoded.
    696      * At the end of this function, the doc will be modified with the re-encoded filenames.
    697      *
    698      * DO NOT REMOVE THE DEBUGGING STATEMENTS IN THIS FUNCTION, AS THEY'RE USEFUL
    699      * FOR DEBUGGING ENCODING ISSUES TO DO WITH FILE LEVEL META ASSIGNED TO FILENAMES
    700      * THAT ARE NON-ASCII OR CONTAIN +/ampersands IN THEM.
    701     */
    702     static private void reEncodeFilenamesInMetadataXML(Document doc) {
    703        
    704         String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); // returns the curr dir path after removing the /./ at end       
    705         //System.err.println("@@@ curr_directory_path: " + curr_directory_path);
    706        
    707         //System.err.println("PARSED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true));
    708        
    709         // Read all the FileSet elements in the file
    710         NodeList fileset_elements_nodelist = doc.getElementsByTagName(FILESET_ELEMENT);
    711         for (int i = 0; i < fileset_elements_nodelist.getLength(); i++) {
    712             Element current_fileset_element = (Element) fileset_elements_nodelist.item(i);
    713 
    714             // get the value of all FileName elements
    715             NodeList filename_elements_nodelist = current_fileset_element.getElementsByTagName(FILENAME_ELEMENT);
    716             for (int j = 0; j < filename_elements_nodelist.getLength(); j++) {
    717                 Element filename_element = (Element) filename_elements_nodelist.item(j);
    718                 String filename = XMLTools.getElementTextValue(filename_element);
    719                 if(!filename.equals(DIRECTORY_FILENAME)) {
    720                     // Reencode filename after parseXML() had the side-effect of decoding entities in filename elements
    721                    
    722                     //System.err.println("Filename before reencoding was: " + filename);                   
    723 
    724                     String encoded_filename = filename;
    725                     if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
    726                     encoded_filename = encoded_filename.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND);
    727                     } else {
    728                     // Can't convert to URI with backslash-escaped chars (backslash used in regexed filename are illegal in URI object
    729                     // created by filenameToURLEncoding). So replace backslashes in regex with url-encoded hex-value of backslash, %5C.
    730                     encoded_filename = filename.replace("\\", "%5C");
    731                    
    732                     // get the URL encoded filename preserving special encodings, with any curr_directory_path prefix removed
    733                     encoded_filename = FilenameEncoding.filenameToURLEncodingWithPrefixRemoved(encoded_filename, curr_directory_path);
    734                    
    735                     // Reintrodudce the backslash characters in place of their %5C hex placeholders
    736                     encoded_filename = encoded_filename.replace("%5C", "\\");
    737                     }
    738                     // Update filename element in DOM
    739                     XMLTools.setElementTextValue(filename_element, encoded_filename);
    740                     //System.err.println("Filename after reencoding was: " + encoded_filename);
    741                 }
    742             }
    743         }       
    744         //System.err.println("RE-ENCODED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true));
    745     }
    746702   
    747703    /**
     
    763719        return;
    764720    }
    765     // Always call this method after calling parseXMLFile
    766     reEncodeFilenamesInMetadataXML(document);
    767721   
    768722    // Read all the Metadata elements in the file
Note: See TracChangeset for help on using the changeset viewer.