Changeset 34415

Show
Ignore:
Timestamp:
22.09.2020 00:57:33 (5 weeks ago)
Author:
ak19
Message:

Bugfix for slowdown when assigning meta to multiple gathered docs in GLI's Enrich pane. Tested on Windows. This is the simplest way I could think of to solve the problem: XMLParsing always resolves html entities (unless possibly when using the StAX parser, but that may not return the Document object as code expects). Entities start with ampersand and are resolved upon parsing, so too standalone ampersand signs. The earlier code, a bugfix for metadata not sticking to filenames/import folder structures containing non-ASCII or ampersands or plus signs, had caused the slow-down, as after each XML parse of the current metadata.xml file, the code would loop through each FileName? element of the metadata.xml file and reintroduce the resolved html entities. The best and simplest solution that worked is simply to escape ampersands with %26 when writing out values for the FileName? element and compare against filenames that have a similar substitution done. Still to test on Linux, but this reincorporates recent ideas for the bugfix that had worked on Linux (but then broke on Windows) so I feel somewhat confident that this commit is likely to largely work on Linux when I test it tomorrow.

Location:
main/trunk/gli/src/org/greenstone/gatherer/metadata
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java

    r33748 r34415  
    7777    public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 
    7878 
    79     /** The hex entity version of the ampersand character. 
     79    /** The hex version of the ampersand character: previously hex entity (&#x26) now hex url encoded (%26). 
    8080     * We use this in place of the ampersand character in filenames in metadata.xml files to 
    8181     * preserve the reference to the literal ampersand in the real file name on the file system. 
    8282     */ 
    83     public static final String HEX_ENTITY_AMPERSAND = FilenameEncoding.hexEntityForChar("&"); //"&"; 
     83    public static final String HEX_AMPERSAND = "%26"; //= FilenameEncoding.hexEntityForChar("&"); //"&"; 
    8484     
    8585 
     
    257257     
    258258    /** URL encoded version of the byte codes of the given file's name */ 
    259     public static String calcURLEncodedFilePath(File file) {     
    260         if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
    261             return file.getAbsolutePath(); 
    262         } 
    263         else { 
    264             String filename = fileToURLEncoding(file); 
    265             return filename; 
    266         } 
     259    public static String calcURLEncodedFilePath(File file) { 
     260        return fileToURLEncoding(file); 
    267261    } 
    268262 
     
    380374    // just return input filename param, but with any & in the filename replaced with its hex entity 
    381375        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
    382             // protect ampersands in filenames by converting it to its hex entity 
    383376            String filepath = file.getAbsolutePath(); 
    384             filepath = filepath.replace("&", HEX_ENTITY_AMPERSAND); 
    385377            return filepath; 
    386378        } 
     
    430422             
    431423            // Before proceeding, protect & in the filename too. 
    432             // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & (HEX_ENTITY_AMPERSAND) 
     424            // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & (HEX_AMPERSAND) 
    433425            // But dangerous to do simple replace if there are &#x...; entities in the filename already! 
    434426            // That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same! 
     
    445437            //filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller 
    446438            filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex 
    447             filename_url_encoded = filename_url_encoded.replace("%26", HEX_ENTITY_AMPERSAND); // convert URL encoding for ampersand into hex entity for ampersand 
     439            filename_url_encoded = filename_url_encoded.replace("%26", "&"); // now putting back ampersands too, instead of replacing with HEX_ENTITY_AMPERSAND (&) 
    448440        } 
    449441        catch (Exception e) { 
     
    544536        // just return input filename param, but with any & in the filename replaced with its hex entity 
    545537        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
    546             return filename.replace("&", HEX_ENTITY_AMPERSAND); 
     538            return filename; //return filename.replace("&", HEX_AMPERSAND); 
    547539        } 
    548540         
     
    567559    public static String relativeFilenameToURLEncoding(String filename) { 
    568560        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 
    569             return filename.replace("&", HEX_ENTITY_AMPERSAND); 
     561            return filename; // return filename.replace("&", HEX_AMPERSAND); 
    570562        } 
    571563         
     
    580572    public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) { 
    581573        if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 
    582             return filename.replace("&", HEX_ENTITY_AMPERSAND); 
     574            return filename; //return filename.replace("&", HEX_AMPERSAND); 
    583575        } 
    584576         
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java

    r34414 r34415  
    7979        loaded_file = this; 
    8080        loaded_file_document = document; 
    81          
    82         reEncodeFilenamesInMetadataXML(loaded_file_document); 
    8381    } 
    8482 
     
    108106        loaded_file = this; 
    109107        loaded_file_document = document; 
    110         reEncodeFilenamesInMetadataXML(loaded_file_document); 
    111          
    112108    } 
    113109 
    114110    // Determine the file's path relative to the location of the metadata.xml file 
    115111    String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile()); 
    116  
    117112    String file_relative_path = file_node.getURLEncodedFilePath(); 
    118     if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
    119         file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND); 
    120     } 
    121113    file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length()); 
    122114     
     
    132124    } 
    133125    else { 
     126        // When XML files are parsed, predefined XML entities get resolved, which includes & in & and &#x...; 
     127        // see https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references#Predefined_entities_in_XML 
     128        // (and https://stackoverflow.com/questions/1777878/is-there-a-java-xml-api-that-can-parse-a-document-without-resolving-character-en) 
     129        // We don't want &/entities in FileName elements stored in metadata.xml, as we'd have to put the entities 
     130        // back (undo the xml entity resolution) after each XML parse operation, which is costly and slows GLI down 
     131        // when assigning meta to multiple docs. 
     132        // Instead, when writing out or comparing against FileName elements in metadata.xml, we ensure all 
     133        // ampersands are replaced by their hex URL encoded value of %26.        
     134        file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND); 
     135         
    134136        // Convert the file path into a regular expression that will match it 
    135137        file_path_regexp = MetadataTools.getRegularExpressionThatMatchesFilePath(file_relative_path); 
     
    153155        Element current_filename_element = (Element) filename_elements_nodelist.item(j); 
    154156        String current_filename_element_value = XMLTools.getElementTextValue(current_filename_element); 
    155  
     157         
    156158        // Only exact matches can be extended with new metadata 
    157159        if (current_filename_element_value.equals(file_path_regexp)) { 
     
    200202        // changed, it must be applied on the file(name) whose metadata has been adjusted 
    201203        if(metadata_element_name_full.equals(FILENAME_ENCODING_METADATA)) { 
    202             metadata_value_string = processFilenameEncoding(file_path_regexp,  
     204            metadata_value_string = processFilenameEncoding(file_path_regexp, // file_path_regexp has & replaced by HEX_AMPERSAND but processFilenameEncoding doesn't use param 
    203205                                        file_node, metadata_value_string, false); 
    204206                              // true only if removing meta 
     
    252254    loaded_file_changed = true; 
    253255    } 
    254  
    255  
    256     // DO NOT REMOVE THE System.err DEBUGGING STATEMENTS FROM THIS METHOD: HELPS WITH TESTING/DEBUGGING 
     256     
     257    // DO NOT REMOVE THE System.err DEBUGGING STATEMENTS FROM THIS METHOD: HELPS WITH TESTING/DEBUGGING 
    257258    // WHEN FILE-LEVEL META IS ASSIGNED TO NON-ASCII ENCODED FILENAMES OR WITH FILENAMES CONTAINING +/ampersand 
     259     
     260     
     261    // By default, XML parsing automatically resolves certain predefined XML entities including the ampersand. 
     262    // https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references#Predefined_entities_in_XML 
     263    // "The XML specification defines five "predefined entities" representing special characters, and requires that all XML processors honor them. The entities can be explicitly declared in a DTD, as well, but if this is done, the replacement text must be the same as the built-in definitions. XML also allows other named entities of any size to be defined on a per-document basis." 
     264    // Also https://stackoverflow.com/questions/1777878/is-there-a-java-xml-api-that-can-parse-a-document-without-resolving-character-en 
     265    // which suggests using StAX instead of SAX or DOM parsers allows us to bypass automatic entity resolution. 
     266    // However, https://docs.oracle.com/javase/tutorial/jaxp/stax/why.html and 
     267    // https://docs.oracle.com/javase/tutorial/jaxp/stax/api.html show that StAX works like SAX rather than DOM parser 
     268    // while the XMLTools.parseXML() that we use throughout this file relies on DOMParser behaviour to get access to the 
     269    // XML DOM Document, so that it's not straightforward to replace DOMParser's use in Document XMLTools.parseXML() with 
     270    // an equivalent using a streambased StAX parser. 
     271    // Instead, method reEncodeFilenamesInMetadataXML(Doc doc) has been removed, as the solution is to no longer store 
     272    // ampersands: no longer encoding ampersands to entities but to %26, and all hex entities in filenames are further 
     273    // protected from XML's entity resolution because their ampersand prefixes are encoded as %26 (i.e. � is 
     274    // stored as %26#xDDDD;) and therefore we no longer need to go over the XML Doc reinstating entities after parseXML 
     275    // either, entities being now preserved though with %26 prefixed in place of the & prefix.   
     276     
    258277    public ArrayList getMetadataAssignedToFile(File file, boolean fileEncodingOnly) 
    259278    { 
     
    272291        loaded_file = this; 
    273292        loaded_file_document = document; 
    274          
    275         reEncodeFilenamesInMetadataXML(loaded_file_document); 
    276293    } 
    277294 
     
    285302        file_relative_path = file_relative_path.substring(FilenameEncoding.URL_FILE_SEPARATOR.length()); 
    286303    } 
    287     String hexdecoded_regexed_file_relative_path = FilenameEncoding.decodeStringContainingHexEntities(file_relative_path); 
    288  
     304     
     305    // Ampersands and entities in FileName elements are protected from being resolved on parsing back the XML. 
     306    // To compare apples with apples convert any & to its hex url encoded value of %26   
     307    file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND); 
     308     
    289309    // Build up a list of metadata assigned to this file 
    290310    ArrayList metadata_values = new ArrayList(); 
     
    305325 
    306326        //System.err.println("\n  Original TAIL filename was: " + Utility.debugUnicodeString(file.getName())); 
    307         //System.err.println("Looking in meta.xml for hexdecoded_regexed_file_RELATIVE_path: " + hexdecoded_regexed_file_relative_path 
    308             //+ " - debug version: " + Utility.debugUnicodeString(hexdecoded_regexed_file_relative_path));       
     327        //System.err.println("Looking in meta.xml for file_relative_path: " + file_relative_path); 
     328            //+ " - debug version: " + Utility.debugUnicodeString(file_relative_path));      
    309329     
    310330        // Does this fileset specify metadata for one file only? 
     
    315335        } 
    316336 
    317         String hexdecoded_current_filename_element_value = FilenameEncoding.decodeStringContainingHexEntities(current_filename_element_value); 
    318         //System.err.println("   Checking to see if it matches " + hexdecoded_current_filename_element_value + " - debug: " + Utility.debugUnicodeString(hexdecoded_current_filename_element_value)); 
    319             //System.err.println("   Checking to see if it matches " + current_filename_element_value + " - debug: " + Utility.debugUnicodeString(current_filename_element_value)); 
     337        //System.err.println("   Checking to see if it matches " + current_filename_element_value);// + " - debug: " + Utility.debugUnicodeString(current_filename_element_value)); 
    320338         
    321339        // This fileset specifies metadata for the file 
    322340        // MetadataXMLFile.addMetadata(CollectionTreeNode, ArrayList) stored filename in uppercase hex 
    323341        // so need to make sure everything hex has been decoded (no more hex) to compare apples with apples      
    324         if (hexdecoded_regexed_file_relative_path.matches(hexdecoded_current_filename_element_value)) { //if (file_relative_path.matches(current_filename_element_value)) { 
    325                 //System.err.println("   @@@ Found a match in meta.xml for hexdecoded_regexed_file_relative_path: " + hexdecoded_regexed_file_relative_path + "\n"); 
     342        if (file_relative_path.matches(current_filename_element_value)) { 
     343                //System.err.println("   @@@ Found a match in meta.xml for file_relative_path: " + file_relative_path + "\n"); 
    326344            current_fileset_matches = true; 
    327345            if (!file_relative_path.equals("") && current_filename_element_value.equals(DIRECTORY_FILENAME)) { 
     
    330348            break; 
    331349        } //else { 
    332             //System.err.println( hexdecoded_regexed_file_relative_path + " does not match " + hexdecoded_current_filename_element_value); 
    333             //System.err.println( Utility.debugUnicodeString(hexdecoded_regexed_file_relative_path) + " does not match " + Utility.debugUnicodeString(hexdecoded_current_filename_element_value)); 
     350            //System.err.println("    ###" + file_relative_path + " does not match " + current_filename_element_value); 
     351            //System.err.println( Utility.debugUnicodeString(file_relative_path) + " does not match " + Utility.debugUnicodeString(current_filename_element_value)); 
    334352        //} 
    335353         
    336354        // This fileset specifies metadata for the folder the file is in 
    337         if (hexdecoded_regexed_file_relative_path.startsWith(hexdecoded_current_filename_element_value + FilenameEncoding.URL_FILE_SEPARATOR)) { 
     355        if (file_relative_path.startsWith(current_filename_element_value + FilenameEncoding.URL_FILE_SEPARATOR)) { 
    338356            current_fileset_matches = true; 
    339357            folder_metadata_inherited_from = new File(metadata_xml_file_directory, current_filename_element_value); 
     
    435453        loaded_file = this; 
    436454        loaded_file_document = document; 
    437          
    438         reEncodeFilenamesInMetadataXML(loaded_file_document); 
    439455    } 
    440456 
     
    442458    String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile()); 
    443459    String file_relative_path = file_node.getURLEncodedFilePath(); 
    444     if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
    445         file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND); 
    446     } 
    447460    file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length()); 
    448461    if (file_relative_path.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { 
     
    457470    } 
    458471    else { 
     472        // Ampersands and entities in FileName elements are protected from being resolved on parsing back the XML. 
     473        // To compare apples with apples convert any & to its hex url encoded value of %26   
     474        file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND); 
     475         
    459476        // Convert the file path into a regular expression that will match it 
    460477        file_path_regexp = MetadataTools.getRegularExpressionThatMatchesFilePath(file_relative_path); 
     
    558575        loaded_file = this; 
    559576        loaded_file_document = document; 
    560          
    561         reEncodeFilenamesInMetadataXML(loaded_file_document); 
    562577    } 
    563578 
     
    565580    String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile()); 
    566581    String file_relative_path = file_node.getURLEncodedFilePath(); 
    567     if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
    568         file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND); 
    569     } 
    570582    file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length());    
    571583    if (file_relative_path.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { 
     
    580592    } 
    581593    else { 
     594        // Ampersands and entities in FileName elements are protected from being resolved on parsing back the XML. 
     595        // To compare apples with apples convert any & to its hex url encoded value of %26   
     596        file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND); 
     597         
    582598        // Convert the file path into a regular expression that will match it 
    583599        file_path_regexp = MetadataTools.getRegularExpressionThatMatchesFilePath(file_relative_path); 
     
    684700        } 
    685701    } 
    686  
    687     /**  
    688      * parseXML(metadata.xml) has the side-effect of resolving html entities. 
    689      * Although this is not done by the GLIEntityResolver usage in parseXML(), something 
    690      * in parseXML() is resolving the html entities, including those used in carefully 
    691      * html-entity-escaped filenames. 
    692      * We need to get the filenames in the DOM correct after parsing a metadata.xml file 
    693      * into memory, so that we have the correct filenames and so that we'll write it out correctly. 
    694      * Therefore, always call this method after a successful parseXML() call on a metadata.xml. 
    695      * @param doc is the Document where the FILENAME_ELEMENTs need to be re-encoded. 
    696      * At the end of this function, the doc will be modified with the re-encoded filenames. 
    697      *  
    698      * DO NOT REMOVE THE DEBUGGING STATEMENTS IN THIS FUNCTION, AS THEY'RE USEFUL 
    699      * FOR DEBUGGING ENCODING ISSUES TO DO WITH FILE LEVEL META ASSIGNED TO FILENAMES 
    700      * THAT ARE NON-ASCII OR CONTAIN +/ampersands IN THEM. 
    701     */ 
    702     static private void reEncodeFilenamesInMetadataXML(Document doc) { 
    703          
    704         String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); // returns the curr dir path after removing the /./ at end         
    705         //System.err.println("@@@ curr_directory_path: " + curr_directory_path); 
    706          
    707         //System.err.println("PARSED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true)); 
    708          
    709         // Read all the FileSet elements in the file 
    710         NodeList fileset_elements_nodelist = doc.getElementsByTagName(FILESET_ELEMENT); 
    711         for (int i = 0; i < fileset_elements_nodelist.getLength(); i++) { 
    712             Element current_fileset_element = (Element) fileset_elements_nodelist.item(i); 
    713  
    714             // get the value of all FileName elements 
    715             NodeList filename_elements_nodelist = current_fileset_element.getElementsByTagName(FILENAME_ELEMENT); 
    716             for (int j = 0; j < filename_elements_nodelist.getLength(); j++) { 
    717                 Element filename_element = (Element) filename_elements_nodelist.item(j); 
    718                 String filename = XMLTools.getElementTextValue(filename_element); 
    719                 if(!filename.equals(DIRECTORY_FILENAME)) { 
    720                     // Reencode filename after parseXML() had the side-effect of decoding entities in filename elements 
    721                      
    722                     //System.err.println("Filename before reencoding was: " + filename);                     
    723  
    724                     String encoded_filename = filename; 
    725                     if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 
    726                     encoded_filename = encoded_filename.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND); 
    727                     } else { 
    728                     // Can't convert to URI with backslash-escaped chars (backslash used in regexed filename are illegal in URI object 
    729                     // created by filenameToURLEncoding). So replace backslashes in regex with url-encoded hex-value of backslash, %5C. 
    730                     encoded_filename = filename.replace("\\", "%5C"); 
    731                      
    732                     // get the URL encoded filename preserving special encodings, with any curr_directory_path prefix removed 
    733                     encoded_filename = FilenameEncoding.filenameToURLEncodingWithPrefixRemoved(encoded_filename, curr_directory_path); 
    734                      
    735                     // Reintrodudce the backslash characters in place of their %5C hex placeholders 
    736                     encoded_filename = encoded_filename.replace("%5C", "\\"); 
    737                     } 
    738                     // Update filename element in DOM 
    739                     XMLTools.setElementTextValue(filename_element, encoded_filename); 
    740                     //System.err.println("Filename after reencoding was: " + encoded_filename); 
    741                 } 
    742             } 
    743         }        
    744         //System.err.println("RE-ENCODED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true)); 
    745     } 
    746702     
    747703    /** 
     
    763719        return; 
    764720    } 
    765     // Always call this method after calling parseXMLFile 
    766     reEncodeFilenamesInMetadataXML(document); 
    767721     
    768722    // Read all the Metadata elements in the file