Changeset 34415
- Timestamp:
- 2020-09-22T00:57:33+12:00 (4 years ago)
- Location:
- main/trunk/gli/src/org/greenstone/gatherer/metadata
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java
r33748 r34415 77 77 public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 78 78 79 /** The hex entity version of the ampersand character.79 /** The hex version of the ampersand character: previously hex entity (&) now hex url encoded (%26). 80 80 * We use this in place of the ampersand character in filenames in metadata.xml files to 81 81 * preserve the reference to the literal ampersand in the real file name on the file system. 82 82 */ 83 public static final String HEX_ ENTITY_AMPERSAND= FilenameEncoding.hexEntityForChar("&"); //"&";83 public static final String HEX_AMPERSAND = "%26"; //= FilenameEncoding.hexEntityForChar("&"); //"&"; 84 84 85 85 … … 257 257 258 258 /** URL encoded version of the byte codes of the given file's name */ 259 public static String calcURLEncodedFilePath(File file) { 260 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 261 return file.getAbsolutePath(); 262 } 263 else { 264 String filename = fileToURLEncoding(file); 265 return filename; 266 } 259 public static String calcURLEncodedFilePath(File file) { 260 return fileToURLEncoding(file); 267 261 } 268 262 … … 380 374 // just return input filename param, but with any & in the filename replaced with its hex entity 381 375 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 382 // protect ampersands in filenames by converting it to its hex entity383 376 String filepath = file.getAbsolutePath(); 384 filepath = filepath.replace("&", HEX_ENTITY_AMPERSAND);385 377 return filepath; 386 378 } … … 430 422 431 423 // Before proceeding, protect & in the filename too. 432 // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & (HEX_ ENTITY_AMPERSAND)424 // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & (HEX_AMPERSAND) 433 425 // But dangerous to do simple replace if there are &#x...; entities in the filename already! 434 426 // That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same! … … 445 437 //filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller 446 438 filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex 447 filename_url_encoded = filename_url_encoded.replace("%26", HEX_ENTITY_AMPERSAND); // convert URL encoding for ampersand into hex entity for ampersand439 filename_url_encoded = filename_url_encoded.replace("%26", "&"); // now putting back ampersands too, instead of replacing with HEX_ENTITY_AMPERSAND (&) 448 440 } 449 441 catch (Exception e) { … … 544 536 // just return input filename param, but with any & in the filename replaced with its hex entity 545 537 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 546 return filename .replace("&", HEX_ENTITY_AMPERSAND);538 return filename; //return filename.replace("&", HEX_AMPERSAND); 547 539 } 548 540 … … 567 559 public static String relativeFilenameToURLEncoding(String filename) { 568 560 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 569 return filename .replace("&", HEX_ENTITY_AMPERSAND);561 return filename; // return filename.replace("&", HEX_AMPERSAND); 570 562 } 571 563 … … 580 572 public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) { 581 573 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 582 return filename .replace("&", HEX_ENTITY_AMPERSAND);574 return filename; //return filename.replace("&", HEX_AMPERSAND); 583 575 } 584 576 -
main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java
r34414 r34415 79 79 loaded_file = this; 80 80 loaded_file_document = document; 81 82 reEncodeFilenamesInMetadataXML(loaded_file_document);83 81 } 84 82 … … 108 106 loaded_file = this; 109 107 loaded_file_document = document; 110 reEncodeFilenamesInMetadataXML(loaded_file_document);111 112 108 } 113 109 114 110 // Determine the file's path relative to the location of the metadata.xml file 115 111 String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile()); 116 117 112 String file_relative_path = file_node.getURLEncodedFilePath(); 118 if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {119 file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND);120 }121 113 file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length()); 122 114 … … 132 124 } 133 125 else { 126 // When XML files are parsed, predefined XML entities get resolved, which includes & in & and &#x...; 127 // see https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references#Predefined_entities_in_XML 128 // (and https://stackoverflow.com/questions/1777878/is-there-a-java-xml-api-that-can-parse-a-document-without-resolving-character-en) 129 // We don't want &/entities in FileName elements stored in metadata.xml, as we'd have to put the entities 130 // back (undo the xml entity resolution) after each XML parse operation, which is costly and slows GLI down 131 // when assigning meta to multiple docs. 132 // Instead, when writing out or comparing against FileName elements in metadata.xml, we ensure all 133 // ampersands are replaced by their hex URL encoded value of %26. 134 file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND); 135 134 136 // Convert the file path into a regular expression that will match it 135 137 file_path_regexp = MetadataTools.getRegularExpressionThatMatchesFilePath(file_relative_path); … … 153 155 Element current_filename_element = (Element) filename_elements_nodelist.item(j); 154 156 String current_filename_element_value = XMLTools.getElementTextValue(current_filename_element); 155 157 156 158 // Only exact matches can be extended with new metadata 157 159 if (current_filename_element_value.equals(file_path_regexp)) { … … 200 202 // changed, it must be applied on the file(name) whose metadata has been adjusted 201 203 if(metadata_element_name_full.equals(FILENAME_ENCODING_METADATA)) { 202 metadata_value_string = processFilenameEncoding(file_path_regexp, 204 metadata_value_string = processFilenameEncoding(file_path_regexp, // file_path_regexp has & replaced by HEX_AMPERSAND but processFilenameEncoding doesn't use param 203 205 file_node, metadata_value_string, false); 204 206 // true only if removing meta … … 252 254 loaded_file_changed = true; 253 255 } 254 255 256 // DO NOT REMOVE THE System.err DEBUGGING STATEMENTS FROM THIS METHOD: HELPS WITH TESTING/DEBUGGING 256 257 // DO NOT REMOVE THE System.err DEBUGGING STATEMENTS FROM THIS METHOD: HELPS WITH TESTING/DEBUGGING 257 258 // WHEN FILE-LEVEL META IS ASSIGNED TO NON-ASCII ENCODED FILENAMES OR WITH FILENAMES CONTAINING +/ampersand 259 260 261 // By default, XML parsing automatically resolves certain predefined XML entities including the ampersand. 262 // https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references#Predefined_entities_in_XML 263 // "The XML specification defines five "predefined entities" representing special characters, and requires that all XML processors honor them. The entities can be explicitly declared in a DTD, as well, but if this is done, the replacement text must be the same as the built-in definitions. XML also allows other named entities of any size to be defined on a per-document basis." 264 // Also https://stackoverflow.com/questions/1777878/is-there-a-java-xml-api-that-can-parse-a-document-without-resolving-character-en 265 // which suggests using StAX instead of SAX or DOM parsers allows us to bypass automatic entity resolution. 266 // However, https://docs.oracle.com/javase/tutorial/jaxp/stax/why.html and 267 // https://docs.oracle.com/javase/tutorial/jaxp/stax/api.html show that StAX works like SAX rather than DOM parser 268 // while the XMLTools.parseXML() that we use throughout this file relies on DOMParser behaviour to get access to the 269 // XML DOM Document, so that it's not straightforward to replace DOMParser's use in Document XMLTools.parseXML() with 270 // an equivalent using a streambased StAX parser. 271 // Instead, method reEncodeFilenamesInMetadataXML(Doc doc) has been removed, as the solution is to no longer store 272 // ampersands: no longer encoding ampersands to entities but to %26, and all hex entities in filenames are further 273 // protected from XML's entity resolution because their ampersand prefixes are encoded as %26 (i.e. � is 274 // stored as %26#xDDDD;) and therefore we no longer need to go over the XML Doc reinstating entities after parseXML 275 // either, entities being now preserved though with %26 prefixed in place of the & prefix. 276 258 277 public ArrayList getMetadataAssignedToFile(File file, boolean fileEncodingOnly) 259 278 { … … 272 291 loaded_file = this; 273 292 loaded_file_document = document; 274 275 reEncodeFilenamesInMetadataXML(loaded_file_document);276 293 } 277 294 … … 285 302 file_relative_path = file_relative_path.substring(FilenameEncoding.URL_FILE_SEPARATOR.length()); 286 303 } 287 String hexdecoded_regexed_file_relative_path = FilenameEncoding.decodeStringContainingHexEntities(file_relative_path); 288 304 305 // Ampersands and entities in FileName elements are protected from being resolved on parsing back the XML. 306 // To compare apples with apples convert any & to its hex url encoded value of %26 307 file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND); 308 289 309 // Build up a list of metadata assigned to this file 290 310 ArrayList metadata_values = new ArrayList(); … … 305 325 306 326 //System.err.println("\n Original TAIL filename was: " + Utility.debugUnicodeString(file.getName())); 307 //System.err.println("Looking in meta.xml for hexdecoded_regexed_file_RELATIVE_path: " + hexdecoded_regexed_file_relative_path308 //+ " - debug version: " + Utility.debugUnicodeString( hexdecoded_regexed_file_relative_path));327 //System.err.println("Looking in meta.xml for file_relative_path: " + file_relative_path); 328 //+ " - debug version: " + Utility.debugUnicodeString(file_relative_path)); 309 329 310 330 // Does this fileset specify metadata for one file only? … … 315 335 } 316 336 317 String hexdecoded_current_filename_element_value = FilenameEncoding.decodeStringContainingHexEntities(current_filename_element_value); 318 //System.err.println(" Checking to see if it matches " + hexdecoded_current_filename_element_value + " - debug: " + Utility.debugUnicodeString(hexdecoded_current_filename_element_value)); 319 //System.err.println(" Checking to see if it matches " + current_filename_element_value + " - debug: " + Utility.debugUnicodeString(current_filename_element_value)); 337 //System.err.println(" Checking to see if it matches " + current_filename_element_value);// + " - debug: " + Utility.debugUnicodeString(current_filename_element_value)); 320 338 321 339 // This fileset specifies metadata for the file 322 340 // MetadataXMLFile.addMetadata(CollectionTreeNode, ArrayList) stored filename in uppercase hex 323 341 // so need to make sure everything hex has been decoded (no more hex) to compare apples with apples 324 if ( hexdecoded_regexed_file_relative_path.matches(hexdecoded_current_filename_element_value)) { //if (file_relative_path.matches(current_filename_element_value)) {325 //System.err.println(" @@@ Found a match in meta.xml for hexdecoded_regexed_file_relative_path: " + hexdecoded_regexed_file_relative_path + "\n");342 if (file_relative_path.matches(current_filename_element_value)) { 343 //System.err.println(" @@@ Found a match in meta.xml for file_relative_path: " + file_relative_path + "\n"); 326 344 current_fileset_matches = true; 327 345 if (!file_relative_path.equals("") && current_filename_element_value.equals(DIRECTORY_FILENAME)) { … … 330 348 break; 331 349 } //else { 332 //System.err.println( hexdecoded_regexed_file_relative_path + " does not match " + hexdecoded_current_filename_element_value);333 //System.err.println( Utility.debugUnicodeString( hexdecoded_regexed_file_relative_path) + " does not match " + Utility.debugUnicodeString(hexdecoded_current_filename_element_value));350 //System.err.println(" ###" + file_relative_path + " does not match " + current_filename_element_value); 351 //System.err.println( Utility.debugUnicodeString(file_relative_path) + " does not match " + Utility.debugUnicodeString(current_filename_element_value)); 334 352 //} 335 353 336 354 // This fileset specifies metadata for the folder the file is in 337 if ( hexdecoded_regexed_file_relative_path.startsWith(hexdecoded_current_filename_element_value + FilenameEncoding.URL_FILE_SEPARATOR)) {355 if (file_relative_path.startsWith(current_filename_element_value + FilenameEncoding.URL_FILE_SEPARATOR)) { 338 356 current_fileset_matches = true; 339 357 folder_metadata_inherited_from = new File(metadata_xml_file_directory, current_filename_element_value); … … 435 453 loaded_file = this; 436 454 loaded_file_document = document; 437 438 reEncodeFilenamesInMetadataXML(loaded_file_document);439 455 } 440 456 … … 442 458 String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile()); 443 459 String file_relative_path = file_node.getURLEncodedFilePath(); 444 if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {445 file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND);446 }447 460 file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length()); 448 461 if (file_relative_path.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { … … 457 470 } 458 471 else { 472 // Ampersands and entities in FileName elements are protected from being resolved on parsing back the XML. 473 // To compare apples with apples convert any & to its hex url encoded value of %26 474 file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND); 475 459 476 // Convert the file path into a regular expression that will match it 460 477 file_path_regexp = MetadataTools.getRegularExpressionThatMatchesFilePath(file_relative_path); … … 558 575 loaded_file = this; 559 576 loaded_file_document = document; 560 561 reEncodeFilenamesInMetadataXML(loaded_file_document);562 577 } 563 578 … … 565 580 String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile()); 566 581 String file_relative_path = file_node.getURLEncodedFilePath(); 567 if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {568 file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND);569 }570 582 file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length()); 571 583 if (file_relative_path.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { … … 580 592 } 581 593 else { 594 // Ampersands and entities in FileName elements are protected from being resolved on parsing back the XML. 595 // To compare apples with apples convert any & to its hex url encoded value of %26 596 file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND); 597 582 598 // Convert the file path into a regular expression that will match it 583 599 file_path_regexp = MetadataTools.getRegularExpressionThatMatchesFilePath(file_relative_path); … … 684 700 } 685 701 } 686 687 /**688 * parseXML(metadata.xml) has the side-effect of resolving html entities.689 * Although this is not done by the GLIEntityResolver usage in parseXML(), something690 * in parseXML() is resolving the html entities, including those used in carefully691 * html-entity-escaped filenames.692 * We need to get the filenames in the DOM correct after parsing a metadata.xml file693 * into memory, so that we have the correct filenames and so that we'll write it out correctly.694 * Therefore, always call this method after a successful parseXML() call on a metadata.xml.695 * @param doc is the Document where the FILENAME_ELEMENTs need to be re-encoded.696 * At the end of this function, the doc will be modified with the re-encoded filenames.697 *698 * DO NOT REMOVE THE DEBUGGING STATEMENTS IN THIS FUNCTION, AS THEY'RE USEFUL699 * FOR DEBUGGING ENCODING ISSUES TO DO WITH FILE LEVEL META ASSIGNED TO FILENAMES700 * THAT ARE NON-ASCII OR CONTAIN +/ampersands IN THEM.701 */702 static private void reEncodeFilenamesInMetadataXML(Document doc) {703 704 String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); // returns the curr dir path after removing the /./ at end705 //System.err.println("@@@ curr_directory_path: " + curr_directory_path);706 707 //System.err.println("PARSED loaded_file contains:\n" + XMLTools.elementToString(doc.getDocumentElement(), true));708 709 // Read all the FileSet elements in the file710 NodeList fileset_elements_nodelist = doc.getElementsByTagName(FILESET_ELEMENT);711 for (int i = 0; i < fileset_elements_nodelist.getLength(); i++) {712 Element current_fileset_element = (Element) fileset_elements_nodelist.item(i);713 714 // get the value of all FileName elements715 NodeList filename_elements_nodelist = current_fileset_element.getElementsByTagName(FILENAME_ELEMENT);716 for (int j = 0; j < filename_elements_nodelist.getLength(); j++) {717 Element filename_element = (Element) filename_elements_nodelist.item(j);718 String filename = XMLTools.getElementTextValue(filename_element);719 if(!filename.equals(DIRECTORY_FILENAME)) {720 // Reencode filename after parseXML() had the side-effect of decoding entities in filename elements721 722 //System.err.println("Filename before reencoding was: " + filename);723 724 String encoded_filename = filename;725 if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {726 encoded_filename = encoded_filename.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND);727 } else {728 // Can't convert to URI with backslash-escaped chars (backslash used in regexed filename are illegal in URI object729 // created by filenameToURLEncoding). So replace backslashes in regex with url-encoded hex-value of backslash, %5C.730 encoded_filename = filename.replace("\\", "%5C");731 732 // get the URL encoded filename preserving special encodings, with any curr_directory_path prefix removed733 encoded_filename = FilenameEncoding.filenameToURLEncodingWithPrefixRemoved(encoded_filename, curr_directory_path);734 735 // Reintrodudce the backslash characters in place of their %5C hex placeholders736 encoded_filename = encoded_filename.replace("%5C", "\\");737 }738 // Update filename element in DOM739 XMLTools.setElementTextValue(filename_element, encoded_filename);740 //System.err.println("Filename after reencoding was: " + encoded_filename);741 }742 }743 }744 //System.err.println("RE-ENCODED loaded_file contains:\n" + XMLTools.elementToString(doc.getDocumentElement(), true));745 }746 702 747 703 /** … … 763 719 return; 764 720 } 765 // Always call this method after calling parseXMLFile766 reEncodeFilenamesInMetadataXML(document);767 721 768 722 // Read all the Metadata elements in the file
Note:
See TracChangeset
for help on using the changeset viewer.