Changeset 33748 for main/trunk
- Timestamp:
- 2019-12-03T21:06:44+13:00 (4 years ago)
- Location:
- main/trunk/gli/src/org/greenstone/gatherer/metadata
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java
r33747 r33748 76 76 /** Compiled pattern for hex entities of characters. These are of the forn "&#x....;" with 1 to 4 digits */ 77 77 public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 78 78 79 /** The hex entity version of the ampersand character. 80 * We use this in place of the ampersand character in filenames in metadata.xml files to 81 * preserve the reference to the literal ampersand in the real file name on the file system. 82 */ 83 public static final String HEX_ENTITY_AMPERSAND = FilenameEncoding.hexEntityForChar("&"); //"&"; 84 79 85 80 86 //*********************** BUSY REFRESHING / REQUIRING REFRESH ********************* … … 371 377 372 378 public static String fileToURLEncoding(File file) { 379 // on a UTF-8 file system, DO NOT do the stuff further below, 380 // just return input filename param, but with any & in the filename replaced with its hex entity 373 381 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 374 return file.getAbsolutePath(); 382 // protect ampersands in filenames by converting it to its hex entity 383 String filepath = file.getAbsolutePath(); 384 filepath = filepath.replace("&", HEX_ENTITY_AMPERSAND); 385 return filepath; 375 386 } 376 387 … … 419 430 420 431 // Before proceeding, protect & in the filename too. 421 // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & 432 // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & (HEX_ENTITY_AMPERSAND) 422 433 // But dangerous to do simple replace if there are &#x...; entities in the filename already! 423 434 // That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same! … … 434 445 //filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller 435 446 filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex 436 filename_url_encoded = filename_url_encoded.replace("%26", "&"); // convert URL encoding for ampersand into hex entity for ampersand447 filename_url_encoded = filename_url_encoded.replace("%26", HEX_ENTITY_AMPERSAND); // convert URL encoding for ampersand into hex entity for ampersand 437 448 } 438 449 catch (Exception e) { … … 530 541 */ 531 542 public static String fullFilepathToURLEncoding(String filename) { 532 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 533 return filename; 543 // on a UTF-8 file system, DO NOT do the stuff further below, 544 // just return input filename param, but with any & in the filename replaced with its hex entity 545 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 546 return filename.replace("&", HEX_ENTITY_AMPERSAND); 534 547 } 535 548 … … 554 567 public static String relativeFilenameToURLEncoding(String filename) { 555 568 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 556 return filename ;569 return filename.replace("&", HEX_ENTITY_AMPERSAND); 557 570 } 558 571 … … 567 580 public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) { 568 581 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 569 return filename ;582 return filename.replace("&", HEX_ENTITY_AMPERSAND); 570 583 } 571 584 -
main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java
r33747 r33748 87 87 // Determine the file's path relative to the location of the metadata.xml file 88 88 String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile()); 89 String file_relative_path = file_node.getURLEncodedFilePath().substring(metadata_xml_file_directory_path.length()); 89 90 String file_relative_path = file_node.getURLEncodedFilePath(); 91 if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 92 file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND); 93 } 94 file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length()); 95 90 96 if (file_relative_path.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { 91 97 file_relative_path = file_relative_path.substring(FilenameEncoding.URL_FILE_SEPARATOR.length()); … … 103 109 } 104 110 111 // LEAVE THIS DEBUGGING STATEMENT IN - USEFUL TO DEBUG FILENAME ENCODING ISSUES WHEN META ASSIGNED 105 112 //System.err.println("MetadataXMLFile.addMetadata() Adding meta for file regexp: " 106 113 // + file_path_regexp + " - " + org.greenstone.gatherer.util.Utility.debugUnicodeString(file_path_regexp)); … … 220 227 221 228 229 // DO NOT REMOVE THE System.err DEBUGGING STATEMENTS FROM THIS METHOD: HELPS WITH TESTING/DEBUGGING 230 // WHEN FILE-LEVEL META IS ASSIGNED TO NON-ASCII ENCODED FILENAMES OR WITH FILENAMES CONTAINING +/ampersand 222 231 public ArrayList getMetadataAssignedToFile(File file, boolean fileEncodingOnly) 223 232 { … … 652 661 * At the end of this function, the doc will be modified with the re-encoded filenames. 653 662 * 663 * DO NOT REMOVE THE DEBUGGING STATEMENTS IN THIS FUNCTION, AS THEY'RE USEFUL 664 * FOR DEBUGGING ENCODING ISSUES TO DO WITH FILE LEVEL META ASSIGNED TO FILENAMES 665 * THAT ARE NON-ASCII OR CONTAIN +/ampersands IN THEM. 654 666 */ 655 667 static private void reEncodeFilenamesInMetadataXML(Document doc) { 656 if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {657 return;658 }659 668 660 669 String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); // returns the curr dir path after removing the /./ at end … … 677 686 678 687 //System.err.println("Filename before reencoding was: " + filename); 679 688 689 String encoded_filename = filename; 690 if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 691 encoded_filename = encoded_filename.replace("&", FilenameEncoding.HEX_ENTITY_AMPERSAND); 692 } else { 680 693 // Can't convert to URI with backslash-escaped chars (backslash used in regexed filename are illegal in URI object 681 694 // created by filenameToURLEncoding). So replace backslashes in regex with url-encoded hex-value of backslash, %5C. 682 Stringencoded_filename = filename.replace("\\", "%5C");695 encoded_filename = filename.replace("\\", "%5C"); 683 696 684 697 // get the URL encoded filename preserving special encodings, with any curr_directory_path prefix removed … … 687 700 // Reintrodudce the backslash characters in place of their %5C hex placeholders 688 701 encoded_filename = encoded_filename.replace("%5C", "\\"); 689 690 691 692 702 } 703 // Update filename element in DOM 704 XMLTools.setElementTextValue(filename_element, encoded_filename); 705 //System.err.println("Filename after reencoding was: " + encoded_filename); 693 706 } 694 707 }
Note:
See TracChangeset
for help on using the changeset viewer.