Changeset 33737
- Timestamp:
- 2019-12-02T20:03:57+13:00 (4 years ago)
- Location:
- main/trunk/gli/src/org/greenstone/gatherer/metadata
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java
r33730 r33737 364 364 // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png 365 365 // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format) 366 if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 ) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing366 if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 || charCode == 36 || charCode == 43) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too 367 367 hexFilename += filename.charAt(i); 368 368 } else { … … 374 374 } 375 375 376 377 // follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter 378 public static String filenameToURLEncoding(String filename) { 379 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 380 return filename; 381 } 382 383 // Can't create a URI out of a filename containing spaces. Spaces must be encoded as %20 384 String filename_url_encoded = filename.replace(" ", "%20"); 385 //filename_url_encoded = filename_url_encoded.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex 386 //filename_url_encoded = filename_url_encoded.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased 387 388 try { 389 URI filename_uri = new URI(filename_url_encoded); 390 // The trick: 391 // 1. toASCIIString() will %xx encode values > 127 392 // 2. Decode the result to "ISO-8859-1" 393 // 3. URL encode the bytes to string 394 395 // Step 2 forces the string to be 8-bit values. It 396 // doesn't matter if the starting raw filename was *not* 397 // in the ISO-8859-1 encoding, the effect is to ensure 398 // we have an 8-bit byte string that (numerically) 399 // captures the right value. These numerical values are 400 // then used to determine how to URL encode it 401 402 String filename_ascii = filename_uri.toASCIIString(); 403 //filename_ascii = filename_ascii.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex 404 //filename_ascii = filename_ascii.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased 405 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1"); 406 filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes); 407 408 // DEALING WITH & and + in filenames: NOT WORKING YET 409 //if(filename_url_encoded.contains("&")) { 410 // filename_url_encoded = filename_url_encoded.replace("&", "%36amp;"); 411 //} else if(filename_url_encoded.contains("&")) { 412 // filename_url_encoded = filename_url_encoded.replace("&", "%36"); 413 //} 414 415 } 416 catch (Exception e) { 417 e.printStackTrace(); 418 // Give up trying to convert 419 filename_url_encoded = filename; 420 } 421 return filename_url_encoded; 422 } 423 424 425 // follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter 426 public static String _filenameToURLEncoding(String filename) { 427 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 428 return filename; 429 } 430 431 File file = new File (filename); 432 return fileToURLEncoding(file); 433 } 434 435 376 436 // Dr Bainbridge's methods 377 437 /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding, -
main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java
r33730 r33737 81 81 loaded_file = this; 82 82 loaded_file_document = document; 83 reEncodeFilenamesInMetadataXML(loaded_file_document); 84 83 85 } 84 86 … … 234 236 loaded_file = this; 235 237 loaded_file_document = document; 238 239 reEncodeFilenamesInMetadataXML(loaded_file_document); 236 240 } 237 241 … … 395 399 loaded_file = this; 396 400 loaded_file_document = document; 401 402 reEncodeFilenamesInMetadataXML(loaded_file_document); 397 403 } 398 404 … … 512 518 loaded_file = this; 513 519 loaded_file_document = document; 520 521 reEncodeFilenamesInMetadataXML(loaded_file_document); 514 522 } 515 523 … … 629 637 XMLTools.writeXMLFile(loaded_file, loaded_file_document, nonEscapingElements); 630 638 639 /* // DEBUGGING: 631 640 Document doc = XMLTools.parseXMLFile(loaded_file); 632 //System.err.println("AT END saveLoadedFile(), PARSED loaded_file contains:\n" + XMLTools.elementToString(doc.getDocumentElement(), true)); 633 641 System.err.println("AT END saveLoadedFile(), PARSED loaded_file contains:\n" + XMLTools.elementToString(doc.getDocumentElement(), true)); 642 643 reEncodeFilenamesInMetadataXML(doc); 644 System.err.println("AT END saveLoadedFile(), RE-ENCODED loaded_file contains:\n" + XMLTools.elementToString(doc.getDocumentElement(), true)); 645 */ 634 646 loaded_file_changed = false; 635 647 } 648 649 //System.err.println("@@@@ END of saveLoadedFile()"); 650 //Utility.printCaller(); 636 651 } 637 652 638 653 /** 654 * parseXML(metadata.xml) has the side-effect of resolving html entities. 655 * Although this is not done by the GLIEntityResolver usage in parseXML(), something 656 * in parseXML() is resolving the html entities, including those used in carefully 657 * html-entity-escaped filenames. 658 * We need to get the filenames in the DOM correct after parsing a metadata.xml file 659 * into memory, so that we have the correct filenames and so that we'll write it out correctly. 660 * Therefore, always call this method after a successful parseXML() call on a metadata.xml. 661 * @param doc is the Document where the FILENAME_ELEMENTs need to be re-encoded. 662 * At the end of this function, the doc will be modified with the re-encoded filenames. 663 * 664 */ 665 static private void reEncodeFilenamesInMetadataXML(Document doc) { 666 if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 667 return; 668 } 669 670 //System.err.println("PARSED loaded_file contains:\n" + XMLTools.elementToString(doc.getDocumentElement(), true)); 671 672 // Read all the FileSet elements in the file 673 NodeList fileset_elements_nodelist = doc.getElementsByTagName(FILESET_ELEMENT); 674 for (int i = 0; i < fileset_elements_nodelist.getLength(); i++) { 675 Element current_fileset_element = (Element) fileset_elements_nodelist.item(i); 676 677 // get the value of all FileName elements 678 NodeList filename_elements_nodelist = current_fileset_element.getElementsByTagName(FILENAME_ELEMENT); 679 for (int j = 0; j < filename_elements_nodelist.getLength(); j++) { 680 Element filename_element = (Element) filename_elements_nodelist.item(j); 681 String filename = XMLTools.getElementTextValue(filename_element); 682 if(!filename.equals(DIRECTORY_FILENAME)) { 683 //System.err.println("Filename before reencoding was: " + filename); 684 // reencode filename 685 // can't convert to URI with backslash-escaped chars (backslash used in regexed filename are illegal in URI object 686 // created by filenameToURLEncoding). 687 String encoded_filename = filename.replace("\\", "%5C"); 688 encoded_filename = FilenameEncoding.filenameToURLEncoding(encoded_filename); 689 // escape chars for regex again 690 encoded_filename = encoded_filename.replace("%5C", "\\"); 691 XMLTools.setElementTextValue(filename_element, encoded_filename); 692 //System.err.println("Filename after reencoding was: " + encoded_filename); 693 } 694 } 695 } 696 //System.err.println("RE-ENCODED loaded_file contains:\n" + XMLTools.elementToString(doc.getDocumentElement(), true)); 697 } 698 639 699 /** 640 700 * Every metadata.xml file must be skimmed when a collection is opened, for three very important reasons: … … 655 715 return; 656 716 } 657 717 // Always call this method after calling parseXMLFile 718 reEncodeFilenamesInMetadataXML(document); 719 658 720 // Read all the Metadata elements in the file 659 721 HashMap target_metadata_element_name_attrs_cache = new HashMap();
Note:
See TracChangeset
for help on using the changeset viewer.