- Timestamp:
- 2019-12-03T15:04:32+13:00 (4 years ago)
- Location:
- main/trunk/gli/src/org/greenstone/gatherer/metadata
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java
r33739 r33744 438 438 return str; 439 439 } 440 441 442 // follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter443 public static String UNUSED_filenameToURLEncoding(String filename) {444 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {445 return filename;446 }447 448 // Can't create a URI out of a filename containing spaces. Spaces must be encoded as %20449 String filename_url_encoded = filename.replace(" ", "%20");450 //filename_url_encoded = filename_url_encoded.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex451 //filename_url_encoded = filename_url_encoded.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased452 453 try {454 URI filename_uri = new URI(filename_url_encoded);455 // The trick:456 // 1. toASCIIString() will %xx encode values > 127457 // 2. Decode the result to "ISO-8859-1"458 // 3. URL encode the bytes to string459 460 // Step 2 forces the string to be 8-bit values. It461 // doesn't matter if the starting raw filename was *not*462 // in the ISO-8859-1 encoding, the effect is to ensure463 // we have an 8-bit byte string that (numerically)464 // captures the right value. These numerical values are465 // then used to determine how to URL encode it466 467 String filename_ascii = filename_uri.toASCIIString();468 //filename_ascii = filename_ascii.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex469 //filename_ascii = filename_ascii.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased470 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");471 filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);472 473 // DEALING WITH & and + in filenames: NOT WORKING YET474 //if(filename_url_encoded.contains("&")) {475 // filename_url_encoded = filename_url_encoded.replace("&", "%36amp;");476 //} else if(filename_url_encoded.contains("&")) {477 // filename_url_encoded = filename_url_encoded.replace("&", "%36");478 //}479 480 }481 catch (Exception e) {482 e.printStackTrace();483 // Give up trying to convert484 filename_url_encoded = filename;485 }486 return filename_url_encoded;487 }488 489 490 // follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter491 public static String filenameToURLEncoding(String filename) {492 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param493 return filename;494 }495 File file = new File (filename);496 return fileToURLEncoding(file);497 }498 440 499 441 … … 517 459 return file.getAbsolutePath(); 518 460 } 519 520 // we'll want to protect & by replacing with &'s hex value521 // but we don't want to replace &#x....; with the same!522 Pattern plain_ampersand_not_hex_prefix_Pattern = Pattern.compile("&[^#]");523 524 525 int containsAmp = 0;526 if(file.getName().contains("&")) {527 System.err.println("@@@ 1 to encode " + file.getName());528 containsAmp = 1;529 } else if(file.getName().contains("&")) {530 System.err.println("@@@ 2 to encode " + file.getName());531 containsAmp = 2;532 } else {533 System.err.println("@@@ 0 to encode " + file.getName());534 }535 536 461 537 462 String filename_url_encoded = ""; … … 571 496 572 497 String filename_ascii = filename_uri.toASCIIString(); 573 // protect & and + in the filename too 574 filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); 575 576 if(containsAmp > 0) System.err.println("@@@ filename_ascii: " + filename_ascii); 577 578 579 //if(containsAmp > 0) System.err.println("@@@ filename_ascii with hexed &: " + filename_ascii); 498 499 // Before proceeding, protect & in the filename too. 500 // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & 501 // But dangerous to do simple replace if there are &#x...; entities in the filename already! 502 // That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same! 503 //filename_url_encoded = filename_url_encoded.replace("&", "&x26;");// SO THIS IS BAD 504 //filename_url_encoded = filename_url_encoded.replace("&", hexEntityForChar("&"));// SAME, STILL BAD 505 filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities 506 507 580 508 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1"); 581 509 filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes); 582 583 584 //filename_url_encoded = filename_url_encoded.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex585 //filename_ascii = filename_ascii.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased586 //if(containsAmp > 0) System.err.println("@@@ filename_url_encoded: " + filename_url_encoded);587 510 588 511 } … … 672 595 } 673 596 597 // FURTHER HELPER METHODS 598 599 /** 600 * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter. 601 * If filename is relative, then the current directory (gli?) will be prefixed to what is returned 602 * and should be removed manually by the caller. Alternatively, for relative paths, call the variant 603 * relativeFilenameToURLEncoding(String), which will remove any added filepath prefix. 604 */ 605 public static String fullFilepathToURLEncoding(String filename) { 606 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 607 return filename; 608 } 609 610 File file = new File (filename); 611 //return fileToURLEncoding(file); 612 String filename_url_encoded = fileToURLEncoding(file); 613 614 // if the current directory (".") was passed in as filename, 615 // then the filename_url_encoded looks like /full/path/./ 616 // In that case, remove the ./ at the end 617 if (filename_url_encoded.endsWith(FilenameEncoding.URL_FILE_SEPARATOR+"."+FilenameEncoding.URL_FILE_SEPARATOR)) { 618 filename_url_encoded = filename_url_encoded.substring(0, filename_url_encoded.length()-2); // cut off /. at end 619 } 620 621 return filename_url_encoded; 622 } 623 624 /** 625 * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter 626 * If filename is a relative path, call this method to get it specially URL encoded. 627 * This method will remove the current directory that is prefixed as an intermediary step. 628 */ 629 public static String relativeFilenameToURLEncoding(String filename) { 630 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 631 return filename; 632 } 633 634 String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); 635 return filenameToURLEncodingWithPrefixRemoved(filename, curr_directory_path); 636 637 /* 638 String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); 639 if (curr_directory_path.endsWith("."+FilenameEncoding.URL_FILE_SEPARATOR)) { 640 curr_directory_path = curr_directory_path.substring(0, curr_directory_path.length()-2); // cut off /. at end 641 } 642 643 File file = new File (filename); 644 String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath 645 646 // now lop off the current dir prefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added 647 filename_url_encoded = filename_url_encoded.substring(curr_directory_path.length()); 648 // remove any remaining slash prefix 649 if (filename_url_encoded.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { 650 filename_url_encoded = filename_url_encoded.substring(FilenameEncoding.URL_FILE_SEPARATOR.length()); 651 } 652 653 return filename_url_encoded; 654 */ 655 } 656 657 /** 658 * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter 659 * Convenience method that will return the specially URL encoded version of filename 660 * with the provided removeFilePathPrefix removed */ 661 public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) { 662 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 663 return filename; 664 } 665 666 /*if (removeFilePathPrefix.endsWith("."+FilenameEncoding.URL_FILE_SEPARATOR)) { 667 removeFilePathPrefix = removeFilePathPrefix.substring(0, removeFilePathPrefix.length()-2); // cut off /. at end 668 } 669 */ 670 671 File file = new File (filename); 672 String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath 673 674 System.err.println("@@@ full url encoded filename: " + filename_url_encoded); 675 676 // now lop off the given removeFilePathPrefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added 677 filename_url_encoded = filename_url_encoded.substring(removeFilePathPrefix.length()); 678 // remove any remaining slash prefix 679 if (filename_url_encoded.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { 680 filename_url_encoded = filename_url_encoded.substring(FilenameEncoding.URL_FILE_SEPARATOR.length()); 681 } 682 683 return filename_url_encoded; 684 } 674 685 } -
main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java
r33739 r33744 668 668 } 669 669 670 String curr_directory_path = FilenameEncoding.filenameToURLEncoding("."); 671 curr_directory_path = curr_directory_path.substring(0, curr_directory_path.length()-2); // cut off /. at end 672 //System.err.println("@@@ curr_directory_path: " + curr_directory_path); 670 String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); 671 672 //String curr_directory_path = FilenameEncoding.filenameToURLEncoding("."); 673 //curr_directory_path = curr_directory_path.substring(0, curr_directory_path.length()-2); // cut off /. at end 674 System.err.println("@@@ curr_directory_path: " + curr_directory_path); 673 675 674 676 //System.err.println("PARSED loaded_file contains:\n" + XMLTools.elementToString(doc.getDocumentElement(), true)); … … 687 689 // Reencode filename after parseXML() had the side-effect of decoding entities in filename elements 688 690 689 //System.err.println("Filename before reencoding was: " + filename);691 System.err.println("Filename before reencoding was: " + filename); 690 692 691 693 // Can't convert to URI with backslash-escaped chars (backslash used in regexed filename are illegal in URI object 692 694 // created by filenameToURLEncoding). So replace backslashes in regex with url-encoded hex-value of backslash, %5C. 693 695 String encoded_filename = filename.replace("\\", "%5C"); 696 697 /* 694 698 encoded_filename = FilenameEncoding.filenameToURLEncoding(encoded_filename); 695 696 699 // now lop off the metadataxml dir prefix the FilenameEncoding.filenameToURLEncoding(STRING) variant would have added 697 700 encoded_filename = encoded_filename.substring(curr_directory_path.length()); … … 699 702 encoded_filename = encoded_filename.substring(FilenameEncoding.URL_FILE_SEPARATOR.length()); 700 703 } 701 704 */ 705 706 // get the URL encoded filename preserving special encodings, with any curr_directory_path prefix removed 707 encoded_filename = FilenameEncoding.filenameToURLEncodingWithPrefixRemoved(encoded_filename, curr_directory_path); 708 702 709 // Reintrodudce the backslash characters in place of their %5C hex placeholders 703 710 encoded_filename = encoded_filename.replace("%5C", "\\"); … … 705 712 // Update filename element in DOM 706 713 XMLTools.setElementTextValue(filename_element, encoded_filename); 707 //System.err.println("Filename after reencoding was: " + encoded_filename);714 System.err.println("Filename after reencoding was: " + encoded_filename); 708 715 } 709 716 }
Note:
See TracChangeset
for help on using the changeset viewer.