Changeset 33746 for main/trunk
- Timestamp:
- 2019-12-03T17:31:17+13:00 (4 years ago)
- Location:
- main/trunk/gli/src/org/greenstone/gatherer/metadata
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java
r33745 r33746 73 73 * a file, it means this still needs to be retrieved. */ 74 74 public static Map map = new HashMap(); 75 76 /** Compiled pattern for hex entities of characters. These are of the forn "&#x....;" with 1 to 4 digits */ 77 public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 78 75 79 76 80 //*********************** BUSY REFRESHING / REQUIRING REFRESH ********************* … … 310 314 */ 311 315 public static String decodeStringContainingHexEntities(String str) { 312 String result = ""; 313 Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 314 Matcher matcher = hexPattern.matcher(str); 316 String result = ""; 317 Matcher matcher = HEX_PATTERN.matcher(str); 315 318 316 319 int searchFromIndex = 0; … … 327 330 String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match 328 331 // https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string 329 // https://stackoverflow.com/questions/11194513/convert-hex-string-to-int 330 331 //System.err.println("hexNumberStr so far: " + hexNumberStr); 332 // https://stackoverflow.com/questions/11194513/convert-hex-string-to-int 333 334 //System.err.println("hexNumberStr so far: " + hexNumberStr); 332 335 hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD" 333 336 //int hexNumber = Integer.parseInt(hexNumberStr); … … 388 391 */ 389 392 public static String escapeAllCharWithHexEntity(String str, char CHARACTER/*, String hexCodeString*/) { 393 394 if(str.indexOf(CHARACTER) == -1) { // nothing to replace, we're done 395 return str; 396 } 397 390 398 String char_as_string = Character.toString(CHARACTER); 391 399 String hexCodeString = hexEntityForChar(char_as_string); … … 393 401 //System.err.println("@@@ hexCodeString for: " + char_as_string + " is: " + hexCodeString); 394 402 395 Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 396 Matcher hexPatternMatch = hexPattern.matcher(str); 397 398 // want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match hexPattern 403 Matcher hexPatternMatch = HEX_PATTERN.matcher(str); // looks for a hex entity, which has the pattern "&#x....;" 404 405 // want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match HEX_PATTERN 399 406 int searchIndex = 0; 400 407 … … 423 430 424 431 // String has been modified, so have to update Matcher 425 hexPatternMatch = hexPattern.matcher(str);432 hexPatternMatch = HEX_PATTERN.matcher(str); 426 433 427 434 if(searchIndex >= str.length()) { … … 493 500 String filename_ascii = filename_uri.toASCIIString(); 494 501 502 // The URI.toASCIIString() call above only encodes values > 127. 503 // But we also need to protect + and & signs in filenames 504 filename_ascii = filename_ascii.replace("+", "%252B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased 505 filename_ascii = filename_ascii.replace("&", "%2526"); // &'s ASCII code is 36 in decimal, and 26 in hex 506 495 507 // Before proceeding, protect & in the filename too. 496 508 // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & … … 499 511 //filename_url_encoded = filename_url_encoded.replace("&", "&x26;");// SO THIS IS BAD 500 512 //filename_url_encoded = filename_url_encoded.replace("&", hexEntityForChar("&"));// SAME, STILL BAD 501 filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities 502 513 ///filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities 503 514 504 515 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1"); 505 516 filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes); 506 517 518 //filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller 519 filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex 520 filename_url_encoded = filename_url_encoded.replace("%26", "&"); 507 521 } 508 522 catch (Exception e) { … … 605 619 606 620 File file = new File (filename); 607 //return fileToURLEncoding(file);608 621 String filename_url_encoded = fileToURLEncoding(file); 609 622 … … 630 643 String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); 631 644 return filenameToURLEncodingWithPrefixRemoved(filename, curr_directory_path); 632 633 /*634 String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding(".");635 if (curr_directory_path.endsWith("."+FilenameEncoding.URL_FILE_SEPARATOR)) {636 curr_directory_path = curr_directory_path.substring(0, curr_directory_path.length()-2); // cut off /. at end637 }638 639 File file = new File (filename);640 String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath641 642 // now lop off the current dir prefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added643 filename_url_encoded = filename_url_encoded.substring(curr_directory_path.length());644 // remove any remaining slash prefix645 if (filename_url_encoded.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {646 filename_url_encoded = filename_url_encoded.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());647 }648 649 return filename_url_encoded;650 */651 645 } 652 646 … … 660 654 } 661 655 662 /*if (removeFilePathPrefix.endsWith("."+FilenameEncoding.URL_FILE_SEPARATOR)) {663 removeFilePathPrefix = removeFilePathPrefix.substring(0, removeFilePathPrefix.length()-2); // cut off /. at end664 }665 */666 667 656 File file = new File (filename); 668 657 String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath 669 670 System.err.println("@@@ full url encoded filename: " + filename_url_encoded);671 658 672 659 // now lop off the given removeFilePathPrefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added -
main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java
r33744 r33746 668 668 } 669 669 670 String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); 671 672 //String curr_directory_path = FilenameEncoding.filenameToURLEncoding("."); 673 //curr_directory_path = curr_directory_path.substring(0, curr_directory_path.length()-2); // cut off /. at end 674 System.err.println("@@@ curr_directory_path: " + curr_directory_path); 670 String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); // returns the curr dir path after removing the /./ at end 671 //System.err.println("@@@ curr_directory_path: " + curr_directory_path); 675 672 676 673 //System.err.println("PARSED loaded_file contains:\n" + XMLTools.elementToString(doc.getDocumentElement(), true)); … … 689 686 // Reencode filename after parseXML() had the side-effect of decoding entities in filename elements 690 687 691 System.err.println("Filename before reencoding was: " + filename);688 //System.err.println("Filename before reencoding was: " + filename); 692 689 693 690 // Can't convert to URI with backslash-escaped chars (backslash used in regexed filename are illegal in URI object 694 691 // created by filenameToURLEncoding). So replace backslashes in regex with url-encoded hex-value of backslash, %5C. 695 String encoded_filename = filename.replace("\\", "%5C"); 696 697 /* 698 encoded_filename = FilenameEncoding.filenameToURLEncoding(encoded_filename); 699 // now lop off the metadataxml dir prefix the FilenameEncoding.filenameToURLEncoding(STRING) variant would have added 700 encoded_filename = encoded_filename.substring(curr_directory_path.length()); 701 if (encoded_filename.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { 702 encoded_filename = encoded_filename.substring(FilenameEncoding.URL_FILE_SEPARATOR.length()); 703 } 704 */ 692 String encoded_filename = filename.replace("\\", "%5C"); 705 693 706 694 // get the URL encoded filename preserving special encodings, with any curr_directory_path prefix removed … … 708 696 709 697 // Reintrodudce the backslash characters in place of their %5C hex placeholders 710 encoded_filename = encoded_filename.replace("%5C", "\\"); 698 encoded_filename = encoded_filename.replace("%5C", "\\"); 711 699 712 700 // Update filename element in DOM 713 701 XMLTools.setElementTextValue(filename_element, encoded_filename); 714 System.err.println("Filename after reencoding was: " + encoded_filename);702 //System.err.println("Filename after reencoding was: " + encoded_filename); 715 703 } 716 704 }
Note:
See TracChangeset
for help on using the changeset viewer.