Changeset 33747
- Timestamp:
- 2019-12-03T17:50:30+13:00 (4 years ago)
- Location:
- main/trunk/gli/src/org/greenstone/gatherer/metadata
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java
r33746 r33747 354 354 } 355 355 356 357 // Dr Bainbridge's methods 358 /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding, 359 * we can work with URL-encoded filenames in Java. Java works with whatever 360 * encoding the filesystem uses. Unlike systems working with UTF-8, where Java 361 * interprets filenames as UTF-8 (a destructive process since characters invalid 362 * for UTF-8 are replaced with the invalid character, which means the original 363 * character's byte codes can not be regained), working with an ISO-8859-1 364 * system means the original byte codes of the characters are preserved, 365 * regardless of whether the characters represent ISO-8859-1 or not. Such byte 366 * codes are converted by the following method to the correct URL versions of 367 * the strings that the filenames represent (that is, the correct URL representations 368 * of the filenames in their original encodings). This is useful for interactions with 369 * Perl as Java and Perl can use URL-encoded filenames to talk about the same files 370 * on the file system, instead of having to work out what encoding they are in. */ 371 372 public static String fileToURLEncoding(File file) { 373 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { 374 return file.getAbsolutePath(); 375 } 376 377 String filename_url_encoded = ""; 378 379 // The following test for whether the file exists or not is a problem 380 // when a File object--whose actual file is in the process of being moved 381 // and therefore temporarily does not 'exist' on the actual system--can't 382 // be URL encoded: the following would return "" when a file doesn't exist. 383 // So commenting out the test. 384 /* 385 if(!file.getName().equals("recycle")) { 386 if(!file.isFile() && !file.isDirectory()) { 387 System.err.println("*** ERROR. Java can't see file: " + file.getAbsolutePath()); 388 return ""; 389 } 390 391 if(!file.exists()) { 392 System.err.println("*** NOTE: File doesn't exist: " + file.getAbsolutePath()); 393 return ""; //file.getName(); 394 } 395 } 396 */ 397 398 URI filename_uri = file.toURI(); 399 try { 400 // The trick: 401 // 1. toASCIIString() will %xx encode values > 127 402 // 2. Decode the result to "ISO-8859-1" 403 // 3. URL encode the bytes to string 404 405 // Step 2 forces the string to be 8-bit values. It 406 // doesn't matter if the starting raw filename was *not* 407 // in the ISO-8859-1 encoding, the effect is to ensure 408 // we have an 8-bit byte string that (numerically) 409 // captures the right value. These numerical values are 410 // then used to determine how to URL encode it 411 412 String filename_ascii = filename_uri.toASCIIString(); 413 414 // The URI.toASCIIString() call above only encodes values > 127. 415 // But we also need to protect + and & signs in filenames. Do this by URL encoding. 416 // But need to double URL encode, else it will get decoded too early, in methods called shortly hereafter. 417 filename_ascii = filename_ascii.replace("+", "%252B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased 418 filename_ascii = filename_ascii.replace("&", "%2526"); // &'s ASCII code is 36 in decimal, and 26 in hex 419 420 // Before proceeding, protect & in the filename too. 421 // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & 422 // But dangerous to do simple replace if there are &#x...; entities in the filename already! 423 // That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same! 424 //filename_url_encoded = filename_url_encoded.replace("&", "&x26;");// SO THIS IS BAD 425 //filename_url_encoded = filename_url_encoded.replace("&", hexEntityForChar("&"));// SAME, STILL BAD 426 ///filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities 427 428 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1"); 429 filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes); 430 431 // For chars that were protected by being URL encoded, now convert them to the correct version we want them in. 432 // For +: this char is special in regex, so it needs to be converted from URL encoding back to + so it will get properly escaped for regex 433 // For &: this char is special in XML, so since the call to iso_8859_1_filename_to_url_encoded() is over, we can finally convert & to hex entity now. 434 //filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller 435 filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex 436 filename_url_encoded = filename_url_encoded.replace("%26", "&"); // convert URL encoding for ampersand into hex entity for ampersand 437 } 438 catch (Exception e) { 439 e.printStackTrace(); 440 // Give up trying to convert 441 filename_url_encoded = file.getAbsolutePath(); 442 } 443 return filename_url_encoded; 444 } 445 446 // For unicode codepoints see: 447 // http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT for ISO8859-1 (Latin-1) 448 // where 0xE2 maps to codepoint 0x00E2 and is defined as "Latin small letter a with circumflex" 449 // http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT for ISO8859-7 (Greek) 450 // where 0xE2 maps to codepoint 0x03B2 and is defined as "Greek small letter beta" 451 public static String iso_8859_1_filename_to_url_encoded(String raw_bytes_filename) 452 throws Exception 453 { 454 String urlEncoded = ""; 455 456 try { 457 // By this point we have a UTF-8 encoded string that captures 458 // what the ISO-8859-1 (Latin-1) character is that corresponded to the 459 // 8-bit numeric value for that character in the filename 460 // on the file system 461 462 // For example: 463 // File system char: <lower-case beta char in Latin-7> = %E2 464 // Equivalent Latin 1 char: <lower-case a with circumflex> = %E2 465 // Mapped to UTF-8: <lower-case a with circumflex> = <C3><A2> 466 467 // Our task is to take the string the contains <C3><A2> and ensure that 468 // we "see" it as <E2> 469 470 byte [] raw_bytes = raw_bytes_filename.getBytes("ISO-8859-1"); 471 String unicode_filename = new String(raw_bytes,"UTF-8"); 472 473 for(int i = 0; i < unicode_filename.length(); i++) { 474 char charVal = unicode_filename.charAt(i); 475 if ((int)charVal > 255) { 476 urlEncoded += String.format("&#x%02X;", (int)charVal); 477 } 478 else if((int)charVal > 127) { 479 urlEncoded += String.format("%%%02X", (int)charVal); 480 } else { 481 urlEncoded += String.format("%c", (char)charVal); 482 } 483 } 484 } 485 catch (Exception e) { 486 //e.printStackTrace(); 487 throw(e); 488 } 489 490 return urlEncoded; 491 } 492 493 // unused for now 494 public static String raw_filename_to_url_encoded(String fileName) 495 throws Exception 496 { 497 String urlEncoded = ""; 498 try { 499 byte[] bytes = fileName.getBytes(); 500 501 for(int i = 0; i < bytes.length; i++) { 502 // mask each byte (by applying & 0xFF) to make the signed 503 // byte (in the range -128 to 127) unsigned (in the range 504 // 0 to 255). 505 506 int byteVal = (int)(bytes[i] & 0xFF); 507 508 if(byteVal > 127) { 509 urlEncoded += String.format("%%%02X", (int)byteVal); 510 } else { 511 urlEncoded += String.format("%c",(char)byteVal); 512 } 513 } 514 } 515 catch (Exception e) { 516 //e.printStackTrace(); 517 throw(e); 518 } 519 520 return urlEncoded; 521 } 522 523 // FURTHER HELPER METHODS 524 525 /** 526 * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter. 527 * If filename is relative, then the current directory (gli?) will be prefixed to what is returned 528 * and should be removed manually by the caller. Alternatively, for relative paths, call the variant 529 * relativeFilenameToURLEncoding(String), which will remove any added filepath prefix. 530 */ 531 public static String fullFilepathToURLEncoding(String filename) { 532 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 533 return filename; 534 } 535 536 File file = new File (filename); 537 String filename_url_encoded = fileToURLEncoding(file); 538 539 // if the current directory (".") was passed in as filename, 540 // then the filename_url_encoded looks like /full/path/./ 541 // In that case, remove the ./ at the end 542 if (filename_url_encoded.endsWith(FilenameEncoding.URL_FILE_SEPARATOR+"."+FilenameEncoding.URL_FILE_SEPARATOR)) { 543 filename_url_encoded = filename_url_encoded.substring(0, filename_url_encoded.length()-2); // cut off /. at end 544 } 545 546 return filename_url_encoded; 547 } 548 549 /** 550 * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter 551 * If filename is a relative path, call this method to get it specially URL encoded. 552 * This method will remove the current directory that is prefixed as an intermediary step. 553 */ 554 public static String relativeFilenameToURLEncoding(String filename) { 555 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 556 return filename; 557 } 558 559 String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); 560 return filenameToURLEncodingWithPrefixRemoved(filename, curr_directory_path); 561 } 562 563 /** 564 * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter 565 * Convenience method that will return the specially URL encoded version of filename 566 * with the provided removeFilePathPrefix removed */ 567 public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) { 568 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param 569 return filename; 570 } 571 572 File file = new File (filename); 573 String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath 574 575 // now lop off the given removeFilePathPrefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added 576 filename_url_encoded = filename_url_encoded.substring(removeFilePathPrefix.length()); 577 // remove any remaining slash prefix 578 if (filename_url_encoded.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { 579 filename_url_encoded = filename_url_encoded.substring(FilenameEncoding.URL_FILE_SEPARATOR.length()); 580 } 581 582 return filename_url_encoded; 583 } 584 585 // UNUSED now, but useful functions and escapeAllCharWithHexEntity() took effort to write. 586 356 587 /** 357 588 * Attempting to produce the equivalent method fileToURLEncoding(), but taking a String as input parameter … … 375 606 return hex_str; 376 607 } 377 378 608 379 609 /** Takes a String containing a single char and returns the hex entity for it */ … … 390 620 * as in that case, we don't want to replace any existing hex entities already present in the String. 391 621 */ 392 public static String escapeAllCharWithHexEntity(String str, char CHARACTER /*, String hexCodeString*/) {622 public static String escapeAllCharWithHexEntity(String str, char CHARACTER) { 393 623 394 624 if(str.indexOf(CHARACTER) == -1) { // nothing to replace, we're done … … 398 628 String char_as_string = Character.toString(CHARACTER); 399 629 String hexCodeString = hexEntityForChar(char_as_string); 400 401 //System.err.println("@@@ hexCodeString for: " + char_as_string + " is: " + hexCodeString);402 630 403 631 Matcher hexPatternMatch = HEX_PATTERN.matcher(str); // looks for a hex entity, which has the pattern "&#x....;" … … 441 669 return str; 442 670 } 443 444 445 // Dr Bainbridge's methods446 /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,447 * we can work with URL-encoded filenames in Java. Java works with whatever448 * encoding the filesystem uses. Unlike systems working with UTF-8, where Java449 * interprets filenames as UTF-8 (a destructive process since characters invalid450 * for UTF-8 are replaced with the invalid character, which means the original451 * character's byte codes can not be regained), working with an ISO-8859-1452 * system means the original byte codes of the characters are preserved,453 * regardless of whether the characters represent ISO-8859-1 or not. Such byte454 * codes are converted by the following method to the correct URL versions of455 * the strings that the filenames represent (that is, the correct URL representations456 * of the filenames in their original encodings). This is useful for interactions with457 * Perl as Java and Perl can use URL-encoded filenames to talk about the same files458 * on the file system, instead of having to work out what encoding they are in. */459 460 public static String fileToURLEncoding(File file) {461 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {462 return file.getAbsolutePath();463 }464 465 String filename_url_encoded = "";466 467 // The following test for whether the file exists or not is a problem468 // when a File object--whose actual file is in the process of being moved469 // and therefore temporarily does not 'exist' on the actual system--can't470 // be URL encoded: the following would return "" when a file doesn't exist.471 // So commenting out the test.472 /*473 if(!file.getName().equals("recycle")) {474 if(!file.isFile() && !file.isDirectory()) {475 System.err.println("*** ERROR. Java can't see file: " + file.getAbsolutePath());476 return "";477 }478 479 if(!file.exists()) {480 System.err.println("*** NOTE: File doesn't exist: " + file.getAbsolutePath());481 return ""; //file.getName();482 }483 }484 */485 486 URI filename_uri = file.toURI();487 try {488 // The trick:489 // 1. toASCIIString() will %xx encode values > 127490 // 2. Decode the result to "ISO-8859-1"491 // 3. URL encode the bytes to string492 493 // Step 2 forces the string to be 8-bit values. It494 // doesn't matter if the starting raw filename was *not*495 // in the ISO-8859-1 encoding, the effect is to ensure496 // we have an 8-bit byte string that (numerically)497 // captures the right value. These numerical values are498 // then used to determine how to URL encode it499 500 String filename_ascii = filename_uri.toASCIIString();501 502 // The URI.toASCIIString() call above only encodes values > 127.503 // But we also need to protect + and & signs in filenames504 filename_ascii = filename_ascii.replace("+", "%252B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased505 filename_ascii = filename_ascii.replace("&", "%2526"); // &'s ASCII code is 36 in decimal, and 26 in hex506 507 // Before proceeding, protect & in the filename too.508 // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with &509 // But dangerous to do simple replace if there are &#x...; entities in the filename already!510 // That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same!511 //filename_url_encoded = filename_url_encoded.replace("&", "&x26;");// SO THIS IS BAD512 //filename_url_encoded = filename_url_encoded.replace("&", hexEntityForChar("&"));// SAME, STILL BAD513 ///filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities514 515 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");516 filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);517 518 //filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller519 filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex520 filename_url_encoded = filename_url_encoded.replace("%26", "&");521 }522 catch (Exception e) {523 e.printStackTrace();524 // Give up trying to convert525 filename_url_encoded = file.getAbsolutePath();526 }527 return filename_url_encoded;528 }529 530 // For unicode codepoints see:531 // http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT for ISO8859-1 (Latin-1)532 // where 0xE2 maps to codepoint 0x00E2 and is defined as "Latin small letter a with circumflex"533 // http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT for ISO8859-7 (Greek)534 // where 0xE2 maps to codepoint 0x03B2 and is defined as "Greek small letter beta"535 public static String iso_8859_1_filename_to_url_encoded(String raw_bytes_filename)536 throws Exception537 {538 String urlEncoded = "";539 540 try {541 // By this point we have a UTF-8 encoded string that captures542 // what the ISO-8859-1 (Latin-1) character is that corresponded to the543 // 8-bit numeric value for that character in the filename544 // on the file system545 546 // For example:547 // File system char: <lower-case beta char in Latin-7> = %E2548 // Equivalent Latin 1 char: <lower-case a with circumflex> = %E2549 // Mapped to UTF-8: <lower-case a with circumflex> = <C3><A2>550 551 // Our task is to take the string the contains <C3><A2> and ensure that552 // we "see" it as <E2>553 554 byte [] raw_bytes = raw_bytes_filename.getBytes("ISO-8859-1");555 String unicode_filename = new String(raw_bytes,"UTF-8");556 557 for(int i = 0; i < unicode_filename.length(); i++) {558 char charVal = unicode_filename.charAt(i);559 if ((int)charVal > 255) {560 urlEncoded += String.format("&#x%02X;", (int)charVal);561 }562 else if((int)charVal > 127) {563 urlEncoded += String.format("%%%02X", (int)charVal);564 } else {565 urlEncoded += String.format("%c", (char)charVal);566 }567 }568 }569 catch (Exception e) {570 //e.printStackTrace();571 throw(e);572 }573 574 return urlEncoded;575 }576 577 // unused for now578 public static String raw_filename_to_url_encoded(String fileName)579 throws Exception580 {581 String urlEncoded = "";582 try {583 byte[] bytes = fileName.getBytes();584 585 for(int i = 0; i < bytes.length; i++) {586 // mask each byte (by applying & 0xFF) to make the signed587 // byte (in the range -128 to 127) unsigned (in the range588 // 0 to 255).589 590 int byteVal = (int)(bytes[i] & 0xFF);591 592 if(byteVal > 127) {593 urlEncoded += String.format("%%%02X", (int)byteVal);594 } else {595 urlEncoded += String.format("%c",(char)byteVal);596 }597 }598 }599 catch (Exception e) {600 //e.printStackTrace();601 throw(e);602 }603 604 return urlEncoded;605 }606 607 // FURTHER HELPER METHODS608 609 /**610 * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter.611 * If filename is relative, then the current directory (gli?) will be prefixed to what is returned612 * and should be removed manually by the caller. Alternatively, for relative paths, call the variant613 * relativeFilenameToURLEncoding(String), which will remove any added filepath prefix.614 */615 public static String fullFilepathToURLEncoding(String filename) {616 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param617 return filename;618 }619 620 File file = new File (filename);621 String filename_url_encoded = fileToURLEncoding(file);622 623 // if the current directory (".") was passed in as filename,624 // then the filename_url_encoded looks like /full/path/./625 // In that case, remove the ./ at the end626 if (filename_url_encoded.endsWith(FilenameEncoding.URL_FILE_SEPARATOR+"."+FilenameEncoding.URL_FILE_SEPARATOR)) {627 filename_url_encoded = filename_url_encoded.substring(0, filename_url_encoded.length()-2); // cut off /. at end628 }629 630 return filename_url_encoded;631 }632 633 /**634 * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter635 * If filename is a relative path, call this method to get it specially URL encoded.636 * This method will remove the current directory that is prefixed as an intermediary step.637 */638 public static String relativeFilenameToURLEncoding(String filename) {639 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param640 return filename;641 }642 643 String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding(".");644 return filenameToURLEncodingWithPrefixRemoved(filename, curr_directory_path);645 }646 647 /**648 * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter649 * Convenience method that will return the specially URL encoded version of filename650 * with the provided removeFilePathPrefix removed */651 public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) {652 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param653 return filename;654 }655 656 File file = new File (filename);657 String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath658 659 // now lop off the given removeFilePathPrefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added660 filename_url_encoded = filename_url_encoded.substring(removeFilePathPrefix.length());661 // remove any remaining slash prefix662 if (filename_url_encoded.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {663 filename_url_encoded = filename_url_encoded.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());664 }665 666 return filename_url_encoded;667 }668 671 } -
main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java
r33746 r33747 631 631 static public void saveLoadedFile() 632 632 { 633 // If we have a file loaded into memory and it has been modified, save it now 634 if (loaded_file != null && loaded_file_changed == true) { 635 //System.err.println("START saveLoadedFile(), loaded_file_document:\n" + XMLTools.elementToString(loaded_file_document.getDocumentElement(), true)); 636 637 XMLTools.writeXMLFile(loaded_file, loaded_file_document, nonEscapingElements); 638 639 /* // DEBUGGING: 640 Document doc = XMLTools.parseXMLFile(loaded_file); 641 System.err.println("AT END saveLoadedFile(), PARSED loaded_file contains:\n" + XMLTools.elementToString(doc.getDocumentElement(), true)); 642 643 reEncodeFilenamesInMetadataXML(doc); 644 System.err.println("AT END saveLoadedFile(), RE-ENCODED loaded_file contains:\n" + XMLTools.elementToString(doc.getDocumentElement(), true)); 645 */ 646 loaded_file_changed = false; 647 } 648 649 //System.err.println("@@@@ END of saveLoadedFile()"); 650 //Utility.printCaller(); 633 // If we have a file loaded into memory and it has been modified, save it now 634 if (loaded_file != null && loaded_file_changed == true) { 635 //System.err.println("START saveLoadedFile(), loaded_file_document:\n" + XMLTools.elementToString(loaded_file_document.getDocumentElement(), true)); 636 637 XMLTools.writeXMLFile(loaded_file, loaded_file_document, nonEscapingElements); 638 639 loaded_file_changed = false; 640 } 651 641 } 652 642
Note:
See TracChangeset
for help on using the changeset viewer.