Changeset 33739
- Timestamp:
- 2019-12-02T23:15:50+13:00 (4 years ago)
- Location:
- main/trunk/gli/src/org/greenstone/gatherer/metadata
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java
r33738 r33739 355 355 } 356 356 357 /** Attempting to produce the equivalent method fileToURLEncoding() above, but taking a String as input parameter */ 358 public static String fileNameToHex(String filename) { 359 360 String hexFilename = ""; 361 for(int i = 0; i < filename.length(); i++) { 362 int charCode = filename.codePointAt(i); // unicode codepoint / ASCII code 357 /** 358 * Attempting to produce the equivalent method fileToURLEncoding(), but taking a String as input parameter 359 * UNUSED - REPLACED by filenameToURLEncoding(String str) which reuses existing fileToURLEncoding(File) method. 360 */ 361 public static String stringToHex(String str) { 362 363 String hex_str = ""; 364 for(int i = 0; i < str.length(); i++) { 365 int charCode = str.codePointAt(i); // unicode codepoint / ASCII code 363 366 364 367 // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png 365 368 // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format) 366 if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 || charCode == 36 || charCode == 43) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too367 hex Filename += filename.charAt(i);369 if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 /*|| charCode == 36 || charCode == 43*/) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too 370 hex_str += str.charAt(i); 368 371 } else { 369 hex Filename+= "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"372 hex_str += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];" 370 373 } 371 374 } 372 375 373 return hex Filename;376 return hex_str; 374 377 } 375 378 379 380 /** Takes a String containing a single char and returns the hex entity for it */ 381 public static String hexEntityForChar(String char_as_string) { 382 int charCode = char_as_string.codePointAt(0); // unicode codepoint / ASCII code 383 String hexCodeStr = "&#x" + String.format("%x", charCode).toUpperCase() + ";"; 384 return hexCodeStr; 385 } 386 387 /** 388 * Given a String containing 0 or more occurrences of CHARACTER, 389 * this method will replace all occurrences of that CHARACTER with its hex entity variant, "&x....;" 390 * Special care is taken where the CHARACTER to be replaced is &, 391 * as in that case, we don't want to replace any existing hex entities already present in the String. 392 */ 393 public static String escapeAllCharWithHexEntity(String str, char CHARACTER/*, String hexCodeString*/) { 394 String char_as_string = Character.toString(CHARACTER); 395 String hexCodeString = hexEntityForChar(char_as_string); 396 397 //System.err.println("@@@ hexCodeString for: " + char_as_string + " is: " + hexCodeString); 398 399 Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 400 Matcher hexPatternMatch = hexPattern.matcher(str); 401 402 // want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match hexPattern 403 int searchIndex = 0; 404 405 boolean finished = false; 406 while(!finished) { 407 408 searchIndex = str.indexOf(CHARACTER, searchIndex); 409 410 if(searchIndex == -1) { 411 finished = true; 412 } 413 else { 414 415 // replacing ampersands, &, is a special case: don't want to replace the & of (hex) entities in the string: 416 if(hexPatternMatch.find(searchIndex) && searchIndex == hexPatternMatch.start()) { 417 searchIndex = hexPatternMatch.end(); 418 } else { 419 420 String tmp = str.substring(0, searchIndex) + hexCodeString; 421 searchIndex++; 422 if(str.length() > searchIndex) { 423 tmp += str.substring(searchIndex); 424 } 425 str = tmp; 426 searchIndex = searchIndex+ hexCodeString.length() - 1; 427 428 // String has been modified, so have to update Matcher 429 hexPatternMatch = hexPattern.matcher(str); 430 431 if(searchIndex >= str.length()) { 432 finished = true; 433 } 434 } 435 } 436 } 437 438 return str; 439 } 440 376 441 377 442 // follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter … … 428 493 return filename; 429 494 } 430 431 495 File file = new File (filename); 432 496 return fileToURLEncoding(file); … … 454 518 } 455 519 520 // we'll want to protect & by replacing with &'s hex value 521 // but we don't want to replace &#x....; with the same! 522 Pattern plain_ampersand_not_hex_prefix_Pattern = Pattern.compile("&[^#]"); 523 524 525 int containsAmp = 0; 526 if(file.getName().contains("&")) { 527 System.err.println("@@@ 1 to encode " + file.getName()); 528 containsAmp = 1; 529 } else if(file.getName().contains("&")) { 530 System.err.println("@@@ 2 to encode " + file.getName()); 531 containsAmp = 2; 532 } else { 533 System.err.println("@@@ 0 to encode " + file.getName()); 534 } 535 536 456 537 String filename_url_encoded = ""; 457 538 … … 489 570 // then used to determine how to URL encode it 490 571 491 String filename_ascii = filename_uri.toASCIIString(); 492 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1"); 572 String filename_ascii = filename_uri.toASCIIString(); 573 // protect & and + in the filename too 574 filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); 575 576 if(containsAmp > 0) System.err.println("@@@ filename_ascii: " + filename_ascii); 577 578 579 //if(containsAmp > 0) System.err.println("@@@ filename_ascii with hexed &: " + filename_ascii); 580 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1"); 493 581 filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes); 582 583 584 //filename_url_encoded = filename_url_encoded.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex 585 //filename_ascii = filename_ascii.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased 586 //if(containsAmp > 0) System.err.println("@@@ filename_url_encoded: " + filename_url_encoded); 494 587 495 588 } -
main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java
r33738 r33739 668 668 } 669 669 670 String metadata_xml_file_directory_path = FilenameEncoding.filenameToURLEncoding(".");671 metadata_xml_file_directory_path = metadata_xml_file_directory_path.substring(0, metadata_xml_file_directory_path.length()-2); // cut off /. at end672 System.err.println("@@@ metadata_xml_file_directory_path: " + metadata_xml_file_directory_path);670 String curr_directory_path = FilenameEncoding.filenameToURLEncoding("."); 671 curr_directory_path = curr_directory_path.substring(0, curr_directory_path.length()-2); // cut off /. at end 672 //System.err.println("@@@ curr_directory_path: " + curr_directory_path); 673 673 674 674 //System.err.println("PARSED loaded_file contains:\n" + XMLTools.elementToString(doc.getDocumentElement(), true)); … … 695 695 696 696 // now lop off the metadataxml dir prefix the FilenameEncoding.filenameToURLEncoding(STRING) variant would have added 697 encoded_filename = encoded_filename.substring( metadata_xml_file_directory_path.length());697 encoded_filename = encoded_filename.substring(curr_directory_path.length()); 698 698 if (encoded_filename.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { 699 699 encoded_filename = encoded_filename.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
Note:
See TracChangeset
for help on using the changeset viewer.