Changeset 33728
- Timestamp:
- 2019-11-29T21:46:20+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java
r33727 r33728 34 34 import org.greenstone.gatherer.DebugStream; 35 35 36 import java.util.regex.Matcher; 37 import java.util.regex.Pattern; 38 39 40 36 41 /** Static access class that contains many of the methods used to work with filename encodings. 37 42 * Works closely with classes FileNode, CollectionTreeNode, MetadataXMLFile, MetadataXMLFileManager … … 291 296 292 297 //************************* GETTING THE URL ENCODING OF FILENAMES ********************************* 298 299 /** 300 * Given a String containing hexentities, will convert back into the unicode version of the String. 301 * e.g. A string like "02 Tēnā Koutou\.mp3" will be returned as "02 Tena Koutou\.mp3" with macrons on e and a 302 * I've tested this in a separate file that imports java.util.regex.Matcher and java.util.regex.Pattern 303 * and contains a copy of Utility.debugUnicodeString(String) with the following main function: 304 public static void main(String args[]) { 305 String str = "02 Tēnā Koutou\\.mp3"; // or more basic case: String str = "mmmmānnnnēpppp\\.txt"; 306 System.err.println("About to decode hex string: " + str); 307 String result = decodeStringContainingHexEntities(str); 308 System.err.println("Decoded hex string: " + result + " - debug unicode form: " + debugUnicodeString(result)); 309 } 310 */ 311 public static String decodeStringContainingHexEntities(String str) { 312 String result = ""; 313 boolean done = false; 314 Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 315 Matcher matcher = hexPattern.matcher(str); 316 317 int searchFromIndex = 0; 318 int endMatchIndex = -1; 319 320 while(matcher.find(searchFromIndex)) { 321 String hexPart = matcher.group(); 322 //System.err.println("Found hexpart match: " + hexPart); 323 324 int startMatchIndex = matcher.start(); 325 endMatchIndex = matcher.end(); 326 result += str.substring(searchFromIndex, startMatchIndex); 327 328 String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match 329 // https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string 330 // https://stackoverflow.com/questions/11194513/convert-hex-string-to-int 331 332 //System.err.println("hexNumberStr so far: " + hexNumberStr); 333 int tmpDigit = Integer.parseInt(hexNumberStr); 334 //System.err.println("As digit: " + tmpDigit); 335 hexNumberStr = String.format("%04d", tmpDigit); 336 //System.err.println("2 hexNumberStr so far: " + hexNumberStr); 337 hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD" 338 //int hexNumber = Integer.parseInt(hexNumberStr); 339 int hexNumber = Integer.decode(hexNumberStr); 340 String hexNumberAsChar = Character.toString((char) hexNumber); 341 result += hexNumberAsChar; 342 343 searchFromIndex = endMatchIndex; 344 345 } 346 347 if(endMatchIndex != -1) { 348 result += str.substring(endMatchIndex); 349 //System.err.println("suffix: " + str.substring(endMatchIndex)); 350 } 351 352 return result; 353 } 354 355 /** Attempting to produce the equivalent method fileToURLEncoding() above, but taking a String as input parameter */ 356 public static String fileNameToHex(String filename) { 357 /*String filename_url_encoded = ""; 358 try { 359 URI filename_uri = new URI(filename); 360 String filename_ascii = filename_uri.toASCIIString(); 361 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1"); 362 filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes); 363 return filename_url_encoded; 364 } catch (Exception e) { 365 e.printStackTrace(); 366 // Give up trying to convert 367 filename_url_encoded = filename; 368 } 369 return filename_url_encoded; 370 */ 371 372 String hexFilename = ""; 373 for(int i = 0; i < filename.length(); i++) { 374 int charCode = filename.codePointAt(i); // unicode codepoint / ASCII code 375 376 // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png 377 // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format) 378 if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing 379 hexFilename += filename.charAt(i); 380 } else { 381 hexFilename += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];" 382 } 383 } 384 385 return hexFilename; 386 } 387 293 388 // Dr Bainbridge's methods 294 389 /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding, … … 358 453 return filename_url_encoded; 359 454 } 360 361 /** Attempting to produce the equivalent method fileToURLEncoding() above, but taking a String as input parameter */362 public static String fileNameToHex(String filename) {363 /*String filename_url_encoded = "";364 try {365 URI filename_uri = new URI(filename);366 String filename_ascii = filename_uri.toASCIIString();367 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");368 filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);369 return filename_url_encoded;370 } catch (Exception e) {371 e.printStackTrace();372 // Give up trying to convert373 filename_url_encoded = filename;374 }375 return filename_url_encoded;376 */377 378 String hexFilename = "";379 for(int i = 0; i < filename.length(); i++) {380 int charCode = filename.codePointAt(i); // unicode codepoint / ASCII code381 382 // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png383 // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)384 if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing385 hexFilename += filename.charAt(i);386 } else {387 hexFilename += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"388 }389 }390 391 return hexFilename;392 }393 394 455 395 456 // For unicode codepoints see:
Note:
See TracChangeset
for help on using the changeset viewer.