[23433] | 1 | /**
|
---|
| 2 | *############################################################################
|
---|
| 3 | * A component of the Greenstone Librarian Interface, part of the Greenstone
|
---|
| 4 | * digital library suite from the New Zealand Digital Library Project at the
|
---|
| 5 | * University of Waikato, New Zealand.
|
---|
| 6 | *
|
---|
| 7 | * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
|
---|
| 8 | *
|
---|
| 9 | * Copyright (C) 2010 Greenstone Digital Library Project
|
---|
| 10 | *
|
---|
| 11 | * This program is free software; you can redistribute it and/or modify
|
---|
| 12 | * it under the terms of the GNU General Public License as published by
|
---|
| 13 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 14 | * (at your option) any later version.
|
---|
| 15 | *
|
---|
| 16 | * This program is distributed in the hope that it will be useful,
|
---|
| 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 19 | * GNU General Public License for more details.
|
---|
| 20 | *
|
---|
| 21 | * You should have received a copy of the GNU General Public License
|
---|
| 22 | * along with this program; if not, write to the Free Software
|
---|
| 23 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 24 | *############################################################################
|
---|
| 25 | */
|
---|
| 26 |
|
---|
| 27 | package org.greenstone.gatherer.metadata;
|
---|
| 28 |
|
---|
| 29 | import java.io.File;
|
---|
| 30 | import java.net.*;
|
---|
| 31 | import java.nio.charset.*;
|
---|
| 32 | import java.util.*;
|
---|
| 33 | import org.greenstone.gatherer.collection.CollectionManager;
|
---|
[29793] | 34 | import org.greenstone.gatherer.DebugStream;
|
---|
[23433] | 35 |
|
---|
[33728] | 36 | import java.util.regex.Matcher;
|
---|
| 37 | import java.util.regex.Pattern;
|
---|
| 38 |
|
---|
| 39 |
|
---|
| 40 |
|
---|
[23433] | 41 | /** Static access class that contains many of the methods used to work with filename encodings.
|
---|
| 42 | * Works closely with classes FileNode, CollectionTreeNode, MetadataXMLFile, MetadataXMLFileManager
|
---|
| 43 | * to maintain a map of URLEncodedFilenames to their filename encodings.
|
---|
| 44 | * The process of filename encoding further affects the CollectionManager which refreshes its CollectionTree,
|
---|
| 45 | * FileManager (move, delete, rename actions), MetadataValueTableModel, EnrichPane. */
|
---|
| 46 |
|
---|
| 47 | public class FilenameEncoding {
|
---|
| 48 | /** Display of filenames in the trees are in URL encoding, if debugging */
|
---|
| 49 | public static boolean DEBUGGING = false;
|
---|
| 50 |
|
---|
| 51 | /** Set to false by Gatherer if the locale is UTF-8, as Java's handling is
|
---|
| 52 | * such that non-UTF8 filename encodings on a UTF-8 locale are destructively
|
---|
| 53 | * converted so that the bytecodes in the filename are not preserved. */
|
---|
| 54 | public static boolean MULTIPLE_FILENAME_ENCODINGS_SUPPORTED = false;
|
---|
| 55 |
|
---|
| 56 | /** Also set by Gatherer.
|
---|
| 57 | * If the OS supports multiple filename encodings, we will be working with URL strings
|
---|
| 58 | * and the applicable separators are always the forward slash ("/") not File.separator.
|
---|
| 59 | * If multiple filename encodings are not supported, we're dealing with File.separator. */
|
---|
| 60 | public static String URL_FILE_SEPARATOR = File.separator;
|
---|
| 61 |
|
---|
| 62 |
|
---|
| 63 | /** gs.filenameEncoding is a special sort of metadata that is not merely to be stored along
|
---|
| 64 | * with a file, but is to be applied in real-time on the file's name in the CollectionTree
|
---|
| 65 | * display. Since FileNodes are constantly destroyed and reconstructed by that Tree when
|
---|
| 66 | * its nodes are expanded and contracted, storing the filename encodings of each file along
|
---|
| 67 | * with the file in a FileNode doesn't help because it doesn't last. Instead of rediscovering
|
---|
| 68 | * the encoding at every stage by querying the metadataXML file, we store the encodings for
|
---|
| 69 | * fast access: in a map of (URLEncodedFilePath, filename-encoding) pairs.
|
---|
| 70 | * The current design of the map is to only store any active filename metadata assigned
|
---|
| 71 | * directly at that file/folder's level, and if there is none discovered at that level, then
|
---|
| 72 | * storing the empty string for it. Therefore, if the hashmap contains no entry for
|
---|
| 73 | * a file, it means this still needs to be retrieved. */
|
---|
| 74 | public static Map map = new HashMap();
|
---|
| 75 |
|
---|
[23436] | 76 | //*********************** BUSY REFRESHING / REQUIRING REFRESH *********************
|
---|
| 77 |
|
---|
[23433] | 78 | /** Set to true if filename encoding metadata was changed. Called by the enter keyPress
|
---|
| 79 | * event in gui.EnrichPane and when the gs.FilenameEncoding field loses focus. */
|
---|
| 80 | private static boolean refreshRequired = false;
|
---|
| 81 |
|
---|
[23436] | 82 | synchronized public static boolean isRefreshRequired() {
|
---|
[23433] | 83 | return refreshRequired;
|
---|
| 84 | }
|
---|
| 85 |
|
---|
[23436] | 86 | synchronized public static void setRefreshRequired(boolean state) {
|
---|
[23433] | 87 | if(MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
|
---|
| 88 | refreshRequired = state;
|
---|
| 89 | } else {
|
---|
| 90 | refreshRequired = false;
|
---|
| 91 | }
|
---|
| 92 | }
|
---|
| 93 |
|
---|
| 94 | //************************** MAP RETRIEVAL METHODS ******************************
|
---|
| 95 |
|
---|
| 96 | /** Returns the cumulative gs.filenameEncoding metadata
|
---|
| 97 | * assigned to a file inside the collection. */
|
---|
| 98 | public static String findFilenameEncoding(
|
---|
| 99 | File file, String urlEncodedFilePath, boolean bruteForceLookup)
|
---|
| 100 | {
|
---|
| 101 | //if(bruteForceLookup) {
|
---|
| 102 | // return findFilenameEncodingBruteForce(file, urlEncodedFilePath, bruteForceLookup);
|
---|
| 103 | //}
|
---|
| 104 |
|
---|
| 105 | String encoding = "";
|
---|
| 106 |
|
---|
| 107 | // Check any assigned encoding at this level, starting with the map first
|
---|
| 108 | // and else retrieving the filename encoding from the metadata file
|
---|
| 109 | if(!map.containsKey(urlEncodedFilePath)) {
|
---|
| 110 |
|
---|
| 111 | // Check for filename encoding metadata *directly* associated with the file
|
---|
| 112 | // Now don't need to get any inherited encoding metadata here, because of
|
---|
| 113 | // the way we're storing and retrieving encoding information from the map.
|
---|
| 114 | ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(file, true); // true: gets gs.filenameEncoding only
|
---|
| 115 | if(!list.isEmpty()) {
|
---|
| 116 | MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
|
---|
| 117 | encoding = metavalue.getValue();
|
---|
| 118 | } // else no filename encoding set yet at this level
|
---|
| 119 |
|
---|
| 120 | // Now we've done a lookup at this level cache the result in the map,
|
---|
| 121 | // including empty strings, to indicate that we've done a full lookup
|
---|
| 122 | map.put(urlEncodedFilePath, encoding);
|
---|
| 123 | }
|
---|
| 124 | else { // an entry exists in the map, get it from there
|
---|
| 125 | encoding = (String)map.get(urlEncodedFilePath);
|
---|
| 126 | }
|
---|
| 127 |
|
---|
| 128 | // if no meta was specified at at the file level, look for any inherited metadata
|
---|
| 129 | if(encoding.equals("")) {
|
---|
| 130 | encoding = getInheritedFilenameEncoding(urlEncodedFilePath, file);
|
---|
| 131 | }
|
---|
| 132 |
|
---|
| 133 | //System.err.println("\n@@@@Looked for: " + urlEncodedFilePath + " | found: " + encoding);
|
---|
| 134 | return encoding; // found something in map, may still be "", but it's what was stored
|
---|
| 135 | }
|
---|
| 136 |
|
---|
| 137 | /** Checks the file-to-encoding map for all the superfolders of the given
|
---|
| 138 | * filename in sequence for an applicable encoding. Note that the file/folder
|
---|
| 139 | * at the level of urlFoldername (and dir) has already been inspected. */
|
---|
| 140 | static public String getInheritedFilenameEncoding(String urlFoldername, File dir)
|
---|
| 141 | {
|
---|
| 142 | String encoding = "";
|
---|
| 143 | boolean done = false;
|
---|
| 144 |
|
---|
| 145 | // don't want to search past import folder which is as
|
---|
| 146 | // far as we need to go to determine inherited encodings
|
---|
| 147 | File importDir = new File(CollectionManager.getLoadedCollectionImportDirectoryPath());
|
---|
| 148 | if(dir.equals(importDir)) { // if the top-level dir was already checked, we're done
|
---|
| 149 | done = true;
|
---|
| 150 | }
|
---|
| 151 |
|
---|
| 152 | // For directories, first remove trailing file separator in order to start checking from higher level folders
|
---|
| 153 | int lastIndex = urlFoldername.length()-1;
|
---|
| 154 | char urlFileSeparatorChar = URL_FILE_SEPARATOR.charAt(0);
|
---|
| 155 | if(urlFoldername.charAt(lastIndex) == urlFileSeparatorChar) {
|
---|
| 156 | urlFoldername = urlFoldername.substring(0, lastIndex);
|
---|
| 157 | }
|
---|
| 158 |
|
---|
| 159 | while(!done) {
|
---|
| 160 | // get the folder that's one level up
|
---|
| 161 | dir = dir.getParentFile();
|
---|
| 162 |
|
---|
| 163 | int index = urlFoldername.lastIndexOf(URL_FILE_SEPARATOR);
|
---|
| 164 | if(index == -1) { // no more slashes
|
---|
| 165 | done = true;
|
---|
| 166 | } else {
|
---|
| 167 | urlFoldername = urlFoldername.substring(0, index);
|
---|
| 168 | }
|
---|
| 169 |
|
---|
| 170 | // now look in the map to see whether there's an encoding for this folder
|
---|
| 171 | String folder = urlFoldername + URL_FILE_SEPARATOR;
|
---|
| 172 | if(map.containsKey(folder)) {
|
---|
| 173 | encoding = (String)map.get(folder); // may be ""
|
---|
| 174 | } else { // no entry in map, so look in the metadata.xml at this folder level
|
---|
| 175 | ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(
|
---|
| 176 | dir, true); // true: gets gs.filenameEncoding only
|
---|
| 177 | if(!list.isEmpty()) {
|
---|
| 178 | MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
|
---|
| 179 | encoding = metavalue.getValue();
|
---|
| 180 | }
|
---|
| 181 | map.put(folder, encoding); // may be ""
|
---|
| 182 | }
|
---|
| 183 |
|
---|
| 184 | if(!encoding.equals("")){
|
---|
| 185 | done = true;
|
---|
| 186 | } // else if "", loop to check next folder up
|
---|
| 187 | else if(dir.equals(importDir)) { // don't iterate past the import folder, which we've now checked
|
---|
| 188 | done = true;
|
---|
| 189 | }
|
---|
| 190 | }
|
---|
| 191 |
|
---|
| 192 | return encoding;
|
---|
| 193 | }
|
---|
| 194 |
|
---|
| 195 | /** Called by GUIManager when a collection is closed. This then empties the
|
---|
| 196 | * file-to-encoding map which is applicable only on a per-collection basis */
|
---|
| 197 | static public void closeCollection() {
|
---|
| 198 | //printFilenameMap("Closing collection. Clearing file-to-encoding map of entries:");
|
---|
| 199 | map.clear();
|
---|
| 200 | }
|
---|
| 201 |
|
---|
| 202 | // Useful for debugging: prints contents of file-to-encoding map
|
---|
| 203 | static public void printFilenameMap(String heading) {
|
---|
| 204 | System.err.println("\n********************************************");
|
---|
| 205 | System.err.println(heading.toUpperCase());
|
---|
| 206 | Iterator entries = map.entrySet().iterator();
|
---|
| 207 | while(entries.hasNext()) {
|
---|
| 208 | Map.Entry entry = (Map.Entry)entries.next();
|
---|
| 209 | System.err.println("+ " + (String)entry.getKey() + ": " + (String)entry.getValue());
|
---|
| 210 | }
|
---|
| 211 | System.err.println("********************************************\n");
|
---|
| 212 | }
|
---|
| 213 |
|
---|
| 214 | // UNUSED at present. Brute force version of the findFilenameEncoding() method
|
---|
| 215 | // Doesn't use the map, but gets *all* the metadata assigned to a file/folder to
|
---|
| 216 | // work out the encoding applicable to a file/folder.
|
---|
| 217 | public static String findFilenameEncodingBruteForce(File file, String urlEncodedFilename,
|
---|
| 218 | boolean bruteForceLookup)
|
---|
| 219 | {
|
---|
| 220 | System.err.println("\n***** BRUTE FORCE getFilenameEncoding() called\n");
|
---|
| 221 |
|
---|
| 222 |
|
---|
| 223 | String encoding = "";
|
---|
| 224 |
|
---|
| 225 | // Check for filename encoding metadata *directly* associated with the file
|
---|
| 226 | // Now don't need to get any inherited encoding metadata here, because of
|
---|
| 227 | // the way we're storing and retrieving encoding information from the map.
|
---|
| 228 |
|
---|
| 229 | ArrayList list = MetadataXMLFileManager.getMetadataAssignedToFile(file, true); // true: gets gs.filenameEncoding only
|
---|
| 230 | if(!list.isEmpty()) {
|
---|
| 231 | // try to get the filename encoding meta that was assigned last to this
|
---|
| 232 | // file, even though it makes no sense to have multiple values for it
|
---|
| 233 | MetadataValue metavalue = (MetadataValue)list.get(list.size()-1);
|
---|
| 234 | encoding = metavalue.getValue();
|
---|
| 235 |
|
---|
| 236 | if(encoding == null) { // unlikely ???
|
---|
| 237 | System.err.println("**** ERROR: encoding for "
|
---|
| 238 | + urlEncodedFilename + " is NULL!");
|
---|
| 239 | encoding = "";
|
---|
| 240 | }
|
---|
| 241 | } // else no filename encoding set yet, perhaps
|
---|
| 242 | //System.err.println("**** Found encoding for " + urlEncodedFilename + " " + encoding);
|
---|
| 243 | return encoding;
|
---|
| 244 | }
|
---|
| 245 |
|
---|
| 246 | //****************************** APPLYING ENCODINGS TO FILENAMES *****************************
|
---|
| 247 |
|
---|
| 248 | /** URL encoded version of the byte codes of the given file's name */
|
---|
| 249 | public static String calcURLEncodedFilePath(File file) {
|
---|
| 250 | if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
|
---|
| 251 | return file.getAbsolutePath();
|
---|
| 252 | }
|
---|
| 253 | else {
|
---|
| 254 | String filename = fileToURLEncoding(file);
|
---|
| 255 | return filename;
|
---|
| 256 | }
|
---|
| 257 | }
|
---|
| 258 |
|
---|
| 259 | /** URL encoded version of the byte codes of this file's name */
|
---|
| 260 | public static String calcURLEncodedFileName(String urlfilepath) {
|
---|
| 261 | String filename = urlfilepath;
|
---|
| 262 | if(filename.endsWith(URL_FILE_SEPARATOR)) { // directory, remove trailing slash
|
---|
| 263 | filename = filename.substring(0, filename.length() - 1);
|
---|
| 264 | }
|
---|
| 265 |
|
---|
| 266 | // remove the directory prefix (if any) to get the filename
|
---|
| 267 | int index = filename.lastIndexOf(URL_FILE_SEPARATOR);
|
---|
| 268 | if(index != -1) {
|
---|
| 269 | filename = filename.substring(index+1); // skip separator
|
---|
| 270 | }
|
---|
| 271 |
|
---|
| 272 | return filename;
|
---|
| 273 | }
|
---|
| 274 |
|
---|
| 275 | /** Given a string representing an alias to an official encoding (and unofficial ones
|
---|
| 276 | * starting with "Latin-"), attempts to work out what the canonical encoding for that is.
|
---|
| 277 | * If the given encoding is unrecognised, it is returned as is. */
|
---|
| 278 | public static String canonicalEncodingName(String encoding) {
|
---|
| 279 | String canonicalEncoding = encoding;
|
---|
| 280 | try {
|
---|
| 281 | // Latin-1 -> ISO-8859-1
|
---|
| 282 | String alias = canonicalEncoding.toLowerCase();
|
---|
| 283 | if(alias.startsWith("latin")){
|
---|
| 284 | canonicalEncoding = "ISO-8859" + alias.substring("latin".length());
|
---|
| 285 | }
|
---|
| 286 |
|
---|
| 287 | // canonical encoding for official aliases
|
---|
| 288 | canonicalEncoding = Charset.forName(canonicalEncoding).name();
|
---|
| 289 | return canonicalEncoding;
|
---|
| 290 | } catch (Exception e) {
|
---|
| 291 | System.err.println("(Could not recognise encoding (alias): "
|
---|
| 292 | + encoding + ".)");
|
---|
| 293 | return encoding; // no alias could be found, return the original parameter
|
---|
| 294 | }
|
---|
| 295 | }
|
---|
| 296 |
|
---|
| 297 | //************************* GETTING THE URL ENCODING OF FILENAMES *********************************
|
---|
[33728] | 298 |
|
---|
| 299 | /**
|
---|
| 300 | * Given a String containing hexentities, will convert back into the unicode version of the String.
|
---|
| 301 | * e.g. A string like "02 Tēnā Koutou\.mp3" will be returned as "02 Tena Koutou\.mp3" with macrons on e and a
|
---|
| 302 | * I've tested this in a separate file that imports java.util.regex.Matcher and java.util.regex.Pattern
|
---|
| 303 | * and contains a copy of Utility.debugUnicodeString(String) with the following main function:
|
---|
| 304 | public static void main(String args[]) {
|
---|
| 305 | String str = "02 Tēnā Koutou\\.mp3"; // or more basic case: String str = "mmmmānnnnēpppp\\.txt";
|
---|
| 306 | System.err.println("About to decode hex string: " + str);
|
---|
| 307 | String result = decodeStringContainingHexEntities(str);
|
---|
| 308 | System.err.println("Decoded hex string: " + result + " - debug unicode form: " + debugUnicodeString(result));
|
---|
| 309 | }
|
---|
| 310 | */
|
---|
| 311 | public static String decodeStringContainingHexEntities(String str) {
|
---|
| 312 | String result = "";
|
---|
| 313 | Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)");
|
---|
| 314 | Matcher matcher = hexPattern.matcher(str);
|
---|
| 315 |
|
---|
| 316 | int searchFromIndex = 0;
|
---|
| 317 | int endMatchIndex = -1;
|
---|
| 318 |
|
---|
| 319 | while(matcher.find(searchFromIndex)) {
|
---|
| 320 | String hexPart = matcher.group();
|
---|
| 321 | //System.err.println("Found hexpart match: " + hexPart);
|
---|
| 322 |
|
---|
| 323 | int startMatchIndex = matcher.start();
|
---|
| 324 | endMatchIndex = matcher.end();
|
---|
| 325 | result += str.substring(searchFromIndex, startMatchIndex);
|
---|
| 326 |
|
---|
| 327 | String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match
|
---|
| 328 | // https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string
|
---|
| 329 | // https://stackoverflow.com/questions/11194513/convert-hex-string-to-int
|
---|
| 330 |
|
---|
| 331 | //System.err.println("hexNumberStr so far: " + hexNumberStr);
|
---|
| 332 | int tmpDigit = Integer.parseInt(hexNumberStr);
|
---|
| 333 | //System.err.println("As digit: " + tmpDigit);
|
---|
| 334 | hexNumberStr = String.format("%04d", tmpDigit);
|
---|
| 335 | //System.err.println("2 hexNumberStr so far: " + hexNumberStr);
|
---|
| 336 | hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD"
|
---|
| 337 | //int hexNumber = Integer.parseInt(hexNumberStr);
|
---|
| 338 | int hexNumber = Integer.decode(hexNumberStr);
|
---|
| 339 | String hexNumberAsChar = Character.toString((char) hexNumber);
|
---|
| 340 | result += hexNumberAsChar;
|
---|
| 341 |
|
---|
| 342 | searchFromIndex = endMatchIndex;
|
---|
| 343 |
|
---|
| 344 | }
|
---|
| 345 |
|
---|
[33730] | 346 | if(endMatchIndex != -1) { // attach any suffix once we finished processing all the hex codes
|
---|
[33728] | 347 | result += str.substring(endMatchIndex);
|
---|
| 348 | //System.err.println("suffix: " + str.substring(endMatchIndex));
|
---|
| 349 | }
|
---|
[33730] | 350 | else { // there were no hex codes to decode, return string as is
|
---|
| 351 | result = str;
|
---|
| 352 | }
|
---|
[33728] | 353 |
|
---|
| 354 | return result;
|
---|
| 355 | }
|
---|
| 356 |
|
---|
| 357 | /** Attempting to produce the equivalent method fileToURLEncoding() above, but taking a String as input parameter */
|
---|
| 358 | public static String fileNameToHex(String filename) {
|
---|
[33730] | 359 |
|
---|
[33728] | 360 | String hexFilename = "";
|
---|
| 361 | for(int i = 0; i < filename.length(); i++) {
|
---|
| 362 | int charCode = filename.codePointAt(i); // unicode codepoint / ASCII code
|
---|
| 363 |
|
---|
| 364 | // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
|
---|
| 365 | // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
|
---|
[33737] | 366 | if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 || charCode == 36 || charCode == 43) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too
|
---|
[33728] | 367 | hexFilename += filename.charAt(i);
|
---|
| 368 | } else {
|
---|
| 369 | hexFilename += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"
|
---|
| 370 | }
|
---|
| 371 | }
|
---|
| 372 |
|
---|
| 373 | return hexFilename;
|
---|
| 374 | }
|
---|
| 375 |
|
---|
[33737] | 376 |
|
---|
| 377 | // follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter
|
---|
[33738] | 378 | public static String UNUSED_filenameToURLEncoding(String filename) {
|
---|
[33737] | 379 | if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
|
---|
| 380 | return filename;
|
---|
| 381 | }
|
---|
| 382 |
|
---|
| 383 | // Can't create a URI out of a filename containing spaces. Spaces must be encoded as %20
|
---|
| 384 | String filename_url_encoded = filename.replace(" ", "%20");
|
---|
| 385 | //filename_url_encoded = filename_url_encoded.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex
|
---|
| 386 | //filename_url_encoded = filename_url_encoded.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased
|
---|
| 387 |
|
---|
| 388 | try {
|
---|
| 389 | URI filename_uri = new URI(filename_url_encoded);
|
---|
| 390 | // The trick:
|
---|
| 391 | // 1. toASCIIString() will %xx encode values > 127
|
---|
| 392 | // 2. Decode the result to "ISO-8859-1"
|
---|
| 393 | // 3. URL encode the bytes to string
|
---|
| 394 |
|
---|
| 395 | // Step 2 forces the string to be 8-bit values. It
|
---|
| 396 | // doesn't matter if the starting raw filename was *not*
|
---|
| 397 | // in the ISO-8859-1 encoding, the effect is to ensure
|
---|
| 398 | // we have an 8-bit byte string that (numerically)
|
---|
| 399 | // captures the right value. These numerical values are
|
---|
| 400 | // then used to determine how to URL encode it
|
---|
| 401 |
|
---|
| 402 | String filename_ascii = filename_uri.toASCIIString();
|
---|
| 403 | //filename_ascii = filename_ascii.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex
|
---|
| 404 | //filename_ascii = filename_ascii.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased
|
---|
| 405 | String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
|
---|
| 406 | filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
|
---|
| 407 |
|
---|
| 408 | // DEALING WITH & and + in filenames: NOT WORKING YET
|
---|
| 409 | //if(filename_url_encoded.contains("&")) {
|
---|
| 410 | // filename_url_encoded = filename_url_encoded.replace("&", "%36amp;");
|
---|
| 411 | //} else if(filename_url_encoded.contains("&")) {
|
---|
| 412 | // filename_url_encoded = filename_url_encoded.replace("&", "%36");
|
---|
| 413 | //}
|
---|
| 414 |
|
---|
| 415 | }
|
---|
| 416 | catch (Exception e) {
|
---|
| 417 | e.printStackTrace();
|
---|
| 418 | // Give up trying to convert
|
---|
| 419 | filename_url_encoded = filename;
|
---|
| 420 | }
|
---|
| 421 | return filename_url_encoded;
|
---|
| 422 | }
|
---|
| 423 |
|
---|
| 424 |
|
---|
| 425 | // follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter
|
---|
[33738] | 426 | public static String filenameToURLEncoding(String filename) {
|
---|
[33737] | 427 | if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
|
---|
| 428 | return filename;
|
---|
| 429 | }
|
---|
| 430 |
|
---|
| 431 | File file = new File (filename);
|
---|
| 432 | return fileToURLEncoding(file);
|
---|
| 433 | }
|
---|
| 434 |
|
---|
| 435 |
|
---|
[23433] | 436 | // Dr Bainbridge's methods
|
---|
| 437 | /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,
|
---|
| 438 | * we can work with URL-encoded filenames in Java. Java works with whatever
|
---|
| 439 | * encoding the filesystem uses. Unlike systems working with UTF-8, where Java
|
---|
| 440 | * interprets filenames as UTF-8 (a destructive process since characters invalid
|
---|
| 441 | * for UTF-8 are replaced with the invalid character, which means the original
|
---|
| 442 | * character's byte codes can not be regained), working with an ISO-8859-1
|
---|
| 443 | * system means the original byte codes of the characters are preserved,
|
---|
| 444 | * regardless of whether the characters represent ISO-8859-1 or not. Such byte
|
---|
| 445 | * codes are converted by the following method to the correct URL versions of
|
---|
| 446 | * the strings that the filenames represent (that is, the correct URL representations
|
---|
| 447 | * of the filenames in their original encodings). This is useful for interactions with
|
---|
| 448 | * Perl as Java and Perl can use URL-encoded filenames to talk about the same files
|
---|
| 449 | * on the file system, instead of having to work out what encoding they are in. */
|
---|
| 450 |
|
---|
| 451 | public static String fileToURLEncoding(File file) {
|
---|
| 452 | if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
|
---|
| 453 | return file.getAbsolutePath();
|
---|
| 454 | }
|
---|
| 455 |
|
---|
| 456 | String filename_url_encoded = "";
|
---|
| 457 |
|
---|
| 458 | // The following test for whether the file exists or not is a problem
|
---|
| 459 | // when a File object--whose actual file is in the process of being moved
|
---|
| 460 | // and therefore temporarily does not 'exist' on the actual system--can't
|
---|
| 461 | // be URL encoded: the following would return "" when a file doesn't exist.
|
---|
| 462 | // So commenting out the test.
|
---|
| 463 | /*
|
---|
| 464 | if(!file.getName().equals("recycle")) {
|
---|
| 465 | if(!file.isFile() && !file.isDirectory()) {
|
---|
| 466 | System.err.println("*** ERROR. Java can't see file: " + file.getAbsolutePath());
|
---|
| 467 | return "";
|
---|
| 468 | }
|
---|
| 469 |
|
---|
| 470 | if(!file.exists()) {
|
---|
| 471 | System.err.println("*** NOTE: File doesn't exist: " + file.getAbsolutePath());
|
---|
| 472 | return ""; //file.getName();
|
---|
| 473 | }
|
---|
| 474 | }
|
---|
| 475 | */
|
---|
| 476 |
|
---|
| 477 | URI filename_uri = file.toURI();
|
---|
| 478 | try {
|
---|
| 479 | // The trick:
|
---|
| 480 | // 1. toASCIIString() will %xx encode values > 127
|
---|
| 481 | // 2. Decode the result to "ISO-8859-1"
|
---|
| 482 | // 3. URL encode the bytes to string
|
---|
| 483 |
|
---|
| 484 | // Step 2 forces the string to be 8-bit values. It
|
---|
| 485 | // doesn't matter if the starting raw filename was *not*
|
---|
| 486 | // in the ISO-8859-1 encoding, the effect is to ensure
|
---|
| 487 | // we have an 8-bit byte string that (numerically)
|
---|
| 488 | // captures the right value. These numerical values are
|
---|
| 489 | // then used to determine how to URL encode it
|
---|
| 490 |
|
---|
| 491 | String filename_ascii = filename_uri.toASCIIString();
|
---|
| 492 | String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
|
---|
| 493 | filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
|
---|
[29793] | 494 |
|
---|
[23433] | 495 | }
|
---|
| 496 | catch (Exception e) {
|
---|
| 497 | e.printStackTrace();
|
---|
| 498 | // Give up trying to convert
|
---|
| 499 | filename_url_encoded = file.getAbsolutePath();
|
---|
| 500 | }
|
---|
| 501 | return filename_url_encoded;
|
---|
| 502 | }
|
---|
| 503 |
|
---|
| 504 | // For unicode codepoints see:
|
---|
| 505 | // http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT for ISO8859-1 (Latin-1)
|
---|
| 506 | // where 0xE2 maps to codepoint 0x00E2 and is defined as "Latin small letter a with circumflex"
|
---|
| 507 | // http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT for ISO8859-7 (Greek)
|
---|
| 508 | // where 0xE2 maps to codepoint 0x03B2 and is defined as "Greek small letter beta"
|
---|
| 509 | public static String iso_8859_1_filename_to_url_encoded(String raw_bytes_filename)
|
---|
| 510 | throws Exception
|
---|
| 511 | {
|
---|
| 512 | String urlEncoded = "";
|
---|
[29815] | 513 |
|
---|
[23433] | 514 | try {
|
---|
| 515 | // By this point we have a UTF-8 encoded string that captures
|
---|
| 516 | // what the ISO-8859-1 (Latin-1) character is that corresponded to the
|
---|
| 517 | // 8-bit numeric value for that character in the filename
|
---|
| 518 | // on the file system
|
---|
| 519 |
|
---|
| 520 | // For example:
|
---|
| 521 | // File system char: <lower-case beta char in Latin-7> = %E2
|
---|
| 522 | // Equivalent Latin 1 char: <lower-case a with circumflex> = %E2
|
---|
| 523 | // Mapped to UTF-8: <lower-case a with circumflex> = <C3><A2>
|
---|
| 524 |
|
---|
| 525 | // Our task is to take the string the contains <C3><A2> and ensure that
|
---|
| 526 | // we "see" it as <E2>
|
---|
| 527 |
|
---|
| 528 | byte [] raw_bytes = raw_bytes_filename.getBytes("ISO-8859-1");
|
---|
| 529 | String unicode_filename = new String(raw_bytes,"UTF-8");
|
---|
| 530 |
|
---|
| 531 | for(int i = 0; i < unicode_filename.length(); i++) {
|
---|
| 532 | char charVal = unicode_filename.charAt(i);
|
---|
[29793] | 533 | if ((int)charVal > 255) {
|
---|
| 534 | urlEncoded += String.format("&#x%02X;", (int)charVal);
|
---|
| 535 | }
|
---|
| 536 | else if((int)charVal > 127) {
|
---|
[23433] | 537 | urlEncoded += String.format("%%%02X", (int)charVal);
|
---|
| 538 | } else {
|
---|
| 539 | urlEncoded += String.format("%c", (char)charVal);
|
---|
| 540 | }
|
---|
| 541 | }
|
---|
| 542 | }
|
---|
| 543 | catch (Exception e) {
|
---|
| 544 | //e.printStackTrace();
|
---|
| 545 | throw(e);
|
---|
| 546 | }
|
---|
[29815] | 547 |
|
---|
[23433] | 548 | return urlEncoded;
|
---|
| 549 | }
|
---|
| 550 |
|
---|
| 551 | // unused for now
|
---|
| 552 | public static String raw_filename_to_url_encoded(String fileName)
|
---|
| 553 | throws Exception
|
---|
| 554 | {
|
---|
| 555 | String urlEncoded = "";
|
---|
| 556 | try {
|
---|
| 557 | byte[] bytes = fileName.getBytes();
|
---|
| 558 |
|
---|
| 559 | for(int i = 0; i < bytes.length; i++) {
|
---|
| 560 | // mask each byte (by applying & 0xFF) to make the signed
|
---|
| 561 | // byte (in the range -128 to 127) unsigned (in the range
|
---|
| 562 | // 0 to 255).
|
---|
| 563 |
|
---|
| 564 | int byteVal = (int)(bytes[i] & 0xFF);
|
---|
| 565 |
|
---|
| 566 | if(byteVal > 127) {
|
---|
| 567 | urlEncoded += String.format("%%%02X", (int)byteVal);
|
---|
| 568 | } else {
|
---|
| 569 | urlEncoded += String.format("%c",(char)byteVal);
|
---|
| 570 | }
|
---|
| 571 | }
|
---|
| 572 | }
|
---|
| 573 | catch (Exception e) {
|
---|
| 574 | //e.printStackTrace();
|
---|
| 575 | throw(e);
|
---|
| 576 | }
|
---|
| 577 |
|
---|
| 578 | return urlEncoded;
|
---|
| 579 | }
|
---|
| 580 |
|
---|
| 581 | }
|
---|