1 | /**
|
---|
2 | *############################################################################
|
---|
3 | * A component of the Greenstone Librarian Interface, part of the Greenstone
|
---|
4 | * digital library suite from the New Zealand Digital Library Project at the
|
---|
5 | * University of Waikato, New Zealand.
|
---|
6 | *
|
---|
7 | * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
|
---|
8 | *
|
---|
9 | * Copyright (C) 2010 Greenstone Digital Library Project
|
---|
10 | *
|
---|
11 | * This program is free software; you can redistribute it and/or modify
|
---|
12 | * it under the terms of the GNU General Public License as published by
|
---|
13 | * the Free Software Foundation; either version 2 of the License, or
|
---|
14 | * (at your option) any later version.
|
---|
15 | *
|
---|
16 | * This program is distributed in the hope that it will be useful,
|
---|
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
19 | * GNU General Public License for more details.
|
---|
20 | *
|
---|
21 | * You should have received a copy of the GNU General Public License
|
---|
22 | * along with this program; if not, write to the Free Software
|
---|
23 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
24 | *############################################################################
|
---|
25 | */
|
---|
26 |
|
---|
27 | package org.greenstone.gatherer.metadata;
|
---|
28 |
|
---|
29 | import java.io.File;
|
---|
30 | import java.net.*;
|
---|
31 | import java.nio.charset.*;
|
---|
32 | import java.util.*;
|
---|
33 | import org.greenstone.gatherer.collection.CollectionManager;
|
---|
34 | import org.greenstone.gatherer.DebugStream;
|
---|
35 |
|
---|
36 | import java.util.regex.Matcher;
|
---|
37 | import java.util.regex.Pattern;
|
---|
38 |
|
---|
39 |
|
---|
40 |
|
---|
41 | /** Static access class that contains many of the methods used to work with filename encodings.
|
---|
42 | * Works closely with classes FileNode, CollectionTreeNode, MetadataXMLFile, MetadataXMLFileManager
|
---|
43 | * to maintain a map of URLEncodedFilenames to their filename encodings.
|
---|
44 | * The process of filename encoding further affects the CollectionManager which refreshes its CollectionTree,
|
---|
45 | * FileManager (move, delete, rename actions), MetadataValueTableModel, EnrichPane. */
|
---|
46 |
|
---|
47 | public class FilenameEncoding {
|
---|
48 | /** Display of filenames in the trees are in URL encoding, if debugging */
|
---|
49 | public static boolean DEBUGGING = false;
|
---|
50 |
|
---|
51 | /** Set to false by Gatherer if the locale is UTF-8, as Java's handling is
|
---|
52 | * such that non-UTF8 filename encodings on a UTF-8 locale are destructively
|
---|
53 | * converted so that the bytecodes in the filename are not preserved. */
|
---|
54 | public static boolean MULTIPLE_FILENAME_ENCODINGS_SUPPORTED = false;
|
---|
55 |
|
---|
56 | /** Also set by Gatherer.
|
---|
57 | * If the OS supports multiple filename encodings, we will be working with URL strings
|
---|
58 | * and the applicable separators are always the forward slash ("/") not File.separator.
|
---|
59 | * If multiple filename encodings are not supported, we're dealing with File.separator. */
|
---|
60 | public static String URL_FILE_SEPARATOR = File.separator;
|
---|
61 |
|
---|
62 |
|
---|
63 | /** gs.filenameEncoding is a special sort of metadata that is not merely to be stored along
|
---|
64 | * with a file, but is to be applied in real-time on the file's name in the CollectionTree
|
---|
65 | * display. Since FileNodes are constantly destroyed and reconstructed by that Tree when
|
---|
66 | * its nodes are expanded and contracted, storing the filename encodings of each file along
|
---|
67 | * with the file in a FileNode doesn't help because it doesn't last. Instead of rediscovering
|
---|
68 | * the encoding at every stage by querying the metadataXML file, we store the encodings for
|
---|
69 | * fast access: in a map of (URLEncodedFilePath, filename-encoding) pairs.
|
---|
70 | * The current design of the map is to only store any active filename metadata assigned
|
---|
71 | * directly at that file/folder's level, and if there is none discovered at that level, then
|
---|
72 | * storing the empty string for it. Therefore, if the hashmap contains no entry for
|
---|
73 | * a file, it means this still needs to be retrieved. */
|
---|
74 | public static Map map = new HashMap();
|
---|
75 |
|
---|
76 | /** Compiled pattern for hex entities of characters. These are of the forn "&#x....;" with 1 to 4 digits */
|
---|
77 | public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)");
|
---|
78 |
|
---|
79 |
|
---|
80 | //*********************** BUSY REFRESHING / REQUIRING REFRESH *********************
|
---|
81 |
|
---|
82 | /** Set to true if filename encoding metadata was changed. Called by the enter keyPress
|
---|
83 | * event in gui.EnrichPane and when the gs.FilenameEncoding field loses focus. */
|
---|
84 | private static boolean refreshRequired = false;
|
---|
85 |
|
---|
86 | synchronized public static boolean isRefreshRequired() {
|
---|
87 | return refreshRequired;
|
---|
88 | }
|
---|
89 |
|
---|
90 | synchronized public static void setRefreshRequired(boolean state) {
|
---|
91 | if(MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
|
---|
92 | refreshRequired = state;
|
---|
93 | } else {
|
---|
94 | refreshRequired = false;
|
---|
95 | }
|
---|
96 | }
|
---|
97 |
|
---|
98 | //************************** MAP RETRIEVAL METHODS ******************************
|
---|
99 |
|
---|
100 | /** Returns the cumulative gs.filenameEncoding metadata
|
---|
101 | * assigned to a file inside the collection. */
|
---|
102 | public static String findFilenameEncoding(
|
---|
103 | File file, String urlEncodedFilePath, boolean bruteForceLookup)
|
---|
104 | {
|
---|
105 | //if(bruteForceLookup) {
|
---|
106 | // return findFilenameEncodingBruteForce(file, urlEncodedFilePath, bruteForceLookup);
|
---|
107 | //}
|
---|
108 |
|
---|
109 | String encoding = "";
|
---|
110 |
|
---|
111 | // Check any assigned encoding at this level, starting with the map first
|
---|
112 | // and else retrieving the filename encoding from the metadata file
|
---|
113 | if(!map.containsKey(urlEncodedFilePath)) {
|
---|
114 |
|
---|
115 | // Check for filename encoding metadata *directly* associated with the file
|
---|
116 | // Now don't need to get any inherited encoding metadata here, because of
|
---|
117 | // the way we're storing and retrieving encoding information from the map.
|
---|
118 | ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(file, true); // true: gets gs.filenameEncoding only
|
---|
119 | if(!list.isEmpty()) {
|
---|
120 | MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
|
---|
121 | encoding = metavalue.getValue();
|
---|
122 | } // else no filename encoding set yet at this level
|
---|
123 |
|
---|
124 | // Now we've done a lookup at this level cache the result in the map,
|
---|
125 | // including empty strings, to indicate that we've done a full lookup
|
---|
126 | map.put(urlEncodedFilePath, encoding);
|
---|
127 | }
|
---|
128 | else { // an entry exists in the map, get it from there
|
---|
129 | encoding = (String)map.get(urlEncodedFilePath);
|
---|
130 | }
|
---|
131 |
|
---|
132 | // if no meta was specified at at the file level, look for any inherited metadata
|
---|
133 | if(encoding.equals("")) {
|
---|
134 | encoding = getInheritedFilenameEncoding(urlEncodedFilePath, file);
|
---|
135 | }
|
---|
136 |
|
---|
137 | //System.err.println("\n@@@@Looked for: " + urlEncodedFilePath + " | found: " + encoding);
|
---|
138 | return encoding; // found something in map, may still be "", but it's what was stored
|
---|
139 | }
|
---|
140 |
|
---|
141 | /** Checks the file-to-encoding map for all the superfolders of the given
|
---|
142 | * filename in sequence for an applicable encoding. Note that the file/folder
|
---|
143 | * at the level of urlFoldername (and dir) has already been inspected. */
|
---|
144 | static public String getInheritedFilenameEncoding(String urlFoldername, File dir)
|
---|
145 | {
|
---|
146 | String encoding = "";
|
---|
147 | boolean done = false;
|
---|
148 |
|
---|
149 | // don't want to search past import folder which is as
|
---|
150 | // far as we need to go to determine inherited encodings
|
---|
151 | File importDir = new File(CollectionManager.getLoadedCollectionImportDirectoryPath());
|
---|
152 | if(dir.equals(importDir)) { // if the top-level dir was already checked, we're done
|
---|
153 | done = true;
|
---|
154 | }
|
---|
155 |
|
---|
156 | // For directories, first remove trailing file separator in order to start checking from higher level folders
|
---|
157 | int lastIndex = urlFoldername.length()-1;
|
---|
158 | char urlFileSeparatorChar = URL_FILE_SEPARATOR.charAt(0);
|
---|
159 | if(urlFoldername.charAt(lastIndex) == urlFileSeparatorChar) {
|
---|
160 | urlFoldername = urlFoldername.substring(0, lastIndex);
|
---|
161 | }
|
---|
162 |
|
---|
163 | while(!done) {
|
---|
164 | // get the folder that's one level up
|
---|
165 | dir = dir.getParentFile();
|
---|
166 |
|
---|
167 | int index = urlFoldername.lastIndexOf(URL_FILE_SEPARATOR);
|
---|
168 | if(index == -1) { // no more slashes
|
---|
169 | done = true;
|
---|
170 | } else {
|
---|
171 | urlFoldername = urlFoldername.substring(0, index);
|
---|
172 | }
|
---|
173 |
|
---|
174 | // now look in the map to see whether there's an encoding for this folder
|
---|
175 | String folder = urlFoldername + URL_FILE_SEPARATOR;
|
---|
176 | if(map.containsKey(folder)) {
|
---|
177 | encoding = (String)map.get(folder); // may be ""
|
---|
178 | } else { // no entry in map, so look in the metadata.xml at this folder level
|
---|
179 | ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(
|
---|
180 | dir, true); // true: gets gs.filenameEncoding only
|
---|
181 | if(!list.isEmpty()) {
|
---|
182 | MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
|
---|
183 | encoding = metavalue.getValue();
|
---|
184 | }
|
---|
185 | map.put(folder, encoding); // may be ""
|
---|
186 | }
|
---|
187 |
|
---|
188 | if(!encoding.equals("")){
|
---|
189 | done = true;
|
---|
190 | } // else if "", loop to check next folder up
|
---|
191 | else if(dir.equals(importDir)) { // don't iterate past the import folder, which we've now checked
|
---|
192 | done = true;
|
---|
193 | }
|
---|
194 | }
|
---|
195 |
|
---|
196 | return encoding;
|
---|
197 | }
|
---|
198 |
|
---|
199 | /** Called by GUIManager when a collection is closed. This then empties the
|
---|
200 | * file-to-encoding map which is applicable only on a per-collection basis */
|
---|
201 | static public void closeCollection() {
|
---|
202 | //printFilenameMap("Closing collection. Clearing file-to-encoding map of entries:");
|
---|
203 | map.clear();
|
---|
204 | }
|
---|
205 |
|
---|
206 | // Useful for debugging: prints contents of file-to-encoding map
|
---|
207 | static public void printFilenameMap(String heading) {
|
---|
208 | System.err.println("\n********************************************");
|
---|
209 | System.err.println(heading.toUpperCase());
|
---|
210 | Iterator entries = map.entrySet().iterator();
|
---|
211 | while(entries.hasNext()) {
|
---|
212 | Map.Entry entry = (Map.Entry)entries.next();
|
---|
213 | System.err.println("+ " + (String)entry.getKey() + ": " + (String)entry.getValue());
|
---|
214 | }
|
---|
215 | System.err.println("********************************************\n");
|
---|
216 | }
|
---|
217 |
|
---|
218 | // UNUSED at present. Brute force version of the findFilenameEncoding() method
|
---|
219 | // Doesn't use the map, but gets *all* the metadata assigned to a file/folder to
|
---|
220 | // work out the encoding applicable to a file/folder.
|
---|
221 | public static String findFilenameEncodingBruteForce(File file, String urlEncodedFilename,
|
---|
222 | boolean bruteForceLookup)
|
---|
223 | {
|
---|
224 | System.err.println("\n***** BRUTE FORCE getFilenameEncoding() called\n");
|
---|
225 |
|
---|
226 |
|
---|
227 | String encoding = "";
|
---|
228 |
|
---|
229 | // Check for filename encoding metadata *directly* associated with the file
|
---|
230 | // Now don't need to get any inherited encoding metadata here, because of
|
---|
231 | // the way we're storing and retrieving encoding information from the map.
|
---|
232 |
|
---|
233 | ArrayList list = MetadataXMLFileManager.getMetadataAssignedToFile(file, true); // true: gets gs.filenameEncoding only
|
---|
234 | if(!list.isEmpty()) {
|
---|
235 | // try to get the filename encoding meta that was assigned last to this
|
---|
236 | // file, even though it makes no sense to have multiple values for it
|
---|
237 | MetadataValue metavalue = (MetadataValue)list.get(list.size()-1);
|
---|
238 | encoding = metavalue.getValue();
|
---|
239 |
|
---|
240 | if(encoding == null) { // unlikely ???
|
---|
241 | System.err.println("**** ERROR: encoding for "
|
---|
242 | + urlEncodedFilename + " is NULL!");
|
---|
243 | encoding = "";
|
---|
244 | }
|
---|
245 | } // else no filename encoding set yet, perhaps
|
---|
246 | //System.err.println("**** Found encoding for " + urlEncodedFilename + " " + encoding);
|
---|
247 | return encoding;
|
---|
248 | }
|
---|
249 |
|
---|
250 | //****************************** APPLYING ENCODINGS TO FILENAMES *****************************
|
---|
251 |
|
---|
252 | /** URL encoded version of the byte codes of the given file's name */
|
---|
253 | public static String calcURLEncodedFilePath(File file) {
|
---|
254 | if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
|
---|
255 | return file.getAbsolutePath();
|
---|
256 | }
|
---|
257 | else {
|
---|
258 | String filename = fileToURLEncoding(file);
|
---|
259 | return filename;
|
---|
260 | }
|
---|
261 | }
|
---|
262 |
|
---|
263 | /** URL encoded version of the byte codes of this file's name */
|
---|
264 | public static String calcURLEncodedFileName(String urlfilepath) {
|
---|
265 | String filename = urlfilepath;
|
---|
266 | if(filename.endsWith(URL_FILE_SEPARATOR)) { // directory, remove trailing slash
|
---|
267 | filename = filename.substring(0, filename.length() - 1);
|
---|
268 | }
|
---|
269 |
|
---|
270 | // remove the directory prefix (if any) to get the filename
|
---|
271 | int index = filename.lastIndexOf(URL_FILE_SEPARATOR);
|
---|
272 | if(index != -1) {
|
---|
273 | filename = filename.substring(index+1); // skip separator
|
---|
274 | }
|
---|
275 |
|
---|
276 | return filename;
|
---|
277 | }
|
---|
278 |
|
---|
279 | /** Given a string representing an alias to an official encoding (and unofficial ones
|
---|
280 | * starting with "Latin-"), attempts to work out what the canonical encoding for that is.
|
---|
281 | * If the given encoding is unrecognised, it is returned as is. */
|
---|
282 | public static String canonicalEncodingName(String encoding) {
|
---|
283 | String canonicalEncoding = encoding;
|
---|
284 | try {
|
---|
285 | // Latin-1 -> ISO-8859-1
|
---|
286 | String alias = canonicalEncoding.toLowerCase();
|
---|
287 | if(alias.startsWith("latin")){
|
---|
288 | canonicalEncoding = "ISO-8859" + alias.substring("latin".length());
|
---|
289 | }
|
---|
290 |
|
---|
291 | // canonical encoding for official aliases
|
---|
292 | canonicalEncoding = Charset.forName(canonicalEncoding).name();
|
---|
293 | return canonicalEncoding;
|
---|
294 | } catch (Exception e) {
|
---|
295 | System.err.println("(Could not recognise encoding (alias): "
|
---|
296 | + encoding + ".)");
|
---|
297 | return encoding; // no alias could be found, return the original parameter
|
---|
298 | }
|
---|
299 | }
|
---|
300 |
|
---|
301 | //************************* GETTING THE URL ENCODING OF FILENAMES *********************************
|
---|
302 |
|
---|
303 | /**
|
---|
304 | * Given a String containing hexentities, will convert back into the unicode version of the String.
|
---|
305 | * e.g. A string like "02 Tēnā Koutou\.mp3" will be returned as "02 Tena Koutou\.mp3" with macrons on e and a
|
---|
306 | * I've tested this in a separate file that imports java.util.regex.Matcher and java.util.regex.Pattern
|
---|
307 | * and contains a copy of Utility.debugUnicodeString(String) with the following main function:
|
---|
308 | public static void main(String args[]) {
|
---|
309 | String str = "02 Tēnā Koutou\\.mp3"; // or more basic case: String str = "mmmmānnnnēpppp\\.txt";
|
---|
310 | System.err.println("About to decode hex string: " + str);
|
---|
311 | String result = decodeStringContainingHexEntities(str);
|
---|
312 | System.err.println("Decoded hex string: " + result + " - debug unicode form: " + debugUnicodeString(result));
|
---|
313 | }
|
---|
314 | */
|
---|
315 | public static String decodeStringContainingHexEntities(String str) {
|
---|
316 | String result = "";
|
---|
317 | Matcher matcher = HEX_PATTERN.matcher(str);
|
---|
318 |
|
---|
319 | int searchFromIndex = 0;
|
---|
320 | int endMatchIndex = -1;
|
---|
321 |
|
---|
322 | while(matcher.find(searchFromIndex)) {
|
---|
323 | String hexPart = matcher.group();
|
---|
324 | //System.err.println("Found hexpart match: " + hexPart);
|
---|
325 |
|
---|
326 | int startMatchIndex = matcher.start();
|
---|
327 | endMatchIndex = matcher.end();
|
---|
328 | result += str.substring(searchFromIndex, startMatchIndex);
|
---|
329 |
|
---|
330 | String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match
|
---|
331 | // https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string
|
---|
332 | // https://stackoverflow.com/questions/11194513/convert-hex-string-to-int
|
---|
333 |
|
---|
334 | //System.err.println("hexNumberStr so far: " + hexNumberStr);
|
---|
335 | hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD"
|
---|
336 | //int hexNumber = Integer.parseInt(hexNumberStr);
|
---|
337 | int hexNumber = Integer.decode(hexNumberStr);
|
---|
338 | String hexNumberAsChar = Character.toString((char) hexNumber);
|
---|
339 | result += hexNumberAsChar;
|
---|
340 |
|
---|
341 | searchFromIndex = endMatchIndex;
|
---|
342 |
|
---|
343 | }
|
---|
344 |
|
---|
345 | if(endMatchIndex != -1) { // attach any suffix once we finished processing all the hex codes
|
---|
346 | result += str.substring(endMatchIndex);
|
---|
347 | //System.err.println("suffix: " + str.substring(endMatchIndex));
|
---|
348 | }
|
---|
349 | else { // there were no hex codes to decode, return string as is
|
---|
350 | result = str;
|
---|
351 | }
|
---|
352 |
|
---|
353 | return result;
|
---|
354 | }
|
---|
355 |
|
---|
356 |
|
---|
357 | // Dr Bainbridge's methods
|
---|
358 | /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,
|
---|
359 | * we can work with URL-encoded filenames in Java. Java works with whatever
|
---|
360 | * encoding the filesystem uses. Unlike systems working with UTF-8, where Java
|
---|
361 | * interprets filenames as UTF-8 (a destructive process since characters invalid
|
---|
362 | * for UTF-8 are replaced with the invalid character, which means the original
|
---|
363 | * character's byte codes can not be regained), working with an ISO-8859-1
|
---|
364 | * system means the original byte codes of the characters are preserved,
|
---|
365 | * regardless of whether the characters represent ISO-8859-1 or not. Such byte
|
---|
366 | * codes are converted by the following method to the correct URL versions of
|
---|
367 | * the strings that the filenames represent (that is, the correct URL representations
|
---|
368 | * of the filenames in their original encodings). This is useful for interactions with
|
---|
369 | * Perl as Java and Perl can use URL-encoded filenames to talk about the same files
|
---|
370 | * on the file system, instead of having to work out what encoding they are in. */
|
---|
371 |
|
---|
372 | public static String fileToURLEncoding(File file) {
|
---|
373 | if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
|
---|
374 | return file.getAbsolutePath();
|
---|
375 | }
|
---|
376 |
|
---|
377 | String filename_url_encoded = "";
|
---|
378 |
|
---|
379 | // The following test for whether the file exists or not is a problem
|
---|
380 | // when a File object--whose actual file is in the process of being moved
|
---|
381 | // and therefore temporarily does not 'exist' on the actual system--can't
|
---|
382 | // be URL encoded: the following would return "" when a file doesn't exist.
|
---|
383 | // So commenting out the test.
|
---|
384 | /*
|
---|
385 | if(!file.getName().equals("recycle")) {
|
---|
386 | if(!file.isFile() && !file.isDirectory()) {
|
---|
387 | System.err.println("*** ERROR. Java can't see file: " + file.getAbsolutePath());
|
---|
388 | return "";
|
---|
389 | }
|
---|
390 |
|
---|
391 | if(!file.exists()) {
|
---|
392 | System.err.println("*** NOTE: File doesn't exist: " + file.getAbsolutePath());
|
---|
393 | return ""; //file.getName();
|
---|
394 | }
|
---|
395 | }
|
---|
396 | */
|
---|
397 |
|
---|
398 | URI filename_uri = file.toURI();
|
---|
399 | try {
|
---|
400 | // The trick:
|
---|
401 | // 1. toASCIIString() will %xx encode values > 127
|
---|
402 | // 2. Decode the result to "ISO-8859-1"
|
---|
403 | // 3. URL encode the bytes to string
|
---|
404 |
|
---|
405 | // Step 2 forces the string to be 8-bit values. It
|
---|
406 | // doesn't matter if the starting raw filename was *not*
|
---|
407 | // in the ISO-8859-1 encoding, the effect is to ensure
|
---|
408 | // we have an 8-bit byte string that (numerically)
|
---|
409 | // captures the right value. These numerical values are
|
---|
410 | // then used to determine how to URL encode it
|
---|
411 |
|
---|
412 | String filename_ascii = filename_uri.toASCIIString();
|
---|
413 |
|
---|
414 | // The URI.toASCIIString() call above only encodes values > 127.
|
---|
415 | // But we also need to protect + and & signs in filenames. Do this by URL encoding.
|
---|
416 | // But need to double URL encode, else it will get decoded too early, in methods called shortly hereafter.
|
---|
417 | filename_ascii = filename_ascii.replace("+", "%252B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased
|
---|
418 | filename_ascii = filename_ascii.replace("&", "%2526"); // &'s ASCII code is 36 in decimal, and 26 in hex
|
---|
419 |
|
---|
420 | // Before proceeding, protect & in the filename too.
|
---|
421 | // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with &
|
---|
422 | // But dangerous to do simple replace if there are &#x...; entities in the filename already!
|
---|
423 | // That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same!
|
---|
424 | //filename_url_encoded = filename_url_encoded.replace("&", "&x26;");// SO THIS IS BAD
|
---|
425 | //filename_url_encoded = filename_url_encoded.replace("&", hexEntityForChar("&"));// SAME, STILL BAD
|
---|
426 | ///filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities
|
---|
427 |
|
---|
428 | String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
|
---|
429 | filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
|
---|
430 |
|
---|
431 | // For chars that were protected by being URL encoded, now convert them to the correct version we want them in.
|
---|
432 | // For +: this char is special in regex, so it needs to be converted from URL encoding back to + so it will get properly escaped for regex
|
---|
433 | // For &: this char is special in XML, so since the call to iso_8859_1_filename_to_url_encoded() is over, we can finally convert & to hex entity now.
|
---|
434 | //filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller
|
---|
435 | filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex
|
---|
436 | filename_url_encoded = filename_url_encoded.replace("%26", "&"); // convert URL encoding for ampersand into hex entity for ampersand
|
---|
437 | }
|
---|
438 | catch (Exception e) {
|
---|
439 | e.printStackTrace();
|
---|
440 | // Give up trying to convert
|
---|
441 | filename_url_encoded = file.getAbsolutePath();
|
---|
442 | }
|
---|
443 | return filename_url_encoded;
|
---|
444 | }
|
---|
445 |
|
---|
446 | // For unicode codepoints see:
|
---|
447 | // http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT for ISO8859-1 (Latin-1)
|
---|
448 | // where 0xE2 maps to codepoint 0x00E2 and is defined as "Latin small letter a with circumflex"
|
---|
449 | // http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT for ISO8859-7 (Greek)
|
---|
450 | // where 0xE2 maps to codepoint 0x03B2 and is defined as "Greek small letter beta"
|
---|
451 | public static String iso_8859_1_filename_to_url_encoded(String raw_bytes_filename)
|
---|
452 | throws Exception
|
---|
453 | {
|
---|
454 | String urlEncoded = "";
|
---|
455 |
|
---|
456 | try {
|
---|
457 | // By this point we have a UTF-8 encoded string that captures
|
---|
458 | // what the ISO-8859-1 (Latin-1) character is that corresponded to the
|
---|
459 | // 8-bit numeric value for that character in the filename
|
---|
460 | // on the file system
|
---|
461 |
|
---|
462 | // For example:
|
---|
463 | // File system char: <lower-case beta char in Latin-7> = %E2
|
---|
464 | // Equivalent Latin 1 char: <lower-case a with circumflex> = %E2
|
---|
465 | // Mapped to UTF-8: <lower-case a with circumflex> = <C3><A2>
|
---|
466 |
|
---|
467 | // Our task is to take the string the contains <C3><A2> and ensure that
|
---|
468 | // we "see" it as <E2>
|
---|
469 |
|
---|
470 | byte [] raw_bytes = raw_bytes_filename.getBytes("ISO-8859-1");
|
---|
471 | String unicode_filename = new String(raw_bytes,"UTF-8");
|
---|
472 |
|
---|
473 | for(int i = 0; i < unicode_filename.length(); i++) {
|
---|
474 | char charVal = unicode_filename.charAt(i);
|
---|
475 | if ((int)charVal > 255) {
|
---|
476 | urlEncoded += String.format("&#x%02X;", (int)charVal);
|
---|
477 | }
|
---|
478 | else if((int)charVal > 127) {
|
---|
479 | urlEncoded += String.format("%%%02X", (int)charVal);
|
---|
480 | } else {
|
---|
481 | urlEncoded += String.format("%c", (char)charVal);
|
---|
482 | }
|
---|
483 | }
|
---|
484 | }
|
---|
485 | catch (Exception e) {
|
---|
486 | //e.printStackTrace();
|
---|
487 | throw(e);
|
---|
488 | }
|
---|
489 |
|
---|
490 | return urlEncoded;
|
---|
491 | }
|
---|
492 |
|
---|
493 | // unused for now
|
---|
494 | public static String raw_filename_to_url_encoded(String fileName)
|
---|
495 | throws Exception
|
---|
496 | {
|
---|
497 | String urlEncoded = "";
|
---|
498 | try {
|
---|
499 | byte[] bytes = fileName.getBytes();
|
---|
500 |
|
---|
501 | for(int i = 0; i < bytes.length; i++) {
|
---|
502 | // mask each byte (by applying & 0xFF) to make the signed
|
---|
503 | // byte (in the range -128 to 127) unsigned (in the range
|
---|
504 | // 0 to 255).
|
---|
505 |
|
---|
506 | int byteVal = (int)(bytes[i] & 0xFF);
|
---|
507 |
|
---|
508 | if(byteVal > 127) {
|
---|
509 | urlEncoded += String.format("%%%02X", (int)byteVal);
|
---|
510 | } else {
|
---|
511 | urlEncoded += String.format("%c",(char)byteVal);
|
---|
512 | }
|
---|
513 | }
|
---|
514 | }
|
---|
515 | catch (Exception e) {
|
---|
516 | //e.printStackTrace();
|
---|
517 | throw(e);
|
---|
518 | }
|
---|
519 |
|
---|
520 | return urlEncoded;
|
---|
521 | }
|
---|
522 |
|
---|
523 | // FURTHER HELPER METHODS
|
---|
524 |
|
---|
525 | /**
|
---|
526 | * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter.
|
---|
527 | * If filename is relative, then the current directory (gli?) will be prefixed to what is returned
|
---|
528 | * and should be removed manually by the caller. Alternatively, for relative paths, call the variant
|
---|
529 | * relativeFilenameToURLEncoding(String), which will remove any added filepath prefix.
|
---|
530 | */
|
---|
531 | public static String fullFilepathToURLEncoding(String filename) {
|
---|
532 | if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
|
---|
533 | return filename;
|
---|
534 | }
|
---|
535 |
|
---|
536 | File file = new File (filename);
|
---|
537 | String filename_url_encoded = fileToURLEncoding(file);
|
---|
538 |
|
---|
539 | // if the current directory (".") was passed in as filename,
|
---|
540 | // then the filename_url_encoded looks like /full/path/./
|
---|
541 | // In that case, remove the ./ at the end
|
---|
542 | if (filename_url_encoded.endsWith(FilenameEncoding.URL_FILE_SEPARATOR+"."+FilenameEncoding.URL_FILE_SEPARATOR)) {
|
---|
543 | filename_url_encoded = filename_url_encoded.substring(0, filename_url_encoded.length()-2); // cut off /. at end
|
---|
544 | }
|
---|
545 |
|
---|
546 | return filename_url_encoded;
|
---|
547 | }
|
---|
548 |
|
---|
549 | /**
|
---|
550 | * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter
|
---|
551 | * If filename is a relative path, call this method to get it specially URL encoded.
|
---|
552 | * This method will remove the current directory that is prefixed as an intermediary step.
|
---|
553 | */
|
---|
554 | public static String relativeFilenameToURLEncoding(String filename) {
|
---|
555 | if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
|
---|
556 | return filename;
|
---|
557 | }
|
---|
558 |
|
---|
559 | String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding(".");
|
---|
560 | return filenameToURLEncodingWithPrefixRemoved(filename, curr_directory_path);
|
---|
561 | }
|
---|
562 |
|
---|
563 | /**
|
---|
564 | * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter
|
---|
565 | * Convenience method that will return the specially URL encoded version of filename
|
---|
566 | * with the provided removeFilePathPrefix removed */
|
---|
567 | public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) {
|
---|
568 | if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
|
---|
569 | return filename;
|
---|
570 | }
|
---|
571 |
|
---|
572 | File file = new File (filename);
|
---|
573 | String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath
|
---|
574 |
|
---|
575 | // now lop off the given removeFilePathPrefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added
|
---|
576 | filename_url_encoded = filename_url_encoded.substring(removeFilePathPrefix.length());
|
---|
577 | // remove any remaining slash prefix
|
---|
578 | if (filename_url_encoded.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
|
---|
579 | filename_url_encoded = filename_url_encoded.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
|
---|
580 | }
|
---|
581 |
|
---|
582 | return filename_url_encoded;
|
---|
583 | }
|
---|
584 |
|
---|
585 | // UNUSED now, but useful functions and escapeAllCharWithHexEntity() took effort to write.
|
---|
586 |
|
---|
587 | /**
|
---|
588 | * Attempting to produce the equivalent method fileToURLEncoding(), but taking a String as input parameter
|
---|
589 | * UNUSED - REPLACED by filenameToURLEncoding(String str) which reuses existing fileToURLEncoding(File) method.
|
---|
590 | */
|
---|
591 | public static String stringToHex(String str) {
|
---|
592 |
|
---|
593 | String hex_str = "";
|
---|
594 | for(int i = 0; i < str.length(); i++) {
|
---|
595 | int charCode = str.codePointAt(i); // unicode codepoint / ASCII code
|
---|
596 |
|
---|
597 | // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
|
---|
598 | // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
|
---|
599 | if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 /*|| charCode == 36 || charCode == 43*/) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too
|
---|
600 | hex_str += str.charAt(i);
|
---|
601 | } else {
|
---|
602 | hex_str += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"
|
---|
603 | }
|
---|
604 | }
|
---|
605 |
|
---|
606 | return hex_str;
|
---|
607 | }
|
---|
608 |
|
---|
609 | /** Takes a String containing a single char and returns the hex entity for it */
|
---|
610 | public static String hexEntityForChar(String char_as_string) {
|
---|
611 | int charCode = char_as_string.codePointAt(0); // unicode codepoint / ASCII code
|
---|
612 | String hexCodeStr = "&#x" + String.format("%x", charCode).toUpperCase() + ";";
|
---|
613 | return hexCodeStr;
|
---|
614 | }
|
---|
615 |
|
---|
616 | /**
|
---|
617 | * Given a String containing 0 or more occurrences of CHARACTER,
|
---|
618 | * this method will replace all occurrences of that CHARACTER with its hex entity variant, "&x....;"
|
---|
619 | * Special care is taken where the CHARACTER to be replaced is &,
|
---|
620 | * as in that case, we don't want to replace any existing hex entities already present in the String.
|
---|
621 | */
|
---|
622 | public static String escapeAllCharWithHexEntity(String str, char CHARACTER) {
|
---|
623 |
|
---|
624 | if(str.indexOf(CHARACTER) == -1) { // nothing to replace, we're done
|
---|
625 | return str;
|
---|
626 | }
|
---|
627 |
|
---|
628 | String char_as_string = Character.toString(CHARACTER);
|
---|
629 | String hexCodeString = hexEntityForChar(char_as_string);
|
---|
630 |
|
---|
631 | Matcher hexPatternMatch = HEX_PATTERN.matcher(str); // looks for a hex entity, which has the pattern "&#x....;"
|
---|
632 |
|
---|
633 | // want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match HEX_PATTERN
|
---|
634 | int searchIndex = 0;
|
---|
635 |
|
---|
636 | boolean finished = false;
|
---|
637 | while(!finished) {
|
---|
638 |
|
---|
639 | searchIndex = str.indexOf(CHARACTER, searchIndex);
|
---|
640 |
|
---|
641 | if(searchIndex == -1) {
|
---|
642 | finished = true;
|
---|
643 | }
|
---|
644 | else {
|
---|
645 |
|
---|
646 | // replacing ampersands, &, is a special case: don't want to replace the & of (hex) entities in the string:
|
---|
647 | if(hexPatternMatch.find(searchIndex) && searchIndex == hexPatternMatch.start()) {
|
---|
648 | searchIndex = hexPatternMatch.end();
|
---|
649 | } else {
|
---|
650 |
|
---|
651 | String tmp = str.substring(0, searchIndex) + hexCodeString;
|
---|
652 | searchIndex++;
|
---|
653 | if(str.length() > searchIndex) {
|
---|
654 | tmp += str.substring(searchIndex);
|
---|
655 | }
|
---|
656 | str = tmp;
|
---|
657 | searchIndex = searchIndex+ hexCodeString.length() - 1;
|
---|
658 |
|
---|
659 | // String has been modified, so have to update Matcher
|
---|
660 | hexPatternMatch = HEX_PATTERN.matcher(str);
|
---|
661 |
|
---|
662 | if(searchIndex >= str.length()) {
|
---|
663 | finished = true;
|
---|
664 | }
|
---|
665 | }
|
---|
666 | }
|
---|
667 | }
|
---|
668 |
|
---|
669 | return str;
|
---|
670 | }
|
---|
671 | }
|
---|