source: main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java@ 34415

Last change on this file since 34415 was 34415, checked in by ak19, 4 years ago

Bugfix for slowdown when assigning meta to multiple gathered docs in GLI's Enrich pane. Tested on Windows. This is the simplest way I could think of to solve the problem: XMLParsing always resolves html entities (unless possibly when using the StAX parser, but that may not return the Document object as code expects). Entities start with ampersand and are resolved upon parsing, so too standalone ampersand signs. The earlier code, a bugfix for metadata not sticking to filenames/import folder structures containing non-ASCII or ampersands or plus signs, had caused the slow-down, as after each XML parse of the current metadata.xml file, the code would loop through each FileName element of the metadata.xml file and reintroduce the resolved html entities. The best and simplest solution that worked is simply to escape ampersands with %26 when writing out values for the FileName element and compare against filenames that have a similar substitution done. Still to test on Linux, but this reincorporates recent ideas for the bugfix that had worked on Linux (but then broke on Windows) so I feel somewhat confident that this commit is likely to largely work on Linux when I test it tomorrow.

File size: 30.1 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2010 Greenstone Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29import java.io.File;
30import java.net.*;
31import java.nio.charset.*;
32import java.util.*;
33import org.greenstone.gatherer.collection.CollectionManager;
34import org.greenstone.gatherer.DebugStream;
35
36import java.util.regex.Matcher;
37import java.util.regex.Pattern;
38
39
40
41/** Static access class that contains many of the methods used to work with filename encodings.
42* Works closely with classes FileNode, CollectionTreeNode, MetadataXMLFile, MetadataXMLFileManager
43* to maintain a map of URLEncodedFilenames to their filename encodings.
44* The process of filename encoding further affects the CollectionManager which refreshes its CollectionTree,
45* FileManager (move, delete, rename actions), MetadataValueTableModel, EnrichPane. */
46
47public class FilenameEncoding {
48 /** Display of filenames in the trees are in URL encoding, if debugging */
49 public static boolean DEBUGGING = false;
50
51 /** Set to false by Gatherer if the locale is UTF-8, as Java's handling is
52 * such that non-UTF8 filename encodings on a UTF-8 locale are destructively
53 * converted so that the bytecodes in the filename are not preserved. */
54 public static boolean MULTIPLE_FILENAME_ENCODINGS_SUPPORTED = false;
55
56 /** Also set by Gatherer.
57 * If the OS supports multiple filename encodings, we will be working with URL strings
58 * and the applicable separators are always the forward slash ("/") not File.separator.
59 * If multiple filename encodings are not supported, we're dealing with File.separator. */
60 public static String URL_FILE_SEPARATOR = File.separator;
61
62
63 /** gs.filenameEncoding is a special sort of metadata that is not merely to be stored along
64 * with a file, but is to be applied in real-time on the file's name in the CollectionTree
65 * display. Since FileNodes are constantly destroyed and reconstructed by that Tree when
66 * its nodes are expanded and contracted, storing the filename encodings of each file along
67 * with the file in a FileNode doesn't help because it doesn't last. Instead of rediscovering
68 * the encoding at every stage by querying the metadataXML file, we store the encodings for
69 * fast access: in a map of (URLEncodedFilePath, filename-encoding) pairs.
70 * The current design of the map is to only store any active filename metadata assigned
71 * directly at that file/folder's level, and if there is none discovered at that level, then
72 * storing the empty string for it. Therefore, if the hashmap contains no entry for
73 * a file, it means this still needs to be retrieved. */
74 public static Map map = new HashMap();
75
76 /** Compiled pattern for hex entities of characters. These are of the forn "&#x....;" with 1 to 4 digits */
77 public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)");
78
79 /** The hex version of the ampersand character: previously hex entity (&#x26) now hex url encoded (%26).
80 * We use this in place of the ampersand character in filenames in metadata.xml files to
81 * preserve the reference to the literal ampersand in the real file name on the file system.
82 */
83 public static final String HEX_AMPERSAND = "%26"; //= FilenameEncoding.hexEntityForChar("&"); //"&";
84
85
86//*********************** BUSY REFRESHING / REQUIRING REFRESH *********************
87
88 /** Set to true if filename encoding metadata was changed. Called by the enter keyPress
89 * event in gui.EnrichPane and when the gs.FilenameEncoding field loses focus. */
90 private static boolean refreshRequired = false;
91
92 synchronized public static boolean isRefreshRequired() {
93 return refreshRequired;
94 }
95
96 synchronized public static void setRefreshRequired(boolean state) {
97 if(MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
98 refreshRequired = state;
99 } else {
100 refreshRequired = false;
101 }
102 }
103
104//************************** MAP RETRIEVAL METHODS ******************************
105
106 /** Returns the cumulative gs.filenameEncoding metadata
107 * assigned to a file inside the collection. */
108 public static String findFilenameEncoding(
109 File file, String urlEncodedFilePath, boolean bruteForceLookup)
110 {
111 //if(bruteForceLookup) {
112 // return findFilenameEncodingBruteForce(file, urlEncodedFilePath, bruteForceLookup);
113 //}
114
115 String encoding = "";
116
117 // Check any assigned encoding at this level, starting with the map first
118 // and else retrieving the filename encoding from the metadata file
119 if(!map.containsKey(urlEncodedFilePath)) {
120
121 // Check for filename encoding metadata *directly* associated with the file
122 // Now don't need to get any inherited encoding metadata here, because of
123 // the way we're storing and retrieving encoding information from the map.
124 ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(file, true); // true: gets gs.filenameEncoding only
125 if(!list.isEmpty()) {
126 MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
127 encoding = metavalue.getValue();
128 } // else no filename encoding set yet at this level
129
130 // Now we've done a lookup at this level cache the result in the map,
131 // including empty strings, to indicate that we've done a full lookup
132 map.put(urlEncodedFilePath, encoding);
133 }
134 else { // an entry exists in the map, get it from there
135 encoding = (String)map.get(urlEncodedFilePath);
136 }
137
138 // if no meta was specified at at the file level, look for any inherited metadata
139 if(encoding.equals("")) {
140 encoding = getInheritedFilenameEncoding(urlEncodedFilePath, file);
141 }
142
143 //System.err.println("\n@@@@Looked for: " + urlEncodedFilePath + " | found: " + encoding);
144 return encoding; // found something in map, may still be "", but it's what was stored
145 }
146
147 /** Checks the file-to-encoding map for all the superfolders of the given
148 * filename in sequence for an applicable encoding. Note that the file/folder
149 * at the level of urlFoldername (and dir) has already been inspected. */
150 static public String getInheritedFilenameEncoding(String urlFoldername, File dir)
151 {
152 String encoding = "";
153 boolean done = false;
154
155 // don't want to search past import folder which is as
156 // far as we need to go to determine inherited encodings
157 File importDir = new File(CollectionManager.getLoadedCollectionImportDirectoryPath());
158 if(dir.equals(importDir)) { // if the top-level dir was already checked, we're done
159 done = true;
160 }
161
162 // For directories, first remove trailing file separator in order to start checking from higher level folders
163 int lastIndex = urlFoldername.length()-1;
164 char urlFileSeparatorChar = URL_FILE_SEPARATOR.charAt(0);
165 if(urlFoldername.charAt(lastIndex) == urlFileSeparatorChar) {
166 urlFoldername = urlFoldername.substring(0, lastIndex);
167 }
168
169 while(!done) {
170 // get the folder that's one level up
171 dir = dir.getParentFile();
172
173 int index = urlFoldername.lastIndexOf(URL_FILE_SEPARATOR);
174 if(index == -1) { // no more slashes
175 done = true;
176 } else {
177 urlFoldername = urlFoldername.substring(0, index);
178 }
179
180 // now look in the map to see whether there's an encoding for this folder
181 String folder = urlFoldername + URL_FILE_SEPARATOR;
182 if(map.containsKey(folder)) {
183 encoding = (String)map.get(folder); // may be ""
184 } else { // no entry in map, so look in the metadata.xml at this folder level
185 ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(
186 dir, true); // true: gets gs.filenameEncoding only
187 if(!list.isEmpty()) {
188 MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
189 encoding = metavalue.getValue();
190 }
191 map.put(folder, encoding); // may be ""
192 }
193
194 if(!encoding.equals("")){
195 done = true;
196 } // else if "", loop to check next folder up
197 else if(dir.equals(importDir)) { // don't iterate past the import folder, which we've now checked
198 done = true;
199 }
200 }
201
202 return encoding;
203 }
204
205 /** Called by GUIManager when a collection is closed. This then empties the
206 * file-to-encoding map which is applicable only on a per-collection basis */
207 static public void closeCollection() {
208 //printFilenameMap("Closing collection. Clearing file-to-encoding map of entries:");
209 map.clear();
210 }
211
212 // Useful for debugging: prints contents of file-to-encoding map
213 static public void printFilenameMap(String heading) {
214 System.err.println("\n********************************************");
215 System.err.println(heading.toUpperCase());
216 Iterator entries = map.entrySet().iterator();
217 while(entries.hasNext()) {
218 Map.Entry entry = (Map.Entry)entries.next();
219 System.err.println("+ " + (String)entry.getKey() + ": " + (String)entry.getValue());
220 }
221 System.err.println("********************************************\n");
222 }
223
224 // UNUSED at present. Brute force version of the findFilenameEncoding() method
225 // Doesn't use the map, but gets *all* the metadata assigned to a file/folder to
226 // work out the encoding applicable to a file/folder.
227 public static String findFilenameEncodingBruteForce(File file, String urlEncodedFilename,
228 boolean bruteForceLookup)
229 {
230 System.err.println("\n***** BRUTE FORCE getFilenameEncoding() called\n");
231
232
233 String encoding = "";
234
235 // Check for filename encoding metadata *directly* associated with the file
236 // Now don't need to get any inherited encoding metadata here, because of
237 // the way we're storing and retrieving encoding information from the map.
238
239 ArrayList list = MetadataXMLFileManager.getMetadataAssignedToFile(file, true); // true: gets gs.filenameEncoding only
240 if(!list.isEmpty()) {
241 // try to get the filename encoding meta that was assigned last to this
242 // file, even though it makes no sense to have multiple values for it
243 MetadataValue metavalue = (MetadataValue)list.get(list.size()-1);
244 encoding = metavalue.getValue();
245
246 if(encoding == null) { // unlikely ???
247 System.err.println("**** ERROR: encoding for "
248 + urlEncodedFilename + " is NULL!");
249 encoding = "";
250 }
251 } // else no filename encoding set yet, perhaps
252 //System.err.println("**** Found encoding for " + urlEncodedFilename + " " + encoding);
253 return encoding;
254 }
255
256//****************************** APPLYING ENCODINGS TO FILENAMES *****************************
257
258 /** URL encoded version of the byte codes of the given file's name */
259 public static String calcURLEncodedFilePath(File file) {
260 return fileToURLEncoding(file);
261 }
262
263 /** URL encoded version of the byte codes of this file's name */
264 public static String calcURLEncodedFileName(String urlfilepath) {
265 String filename = urlfilepath;
266 if(filename.endsWith(URL_FILE_SEPARATOR)) { // directory, remove trailing slash
267 filename = filename.substring(0, filename.length() - 1);
268 }
269
270 // remove the directory prefix (if any) to get the filename
271 int index = filename.lastIndexOf(URL_FILE_SEPARATOR);
272 if(index != -1) {
273 filename = filename.substring(index+1); // skip separator
274 }
275
276 return filename;
277 }
278
279 /** Given a string representing an alias to an official encoding (and unofficial ones
280 * starting with "Latin-"), attempts to work out what the canonical encoding for that is.
281 * If the given encoding is unrecognised, it is returned as is. */
282 public static String canonicalEncodingName(String encoding) {
283 String canonicalEncoding = encoding;
284 try {
285 // Latin-1 -> ISO-8859-1
286 String alias = canonicalEncoding.toLowerCase();
287 if(alias.startsWith("latin")){
288 canonicalEncoding = "ISO-8859" + alias.substring("latin".length());
289 }
290
291 // canonical encoding for official aliases
292 canonicalEncoding = Charset.forName(canonicalEncoding).name();
293 return canonicalEncoding;
294 } catch (Exception e) {
295 System.err.println("(Could not recognise encoding (alias): "
296 + encoding + ".)");
297 return encoding; // no alias could be found, return the original parameter
298 }
299 }
300
301//************************* GETTING THE URL ENCODING OF FILENAMES *********************************
302
303 /**
304 * Given a String containing hexentities, will convert back into the unicode version of the String.
305 * e.g. A string like "02 Tēnā Koutou\.mp3" will be returned as "02 Tena Koutou\.mp3" with macrons on e and a
306 * I've tested this in a separate file that imports java.util.regex.Matcher and java.util.regex.Pattern
307 * and contains a copy of Utility.debugUnicodeString(String) with the following main function:
308 public static void main(String args[]) {
309 String str = "02 Tēnā Koutou\\.mp3"; // or more basic case: String str = "mmmmānnnnēpppp\\.txt";
310 System.err.println("About to decode hex string: " + str);
311 String result = decodeStringContainingHexEntities(str);
312 System.err.println("Decoded hex string: " + result + " - debug unicode form: " + debugUnicodeString(result));
313 }
314 */
315 public static String decodeStringContainingHexEntities(String str) {
316 String result = "";
317 Matcher matcher = HEX_PATTERN.matcher(str);
318
319 int searchFromIndex = 0;
320 int endMatchIndex = -1;
321
322 while(matcher.find(searchFromIndex)) {
323 String hexPart = matcher.group();
324 //System.err.println("Found hexpart match: " + hexPart);
325
326 int startMatchIndex = matcher.start();
327 endMatchIndex = matcher.end();
328 result += str.substring(searchFromIndex, startMatchIndex);
329
330 String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match
331 // https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string
332 // https://stackoverflow.com/questions/11194513/convert-hex-string-to-int
333
334 //System.err.println("hexNumberStr so far: " + hexNumberStr);
335 hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD"
336 //int hexNumber = Integer.parseInt(hexNumberStr);
337 int hexNumber = Integer.decode(hexNumberStr);
338 String hexNumberAsChar = Character.toString((char) hexNumber);
339 result += hexNumberAsChar;
340
341 searchFromIndex = endMatchIndex;
342
343 }
344
345 if(endMatchIndex != -1) { // attach any suffix once we finished processing all the hex codes
346 result += str.substring(endMatchIndex);
347 //System.err.println("suffix: " + str.substring(endMatchIndex));
348 }
349 else { // there were no hex codes to decode, return string as is
350 result = str;
351 }
352
353 return result;
354 }
355
356
357 // Dr Bainbridge's methods
358 /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,
359 * we can work with URL-encoded filenames in Java. Java works with whatever
360 * encoding the filesystem uses. Unlike systems working with UTF-8, where Java
361 * interprets filenames as UTF-8 (a destructive process since characters invalid
362 * for UTF-8 are replaced with the invalid character, which means the original
363 * character's byte codes can not be regained), working with an ISO-8859-1
364 * system means the original byte codes of the characters are preserved,
365 * regardless of whether the characters represent ISO-8859-1 or not. Such byte
366 * codes are converted by the following method to the correct URL versions of
367 * the strings that the filenames represent (that is, the correct URL representations
368 * of the filenames in their original encodings). This is useful for interactions with
369 * Perl as Java and Perl can use URL-encoded filenames to talk about the same files
370 * on the file system, instead of having to work out what encoding they are in. */
371
372 public static String fileToURLEncoding(File file) {
373 // on a UTF-8 file system, DO NOT do the stuff further below,
374 // just return input filename param, but with any & in the filename replaced with its hex entity
375 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
376 String filepath = file.getAbsolutePath();
377 return filepath;
378 }
379
380 String filename_url_encoded = "";
381
382 // The following test for whether the file exists or not is a problem
383 // when a File object--whose actual file is in the process of being moved
384 // and therefore temporarily does not 'exist' on the actual system--can't
385 // be URL encoded: the following would return "" when a file doesn't exist.
386 // So commenting out the test.
387 /*
388 if(!file.getName().equals("recycle")) {
389 if(!file.isFile() && !file.isDirectory()) {
390 System.err.println("*** ERROR. Java can't see file: " + file.getAbsolutePath());
391 return "";
392 }
393
394 if(!file.exists()) {
395 System.err.println("*** NOTE: File doesn't exist: " + file.getAbsolutePath());
396 return ""; //file.getName();
397 }
398 }
399 */
400
401 URI filename_uri = file.toURI();
402 try {
403 // The trick:
404 // 1. toASCIIString() will %xx encode values > 127
405 // 2. Decode the result to "ISO-8859-1"
406 // 3. URL encode the bytes to string
407
408 // Step 2 forces the string to be 8-bit values. It
409 // doesn't matter if the starting raw filename was *not*
410 // in the ISO-8859-1 encoding, the effect is to ensure
411 // we have an 8-bit byte string that (numerically)
412 // captures the right value. These numerical values are
413 // then used to determine how to URL encode it
414
415 String filename_ascii = filename_uri.toASCIIString();
416
417 // The URI.toASCIIString() call above only encodes values > 127.
418 // But we also need to protect + and & signs in filenames. Do this by URL encoding.
419 // But need to double URL encode, else it will get decoded too early, in methods called shortly hereafter.
420 filename_ascii = filename_ascii.replace("+", "%252B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased
421 filename_ascii = filename_ascii.replace("&", "%2526"); // &'s ASCII code is 36 in decimal, and 26 in hex
422
423 // Before proceeding, protect & in the filename too.
424 // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & (HEX_AMPERSAND)
425 // But dangerous to do simple replace if there are &#x...; entities in the filename already!
426 // That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same!
427 //filename_url_encoded = filename_url_encoded.replace("&", "&x26;");// SO THIS IS BAD
428 //filename_url_encoded = filename_url_encoded.replace("&", hexEntityForChar("&"));// SAME, STILL BAD
429 ///filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities
430
431 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
432 filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
433
434 // For chars that were protected by being URL encoded, now convert them to the correct version we want them in.
435 // For +: this char is special in regex, so it needs to be converted from URL encoding back to + so it will get properly escaped for regex
436 // For &: this char is special in XML, so since the call to iso_8859_1_filename_to_url_encoded() is over, we can finally convert & to hex entity now.
437 //filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller
438 filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex
439 filename_url_encoded = filename_url_encoded.replace("%26", "&"); // now putting back ampersands too, instead of replacing with HEX_ENTITY_AMPERSAND (&)
440 }
441 catch (Exception e) {
442 e.printStackTrace();
443 // Give up trying to convert
444 filename_url_encoded = file.getAbsolutePath();
445 }
446 return filename_url_encoded;
447 }
448
449 // For unicode codepoints see:
450 // http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT for ISO8859-1 (Latin-1)
451 // where 0xE2 maps to codepoint 0x00E2 and is defined as "Latin small letter a with circumflex"
452 // http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT for ISO8859-7 (Greek)
453 // where 0xE2 maps to codepoint 0x03B2 and is defined as "Greek small letter beta"
454 public static String iso_8859_1_filename_to_url_encoded(String raw_bytes_filename)
455 throws Exception
456 {
457 String urlEncoded = "";
458
459 try {
460 // By this point we have a UTF-8 encoded string that captures
461 // what the ISO-8859-1 (Latin-1) character is that corresponded to the
462 // 8-bit numeric value for that character in the filename
463 // on the file system
464
465 // For example:
466 // File system char: <lower-case beta char in Latin-7> = %E2
467 // Equivalent Latin 1 char: <lower-case a with circumflex> = %E2
468 // Mapped to UTF-8: <lower-case a with circumflex> = <C3><A2>
469
470 // Our task is to take the string the contains <C3><A2> and ensure that
471 // we "see" it as <E2>
472
473 byte [] raw_bytes = raw_bytes_filename.getBytes("ISO-8859-1");
474 String unicode_filename = new String(raw_bytes,"UTF-8");
475
476 for(int i = 0; i < unicode_filename.length(); i++) {
477 char charVal = unicode_filename.charAt(i);
478 if ((int)charVal > 255) {
479 urlEncoded += String.format("&#x%02X;", (int)charVal);
480 }
481 else if((int)charVal > 127) {
482 urlEncoded += String.format("%%%02X", (int)charVal);
483 } else {
484 urlEncoded += String.format("%c", (char)charVal);
485 }
486 }
487 }
488 catch (Exception e) {
489 //e.printStackTrace();
490 throw(e);
491 }
492
493 return urlEncoded;
494 }
495
496 // unused for now
497 public static String raw_filename_to_url_encoded(String fileName)
498 throws Exception
499 {
500 String urlEncoded = "";
501 try {
502 byte[] bytes = fileName.getBytes();
503
504 for(int i = 0; i < bytes.length; i++) {
505 // mask each byte (by applying & 0xFF) to make the signed
506 // byte (in the range -128 to 127) unsigned (in the range
507 // 0 to 255).
508
509 int byteVal = (int)(bytes[i] & 0xFF);
510
511 if(byteVal > 127) {
512 urlEncoded += String.format("%%%02X", (int)byteVal);
513 } else {
514 urlEncoded += String.format("%c",(char)byteVal);
515 }
516 }
517 }
518 catch (Exception e) {
519 //e.printStackTrace();
520 throw(e);
521 }
522
523 return urlEncoded;
524 }
525
526 // FURTHER HELPER METHODS
527
528 /**
529 * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter.
530 * If filename is relative, then the current directory (gli?) will be prefixed to what is returned
531 * and should be removed manually by the caller. Alternatively, for relative paths, call the variant
532 * relativeFilenameToURLEncoding(String), which will remove any added filepath prefix.
533 */
534 public static String fullFilepathToURLEncoding(String filename) {
535 // on a UTF-8 file system, DO NOT do the stuff further below,
536 // just return input filename param, but with any & in the filename replaced with its hex entity
537 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
538 return filename; //return filename.replace("&", HEX_AMPERSAND);
539 }
540
541 File file = new File (filename);
542 String filename_url_encoded = fileToURLEncoding(file);
543
544 // if the current directory (".") was passed in as filename,
545 // then the filename_url_encoded looks like /full/path/./
546 // In that case, remove the ./ at the end
547 if (filename_url_encoded.endsWith(FilenameEncoding.URL_FILE_SEPARATOR+"."+FilenameEncoding.URL_FILE_SEPARATOR)) {
548 filename_url_encoded = filename_url_encoded.substring(0, filename_url_encoded.length()-2); // cut off /. at end
549 }
550
551 return filename_url_encoded;
552 }
553
554 /**
555 * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter
556 * If filename is a relative path, call this method to get it specially URL encoded.
557 * This method will remove the current directory that is prefixed as an intermediary step.
558 */
559 public static String relativeFilenameToURLEncoding(String filename) {
560 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
561 return filename; // return filename.replace("&", HEX_AMPERSAND);
562 }
563
564 String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding(".");
565 return filenameToURLEncodingWithPrefixRemoved(filename, curr_directory_path);
566 }
567
568 /**
569 * Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter
570 * Convenience method that will return the specially URL encoded version of filename
571 * with the provided removeFilePathPrefix removed */
572 public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) {
573 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
574 return filename; //return filename.replace("&", HEX_AMPERSAND);
575 }
576
577 File file = new File (filename);
578 String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath
579
580 // now lop off the given removeFilePathPrefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added
581 filename_url_encoded = filename_url_encoded.substring(removeFilePathPrefix.length());
582 // remove any remaining slash prefix
583 if (filename_url_encoded.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
584 filename_url_encoded = filename_url_encoded.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
585 }
586
587 return filename_url_encoded;
588 }
589
590// UNUSED now, but useful functions and escapeAllCharWithHexEntity() took effort to write.
591
592 /**
593 * Attempting to produce the equivalent method fileToURLEncoding(), but taking a String as input parameter
594 * UNUSED - REPLACED by filenameToURLEncoding(String str) which reuses existing fileToURLEncoding(File) method.
595 */
596 public static String stringToHex(String str) {
597
598 String hex_str = "";
599 for(int i = 0; i < str.length(); i++) {
600 int charCode = str.codePointAt(i); // unicode codepoint / ASCII code
601
602 // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
603 // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
604 if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13 /*|| charCode == 36 || charCode == 43*/) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too
605 hex_str += str.charAt(i);
606 } else {
607 hex_str += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"
608 }
609 }
610
611 return hex_str;
612 }
613
614 /** Takes a String containing a single char and returns the hex entity for it */
615 public static String hexEntityForChar(String char_as_string) {
616 int charCode = char_as_string.codePointAt(0); // unicode codepoint / ASCII code
617 String hexCodeStr = "&#x" + String.format("%x", charCode).toUpperCase() + ";";
618 return hexCodeStr;
619 }
620
621 /**
622 * Given a String containing 0 or more occurrences of CHARACTER,
623 * this method will replace all occurrences of that CHARACTER with its hex entity variant, "&x....;"
624 * Special care is taken where the CHARACTER to be replaced is &,
625 * as in that case, we don't want to replace any existing hex entities already present in the String.
626 */
627 public static String escapeAllCharWithHexEntity(String str, char CHARACTER) {
628
629 if(str.indexOf(CHARACTER) == -1) { // nothing to replace, we're done
630 return str;
631 }
632
633 String char_as_string = Character.toString(CHARACTER);
634 String hexCodeString = hexEntityForChar(char_as_string);
635
636 Matcher hexPatternMatch = HEX_PATTERN.matcher(str); // looks for a hex entity, which has the pattern "&#x....;"
637
638 // want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match HEX_PATTERN
639 int searchIndex = 0;
640
641 boolean finished = false;
642 while(!finished) {
643
644 searchIndex = str.indexOf(CHARACTER, searchIndex);
645
646 if(searchIndex == -1) {
647 finished = true;
648 }
649 else {
650
651 // replacing ampersands, &, is a special case: don't want to replace the & of (hex) entities in the string:
652 if(hexPatternMatch.find(searchIndex) && searchIndex == hexPatternMatch.start()) {
653 searchIndex = hexPatternMatch.end();
654 } else {
655
656 String tmp = str.substring(0, searchIndex) + hexCodeString;
657 searchIndex++;
658 if(str.length() > searchIndex) {
659 tmp += str.substring(searchIndex);
660 }
661 str = tmp;
662 searchIndex = searchIndex+ hexCodeString.length() - 1;
663
664 // String has been modified, so have to update Matcher
665 hexPatternMatch = HEX_PATTERN.matcher(str);
666
667 if(searchIndex >= str.length()) {
668 finished = true;
669 }
670 }
671 }
672 }
673
674 return str;
675 }
676}
Note: See TracBrowser for help on using the repository browser.