Context Navigation

source: main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java@ 33748

Last change on this file since 33748 was 33748, checked in by ak19, 4 years ago
Linux bugfixes to recent commits to do with getting file-level meta assigned to non-ascii filenames or filenames containing plus/ampersand signs to work. Cumulative past commits were sufficient for fixing these issues on Windows. All those changes plus the current ones get it all working on Linux too.
File size: 30.3 KB

Line
1	/**
2	*############################################################################
3	* A component of the Greenstone Librarian Interface, part of the Greenstone
4	* digital library suite from the New Zealand Digital Library Project at the
5	* University of Waikato, New Zealand.
6	*
7	* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8	*
9	* Copyright (C) 2010 Greenstone Digital Library Project
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*############################################################################
25	*/
26
27	package org.greenstone.gatherer.metadata;
28
29	import java.io.File;
30	import java.net.*;
31	import java.nio.charset.*;
32	import java.util.*;
33	import org.greenstone.gatherer.collection.CollectionManager;
34	import org.greenstone.gatherer.DebugStream;
35
36	import java.util.regex.Matcher;
37	import java.util.regex.Pattern;
38
39
40
41	/** Static access class that contains many of the methods used to work with filename encodings.
42	* Works closely with classes FileNode, CollectionTreeNode, MetadataXMLFile, MetadataXMLFileManager
43	* to maintain a map of URLEncodedFilenames to their filename encodings.
44	* The process of filename encoding further affects the CollectionManager which refreshes its CollectionTree,
45	* FileManager (move, delete, rename actions), MetadataValueTableModel, EnrichPane. */
46
47	public class FilenameEncoding {
48	/** Display of filenames in the trees are in URL encoding, if debugging */
49	public static boolean DEBUGGING = false;
50
51	/** Set to false by Gatherer if the locale is UTF-8, as Java's handling is
52	* such that non-UTF8 filename encodings on a UTF-8 locale are destructively
53	* converted so that the bytecodes in the filename are not preserved. */
54	public static boolean MULTIPLE_FILENAME_ENCODINGS_SUPPORTED = false;
55
56	/** Also set by Gatherer.
57	* If the OS supports multiple filename encodings, we will be working with URL strings
58	* and the applicable separators are always the forward slash ("/") not File.separator.
59	* If multiple filename encodings are not supported, we're dealing with File.separator. */
60	public static String URL_FILE_SEPARATOR = File.separator;
61
62
63	/** gs.filenameEncoding is a special sort of metadata that is not merely to be stored along
64	* with a file, but is to be applied in real-time on the file's name in the CollectionTree
65	* display. Since FileNodes are constantly destroyed and reconstructed by that Tree when
66	* its nodes are expanded and contracted, storing the filename encodings of each file along
67	* with the file in a FileNode doesn't help because it doesn't last. Instead of rediscovering
68	* the encoding at every stage by querying the metadataXML file, we store the encodings for
69	* fast access: in a map of (URLEncodedFilePath, filename-encoding) pairs.
70	* The current design of the map is to only store any active filename metadata assigned
71	* directly at that file/folder's level, and if there is none discovered at that level, then
72	* storing the empty string for it. Therefore, if the hashmap contains no entry for
73	* a file, it means this still needs to be retrieved. */
74	public static Map map = new HashMap();
75
76	/** Compiled pattern for hex entities of characters. These are of the forn "&#x....;" with 1 to 4 digits */
77	public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)");
78
79	/** The hex entity version of the ampersand character.
80	* We use this in place of the ampersand character in filenames in metadata.xml files to
81	* preserve the reference to the literal ampersand in the real file name on the file system.
82	*/
83	public static final String HEX_ENTITY_AMPERSAND = FilenameEncoding.hexEntityForChar("&"); //"&";
84
85
86	//********************* BUSY REFRESHING / REQUIRING REFRESH *******************
87
88	/** Set to true if filename encoding metadata was changed. Called by the enter keyPress
89	* event in gui.EnrichPane and when the gs.FilenameEncoding field loses focus. */
90	private static boolean refreshRequired = false;
91
92	synchronized public static boolean isRefreshRequired() {
93	return refreshRequired;
94	}
95
96	synchronized public static void setRefreshRequired(boolean state) {
97	if(MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
98	refreshRequired = state;
99	} else {
100	refreshRequired = false;
101	}
102	}
103
104	//************************ MAP RETRIEVAL METHODS ****************************
105
106	/** Returns the cumulative gs.filenameEncoding metadata
107	* assigned to a file inside the collection. */
108	public static String findFilenameEncoding(
109	File file, String urlEncodedFilePath, boolean bruteForceLookup)
110	{
111	//if(bruteForceLookup) {
112	// return findFilenameEncodingBruteForce(file, urlEncodedFilePath, bruteForceLookup);
113	//}
114
115	String encoding = "";
116
117	// Check any assigned encoding at this level, starting with the map first
118	// and else retrieving the filename encoding from the metadata file
119	if(!map.containsKey(urlEncodedFilePath)) {
120
121	// Check for filename encoding metadata directly associated with the file
122	// Now don't need to get any inherited encoding metadata here, because of
123	// the way we're storing and retrieving encoding information from the map.
124	ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(file, true); // true: gets gs.filenameEncoding only
125	if(!list.isEmpty()) {
126	MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
127	encoding = metavalue.getValue();
128	} // else no filename encoding set yet at this level
129
130	// Now we've done a lookup at this level cache the result in the map,
131	// including empty strings, to indicate that we've done a full lookup
132	map.put(urlEncodedFilePath, encoding);
133	}
134	else { // an entry exists in the map, get it from there
135	encoding = (String)map.get(urlEncodedFilePath);
136	}
137
138	// if no meta was specified at at the file level, look for any inherited metadata
139	if(encoding.equals("")) {
140	encoding = getInheritedFilenameEncoding(urlEncodedFilePath, file);
141	}
142
143	//System.err.println("\n@@@@Looked for: " + urlEncodedFilePath + " \| found: " + encoding);
144	return encoding; // found something in map, may still be "", but it's what was stored
145	}
146
147	/** Checks the file-to-encoding map for all the superfolders of the given
148	* filename in sequence for an applicable encoding. Note that the file/folder
149	* at the level of urlFoldername (and dir) has already been inspected. */
150	static public String getInheritedFilenameEncoding(String urlFoldername, File dir)
151	{
152	String encoding = "";
153	boolean done = false;
154
155	// don't want to search past import folder which is as
156	// far as we need to go to determine inherited encodings
157	File importDir = new File(CollectionManager.getLoadedCollectionImportDirectoryPath());
158	if(dir.equals(importDir)) { // if the top-level dir was already checked, we're done
159	done = true;
160	}
161
162	// For directories, first remove trailing file separator in order to start checking from higher level folders
163	int lastIndex = urlFoldername.length()-1;
164	char urlFileSeparatorChar = URL_FILE_SEPARATOR.charAt(0);
165	if(urlFoldername.charAt(lastIndex) == urlFileSeparatorChar) {
166	urlFoldername = urlFoldername.substring(0, lastIndex);
167	}
168
169	while(!done) {
170	// get the folder that's one level up
171	dir = dir.getParentFile();
172
173	int index = urlFoldername.lastIndexOf(URL_FILE_SEPARATOR);
174	if(index == -1) { // no more slashes
175	done = true;
176	} else {
177	urlFoldername = urlFoldername.substring(0, index);
178	}
179
180	// now look in the map to see whether there's an encoding for this folder
181	String folder = urlFoldername + URL_FILE_SEPARATOR;
182	if(map.containsKey(folder)) {
183	encoding = (String)map.get(folder); // may be ""
184	} else { // no entry in map, so look in the metadata.xml at this folder level
185	ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(
186	dir, true); // true: gets gs.filenameEncoding only
187	if(!list.isEmpty()) {
188	MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
189	encoding = metavalue.getValue();
190	}
191	map.put(folder, encoding); // may be ""
192	}
193
194	if(!encoding.equals("")){
195	done = true;
196	} // else if "", loop to check next folder up
197	else if(dir.equals(importDir)) { // don't iterate past the import folder, which we've now checked
198	done = true;
199	}
200	}
201
202	return encoding;
203	}
204
205	/** Called by GUIManager when a collection is closed. This then empties the
206	* file-to-encoding map which is applicable only on a per-collection basis */
207	static public void closeCollection() {
208	//printFilenameMap("Closing collection. Clearing file-to-encoding map of entries:");
209	map.clear();
210	}
211
212	// Useful for debugging: prints contents of file-to-encoding map
213	static public void printFilenameMap(String heading) {
214	System.err.println("\n********************************************");
215	System.err.println(heading.toUpperCase());
216	Iterator entries = map.entrySet().iterator();
217	while(entries.hasNext()) {
218	Map.Entry entry = (Map.Entry)entries.next();
219	System.err.println("+ " + (String)entry.getKey() + ": " + (String)entry.getValue());
220	}
221	System.err.println("********************************************\n");
222	}
223
224	// UNUSED at present. Brute force version of the findFilenameEncoding() method
225	// Doesn't use the map, but gets all the metadata assigned to a file/folder to
226	// work out the encoding applicable to a file/folder.
227	public static String findFilenameEncodingBruteForce(File file, String urlEncodedFilename,
228	boolean bruteForceLookup)
229	{
230	System.err.println("\n***** BRUTE FORCE getFilenameEncoding() called\n");
231
232
233	String encoding = "";
234
235	// Check for filename encoding metadata directly associated with the file
236	// Now don't need to get any inherited encoding metadata here, because of
237	// the way we're storing and retrieving encoding information from the map.
238
239	ArrayList list = MetadataXMLFileManager.getMetadataAssignedToFile(file, true); // true: gets gs.filenameEncoding only
240	if(!list.isEmpty()) {
241	// try to get the filename encoding meta that was assigned last to this
242	// file, even though it makes no sense to have multiple values for it
243	MetadataValue metavalue = (MetadataValue)list.get(list.size()-1);
244	encoding = metavalue.getValue();
245
246	if(encoding == null) { // unlikely ???
247	System.err.println("**** ERROR: encoding for "
248	+ urlEncodedFilename + " is NULL!");
249	encoding = "";
250	}
251	} // else no filename encoding set yet, perhaps
252	//System.err.println("**** Found encoding for " + urlEncodedFilename + " " + encoding);
253	return encoding;
254	}
255
256	//**************************** APPLYING ENCODINGS TO FILENAMES ***************************
257
258	/** URL encoded version of the byte codes of the given file's name */
259	public static String calcURLEncodedFilePath(File file) {
260	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
261	return file.getAbsolutePath();
262	}
263	else {
264	String filename = fileToURLEncoding(file);
265	return filename;
266	}
267	}
268
269	/** URL encoded version of the byte codes of this file's name */
270	public static String calcURLEncodedFileName(String urlfilepath) {
271	String filename = urlfilepath;
272	if(filename.endsWith(URL_FILE_SEPARATOR)) { // directory, remove trailing slash
273	filename = filename.substring(0, filename.length() - 1);
274	}
275
276	// remove the directory prefix (if any) to get the filename
277	int index = filename.lastIndexOf(URL_FILE_SEPARATOR);
278	if(index != -1) {
279	filename = filename.substring(index+1); // skip separator
280	}
281
282	return filename;
283	}
284
285	/** Given a string representing an alias to an official encoding (and unofficial ones
286	* starting with "Latin-"), attempts to work out what the canonical encoding for that is.
287	* If the given encoding is unrecognised, it is returned as is. */
288	public static String canonicalEncodingName(String encoding) {
289	String canonicalEncoding = encoding;
290	try {
291	// Latin-1 -> ISO-8859-1
292	String alias = canonicalEncoding.toLowerCase();
293	if(alias.startsWith("latin")){
294	canonicalEncoding = "ISO-8859" + alias.substring("latin".length());
295	}
296
297	// canonical encoding for official aliases
298	canonicalEncoding = Charset.forName(canonicalEncoding).name();
299	return canonicalEncoding;
300	} catch (Exception e) {
301	System.err.println("(Could not recognise encoding (alias): "
302	+ encoding + ".)");
303	return encoding; // no alias could be found, return the original parameter
304	}
305	}
306
307	//*********************** GETTING THE URL ENCODING OF FILENAMES *******************************
308
309	/**
310	* Given a String containing hexentities, will convert back into the unicode version of the String.
311	* e.g. A string like "02 Tēnā Koutou\.mp3" will be returned as "02 Tena Koutou\.mp3" with macrons on e and a
312	* I've tested this in a separate file that imports java.util.regex.Matcher and java.util.regex.Pattern
313	* and contains a copy of Utility.debugUnicodeString(String) with the following main function:
314	public static void main(String args[]) {
315	String str = "02 Tēnā Koutou\\.mp3"; // or more basic case: String str = "mmmmānnnnēpppp\\.txt";
316	System.err.println("About to decode hex string: " + str);
317	String result = decodeStringContainingHexEntities(str);
318	System.err.println("Decoded hex string: " + result + " - debug unicode form: " + debugUnicodeString(result));
319	}
320	*/
321	public static String decodeStringContainingHexEntities(String str) {
322	String result = "";
323	Matcher matcher = HEX_PATTERN.matcher(str);
324
325	int searchFromIndex = 0;
326	int endMatchIndex = -1;
327
328	while(matcher.find(searchFromIndex)) {
329	String hexPart = matcher.group();
330	//System.err.println("Found hexpart match: " + hexPart);
331
332	int startMatchIndex = matcher.start();
333	endMatchIndex = matcher.end();
334	result += str.substring(searchFromIndex, startMatchIndex);
335
336	String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match
337	// https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string
338	// https://stackoverflow.com/questions/11194513/convert-hex-string-to-int
339
340	//System.err.println("hexNumberStr so far: " + hexNumberStr);
341	hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD"
342	//int hexNumber = Integer.parseInt(hexNumberStr);
343	int hexNumber = Integer.decode(hexNumberStr);
344	String hexNumberAsChar = Character.toString((char) hexNumber);
345	result += hexNumberAsChar;
346
347	searchFromIndex = endMatchIndex;
348
349	}
350
351	if(endMatchIndex != -1) { // attach any suffix once we finished processing all the hex codes
352	result += str.substring(endMatchIndex);
353	//System.err.println("suffix: " + str.substring(endMatchIndex));
354	}
355	else { // there were no hex codes to decode, return string as is
356	result = str;
357	}
358
359	return result;
360	}
361
362
363	// Dr Bainbridge's methods
364	/* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,
365	* we can work with URL-encoded filenames in Java. Java works with whatever
366	* encoding the filesystem uses. Unlike systems working with UTF-8, where Java
367	* interprets filenames as UTF-8 (a destructive process since characters invalid
368	* for UTF-8 are replaced with the invalid character, which means the original
369	* character's byte codes can not be regained), working with an ISO-8859-1
370	* system means the original byte codes of the characters are preserved,
371	* regardless of whether the characters represent ISO-8859-1 or not. Such byte
372	* codes are converted by the following method to the correct URL versions of
373	* the strings that the filenames represent (that is, the correct URL representations
374	* of the filenames in their original encodings). This is useful for interactions with
375	* Perl as Java and Perl can use URL-encoded filenames to talk about the same files
376	* on the file system, instead of having to work out what encoding they are in. */
377
378	public static String fileToURLEncoding(File file) {
379	// on a UTF-8 file system, DO NOT do the stuff further below,
380	// just return input filename param, but with any & in the filename replaced with its hex entity
381	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
382	// protect ampersands in filenames by converting it to its hex entity
383	String filepath = file.getAbsolutePath();
384	filepath = filepath.replace("&", HEX_ENTITY_AMPERSAND);
385	return filepath;
386	}
387
388	String filename_url_encoded = "";
389
390	// The following test for whether the file exists or not is a problem
391	// when a File object--whose actual file is in the process of being moved
392	// and therefore temporarily does not 'exist' on the actual system--can't
393	// be URL encoded: the following would return "" when a file doesn't exist.
394	// So commenting out the test.
395	/*
396	if(!file.getName().equals("recycle")) {
397	if(!file.isFile() && !file.isDirectory()) {
398	System.err.println("*** ERROR. Java can't see file: " + file.getAbsolutePath());
399	return "";
400	}
401
402	if(!file.exists()) {
403	System.err.println("*** NOTE: File doesn't exist: " + file.getAbsolutePath());
404	return ""; //file.getName();
405	}
406	}
407	*/
408
409	URI filename_uri = file.toURI();
410	try {
411	// The trick:
412	// 1. toASCIIString() will %xx encode values > 127
413	// 2. Decode the result to "ISO-8859-1"
414	// 3. URL encode the bytes to string
415
416	// Step 2 forces the string to be 8-bit values. It
417	// doesn't matter if the starting raw filename was not
418	// in the ISO-8859-1 encoding, the effect is to ensure
419	// we have an 8-bit byte string that (numerically)
420	// captures the right value. These numerical values are
421	// then used to determine how to URL encode it
422
423	String filename_ascii = filename_uri.toASCIIString();
424
425	// The URI.toASCIIString() call above only encodes values > 127.
426	// But we also need to protect + and & signs in filenames. Do this by URL encoding.
427	// But need to double URL encode, else it will get decoded too early, in methods called shortly hereafter.
428	filename_ascii = filename_ascii.replace("+", "%252B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased
429	filename_ascii = filename_ascii.replace("&", "%2526"); // &'s ASCII code is 36 in decimal, and 26 in hex
430
431	// Before proceeding, protect & in the filename too.
432	// &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & (HEX_ENTITY_AMPERSAND)
433	// But dangerous to do simple replace if there are &#x...; entities in the filename already!
434	// That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same!
435	//filename_url_encoded = filename_url_encoded.replace("&", "&x26;");// SO THIS IS BAD
436	//filename_url_encoded = filename_url_encoded.replace("&", hexEntityForChar("&"));// SAME, STILL BAD
437	///filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities
438
439	String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
440	filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
441
442	// For chars that were protected by being URL encoded, now convert them to the correct version we want them in.
443	// For +: this char is special in regex, so it needs to be converted from URL encoding back to + so it will get properly escaped for regex
444	// For &: this char is special in XML, so since the call to iso_8859_1_filename_to_url_encoded() is over, we can finally convert & to hex entity now.
445	//filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller
446	filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex
447	filename_url_encoded = filename_url_encoded.replace("%26", HEX_ENTITY_AMPERSAND); // convert URL encoding for ampersand into hex entity for ampersand
448	}
449	catch (Exception e) {
450	e.printStackTrace();
451	// Give up trying to convert
452	filename_url_encoded = file.getAbsolutePath();
453	}
454	return filename_url_encoded;
455	}
456
457	// For unicode codepoints see:
458	// http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT for ISO8859-1 (Latin-1)
459	// where 0xE2 maps to codepoint 0x00E2 and is defined as "Latin small letter a with circumflex"
460	// http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT for ISO8859-7 (Greek)
461	// where 0xE2 maps to codepoint 0x03B2 and is defined as "Greek small letter beta"
462	public static String iso_8859_1_filename_to_url_encoded(String raw_bytes_filename)
463	throws Exception
464	{
465	String urlEncoded = "";
466
467	try {
468	// By this point we have a UTF-8 encoded string that captures
469	// what the ISO-8859-1 (Latin-1) character is that corresponded to the
470	// 8-bit numeric value for that character in the filename
471	// on the file system
472
473	// For example:
474	// File system char: <lower-case beta char in Latin-7> = %E2
475	// Equivalent Latin 1 char: <lower-case a with circumflex> = %E2
476	// Mapped to UTF-8: <lower-case a with circumflex> = <C3><A2>
477
478	// Our task is to take the string the contains <C3><A2> and ensure that
479	// we "see" it as <E2>
480
481	byte [] raw_bytes = raw_bytes_filename.getBytes("ISO-8859-1");
482	String unicode_filename = new String(raw_bytes,"UTF-8");
483
484	for(int i = 0; i < unicode_filename.length(); i++) {
485	char charVal = unicode_filename.charAt(i);
486	if ((int)charVal > 255) {
487	urlEncoded += String.format("&#x%02X;", (int)charVal);
488	}
489	else if((int)charVal > 127) {
490	urlEncoded += String.format("%%%02X", (int)charVal);
491	} else {
492	urlEncoded += String.format("%c", (char)charVal);
493	}
494	}
495	}
496	catch (Exception e) {
497	//e.printStackTrace();
498	throw(e);
499	}
500
501	return urlEncoded;
502	}
503
504	// unused for now
505	public static String raw_filename_to_url_encoded(String fileName)
506	throws Exception
507	{
508	String urlEncoded = "";
509	try {
510	byte[] bytes = fileName.getBytes();
511
512	for(int i = 0; i < bytes.length; i++) {
513	// mask each byte (by applying & 0xFF) to make the signed
514	// byte (in the range -128 to 127) unsigned (in the range
515	// 0 to 255).
516
517	int byteVal = (int)(bytes[i] & 0xFF);
518
519	if(byteVal > 127) {
520	urlEncoded += String.format("%%%02X", (int)byteVal);
521	} else {
522	urlEncoded += String.format("%c",(char)byteVal);
523	}
524	}
525	}
526	catch (Exception e) {
527	//e.printStackTrace();
528	throw(e);
529	}
530
531	return urlEncoded;
532	}
533
534	// FURTHER HELPER METHODS
535
536	/**
537	* Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter.
538	* If filename is relative, then the current directory (gli?) will be prefixed to what is returned
539	* and should be removed manually by the caller. Alternatively, for relative paths, call the variant
540	* relativeFilenameToURLEncoding(String), which will remove any added filepath prefix.
541	*/
542	public static String fullFilepathToURLEncoding(String filename) {
543	// on a UTF-8 file system, DO NOT do the stuff further below,
544	// just return input filename param, but with any & in the filename replaced with its hex entity
545	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
546	return filename.replace("&", HEX_ENTITY_AMPERSAND);
547	}
548
549	File file = new File (filename);
550	String filename_url_encoded = fileToURLEncoding(file);
551
552	// if the current directory (".") was passed in as filename,
553	// then the filename_url_encoded looks like /full/path/./
554	// In that case, remove the ./ at the end
555	if (filename_url_encoded.endsWith(FilenameEncoding.URL_FILE_SEPARATOR+"."+FilenameEncoding.URL_FILE_SEPARATOR)) {
556	filename_url_encoded = filename_url_encoded.substring(0, filename_url_encoded.length()-2); // cut off /. at end
557	}
558
559	return filename_url_encoded;
560	}
561
562	/**
563	* Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter
564	* If filename is a relative path, call this method to get it specially URL encoded.
565	* This method will remove the current directory that is prefixed as an intermediary step.
566	*/
567	public static String relativeFilenameToURLEncoding(String filename) {
568	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
569	return filename.replace("&", HEX_ENTITY_AMPERSAND);
570	}
571
572	String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding(".");
573	return filenameToURLEncodingWithPrefixRemoved(filename, curr_directory_path);
574	}
575
576	/**
577	* Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter
578	* Convenience method that will return the specially URL encoded version of filename
579	* with the provided removeFilePathPrefix removed */
580	public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) {
581	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
582	return filename.replace("&", HEX_ENTITY_AMPERSAND);
583	}
584
585	File file = new File (filename);
586	String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath
587
588	// now lop off the given removeFilePathPrefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added
589	filename_url_encoded = filename_url_encoded.substring(removeFilePathPrefix.length());
590	// remove any remaining slash prefix
591	if (filename_url_encoded.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
592	filename_url_encoded = filename_url_encoded.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
593	}
594
595	return filename_url_encoded;
596	}
597
598	// UNUSED now, but useful functions and escapeAllCharWithHexEntity() took effort to write.
599
600	/**
601	* Attempting to produce the equivalent method fileToURLEncoding(), but taking a String as input parameter
602	* UNUSED - REPLACED by filenameToURLEncoding(String str) which reuses existing fileToURLEncoding(File) method.
603	*/
604	public static String stringToHex(String str) {
605
606	String hex_str = "";
607	for(int i = 0; i < str.length(); i++) {
608	int charCode = str.codePointAt(i); // unicode codepoint / ASCII code
609
610	// ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
611	// If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
612	if((charCode >= 20 && charCode <= 126) \|\| charCode == 9 \|\| charCode == 10 \|\| charCode == 13 /\|\| charCode == 36 \|\| charCode == 43/) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too
613	hex_str += str.charAt(i);
614	} else {
615	hex_str += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"
616	}
617	}
618
619	return hex_str;
620	}
621
622	/** Takes a String containing a single char and returns the hex entity for it */
623	public static String hexEntityForChar(String char_as_string) {
624	int charCode = char_as_string.codePointAt(0); // unicode codepoint / ASCII code
625	String hexCodeStr = "&#x" + String.format("%x", charCode).toUpperCase() + ";";
626	return hexCodeStr;
627	}
628
629	/**
630	* Given a String containing 0 or more occurrences of CHARACTER,
631	* this method will replace all occurrences of that CHARACTER with its hex entity variant, "&x....;"
632	* Special care is taken where the CHARACTER to be replaced is &,
633	* as in that case, we don't want to replace any existing hex entities already present in the String.
634	*/
635	public static String escapeAllCharWithHexEntity(String str, char CHARACTER) {
636
637	if(str.indexOf(CHARACTER) == -1) { // nothing to replace, we're done
638	return str;
639	}
640
641	String char_as_string = Character.toString(CHARACTER);
642	String hexCodeString = hexEntityForChar(char_as_string);
643
644	Matcher hexPatternMatch = HEX_PATTERN.matcher(str); // looks for a hex entity, which has the pattern "&#x....;"
645
646	// want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match HEX_PATTERN
647	int searchIndex = 0;
648
649	boolean finished = false;
650	while(!finished) {
651
652	searchIndex = str.indexOf(CHARACTER, searchIndex);
653
654	if(searchIndex == -1) {
655	finished = true;
656	}
657	else {
658
659	// replacing ampersands, &, is a special case: don't want to replace the & of (hex) entities in the string:
660	if(hexPatternMatch.find(searchIndex) && searchIndex == hexPatternMatch.start()) {
661	searchIndex = hexPatternMatch.end();
662	} else {
663
664	String tmp = str.substring(0, searchIndex) + hexCodeString;
665	searchIndex++;
666	if(str.length() > searchIndex) {
667	tmp += str.substring(searchIndex);
668	}
669	str = tmp;
670	searchIndex = searchIndex+ hexCodeString.length() - 1;
671
672	// String has been modified, so have to update Matcher
673	hexPatternMatch = HEX_PATTERN.matcher(str);
674
675	if(searchIndex >= str.length()) {
676	finished = true;
677	}
678	}
679	}
680	}
681
682	return str;
683	}
684	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: