Context Navigation

source: main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java@ 33747

Last change on this file since 33747 was 33747, checked in by ak19, 4 years ago
Tidying up code some more and moving unused (but reusable and possibly useful) FilenameEncoding.java functions to end of file.
File size: 29.3 KB

Line
1	/**
2	*############################################################################
3	* A component of the Greenstone Librarian Interface, part of the Greenstone
4	* digital library suite from the New Zealand Digital Library Project at the
5	* University of Waikato, New Zealand.
6	*
7	* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8	*
9	* Copyright (C) 2010 Greenstone Digital Library Project
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*############################################################################
25	*/
26
27	package org.greenstone.gatherer.metadata;
28
29	import java.io.File;
30	import java.net.*;
31	import java.nio.charset.*;
32	import java.util.*;
33	import org.greenstone.gatherer.collection.CollectionManager;
34	import org.greenstone.gatherer.DebugStream;
35
36	import java.util.regex.Matcher;
37	import java.util.regex.Pattern;
38
39
40
41	/** Static access class that contains many of the methods used to work with filename encodings.
42	* Works closely with classes FileNode, CollectionTreeNode, MetadataXMLFile, MetadataXMLFileManager
43	* to maintain a map of URLEncodedFilenames to their filename encodings.
44	* The process of filename encoding further affects the CollectionManager which refreshes its CollectionTree,
45	* FileManager (move, delete, rename actions), MetadataValueTableModel, EnrichPane. */
46
47	public class FilenameEncoding {
48	/** Display of filenames in the trees are in URL encoding, if debugging */
49	public static boolean DEBUGGING = false;
50
51	/** Set to false by Gatherer if the locale is UTF-8, as Java's handling is
52	* such that non-UTF8 filename encodings on a UTF-8 locale are destructively
53	* converted so that the bytecodes in the filename are not preserved. */
54	public static boolean MULTIPLE_FILENAME_ENCODINGS_SUPPORTED = false;
55
56	/** Also set by Gatherer.
57	* If the OS supports multiple filename encodings, we will be working with URL strings
58	* and the applicable separators are always the forward slash ("/") not File.separator.
59	* If multiple filename encodings are not supported, we're dealing with File.separator. */
60	public static String URL_FILE_SEPARATOR = File.separator;
61
62
63	/** gs.filenameEncoding is a special sort of metadata that is not merely to be stored along
64	* with a file, but is to be applied in real-time on the file's name in the CollectionTree
65	* display. Since FileNodes are constantly destroyed and reconstructed by that Tree when
66	* its nodes are expanded and contracted, storing the filename encodings of each file along
67	* with the file in a FileNode doesn't help because it doesn't last. Instead of rediscovering
68	* the encoding at every stage by querying the metadataXML file, we store the encodings for
69	* fast access: in a map of (URLEncodedFilePath, filename-encoding) pairs.
70	* The current design of the map is to only store any active filename metadata assigned
71	* directly at that file/folder's level, and if there is none discovered at that level, then
72	* storing the empty string for it. Therefore, if the hashmap contains no entry for
73	* a file, it means this still needs to be retrieved. */
74	public static Map map = new HashMap();
75
76	/** Compiled pattern for hex entities of characters. These are of the forn "&#x....;" with 1 to 4 digits */
77	public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)");
78
79
80	//********************* BUSY REFRESHING / REQUIRING REFRESH *******************
81
82	/** Set to true if filename encoding metadata was changed. Called by the enter keyPress
83	* event in gui.EnrichPane and when the gs.FilenameEncoding field loses focus. */
84	private static boolean refreshRequired = false;
85
86	synchronized public static boolean isRefreshRequired() {
87	return refreshRequired;
88	}
89
90	synchronized public static void setRefreshRequired(boolean state) {
91	if(MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
92	refreshRequired = state;
93	} else {
94	refreshRequired = false;
95	}
96	}
97
98	//************************ MAP RETRIEVAL METHODS ****************************
99
100	/** Returns the cumulative gs.filenameEncoding metadata
101	* assigned to a file inside the collection. */
102	public static String findFilenameEncoding(
103	File file, String urlEncodedFilePath, boolean bruteForceLookup)
104	{
105	//if(bruteForceLookup) {
106	// return findFilenameEncodingBruteForce(file, urlEncodedFilePath, bruteForceLookup);
107	//}
108
109	String encoding = "";
110
111	// Check any assigned encoding at this level, starting with the map first
112	// and else retrieving the filename encoding from the metadata file
113	if(!map.containsKey(urlEncodedFilePath)) {
114
115	// Check for filename encoding metadata directly associated with the file
116	// Now don't need to get any inherited encoding metadata here, because of
117	// the way we're storing and retrieving encoding information from the map.
118	ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(file, true); // true: gets gs.filenameEncoding only
119	if(!list.isEmpty()) {
120	MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
121	encoding = metavalue.getValue();
122	} // else no filename encoding set yet at this level
123
124	// Now we've done a lookup at this level cache the result in the map,
125	// including empty strings, to indicate that we've done a full lookup
126	map.put(urlEncodedFilePath, encoding);
127	}
128	else { // an entry exists in the map, get it from there
129	encoding = (String)map.get(urlEncodedFilePath);
130	}
131
132	// if no meta was specified at at the file level, look for any inherited metadata
133	if(encoding.equals("")) {
134	encoding = getInheritedFilenameEncoding(urlEncodedFilePath, file);
135	}
136
137	//System.err.println("\n@@@@Looked for: " + urlEncodedFilePath + " \| found: " + encoding);
138	return encoding; // found something in map, may still be "", but it's what was stored
139	}
140
141	/** Checks the file-to-encoding map for all the superfolders of the given
142	* filename in sequence for an applicable encoding. Note that the file/folder
143	* at the level of urlFoldername (and dir) has already been inspected. */
144	static public String getInheritedFilenameEncoding(String urlFoldername, File dir)
145	{
146	String encoding = "";
147	boolean done = false;
148
149	// don't want to search past import folder which is as
150	// far as we need to go to determine inherited encodings
151	File importDir = new File(CollectionManager.getLoadedCollectionImportDirectoryPath());
152	if(dir.equals(importDir)) { // if the top-level dir was already checked, we're done
153	done = true;
154	}
155
156	// For directories, first remove trailing file separator in order to start checking from higher level folders
157	int lastIndex = urlFoldername.length()-1;
158	char urlFileSeparatorChar = URL_FILE_SEPARATOR.charAt(0);
159	if(urlFoldername.charAt(lastIndex) == urlFileSeparatorChar) {
160	urlFoldername = urlFoldername.substring(0, lastIndex);
161	}
162
163	while(!done) {
164	// get the folder that's one level up
165	dir = dir.getParentFile();
166
167	int index = urlFoldername.lastIndexOf(URL_FILE_SEPARATOR);
168	if(index == -1) { // no more slashes
169	done = true;
170	} else {
171	urlFoldername = urlFoldername.substring(0, index);
172	}
173
174	// now look in the map to see whether there's an encoding for this folder
175	String folder = urlFoldername + URL_FILE_SEPARATOR;
176	if(map.containsKey(folder)) {
177	encoding = (String)map.get(folder); // may be ""
178	} else { // no entry in map, so look in the metadata.xml at this folder level
179	ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(
180	dir, true); // true: gets gs.filenameEncoding only
181	if(!list.isEmpty()) {
182	MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
183	encoding = metavalue.getValue();
184	}
185	map.put(folder, encoding); // may be ""
186	}
187
188	if(!encoding.equals("")){
189	done = true;
190	} // else if "", loop to check next folder up
191	else if(dir.equals(importDir)) { // don't iterate past the import folder, which we've now checked
192	done = true;
193	}
194	}
195
196	return encoding;
197	}
198
199	/** Called by GUIManager when a collection is closed. This then empties the
200	* file-to-encoding map which is applicable only on a per-collection basis */
201	static public void closeCollection() {
202	//printFilenameMap("Closing collection. Clearing file-to-encoding map of entries:");
203	map.clear();
204	}
205
206	// Useful for debugging: prints contents of file-to-encoding map
207	static public void printFilenameMap(String heading) {
208	System.err.println("\n********************************************");
209	System.err.println(heading.toUpperCase());
210	Iterator entries = map.entrySet().iterator();
211	while(entries.hasNext()) {
212	Map.Entry entry = (Map.Entry)entries.next();
213	System.err.println("+ " + (String)entry.getKey() + ": " + (String)entry.getValue());
214	}
215	System.err.println("********************************************\n");
216	}
217
218	// UNUSED at present. Brute force version of the findFilenameEncoding() method
219	// Doesn't use the map, but gets all the metadata assigned to a file/folder to
220	// work out the encoding applicable to a file/folder.
221	public static String findFilenameEncodingBruteForce(File file, String urlEncodedFilename,
222	boolean bruteForceLookup)
223	{
224	System.err.println("\n***** BRUTE FORCE getFilenameEncoding() called\n");
225
226
227	String encoding = "";
228
229	// Check for filename encoding metadata directly associated with the file
230	// Now don't need to get any inherited encoding metadata here, because of
231	// the way we're storing and retrieving encoding information from the map.
232
233	ArrayList list = MetadataXMLFileManager.getMetadataAssignedToFile(file, true); // true: gets gs.filenameEncoding only
234	if(!list.isEmpty()) {
235	// try to get the filename encoding meta that was assigned last to this
236	// file, even though it makes no sense to have multiple values for it
237	MetadataValue metavalue = (MetadataValue)list.get(list.size()-1);
238	encoding = metavalue.getValue();
239
240	if(encoding == null) { // unlikely ???
241	System.err.println("**** ERROR: encoding for "
242	+ urlEncodedFilename + " is NULL!");
243	encoding = "";
244	}
245	} // else no filename encoding set yet, perhaps
246	//System.err.println("**** Found encoding for " + urlEncodedFilename + " " + encoding);
247	return encoding;
248	}
249
250	//**************************** APPLYING ENCODINGS TO FILENAMES ***************************
251
252	/** URL encoded version of the byte codes of the given file's name */
253	public static String calcURLEncodedFilePath(File file) {
254	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
255	return file.getAbsolutePath();
256	}
257	else {
258	String filename = fileToURLEncoding(file);
259	return filename;
260	}
261	}
262
263	/** URL encoded version of the byte codes of this file's name */
264	public static String calcURLEncodedFileName(String urlfilepath) {
265	String filename = urlfilepath;
266	if(filename.endsWith(URL_FILE_SEPARATOR)) { // directory, remove trailing slash
267	filename = filename.substring(0, filename.length() - 1);
268	}
269
270	// remove the directory prefix (if any) to get the filename
271	int index = filename.lastIndexOf(URL_FILE_SEPARATOR);
272	if(index != -1) {
273	filename = filename.substring(index+1); // skip separator
274	}
275
276	return filename;
277	}
278
279	/** Given a string representing an alias to an official encoding (and unofficial ones
280	* starting with "Latin-"), attempts to work out what the canonical encoding for that is.
281	* If the given encoding is unrecognised, it is returned as is. */
282	public static String canonicalEncodingName(String encoding) {
283	String canonicalEncoding = encoding;
284	try {
285	// Latin-1 -> ISO-8859-1
286	String alias = canonicalEncoding.toLowerCase();
287	if(alias.startsWith("latin")){
288	canonicalEncoding = "ISO-8859" + alias.substring("latin".length());
289	}
290
291	// canonical encoding for official aliases
292	canonicalEncoding = Charset.forName(canonicalEncoding).name();
293	return canonicalEncoding;
294	} catch (Exception e) {
295	System.err.println("(Could not recognise encoding (alias): "
296	+ encoding + ".)");
297	return encoding; // no alias could be found, return the original parameter
298	}
299	}
300
301	//*********************** GETTING THE URL ENCODING OF FILENAMES *******************************
302
303	/**
304	* Given a String containing hexentities, will convert back into the unicode version of the String.
305	* e.g. A string like "02 Tēnā Koutou\.mp3" will be returned as "02 Tena Koutou\.mp3" with macrons on e and a
306	* I've tested this in a separate file that imports java.util.regex.Matcher and java.util.regex.Pattern
307	* and contains a copy of Utility.debugUnicodeString(String) with the following main function:
308	public static void main(String args[]) {
309	String str = "02 Tēnā Koutou\\.mp3"; // or more basic case: String str = "mmmmānnnnēpppp\\.txt";
310	System.err.println("About to decode hex string: " + str);
311	String result = decodeStringContainingHexEntities(str);
312	System.err.println("Decoded hex string: " + result + " - debug unicode form: " + debugUnicodeString(result));
313	}
314	*/
315	public static String decodeStringContainingHexEntities(String str) {
316	String result = "";
317	Matcher matcher = HEX_PATTERN.matcher(str);
318
319	int searchFromIndex = 0;
320	int endMatchIndex = -1;
321
322	while(matcher.find(searchFromIndex)) {
323	String hexPart = matcher.group();
324	//System.err.println("Found hexpart match: " + hexPart);
325
326	int startMatchIndex = matcher.start();
327	endMatchIndex = matcher.end();
328	result += str.substring(searchFromIndex, startMatchIndex);
329
330	String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match
331	// https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string
332	// https://stackoverflow.com/questions/11194513/convert-hex-string-to-int
333
334	//System.err.println("hexNumberStr so far: " + hexNumberStr);
335	hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD"
336	//int hexNumber = Integer.parseInt(hexNumberStr);
337	int hexNumber = Integer.decode(hexNumberStr);
338	String hexNumberAsChar = Character.toString((char) hexNumber);
339	result += hexNumberAsChar;
340
341	searchFromIndex = endMatchIndex;
342
343	}
344
345	if(endMatchIndex != -1) { // attach any suffix once we finished processing all the hex codes
346	result += str.substring(endMatchIndex);
347	//System.err.println("suffix: " + str.substring(endMatchIndex));
348	}
349	else { // there were no hex codes to decode, return string as is
350	result = str;
351	}
352
353	return result;
354	}
355
356
357	// Dr Bainbridge's methods
358	/* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,
359	* we can work with URL-encoded filenames in Java. Java works with whatever
360	* encoding the filesystem uses. Unlike systems working with UTF-8, where Java
361	* interprets filenames as UTF-8 (a destructive process since characters invalid
362	* for UTF-8 are replaced with the invalid character, which means the original
363	* character's byte codes can not be regained), working with an ISO-8859-1
364	* system means the original byte codes of the characters are preserved,
365	* regardless of whether the characters represent ISO-8859-1 or not. Such byte
366	* codes are converted by the following method to the correct URL versions of
367	* the strings that the filenames represent (that is, the correct URL representations
368	* of the filenames in their original encodings). This is useful for interactions with
369	* Perl as Java and Perl can use URL-encoded filenames to talk about the same files
370	* on the file system, instead of having to work out what encoding they are in. */
371
372	public static String fileToURLEncoding(File file) {
373	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
374	return file.getAbsolutePath();
375	}
376
377	String filename_url_encoded = "";
378
379	// The following test for whether the file exists or not is a problem
380	// when a File object--whose actual file is in the process of being moved
381	// and therefore temporarily does not 'exist' on the actual system--can't
382	// be URL encoded: the following would return "" when a file doesn't exist.
383	// So commenting out the test.
384	/*
385	if(!file.getName().equals("recycle")) {
386	if(!file.isFile() && !file.isDirectory()) {
387	System.err.println("*** ERROR. Java can't see file: " + file.getAbsolutePath());
388	return "";
389	}
390
391	if(!file.exists()) {
392	System.err.println("*** NOTE: File doesn't exist: " + file.getAbsolutePath());
393	return ""; //file.getName();
394	}
395	}
396	*/
397
398	URI filename_uri = file.toURI();
399	try {
400	// The trick:
401	// 1. toASCIIString() will %xx encode values > 127
402	// 2. Decode the result to "ISO-8859-1"
403	// 3. URL encode the bytes to string
404
405	// Step 2 forces the string to be 8-bit values. It
406	// doesn't matter if the starting raw filename was not
407	// in the ISO-8859-1 encoding, the effect is to ensure
408	// we have an 8-bit byte string that (numerically)
409	// captures the right value. These numerical values are
410	// then used to determine how to URL encode it
411
412	String filename_ascii = filename_uri.toASCIIString();
413
414	// The URI.toASCIIString() call above only encodes values > 127.
415	// But we also need to protect + and & signs in filenames. Do this by URL encoding.
416	// But need to double URL encode, else it will get decoded too early, in methods called shortly hereafter.
417	filename_ascii = filename_ascii.replace("+", "%252B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased
418	filename_ascii = filename_ascii.replace("&", "%2526"); // &'s ASCII code is 36 in decimal, and 26 in hex
419
420	// Before proceeding, protect & in the filename too.
421	// &'s ASCII code is 36 in decimal, and 26 in hex, so replace with &
422	// But dangerous to do simple replace if there are &#x...; entities in the filename already!
423	// That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same!
424	//filename_url_encoded = filename_url_encoded.replace("&", "&x26;");// SO THIS IS BAD
425	//filename_url_encoded = filename_url_encoded.replace("&", hexEntityForChar("&"));// SAME, STILL BAD
426	///filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities
427
428	String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
429	filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
430
431	// For chars that were protected by being URL encoded, now convert them to the correct version we want them in.
432	// For +: this char is special in regex, so it needs to be converted from URL encoding back to + so it will get properly escaped for regex
433	// For &: this char is special in XML, so since the call to iso_8859_1_filename_to_url_encoded() is over, we can finally convert & to hex entity now.
434	//filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller
435	filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex
436	filename_url_encoded = filename_url_encoded.replace("%26", "&"); // convert URL encoding for ampersand into hex entity for ampersand
437	}
438	catch (Exception e) {
439	e.printStackTrace();
440	// Give up trying to convert
441	filename_url_encoded = file.getAbsolutePath();
442	}
443	return filename_url_encoded;
444	}
445
446	// For unicode codepoints see:
447	// http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT for ISO8859-1 (Latin-1)
448	// where 0xE2 maps to codepoint 0x00E2 and is defined as "Latin small letter a with circumflex"
449	// http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT for ISO8859-7 (Greek)
450	// where 0xE2 maps to codepoint 0x03B2 and is defined as "Greek small letter beta"
451	public static String iso_8859_1_filename_to_url_encoded(String raw_bytes_filename)
452	throws Exception
453	{
454	String urlEncoded = "";
455
456	try {
457	// By this point we have a UTF-8 encoded string that captures
458	// what the ISO-8859-1 (Latin-1) character is that corresponded to the
459	// 8-bit numeric value for that character in the filename
460	// on the file system
461
462	// For example:
463	// File system char: <lower-case beta char in Latin-7> = %E2
464	// Equivalent Latin 1 char: <lower-case a with circumflex> = %E2
465	// Mapped to UTF-8: <lower-case a with circumflex> = <C3><A2>
466
467	// Our task is to take the string the contains <C3><A2> and ensure that
468	// we "see" it as <E2>
469
470	byte [] raw_bytes = raw_bytes_filename.getBytes("ISO-8859-1");
471	String unicode_filename = new String(raw_bytes,"UTF-8");
472
473	for(int i = 0; i < unicode_filename.length(); i++) {
474	char charVal = unicode_filename.charAt(i);
475	if ((int)charVal > 255) {
476	urlEncoded += String.format("&#x%02X;", (int)charVal);
477	}
478	else if((int)charVal > 127) {
479	urlEncoded += String.format("%%%02X", (int)charVal);
480	} else {
481	urlEncoded += String.format("%c", (char)charVal);
482	}
483	}
484	}
485	catch (Exception e) {
486	//e.printStackTrace();
487	throw(e);
488	}
489
490	return urlEncoded;
491	}
492
493	// unused for now
494	public static String raw_filename_to_url_encoded(String fileName)
495	throws Exception
496	{
497	String urlEncoded = "";
498	try {
499	byte[] bytes = fileName.getBytes();
500
501	for(int i = 0; i < bytes.length; i++) {
502	// mask each byte (by applying & 0xFF) to make the signed
503	// byte (in the range -128 to 127) unsigned (in the range
504	// 0 to 255).
505
506	int byteVal = (int)(bytes[i] & 0xFF);
507
508	if(byteVal > 127) {
509	urlEncoded += String.format("%%%02X", (int)byteVal);
510	} else {
511	urlEncoded += String.format("%c",(char)byteVal);
512	}
513	}
514	}
515	catch (Exception e) {
516	//e.printStackTrace();
517	throw(e);
518	}
519
520	return urlEncoded;
521	}
522
523	// FURTHER HELPER METHODS
524
525	/**
526	* Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter.
527	* If filename is relative, then the current directory (gli?) will be prefixed to what is returned
528	* and should be removed manually by the caller. Alternatively, for relative paths, call the variant
529	* relativeFilenameToURLEncoding(String), which will remove any added filepath prefix.
530	*/
531	public static String fullFilepathToURLEncoding(String filename) {
532	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
533	return filename;
534	}
535
536	File file = new File (filename);
537	String filename_url_encoded = fileToURLEncoding(file);
538
539	// if the current directory (".") was passed in as filename,
540	// then the filename_url_encoded looks like /full/path/./
541	// In that case, remove the ./ at the end
542	if (filename_url_encoded.endsWith(FilenameEncoding.URL_FILE_SEPARATOR+"."+FilenameEncoding.URL_FILE_SEPARATOR)) {
543	filename_url_encoded = filename_url_encoded.substring(0, filename_url_encoded.length()-2); // cut off /. at end
544	}
545
546	return filename_url_encoded;
547	}
548
549	/**
550	* Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter
551	* If filename is a relative path, call this method to get it specially URL encoded.
552	* This method will remove the current directory that is prefixed as an intermediary step.
553	*/
554	public static String relativeFilenameToURLEncoding(String filename) {
555	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
556	return filename;
557	}
558
559	String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding(".");
560	return filenameToURLEncodingWithPrefixRemoved(filename, curr_directory_path);
561	}
562
563	/**
564	* Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter
565	* Convenience method that will return the specially URL encoded version of filename
566	* with the provided removeFilePathPrefix removed */
567	public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) {
568	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
569	return filename;
570	}
571
572	File file = new File (filename);
573	String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath
574
575	// now lop off the given removeFilePathPrefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added
576	filename_url_encoded = filename_url_encoded.substring(removeFilePathPrefix.length());
577	// remove any remaining slash prefix
578	if (filename_url_encoded.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
579	filename_url_encoded = filename_url_encoded.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
580	}
581
582	return filename_url_encoded;
583	}
584
585	// UNUSED now, but useful functions and escapeAllCharWithHexEntity() took effort to write.
586
587	/**
588	* Attempting to produce the equivalent method fileToURLEncoding(), but taking a String as input parameter
589	* UNUSED - REPLACED by filenameToURLEncoding(String str) which reuses existing fileToURLEncoding(File) method.
590	*/
591	public static String stringToHex(String str) {
592
593	String hex_str = "";
594	for(int i = 0; i < str.length(); i++) {
595	int charCode = str.codePointAt(i); // unicode codepoint / ASCII code
596
597	// ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
598	// If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
599	if((charCode >= 20 && charCode <= 126) \|\| charCode == 9 \|\| charCode == 10 \|\| charCode == 13 /\|\| charCode == 36 \|\| charCode == 43/) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too
600	hex_str += str.charAt(i);
601	} else {
602	hex_str += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"
603	}
604	}
605
606	return hex_str;
607	}
608
609	/** Takes a String containing a single char and returns the hex entity for it */
610	public static String hexEntityForChar(String char_as_string) {
611	int charCode = char_as_string.codePointAt(0); // unicode codepoint / ASCII code
612	String hexCodeStr = "&#x" + String.format("%x", charCode).toUpperCase() + ";";
613	return hexCodeStr;
614	}
615
616	/**
617	* Given a String containing 0 or more occurrences of CHARACTER,
618	* this method will replace all occurrences of that CHARACTER with its hex entity variant, "&x....;"
619	* Special care is taken where the CHARACTER to be replaced is &,
620	* as in that case, we don't want to replace any existing hex entities already present in the String.
621	*/
622	public static String escapeAllCharWithHexEntity(String str, char CHARACTER) {
623
624	if(str.indexOf(CHARACTER) == -1) { // nothing to replace, we're done
625	return str;
626	}
627
628	String char_as_string = Character.toString(CHARACTER);
629	String hexCodeString = hexEntityForChar(char_as_string);
630
631	Matcher hexPatternMatch = HEX_PATTERN.matcher(str); // looks for a hex entity, which has the pattern "&#x....;"
632
633	// want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match HEX_PATTERN
634	int searchIndex = 0;
635
636	boolean finished = false;
637	while(!finished) {
638
639	searchIndex = str.indexOf(CHARACTER, searchIndex);
640
641	if(searchIndex == -1) {
642	finished = true;
643	}
644	else {
645
646	// replacing ampersands, &, is a special case: don't want to replace the & of (hex) entities in the string:
647	if(hexPatternMatch.find(searchIndex) && searchIndex == hexPatternMatch.start()) {
648	searchIndex = hexPatternMatch.end();
649	} else {
650
651	String tmp = str.substring(0, searchIndex) + hexCodeString;
652	searchIndex++;
653	if(str.length() > searchIndex) {
654	tmp += str.substring(searchIndex);
655	}
656	str = tmp;
657	searchIndex = searchIndex+ hexCodeString.length() - 1;
658
659	// String has been modified, so have to update Matcher
660	hexPatternMatch = HEX_PATTERN.matcher(str);
661
662	if(searchIndex >= str.length()) {
663	finished = true;
664	}
665	}
666	}
667	}
668
669	return str;
670	}
671	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: