Context Navigation

FilenameEncoding.java@ 33746

Last change on this file since 33746 was 33746, checked in by ak19, 4 years ago

Bugfix for dealing with + in filenames: file-level metadata now sticks and also ends up in doc.xml on build, as should happen. 2. Better (more optimal) bugfix for & in filenames, to get metadata to still stick after yesterday's first bugfix for this. Sadly, the improved code no longer needs the new function I introduced yesterday (escapeAllCharWithHexEntity). Leaving the function in, in case it ever comes in handy or as an idea. 3. Refactoring some code. 4. Removed some debugging statements. But some things are still largely commented out. Will remove hereafter.

File size: 28.8 KB

Line
1	/**
2	*############################################################################
3	* A component of the Greenstone Librarian Interface, part of the Greenstone
4	* digital library suite from the New Zealand Digital Library Project at the
5	* University of Waikato, New Zealand.
6	*
7	* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8	*
9	* Copyright (C) 2010 Greenstone Digital Library Project
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*############################################################################
25	*/
26
27	package org.greenstone.gatherer.metadata;
28
29	import java.io.File;
30	import java.net.*;
31	import java.nio.charset.*;
32	import java.util.*;
33	import org.greenstone.gatherer.collection.CollectionManager;
34	import org.greenstone.gatherer.DebugStream;
35
36	import java.util.regex.Matcher;
37	import java.util.regex.Pattern;
38
39
40
41	/** Static access class that contains many of the methods used to work with filename encodings.
42	* Works closely with classes FileNode, CollectionTreeNode, MetadataXMLFile, MetadataXMLFileManager
43	* to maintain a map of URLEncodedFilenames to their filename encodings.
44	* The process of filename encoding further affects the CollectionManager which refreshes its CollectionTree,
45	* FileManager (move, delete, rename actions), MetadataValueTableModel, EnrichPane. */
46
47	public class FilenameEncoding {
48	/** Display of filenames in the trees are in URL encoding, if debugging */
49	public static boolean DEBUGGING = false;
50
51	/** Set to false by Gatherer if the locale is UTF-8, as Java's handling is
52	* such that non-UTF8 filename encodings on a UTF-8 locale are destructively
53	* converted so that the bytecodes in the filename are not preserved. */
54	public static boolean MULTIPLE_FILENAME_ENCODINGS_SUPPORTED = false;
55
56	/** Also set by Gatherer.
57	* If the OS supports multiple filename encodings, we will be working with URL strings
58	* and the applicable separators are always the forward slash ("/") not File.separator.
59	* If multiple filename encodings are not supported, we're dealing with File.separator. */
60	public static String URL_FILE_SEPARATOR = File.separator;
61
62
63	/** gs.filenameEncoding is a special sort of metadata that is not merely to be stored along
64	* with a file, but is to be applied in real-time on the file's name in the CollectionTree
65	* display. Since FileNodes are constantly destroyed and reconstructed by that Tree when
66	* its nodes are expanded and contracted, storing the filename encodings of each file along
67	* with the file in a FileNode doesn't help because it doesn't last. Instead of rediscovering
68	* the encoding at every stage by querying the metadataXML file, we store the encodings for
69	* fast access: in a map of (URLEncodedFilePath, filename-encoding) pairs.
70	* The current design of the map is to only store any active filename metadata assigned
71	* directly at that file/folder's level, and if there is none discovered at that level, then
72	* storing the empty string for it. Therefore, if the hashmap contains no entry for
73	* a file, it means this still needs to be retrieved. */
74	public static Map map = new HashMap();
75
76	/** Compiled pattern for hex entities of characters. These are of the forn "&#x....;" with 1 to 4 digits */
77	public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)");
78
79
80	//********************* BUSY REFRESHING / REQUIRING REFRESH *******************
81
82	/** Set to true if filename encoding metadata was changed. Called by the enter keyPress
83	* event in gui.EnrichPane and when the gs.FilenameEncoding field loses focus. */
84	private static boolean refreshRequired = false;
85
86	synchronized public static boolean isRefreshRequired() {
87	return refreshRequired;
88	}
89
90	synchronized public static void setRefreshRequired(boolean state) {
91	if(MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
92	refreshRequired = state;
93	} else {
94	refreshRequired = false;
95	}
96	}
97
98	//************************ MAP RETRIEVAL METHODS ****************************
99
100	/** Returns the cumulative gs.filenameEncoding metadata
101	* assigned to a file inside the collection. */
102	public static String findFilenameEncoding(
103	File file, String urlEncodedFilePath, boolean bruteForceLookup)
104	{
105	//if(bruteForceLookup) {
106	// return findFilenameEncodingBruteForce(file, urlEncodedFilePath, bruteForceLookup);
107	//}
108
109	String encoding = "";
110
111	// Check any assigned encoding at this level, starting with the map first
112	// and else retrieving the filename encoding from the metadata file
113	if(!map.containsKey(urlEncodedFilePath)) {
114
115	// Check for filename encoding metadata directly associated with the file
116	// Now don't need to get any inherited encoding metadata here, because of
117	// the way we're storing and retrieving encoding information from the map.
118	ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(file, true); // true: gets gs.filenameEncoding only
119	if(!list.isEmpty()) {
120	MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
121	encoding = metavalue.getValue();
122	} // else no filename encoding set yet at this level
123
124	// Now we've done a lookup at this level cache the result in the map,
125	// including empty strings, to indicate that we've done a full lookup
126	map.put(urlEncodedFilePath, encoding);
127	}
128	else { // an entry exists in the map, get it from there
129	encoding = (String)map.get(urlEncodedFilePath);
130	}
131
132	// if no meta was specified at at the file level, look for any inherited metadata
133	if(encoding.equals("")) {
134	encoding = getInheritedFilenameEncoding(urlEncodedFilePath, file);
135	}
136
137	//System.err.println("\n@@@@Looked for: " + urlEncodedFilePath + " \| found: " + encoding);
138	return encoding; // found something in map, may still be "", but it's what was stored
139	}
140
141	/** Checks the file-to-encoding map for all the superfolders of the given
142	* filename in sequence for an applicable encoding. Note that the file/folder
143	* at the level of urlFoldername (and dir) has already been inspected. */
144	static public String getInheritedFilenameEncoding(String urlFoldername, File dir)
145	{
146	String encoding = "";
147	boolean done = false;
148
149	// don't want to search past import folder which is as
150	// far as we need to go to determine inherited encodings
151	File importDir = new File(CollectionManager.getLoadedCollectionImportDirectoryPath());
152	if(dir.equals(importDir)) { // if the top-level dir was already checked, we're done
153	done = true;
154	}
155
156	// For directories, first remove trailing file separator in order to start checking from higher level folders
157	int lastIndex = urlFoldername.length()-1;
158	char urlFileSeparatorChar = URL_FILE_SEPARATOR.charAt(0);
159	if(urlFoldername.charAt(lastIndex) == urlFileSeparatorChar) {
160	urlFoldername = urlFoldername.substring(0, lastIndex);
161	}
162
163	while(!done) {
164	// get the folder that's one level up
165	dir = dir.getParentFile();
166
167	int index = urlFoldername.lastIndexOf(URL_FILE_SEPARATOR);
168	if(index == -1) { // no more slashes
169	done = true;
170	} else {
171	urlFoldername = urlFoldername.substring(0, index);
172	}
173
174	// now look in the map to see whether there's an encoding for this folder
175	String folder = urlFoldername + URL_FILE_SEPARATOR;
176	if(map.containsKey(folder)) {
177	encoding = (String)map.get(folder); // may be ""
178	} else { // no entry in map, so look in the metadata.xml at this folder level
179	ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(
180	dir, true); // true: gets gs.filenameEncoding only
181	if(!list.isEmpty()) {
182	MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
183	encoding = metavalue.getValue();
184	}
185	map.put(folder, encoding); // may be ""
186	}
187
188	if(!encoding.equals("")){
189	done = true;
190	} // else if "", loop to check next folder up
191	else if(dir.equals(importDir)) { // don't iterate past the import folder, which we've now checked
192	done = true;
193	}
194	}
195
196	return encoding;
197	}
198
199	/** Called by GUIManager when a collection is closed. This then empties the
200	* file-to-encoding map which is applicable only on a per-collection basis */
201	static public void closeCollection() {
202	//printFilenameMap("Closing collection. Clearing file-to-encoding map of entries:");
203	map.clear();
204	}
205
206	// Useful for debugging: prints contents of file-to-encoding map
207	static public void printFilenameMap(String heading) {
208	System.err.println("\n********************************************");
209	System.err.println(heading.toUpperCase());
210	Iterator entries = map.entrySet().iterator();
211	while(entries.hasNext()) {
212	Map.Entry entry = (Map.Entry)entries.next();
213	System.err.println("+ " + (String)entry.getKey() + ": " + (String)entry.getValue());
214	}
215	System.err.println("********************************************\n");
216	}
217
218	// UNUSED at present. Brute force version of the findFilenameEncoding() method
219	// Doesn't use the map, but gets all the metadata assigned to a file/folder to
220	// work out the encoding applicable to a file/folder.
221	public static String findFilenameEncodingBruteForce(File file, String urlEncodedFilename,
222	boolean bruteForceLookup)
223	{
224	System.err.println("\n***** BRUTE FORCE getFilenameEncoding() called\n");
225
226
227	String encoding = "";
228
229	// Check for filename encoding metadata directly associated with the file
230	// Now don't need to get any inherited encoding metadata here, because of
231	// the way we're storing and retrieving encoding information from the map.
232
233	ArrayList list = MetadataXMLFileManager.getMetadataAssignedToFile(file, true); // true: gets gs.filenameEncoding only
234	if(!list.isEmpty()) {
235	// try to get the filename encoding meta that was assigned last to this
236	// file, even though it makes no sense to have multiple values for it
237	MetadataValue metavalue = (MetadataValue)list.get(list.size()-1);
238	encoding = metavalue.getValue();
239
240	if(encoding == null) { // unlikely ???
241	System.err.println("**** ERROR: encoding for "
242	+ urlEncodedFilename + " is NULL!");
243	encoding = "";
244	}
245	} // else no filename encoding set yet, perhaps
246	//System.err.println("**** Found encoding for " + urlEncodedFilename + " " + encoding);
247	return encoding;
248	}
249
250	//**************************** APPLYING ENCODINGS TO FILENAMES ***************************
251
252	/** URL encoded version of the byte codes of the given file's name */
253	public static String calcURLEncodedFilePath(File file) {
254	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
255	return file.getAbsolutePath();
256	}
257	else {
258	String filename = fileToURLEncoding(file);
259	return filename;
260	}
261	}
262
263	/** URL encoded version of the byte codes of this file's name */
264	public static String calcURLEncodedFileName(String urlfilepath) {
265	String filename = urlfilepath;
266	if(filename.endsWith(URL_FILE_SEPARATOR)) { // directory, remove trailing slash
267	filename = filename.substring(0, filename.length() - 1);
268	}
269
270	// remove the directory prefix (if any) to get the filename
271	int index = filename.lastIndexOf(URL_FILE_SEPARATOR);
272	if(index != -1) {
273	filename = filename.substring(index+1); // skip separator
274	}
275
276	return filename;
277	}
278
279	/** Given a string representing an alias to an official encoding (and unofficial ones
280	* starting with "Latin-"), attempts to work out what the canonical encoding for that is.
281	* If the given encoding is unrecognised, it is returned as is. */
282	public static String canonicalEncodingName(String encoding) {
283	String canonicalEncoding = encoding;
284	try {
285	// Latin-1 -> ISO-8859-1
286	String alias = canonicalEncoding.toLowerCase();
287	if(alias.startsWith("latin")){
288	canonicalEncoding = "ISO-8859" + alias.substring("latin".length());
289	}
290
291	// canonical encoding for official aliases
292	canonicalEncoding = Charset.forName(canonicalEncoding).name();
293	return canonicalEncoding;
294	} catch (Exception e) {
295	System.err.println("(Could not recognise encoding (alias): "
296	+ encoding + ".)");
297	return encoding; // no alias could be found, return the original parameter
298	}
299	}
300
301	//*********************** GETTING THE URL ENCODING OF FILENAMES *******************************
302
303	/**
304	* Given a String containing hexentities, will convert back into the unicode version of the String.
305	* e.g. A string like "02 Tēnā Koutou\.mp3" will be returned as "02 Tena Koutou\.mp3" with macrons on e and a
306	* I've tested this in a separate file that imports java.util.regex.Matcher and java.util.regex.Pattern
307	* and contains a copy of Utility.debugUnicodeString(String) with the following main function:
308	public static void main(String args[]) {
309	String str = "02 Tēnā Koutou\\.mp3"; // or more basic case: String str = "mmmmānnnnēpppp\\.txt";
310	System.err.println("About to decode hex string: " + str);
311	String result = decodeStringContainingHexEntities(str);
312	System.err.println("Decoded hex string: " + result + " - debug unicode form: " + debugUnicodeString(result));
313	}
314	*/
315	public static String decodeStringContainingHexEntities(String str) {
316	String result = "";
317	Matcher matcher = HEX_PATTERN.matcher(str);
318
319	int searchFromIndex = 0;
320	int endMatchIndex = -1;
321
322	while(matcher.find(searchFromIndex)) {
323	String hexPart = matcher.group();
324	//System.err.println("Found hexpart match: " + hexPart);
325
326	int startMatchIndex = matcher.start();
327	endMatchIndex = matcher.end();
328	result += str.substring(searchFromIndex, startMatchIndex);
329
330	String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match
331	// https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string
332	// https://stackoverflow.com/questions/11194513/convert-hex-string-to-int
333
334	//System.err.println("hexNumberStr so far: " + hexNumberStr);
335	hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD"
336	//int hexNumber = Integer.parseInt(hexNumberStr);
337	int hexNumber = Integer.decode(hexNumberStr);
338	String hexNumberAsChar = Character.toString((char) hexNumber);
339	result += hexNumberAsChar;
340
341	searchFromIndex = endMatchIndex;
342
343	}
344
345	if(endMatchIndex != -1) { // attach any suffix once we finished processing all the hex codes
346	result += str.substring(endMatchIndex);
347	//System.err.println("suffix: " + str.substring(endMatchIndex));
348	}
349	else { // there were no hex codes to decode, return string as is
350	result = str;
351	}
352
353	return result;
354	}
355
356	/**
357	* Attempting to produce the equivalent method fileToURLEncoding(), but taking a String as input parameter
358	* UNUSED - REPLACED by filenameToURLEncoding(String str) which reuses existing fileToURLEncoding(File) method.
359	*/
360	public static String stringToHex(String str) {
361
362	String hex_str = "";
363	for(int i = 0; i < str.length(); i++) {
364	int charCode = str.codePointAt(i); // unicode codepoint / ASCII code
365
366	// ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
367	// If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
368	if((charCode >= 20 && charCode <= 126) \|\| charCode == 9 \|\| charCode == 10 \|\| charCode == 13 /\|\| charCode == 36 \|\| charCode == 43/) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too
369	hex_str += str.charAt(i);
370	} else {
371	hex_str += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"
372	}
373	}
374
375	return hex_str;
376	}
377
378
379	/** Takes a String containing a single char and returns the hex entity for it */
380	public static String hexEntityForChar(String char_as_string) {
381	int charCode = char_as_string.codePointAt(0); // unicode codepoint / ASCII code
382	String hexCodeStr = "&#x" + String.format("%x", charCode).toUpperCase() + ";";
383	return hexCodeStr;
384	}
385
386	/**
387	* Given a String containing 0 or more occurrences of CHARACTER,
388	* this method will replace all occurrences of that CHARACTER with its hex entity variant, "&x....;"
389	* Special care is taken where the CHARACTER to be replaced is &,
390	* as in that case, we don't want to replace any existing hex entities already present in the String.
391	*/
392	public static String escapeAllCharWithHexEntity(String str, char CHARACTER/, String hexCodeString/) {
393
394	if(str.indexOf(CHARACTER) == -1) { // nothing to replace, we're done
395	return str;
396	}
397
398	String char_as_string = Character.toString(CHARACTER);
399	String hexCodeString = hexEntityForChar(char_as_string);
400
401	//System.err.println("@@@ hexCodeString for: " + char_as_string + " is: " + hexCodeString);
402
403	Matcher hexPatternMatch = HEX_PATTERN.matcher(str); // looks for a hex entity, which has the pattern "&#x....;"
404
405	// want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match HEX_PATTERN
406	int searchIndex = 0;
407
408	boolean finished = false;
409	while(!finished) {
410
411	searchIndex = str.indexOf(CHARACTER, searchIndex);
412
413	if(searchIndex == -1) {
414	finished = true;
415	}
416	else {
417
418	// replacing ampersands, &, is a special case: don't want to replace the & of (hex) entities in the string:
419	if(hexPatternMatch.find(searchIndex) && searchIndex == hexPatternMatch.start()) {
420	searchIndex = hexPatternMatch.end();
421	} else {
422
423	String tmp = str.substring(0, searchIndex) + hexCodeString;
424	searchIndex++;
425	if(str.length() > searchIndex) {
426	tmp += str.substring(searchIndex);
427	}
428	str = tmp;
429	searchIndex = searchIndex+ hexCodeString.length() - 1;
430
431	// String has been modified, so have to update Matcher
432	hexPatternMatch = HEX_PATTERN.matcher(str);
433
434	if(searchIndex >= str.length()) {
435	finished = true;
436	}
437	}
438	}
439	}
440
441	return str;
442	}
443
444
445	// Dr Bainbridge's methods
446	/* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,
447	* we can work with URL-encoded filenames in Java. Java works with whatever
448	* encoding the filesystem uses. Unlike systems working with UTF-8, where Java
449	* interprets filenames as UTF-8 (a destructive process since characters invalid
450	* for UTF-8 are replaced with the invalid character, which means the original
451	* character's byte codes can not be regained), working with an ISO-8859-1
452	* system means the original byte codes of the characters are preserved,
453	* regardless of whether the characters represent ISO-8859-1 or not. Such byte
454	* codes are converted by the following method to the correct URL versions of
455	* the strings that the filenames represent (that is, the correct URL representations
456	* of the filenames in their original encodings). This is useful for interactions with
457	* Perl as Java and Perl can use URL-encoded filenames to talk about the same files
458	* on the file system, instead of having to work out what encoding they are in. */
459
460	public static String fileToURLEncoding(File file) {
461	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
462	return file.getAbsolutePath();
463	}
464
465	String filename_url_encoded = "";
466
467	// The following test for whether the file exists or not is a problem
468	// when a File object--whose actual file is in the process of being moved
469	// and therefore temporarily does not 'exist' on the actual system--can't
470	// be URL encoded: the following would return "" when a file doesn't exist.
471	// So commenting out the test.
472	/*
473	if(!file.getName().equals("recycle")) {
474	if(!file.isFile() && !file.isDirectory()) {
475	System.err.println("*** ERROR. Java can't see file: " + file.getAbsolutePath());
476	return "";
477	}
478
479	if(!file.exists()) {
480	System.err.println("*** NOTE: File doesn't exist: " + file.getAbsolutePath());
481	return ""; //file.getName();
482	}
483	}
484	*/
485
486	URI filename_uri = file.toURI();
487	try {
488	// The trick:
489	// 1. toASCIIString() will %xx encode values > 127
490	// 2. Decode the result to "ISO-8859-1"
491	// 3. URL encode the bytes to string
492
493	// Step 2 forces the string to be 8-bit values. It
494	// doesn't matter if the starting raw filename was not
495	// in the ISO-8859-1 encoding, the effect is to ensure
496	// we have an 8-bit byte string that (numerically)
497	// captures the right value. These numerical values are
498	// then used to determine how to URL encode it
499
500	String filename_ascii = filename_uri.toASCIIString();
501
502	// The URI.toASCIIString() call above only encodes values > 127.
503	// But we also need to protect + and & signs in filenames
504	filename_ascii = filename_ascii.replace("+", "%252B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased
505	filename_ascii = filename_ascii.replace("&", "%2526"); // &'s ASCII code is 36 in decimal, and 26 in hex
506
507	// Before proceeding, protect & in the filename too.
508	// &'s ASCII code is 36 in decimal, and 26 in hex, so replace with &
509	// But dangerous to do simple replace if there are &#x...; entities in the filename already!
510	// That is, we'll want to protect & by replacing with &'s hex value, but we don't want to replace the & in "&#x....;" with the same!
511	//filename_url_encoded = filename_url_encoded.replace("&", "&x26;");// SO THIS IS BAD
512	//filename_url_encoded = filename_url_encoded.replace("&", hexEntityForChar("&"));// SAME, STILL BAD
513	///filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities
514
515	String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
516	filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
517
518	//filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller
519	filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex
520	filename_url_encoded = filename_url_encoded.replace("%26", "&");
521	}
522	catch (Exception e) {
523	e.printStackTrace();
524	// Give up trying to convert
525	filename_url_encoded = file.getAbsolutePath();
526	}
527	return filename_url_encoded;
528	}
529
530	// For unicode codepoints see:
531	// http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT for ISO8859-1 (Latin-1)
532	// where 0xE2 maps to codepoint 0x00E2 and is defined as "Latin small letter a with circumflex"
533	// http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT for ISO8859-7 (Greek)
534	// where 0xE2 maps to codepoint 0x03B2 and is defined as "Greek small letter beta"
535	public static String iso_8859_1_filename_to_url_encoded(String raw_bytes_filename)
536	throws Exception
537	{
538	String urlEncoded = "";
539
540	try {
541	// By this point we have a UTF-8 encoded string that captures
542	// what the ISO-8859-1 (Latin-1) character is that corresponded to the
543	// 8-bit numeric value for that character in the filename
544	// on the file system
545
546	// For example:
547	// File system char: <lower-case beta char in Latin-7> = %E2
548	// Equivalent Latin 1 char: <lower-case a with circumflex> = %E2
549	// Mapped to UTF-8: <lower-case a with circumflex> = <C3><A2>
550
551	// Our task is to take the string the contains <C3><A2> and ensure that
552	// we "see" it as <E2>
553
554	byte [] raw_bytes = raw_bytes_filename.getBytes("ISO-8859-1");
555	String unicode_filename = new String(raw_bytes,"UTF-8");
556
557	for(int i = 0; i < unicode_filename.length(); i++) {
558	char charVal = unicode_filename.charAt(i);
559	if ((int)charVal > 255) {
560	urlEncoded += String.format("&#x%02X;", (int)charVal);
561	}
562	else if((int)charVal > 127) {
563	urlEncoded += String.format("%%%02X", (int)charVal);
564	} else {
565	urlEncoded += String.format("%c", (char)charVal);
566	}
567	}
568	}
569	catch (Exception e) {
570	//e.printStackTrace();
571	throw(e);
572	}
573
574	return urlEncoded;
575	}
576
577	// unused for now
578	public static String raw_filename_to_url_encoded(String fileName)
579	throws Exception
580	{
581	String urlEncoded = "";
582	try {
583	byte[] bytes = fileName.getBytes();
584
585	for(int i = 0; i < bytes.length; i++) {
586	// mask each byte (by applying & 0xFF) to make the signed
587	// byte (in the range -128 to 127) unsigned (in the range
588	// 0 to 255).
589
590	int byteVal = (int)(bytes[i] & 0xFF);
591
592	if(byteVal > 127) {
593	urlEncoded += String.format("%%%02X", (int)byteVal);
594	} else {
595	urlEncoded += String.format("%c",(char)byteVal);
596	}
597	}
598	}
599	catch (Exception e) {
600	//e.printStackTrace();
601	throw(e);
602	}
603
604	return urlEncoded;
605	}
606
607	// FURTHER HELPER METHODS
608
609	/**
610	* Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter.
611	* If filename is relative, then the current directory (gli?) will be prefixed to what is returned
612	* and should be removed manually by the caller. Alternatively, for relative paths, call the variant
613	* relativeFilenameToURLEncoding(String), which will remove any added filepath prefix.
614	*/
615	public static String fullFilepathToURLEncoding(String filename) {
616	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
617	return filename;
618	}
619
620	File file = new File (filename);
621	String filename_url_encoded = fileToURLEncoding(file);
622
623	// if the current directory (".") was passed in as filename,
624	// then the filename_url_encoded looks like /full/path/./
625	// In that case, remove the ./ at the end
626	if (filename_url_encoded.endsWith(FilenameEncoding.URL_FILE_SEPARATOR+"."+FilenameEncoding.URL_FILE_SEPARATOR)) {
627	filename_url_encoded = filename_url_encoded.substring(0, filename_url_encoded.length()-2); // cut off /. at end
628	}
629
630	return filename_url_encoded;
631	}
632
633	/**
634	* Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter
635	* If filename is a relative path, call this method to get it specially URL encoded.
636	* This method will remove the current directory that is prefixed as an intermediary step.
637	*/
638	public static String relativeFilenameToURLEncoding(String filename) {
639	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
640	return filename;
641	}
642
643	String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding(".");
644	return filenameToURLEncodingWithPrefixRemoved(filename, curr_directory_path);
645	}
646
647	/**
648	* Produce the equivalent of method fileToURLEncoding(), but taking a String as input parameter
649	* Convenience method that will return the specially URL encoded version of filename
650	* with the provided removeFilePathPrefix removed */
651	public static String filenameToURLEncodingWithPrefixRemoved(String filename, String removeFilePathPrefix) {
652	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
653	return filename;
654	}
655
656	File file = new File (filename);
657	String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath
658
659	// now lop off the given removeFilePathPrefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added
660	filename_url_encoded = filename_url_encoded.substring(removeFilePathPrefix.length());
661	// remove any remaining slash prefix
662	if (filename_url_encoded.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
663	filename_url_encoded = filename_url_encoded.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
664	}
665
666	return filename_url_encoded;
667	}
668	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java@ 33746

Download in other formats: