Context Navigation

source: main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java@ 33738

Last change on this file since 33738 was 33738, checked in by ak19, 4 years ago
Got the filenameToURLEncoding(String) variant that reuses fileToURLEncoding(File) to work now. It just needed the current directory path (whatever . resolves to) to be removed from the String filepath returned, something Dr Bainbridge had anticipated could also happen with new URI() but that didn't happen there but does happen with file.toURI() as he had also expected.
File size: 24.6 KB

Rev	Line
[23433]	1	/**
	2	*############################################################################
	3	* A component of the Greenstone Librarian Interface, part of the Greenstone
	4	* digital library suite from the New Zealand Digital Library Project at the
	5	* University of Waikato, New Zealand.
	6	*
	7	* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
	8	*
	9	* Copyright (C) 2010 Greenstone Digital Library Project
	10	*
	11	* This program is free software; you can redistribute it and/or modify
	12	* it under the terms of the GNU General Public License as published by
	13	* the Free Software Foundation; either version 2 of the License, or
	14	* (at your option) any later version.
	15	*
	16	* This program is distributed in the hope that it will be useful,
	17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	* GNU General Public License for more details.
	20	*
	21	* You should have received a copy of the GNU General Public License
	22	* along with this program; if not, write to the Free Software
	23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	*############################################################################
	25	*/
	26
	27	package org.greenstone.gatherer.metadata;
	28
	29	import java.io.File;
	30	import java.net.*;
	31	import java.nio.charset.*;
	32	import java.util.*;
	33	import org.greenstone.gatherer.collection.CollectionManager;
[29793]	34	import org.greenstone.gatherer.DebugStream;
[23433]	35
[33728]	36	import java.util.regex.Matcher;
	37	import java.util.regex.Pattern;
	38
	39
	40
[23433]	41	/** Static access class that contains many of the methods used to work with filename encodings.
	42	* Works closely with classes FileNode, CollectionTreeNode, MetadataXMLFile, MetadataXMLFileManager
	43	* to maintain a map of URLEncodedFilenames to their filename encodings.
	44	* The process of filename encoding further affects the CollectionManager which refreshes its CollectionTree,
	45	* FileManager (move, delete, rename actions), MetadataValueTableModel, EnrichPane. */
	46
	47	public class FilenameEncoding {
	48	/** Display of filenames in the trees are in URL encoding, if debugging */
	49	public static boolean DEBUGGING = false;
	50
	51	/** Set to false by Gatherer if the locale is UTF-8, as Java's handling is
	52	* such that non-UTF8 filename encodings on a UTF-8 locale are destructively
	53	* converted so that the bytecodes in the filename are not preserved. */
	54	public static boolean MULTIPLE_FILENAME_ENCODINGS_SUPPORTED = false;
	55
	56	/** Also set by Gatherer.
	57	* If the OS supports multiple filename encodings, we will be working with URL strings
	58	* and the applicable separators are always the forward slash ("/") not File.separator.
	59	* If multiple filename encodings are not supported, we're dealing with File.separator. */
	60	public static String URL_FILE_SEPARATOR = File.separator;
	61
	62
	63	/** gs.filenameEncoding is a special sort of metadata that is not merely to be stored along
	64	* with a file, but is to be applied in real-time on the file's name in the CollectionTree
	65	* display. Since FileNodes are constantly destroyed and reconstructed by that Tree when
	66	* its nodes are expanded and contracted, storing the filename encodings of each file along
	67	* with the file in a FileNode doesn't help because it doesn't last. Instead of rediscovering
	68	* the encoding at every stage by querying the metadataXML file, we store the encodings for
	69	* fast access: in a map of (URLEncodedFilePath, filename-encoding) pairs.
	70	* The current design of the map is to only store any active filename metadata assigned
	71	* directly at that file/folder's level, and if there is none discovered at that level, then
	72	* storing the empty string for it. Therefore, if the hashmap contains no entry for
	73	* a file, it means this still needs to be retrieved. */
	74	public static Map map = new HashMap();
	75
[23436]	76	//********************* BUSY REFRESHING / REQUIRING REFRESH *******************
	77
[23433]	78	/** Set to true if filename encoding metadata was changed. Called by the enter keyPress
	79	* event in gui.EnrichPane and when the gs.FilenameEncoding field loses focus. */
	80	private static boolean refreshRequired = false;
	81
[23436]	82	synchronized public static boolean isRefreshRequired() {
[23433]	83	return refreshRequired;
	84	}
	85
[23436]	86	synchronized public static void setRefreshRequired(boolean state) {
[23433]	87	if(MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
	88	refreshRequired = state;
	89	} else {
	90	refreshRequired = false;
	91	}
	92	}
	93
	94	//************************ MAP RETRIEVAL METHODS ****************************
	95
	96	/** Returns the cumulative gs.filenameEncoding metadata
	97	* assigned to a file inside the collection. */
	98	public static String findFilenameEncoding(
	99	File file, String urlEncodedFilePath, boolean bruteForceLookup)
	100	{
	101	//if(bruteForceLookup) {
	102	// return findFilenameEncodingBruteForce(file, urlEncodedFilePath, bruteForceLookup);
	103	//}
	104
	105	String encoding = "";
	106
	107	// Check any assigned encoding at this level, starting with the map first
	108	// and else retrieving the filename encoding from the metadata file
	109	if(!map.containsKey(urlEncodedFilePath)) {
	110
	111	// Check for filename encoding metadata directly associated with the file
	112	// Now don't need to get any inherited encoding metadata here, because of
	113	// the way we're storing and retrieving encoding information from the map.
	114	ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(file, true); // true: gets gs.filenameEncoding only
	115	if(!list.isEmpty()) {
	116	MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
	117	encoding = metavalue.getValue();
	118	} // else no filename encoding set yet at this level
	119
	120	// Now we've done a lookup at this level cache the result in the map,
	121	// including empty strings, to indicate that we've done a full lookup
	122	map.put(urlEncodedFilePath, encoding);
	123	}
	124	else { // an entry exists in the map, get it from there
	125	encoding = (String)map.get(urlEncodedFilePath);
	126	}
	127
	128	// if no meta was specified at at the file level, look for any inherited metadata
	129	if(encoding.equals("")) {
	130	encoding = getInheritedFilenameEncoding(urlEncodedFilePath, file);
	131	}
	132
	133	//System.err.println("\n@@@@Looked for: " + urlEncodedFilePath + " \| found: " + encoding);
	134	return encoding; // found something in map, may still be "", but it's what was stored
	135	}
	136
	137	/** Checks the file-to-encoding map for all the superfolders of the given
	138	* filename in sequence for an applicable encoding. Note that the file/folder
	139	* at the level of urlFoldername (and dir) has already been inspected. */
	140	static public String getInheritedFilenameEncoding(String urlFoldername, File dir)
	141	{
	142	String encoding = "";
	143	boolean done = false;
	144
	145	// don't want to search past import folder which is as
	146	// far as we need to go to determine inherited encodings
	147	File importDir = new File(CollectionManager.getLoadedCollectionImportDirectoryPath());
	148	if(dir.equals(importDir)) { // if the top-level dir was already checked, we're done
	149	done = true;
	150	}
	151
	152	// For directories, first remove trailing file separator in order to start checking from higher level folders
	153	int lastIndex = urlFoldername.length()-1;
	154	char urlFileSeparatorChar = URL_FILE_SEPARATOR.charAt(0);
	155	if(urlFoldername.charAt(lastIndex) == urlFileSeparatorChar) {
	156	urlFoldername = urlFoldername.substring(0, lastIndex);
	157	}
	158
	159	while(!done) {
	160	// get the folder that's one level up
	161	dir = dir.getParentFile();
	162
	163	int index = urlFoldername.lastIndexOf(URL_FILE_SEPARATOR);
	164	if(index == -1) { // no more slashes
	165	done = true;
	166	} else {
	167	urlFoldername = urlFoldername.substring(0, index);
	168	}
	169
	170	// now look in the map to see whether there's an encoding for this folder
	171	String folder = urlFoldername + URL_FILE_SEPARATOR;
	172	if(map.containsKey(folder)) {
	173	encoding = (String)map.get(folder); // may be ""
	174	} else { // no entry in map, so look in the metadata.xml at this folder level
	175	ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(
	176	dir, true); // true: gets gs.filenameEncoding only
	177	if(!list.isEmpty()) {
	178	MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
	179	encoding = metavalue.getValue();
	180	}
	181	map.put(folder, encoding); // may be ""
	182	}
	183
	184	if(!encoding.equals("")){
	185	done = true;
	186	} // else if "", loop to check next folder up
	187	else if(dir.equals(importDir)) { // don't iterate past the import folder, which we've now checked
	188	done = true;
	189	}
	190	}
	191
	192	return encoding;
	193	}
	194
	195	/** Called by GUIManager when a collection is closed. This then empties the
	196	* file-to-encoding map which is applicable only on a per-collection basis */
	197	static public void closeCollection() {
	198	//printFilenameMap("Closing collection. Clearing file-to-encoding map of entries:");
	199	map.clear();
	200	}
	201
	202	// Useful for debugging: prints contents of file-to-encoding map
	203	static public void printFilenameMap(String heading) {
	204	System.err.println("\n********************************************");
	205	System.err.println(heading.toUpperCase());
	206	Iterator entries = map.entrySet().iterator();
	207	while(entries.hasNext()) {
	208	Map.Entry entry = (Map.Entry)entries.next();
	209	System.err.println("+ " + (String)entry.getKey() + ": " + (String)entry.getValue());
	210	}
	211	System.err.println("********************************************\n");
	212	}
	213
	214	// UNUSED at present. Brute force version of the findFilenameEncoding() method
	215	// Doesn't use the map, but gets all the metadata assigned to a file/folder to
	216	// work out the encoding applicable to a file/folder.
	217	public static String findFilenameEncodingBruteForce(File file, String urlEncodedFilename,
	218	boolean bruteForceLookup)
	219	{
	220	System.err.println("\n***** BRUTE FORCE getFilenameEncoding() called\n");
	221
	222
	223	String encoding = "";
	224
	225	// Check for filename encoding metadata directly associated with the file
	226	// Now don't need to get any inherited encoding metadata here, because of
	227	// the way we're storing and retrieving encoding information from the map.
	228
	229	ArrayList list = MetadataXMLFileManager.getMetadataAssignedToFile(file, true); // true: gets gs.filenameEncoding only
	230	if(!list.isEmpty()) {
	231	// try to get the filename encoding meta that was assigned last to this
	232	// file, even though it makes no sense to have multiple values for it
	233	MetadataValue metavalue = (MetadataValue)list.get(list.size()-1);
	234	encoding = metavalue.getValue();
	235
	236	if(encoding == null) { // unlikely ???
	237	System.err.println("**** ERROR: encoding for "
	238	+ urlEncodedFilename + " is NULL!");
	239	encoding = "";
	240	}
	241	} // else no filename encoding set yet, perhaps
	242	//System.err.println("**** Found encoding for " + urlEncodedFilename + " " + encoding);
	243	return encoding;
	244	}
	245
	246	//**************************** APPLYING ENCODINGS TO FILENAMES ***************************
	247
	248	/** URL encoded version of the byte codes of the given file's name */
	249	public static String calcURLEncodedFilePath(File file) {
	250	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
	251	return file.getAbsolutePath();
	252	}
	253	else {
	254	String filename = fileToURLEncoding(file);
	255	return filename;
	256	}
	257	}
	258
	259	/** URL encoded version of the byte codes of this file's name */
	260	public static String calcURLEncodedFileName(String urlfilepath) {
	261	String filename = urlfilepath;
	262	if(filename.endsWith(URL_FILE_SEPARATOR)) { // directory, remove trailing slash
	263	filename = filename.substring(0, filename.length() - 1);
	264	}
	265
	266	// remove the directory prefix (if any) to get the filename
	267	int index = filename.lastIndexOf(URL_FILE_SEPARATOR);
	268	if(index != -1) {
	269	filename = filename.substring(index+1); // skip separator
	270	}
	271
	272	return filename;
	273	}
	274
	275	/** Given a string representing an alias to an official encoding (and unofficial ones
	276	* starting with "Latin-"), attempts to work out what the canonical encoding for that is.
	277	* If the given encoding is unrecognised, it is returned as is. */
	278	public static String canonicalEncodingName(String encoding) {
	279	String canonicalEncoding = encoding;
	280	try {
	281	// Latin-1 -> ISO-8859-1
	282	String alias = canonicalEncoding.toLowerCase();
	283	if(alias.startsWith("latin")){
	284	canonicalEncoding = "ISO-8859" + alias.substring("latin".length());
	285	}
	286
	287	// canonical encoding for official aliases
	288	canonicalEncoding = Charset.forName(canonicalEncoding).name();
	289	return canonicalEncoding;
	290	} catch (Exception e) {
	291	System.err.println("(Could not recognise encoding (alias): "
	292	+ encoding + ".)");
	293	return encoding; // no alias could be found, return the original parameter
	294	}
	295	}
	296
	297	//*********************** GETTING THE URL ENCODING OF FILENAMES *******************************
[33728]	298
	299	/**
	300	* Given a String containing hexentities, will convert back into the unicode version of the String.
	301	* e.g. A string like "02 Tēnā Koutou\.mp3" will be returned as "02 Tena Koutou\.mp3" with macrons on e and a
	302	* I've tested this in a separate file that imports java.util.regex.Matcher and java.util.regex.Pattern
	303	* and contains a copy of Utility.debugUnicodeString(String) with the following main function:
	304	public static void main(String args[]) {
	305	String str = "02 Tēnā Koutou\\.mp3"; // or more basic case: String str = "mmmmānnnnēpppp\\.txt";
	306	System.err.println("About to decode hex string: " + str);
	307	String result = decodeStringContainingHexEntities(str);
	308	System.err.println("Decoded hex string: " + result + " - debug unicode form: " + debugUnicodeString(result));
	309	}
	310	*/
	311	public static String decodeStringContainingHexEntities(String str) {
	312	String result = "";
	313	Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)");
	314	Matcher matcher = hexPattern.matcher(str);
	315
	316	int searchFromIndex = 0;
	317	int endMatchIndex = -1;
	318
	319	while(matcher.find(searchFromIndex)) {
	320	String hexPart = matcher.group();
	321	//System.err.println("Found hexpart match: " + hexPart);
	322
	323	int startMatchIndex = matcher.start();
	324	endMatchIndex = matcher.end();
	325	result += str.substring(searchFromIndex, startMatchIndex);
	326
	327	String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match
	328	// https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string
	329	// https://stackoverflow.com/questions/11194513/convert-hex-string-to-int
	330
	331	//System.err.println("hexNumberStr so far: " + hexNumberStr);
	332	int tmpDigit = Integer.parseInt(hexNumberStr);
	333	//System.err.println("As digit: " + tmpDigit);
	334	hexNumberStr = String.format("%04d", tmpDigit);
	335	//System.err.println("2 hexNumberStr so far: " + hexNumberStr);
	336	hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD"
	337	//int hexNumber = Integer.parseInt(hexNumberStr);
	338	int hexNumber = Integer.decode(hexNumberStr);
	339	String hexNumberAsChar = Character.toString((char) hexNumber);
	340	result += hexNumberAsChar;
	341
	342	searchFromIndex = endMatchIndex;
	343
	344	}
	345
[33730]	346	if(endMatchIndex != -1) { // attach any suffix once we finished processing all the hex codes
[33728]	347	result += str.substring(endMatchIndex);
	348	//System.err.println("suffix: " + str.substring(endMatchIndex));
	349	}
[33730]	350	else { // there were no hex codes to decode, return string as is
	351	result = str;
	352	}
[33728]	353
	354	return result;
	355	}
	356
	357	/** Attempting to produce the equivalent method fileToURLEncoding() above, but taking a String as input parameter */
	358	public static String fileNameToHex(String filename) {
[33730]	359
[33728]	360	String hexFilename = "";
	361	for(int i = 0; i < filename.length(); i++) {
	362	int charCode = filename.codePointAt(i); // unicode codepoint / ASCII code
	363
	364	// ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
	365	// If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
[33737]	366	if((charCode >= 20 && charCode <= 126) \|\| charCode == 9 \|\| charCode == 10 \|\| charCode == 13 \|\| charCode == 36 \|\| charCode == 43) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing. And spaces and plus signs (ASCII codes 36 and 43) need to be converted to hex too
[33728]	367	hexFilename += filename.charAt(i);
	368	} else {
	369	hexFilename += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"
	370	}
	371	}
	372
	373	return hexFilename;
	374	}
	375
[33737]	376
	377	// follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter
[33738]	378	public static String UNUSED_filenameToURLEncoding(String filename) {
[33737]	379	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
	380	return filename;
	381	}
	382
	383	// Can't create a URI out of a filename containing spaces. Spaces must be encoded as %20
	384	String filename_url_encoded = filename.replace(" ", "%20");
	385	//filename_url_encoded = filename_url_encoded.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex
	386	//filename_url_encoded = filename_url_encoded.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased
	387
	388	try {
	389	URI filename_uri = new URI(filename_url_encoded);
	390	// The trick:
	391	// 1. toASCIIString() will %xx encode values > 127
	392	// 2. Decode the result to "ISO-8859-1"
	393	// 3. URL encode the bytes to string
	394
	395	// Step 2 forces the string to be 8-bit values. It
	396	// doesn't matter if the starting raw filename was not
	397	// in the ISO-8859-1 encoding, the effect is to ensure
	398	// we have an 8-bit byte string that (numerically)
	399	// captures the right value. These numerical values are
	400	// then used to determine how to URL encode it
	401
	402	String filename_ascii = filename_uri.toASCIIString();
	403	//filename_ascii = filename_ascii.replace("&", "%26"); // &'s ASCII code is 36 in decimal, and 26 in hex
	404	//filename_ascii = filename_ascii.replace("+", "%2B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased
	405	String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
	406	filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
	407
	408	// DEALING WITH & and + in filenames: NOT WORKING YET
	409	//if(filename_url_encoded.contains("&")) {
	410	// filename_url_encoded = filename_url_encoded.replace("&", "%36amp;");
	411	//} else if(filename_url_encoded.contains("&")) {
	412	// filename_url_encoded = filename_url_encoded.replace("&", "%36");
	413	//}
	414
	415	}
	416	catch (Exception e) {
	417	e.printStackTrace();
	418	// Give up trying to convert
	419	filename_url_encoded = filename;
	420	}
	421	return filename_url_encoded;
	422	}
	423
	424
	425	// follows Dr Bainbridge's method below, but with a String parameter instead of a file parameter
[33738]	426	public static String filenameToURLEncoding(String filename) {
[33737]	427	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) { // on a UTF-8 file system, DO NOT do the stuff below, just return input param
	428	return filename;
	429	}
	430
	431	File file = new File (filename);
	432	return fileToURLEncoding(file);
	433	}
	434
	435
[23433]	436	// Dr Bainbridge's methods
	437	/* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,
	438	* we can work with URL-encoded filenames in Java. Java works with whatever
	439	* encoding the filesystem uses. Unlike systems working with UTF-8, where Java
	440	* interprets filenames as UTF-8 (a destructive process since characters invalid
	441	* for UTF-8 are replaced with the invalid character, which means the original
	442	* character's byte codes can not be regained), working with an ISO-8859-1
	443	* system means the original byte codes of the characters are preserved,
	444	* regardless of whether the characters represent ISO-8859-1 or not. Such byte
	445	* codes are converted by the following method to the correct URL versions of
	446	* the strings that the filenames represent (that is, the correct URL representations
	447	* of the filenames in their original encodings). This is useful for interactions with
	448	* Perl as Java and Perl can use URL-encoded filenames to talk about the same files
	449	* on the file system, instead of having to work out what encoding they are in. */
	450
	451	public static String fileToURLEncoding(File file) {
	452	if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
	453	return file.getAbsolutePath();
	454	}
	455
	456	String filename_url_encoded = "";
	457
	458	// The following test for whether the file exists or not is a problem
	459	// when a File object--whose actual file is in the process of being moved
	460	// and therefore temporarily does not 'exist' on the actual system--can't
	461	// be URL encoded: the following would return "" when a file doesn't exist.
	462	// So commenting out the test.
	463	/*
	464	if(!file.getName().equals("recycle")) {
	465	if(!file.isFile() && !file.isDirectory()) {
	466	System.err.println("*** ERROR. Java can't see file: " + file.getAbsolutePath());
	467	return "";
	468	}
	469
	470	if(!file.exists()) {
	471	System.err.println("*** NOTE: File doesn't exist: " + file.getAbsolutePath());
	472	return ""; //file.getName();
	473	}
	474	}
	475	*/
	476
	477	URI filename_uri = file.toURI();
	478	try {
	479	// The trick:
	480	// 1. toASCIIString() will %xx encode values > 127
	481	// 2. Decode the result to "ISO-8859-1"
	482	// 3. URL encode the bytes to string
	483
	484	// Step 2 forces the string to be 8-bit values. It
	485	// doesn't matter if the starting raw filename was not
	486	// in the ISO-8859-1 encoding, the effect is to ensure
	487	// we have an 8-bit byte string that (numerically)
	488	// captures the right value. These numerical values are
	489	// then used to determine how to URL encode it
	490
	491	String filename_ascii = filename_uri.toASCIIString();
	492	String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
	493	filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
[29793]	494
[23433]	495	}
	496	catch (Exception e) {
	497	e.printStackTrace();
	498	// Give up trying to convert
	499	filename_url_encoded = file.getAbsolutePath();
	500	}
	501	return filename_url_encoded;
	502	}
	503
	504	// For unicode codepoints see:
	505	// http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT for ISO8859-1 (Latin-1)
	506	// where 0xE2 maps to codepoint 0x00E2 and is defined as "Latin small letter a with circumflex"
	507	// http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT for ISO8859-7 (Greek)
	508	// where 0xE2 maps to codepoint 0x03B2 and is defined as "Greek small letter beta"
	509	public static String iso_8859_1_filename_to_url_encoded(String raw_bytes_filename)
	510	throws Exception
	511	{
	512	String urlEncoded = "";
[29815]	513
[23433]	514	try {
	515	// By this point we have a UTF-8 encoded string that captures
	516	// what the ISO-8859-1 (Latin-1) character is that corresponded to the
	517	// 8-bit numeric value for that character in the filename
	518	// on the file system
	519
	520	// For example:
	521	// File system char: <lower-case beta char in Latin-7> = %E2
	522	// Equivalent Latin 1 char: <lower-case a with circumflex> = %E2
	523	// Mapped to UTF-8: <lower-case a with circumflex> = <C3><A2>
	524
	525	// Our task is to take the string the contains <C3><A2> and ensure that
	526	// we "see" it as <E2>
	527
	528	byte [] raw_bytes = raw_bytes_filename.getBytes("ISO-8859-1");
	529	String unicode_filename = new String(raw_bytes,"UTF-8");
	530
	531	for(int i = 0; i < unicode_filename.length(); i++) {
	532	char charVal = unicode_filename.charAt(i);
[29793]	533	if ((int)charVal > 255) {
	534	urlEncoded += String.format("&#x%02X;", (int)charVal);
	535	}
	536	else if((int)charVal > 127) {
[23433]	537	urlEncoded += String.format("%%%02X", (int)charVal);
	538	} else {
	539	urlEncoded += String.format("%c", (char)charVal);
	540	}
	541	}
	542	}
	543	catch (Exception e) {
	544	//e.printStackTrace();
	545	throw(e);
	546	}
[29815]	547
[23433]	548	return urlEncoded;
	549	}
	550
	551	// unused for now
	552	public static String raw_filename_to_url_encoded(String fileName)
	553	throws Exception
	554	{
	555	String urlEncoded = "";
	556	try {
	557	byte[] bytes = fileName.getBytes();
	558
	559	for(int i = 0; i < bytes.length; i++) {
	560	// mask each byte (by applying & 0xFF) to make the signed
	561	// byte (in the range -128 to 127) unsigned (in the range
	562	// 0 to 255).
	563
	564	int byteVal = (int)(bytes[i] & 0xFF);
	565
	566	if(byteVal > 127) {
	567	urlEncoded += String.format("%%%02X", (int)byteVal);
	568	} else {
	569	urlEncoded += String.format("%c",(char)byteVal);
	570	}
	571	}
	572	}
	573	catch (Exception e) {
	574	//e.printStackTrace();
	575	throw(e);
	576	}
	577
	578	return urlEncoded;
	579	}
	580
	581	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: