source: other-projects/FileTransfer-WebSocketPair/testGXTWithGreenstone/src/org/greenstone/gatherer/metadata/FilenameEncoding.java@ 33053

Last change on this file since 33053 was 33053, checked in by ak19, 5 years ago

I still had some stuff of Nathan Kelly's (FileTransfer-WebSocketPair) sitting on my USB. Had already commited the Themes folder at the time, 2 years back. Not sure if he wanted this additional folder commited. But I didn't want to delete it and decided it will be better off on SVN. When we use his project, if we find we didn't need this test folder, we can remove it from svn then.

File size: 18.3 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2010 Greenstone Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29import java.io.File;
30import java.net.*;
31import java.nio.charset.*;
32import java.util.*;
33import org.greenstone.gatherer.collection.CollectionManager;
34import org.greenstone.gatherer.DebugStream;
35
36/** Static access class that contains many of the methods used to work with filename encodings.
37* Works closely with classes FileNode, CollectionTreeNode, MetadataXMLFile, MetadataXMLFileManager
38* to maintain a map of URLEncodedFilenames to their filename encodings.
39* The process of filename encoding further affects the CollectionManager which refreshes its CollectionTree,
40* FileManager (move, delete, rename actions), MetadataValueTableModel, EnrichPane. */
41
42public class FilenameEncoding {
43 /** Display of filenames in the trees are in URL encoding, if debugging */
44 public static boolean DEBUGGING = false;
45
46 /** Set to false by Gatherer if the locale is UTF-8, as Java's handling is
47 * such that non-UTF8 filename encodings on a UTF-8 locale are destructively
48 * converted so that the bytecodes in the filename are not preserved. */
49 public static boolean MULTIPLE_FILENAME_ENCODINGS_SUPPORTED = false;
50
51 /** Also set by Gatherer.
52 * If the OS supports multiple filename encodings, we will be working with URL strings
53 * and the applicable separators are always the forward slash ("/") not File.separator.
54 * If multiple filename encodings are not supported, we're dealing with File.separator. */
55 public static String URL_FILE_SEPARATOR = File.separator;
56
57
58 /** gs.filenameEncoding is a special sort of metadata that is not merely to be stored along
59 * with a file, but is to be applied in real-time on the file's name in the CollectionTree
60 * display. Since FileNodes are constantly destroyed and reconstructed by that Tree when
61 * its nodes are expanded and contracted, storing the filename encodings of each file along
62 * with the file in a FileNode doesn't help because it doesn't last. Instead of rediscovering
63 * the encoding at every stage by querying the metadataXML file, we store the encodings for
64 * fast access: in a map of (URLEncodedFilePath, filename-encoding) pairs.
65 * The current design of the map is to only store any active filename metadata assigned
66 * directly at that file/folder's level, and if there is none discovered at that level, then
67 * storing the empty string for it. Therefore, if the hashmap contains no entry for
68 * a file, it means this still needs to be retrieved. */
69 public static Map map = new HashMap();
70
71//*********************** BUSY REFRESHING / REQUIRING REFRESH *********************
72
73 /** Set to true if filename encoding metadata was changed. Called by the enter keyPress
74 * event in gui.EnrichPane and when the gs.FilenameEncoding field loses focus. */
75 private static boolean refreshRequired = false;
76
77 synchronized public static boolean isRefreshRequired() {
78 return refreshRequired;
79 }
80
81 synchronized public static void setRefreshRequired(boolean state) {
82 if(MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
83 refreshRequired = state;
84 } else {
85 refreshRequired = false;
86 }
87 }
88
89//************************** MAP RETRIEVAL METHODS ******************************
90
91 /** Returns the cumulative gs.filenameEncoding metadata
92 * assigned to a file inside the collection. */
93 public static String findFilenameEncoding(
94 File file, String urlEncodedFilePath, boolean bruteForceLookup)
95 {
96 //if(bruteForceLookup) {
97 // return findFilenameEncodingBruteForce(file, urlEncodedFilePath, bruteForceLookup);
98 //}
99
100 String encoding = "";
101
102 // Check any assigned encoding at this level, starting with the map first
103 // and else retrieving the filename encoding from the metadata file
104 if(!map.containsKey(urlEncodedFilePath)) {
105
106 // Check for filename encoding metadata *directly* associated with the file
107 // Now don't need to get any inherited encoding metadata here, because of
108 // the way we're storing and retrieving encoding information from the map.
109 ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(file, true); // true: gets gs.filenameEncoding only
110 if(!list.isEmpty()) {
111 MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
112 encoding = metavalue.getValue();
113 } // else no filename encoding set yet at this level
114
115 // Now we've done a lookup at this level cache the result in the map,
116 // including empty strings, to indicate that we've done a full lookup
117 map.put(urlEncodedFilePath, encoding);
118 }
119 else { // an entry exists in the map, get it from there
120 encoding = (String)map.get(urlEncodedFilePath);
121 }
122
123 // if no meta was specified at at the file level, look for any inherited metadata
124 if(encoding.equals("")) {
125 encoding = getInheritedFilenameEncoding(urlEncodedFilePath, file);
126 }
127
128 //System.err.println("\n@@@@Looked for: " + urlEncodedFilePath + " | found: " + encoding);
129 return encoding; // found something in map, may still be "", but it's what was stored
130 }
131
132 /** Checks the file-to-encoding map for all the superfolders of the given
133 * filename in sequence for an applicable encoding. Note that the file/folder
134 * at the level of urlFoldername (and dir) has already been inspected. */
135 static public String getInheritedFilenameEncoding(String urlFoldername, File dir)
136 {
137 String encoding = "";
138 boolean done = false;
139
140 // don't want to search past import folder which is as
141 // far as we need to go to determine inherited encodings
142 File importDir = new File(CollectionManager.getLoadedCollectionImportDirectoryPath());
143 if(dir.equals(importDir)) { // if the top-level dir was already checked, we're done
144 done = true;
145 }
146
147 // For directories, first remove trailing file separator in order to start checking from higher level folders
148 int lastIndex = urlFoldername.length()-1;
149 char urlFileSeparatorChar = URL_FILE_SEPARATOR.charAt(0);
150 if(urlFoldername.charAt(lastIndex) == urlFileSeparatorChar) {
151 urlFoldername = urlFoldername.substring(0, lastIndex);
152 }
153
154 while(!done) {
155 // get the folder that's one level up
156 dir = dir.getParentFile();
157
158 int index = urlFoldername.lastIndexOf(URL_FILE_SEPARATOR);
159 if(index == -1) { // no more slashes
160 done = true;
161 } else {
162 urlFoldername = urlFoldername.substring(0, index);
163 }
164
165 // now look in the map to see whether there's an encoding for this folder
166 String folder = urlFoldername + URL_FILE_SEPARATOR;
167 if(map.containsKey(folder)) {
168 encoding = (String)map.get(folder); // may be ""
169 } else { // no entry in map, so look in the metadata.xml at this folder level
170 ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(
171 dir, true); // true: gets gs.filenameEncoding only
172 if(!list.isEmpty()) {
173 MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
174 encoding = metavalue.getValue();
175 }
176 map.put(folder, encoding); // may be ""
177 }
178
179 if(!encoding.equals("")){
180 done = true;
181 } // else if "", loop to check next folder up
182 else if(dir.equals(importDir)) { // don't iterate past the import folder, which we've now checked
183 done = true;
184 }
185 }
186
187 return encoding;
188 }
189
190 /** Called by GUIManager when a collection is closed. This then empties the
191 * file-to-encoding map which is applicable only on a per-collection basis */
192 static public void closeCollection() {
193 //printFilenameMap("Closing collection. Clearing file-to-encoding map of entries:");
194 map.clear();
195 }
196
197 // Useful for debugging: prints contents of file-to-encoding map
198 static public void printFilenameMap(String heading) {
199 System.err.println("\n********************************************");
200 System.err.println(heading.toUpperCase());
201 Iterator entries = map.entrySet().iterator();
202 while(entries.hasNext()) {
203 Map.Entry entry = (Map.Entry)entries.next();
204 System.err.println("+ " + (String)entry.getKey() + ": " + (String)entry.getValue());
205 }
206 System.err.println("********************************************\n");
207 }
208
209 // UNUSED at present. Brute force version of the findFilenameEncoding() method
210 // Doesn't use the map, but gets *all* the metadata assigned to a file/folder to
211 // work out the encoding applicable to a file/folder.
212 public static String findFilenameEncodingBruteForce(File file, String urlEncodedFilename,
213 boolean bruteForceLookup)
214 {
215 System.err.println("\n***** BRUTE FORCE getFilenameEncoding() called\n");
216
217
218 String encoding = "";
219
220 // Check for filename encoding metadata *directly* associated with the file
221 // Now don't need to get any inherited encoding metadata here, because of
222 // the way we're storing and retrieving encoding information from the map.
223
224 ArrayList list = MetadataXMLFileManager.getMetadataAssignedToFile(file, true); // true: gets gs.filenameEncoding only
225 if(!list.isEmpty()) {
226 // try to get the filename encoding meta that was assigned last to this
227 // file, even though it makes no sense to have multiple values for it
228 MetadataValue metavalue = (MetadataValue)list.get(list.size()-1);
229 encoding = metavalue.getValue();
230
231 if(encoding == null) { // unlikely ???
232 System.err.println("**** ERROR: encoding for "
233 + urlEncodedFilename + " is NULL!");
234 encoding = "";
235 }
236 } // else no filename encoding set yet, perhaps
237 //System.err.println("**** Found encoding for " + urlEncodedFilename + " " + encoding);
238 return encoding;
239 }
240
241//****************************** APPLYING ENCODINGS TO FILENAMES *****************************
242
243 /** URL encoded version of the byte codes of the given file's name */
244 public static String calcURLEncodedFilePath(File file) {
245 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
246 return file.getAbsolutePath();
247 }
248 else {
249 String filename = fileToURLEncoding(file);
250 return filename;
251 }
252 }
253
254 /** URL encoded version of the byte codes of this file's name */
255 public static String calcURLEncodedFileName(String urlfilepath) {
256 String filename = urlfilepath;
257 if(filename.endsWith(URL_FILE_SEPARATOR)) { // directory, remove trailing slash
258 filename = filename.substring(0, filename.length() - 1);
259 }
260
261 // remove the directory prefix (if any) to get the filename
262 int index = filename.lastIndexOf(URL_FILE_SEPARATOR);
263 if(index != -1) {
264 filename = filename.substring(index+1); // skip separator
265 }
266
267 return filename;
268 }
269
270 /** Given a string representing an alias to an official encoding (and unofficial ones
271 * starting with "Latin-"), attempts to work out what the canonical encoding for that is.
272 * If the given encoding is unrecognised, it is returned as is. */
273 public static String canonicalEncodingName(String encoding) {
274 String canonicalEncoding = encoding;
275 try {
276 // Latin-1 -> ISO-8859-1
277 String alias = canonicalEncoding.toLowerCase();
278 if(alias.startsWith("latin")){
279 canonicalEncoding = "ISO-8859" + alias.substring("latin".length());
280 }
281
282 // canonical encoding for official aliases
283 canonicalEncoding = Charset.forName(canonicalEncoding).name();
284 return canonicalEncoding;
285 } catch (Exception e) {
286 System.err.println("(Could not recognise encoding (alias): "
287 + encoding + ".)");
288 return encoding; // no alias could be found, return the original parameter
289 }
290 }
291
292//************************* GETTING THE URL ENCODING OF FILENAMES *********************************
293 // Dr Bainbridge's methods
294 /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,
295 * we can work with URL-encoded filenames in Java. Java works with whatever
296 * encoding the filesystem uses. Unlike systems working with UTF-8, where Java
297 * interprets filenames as UTF-8 (a destructive process since characters invalid
298 * for UTF-8 are replaced with the invalid character, which means the original
299 * character's byte codes can not be regained), working with an ISO-8859-1
300 * system means the original byte codes of the characters are preserved,
301 * regardless of whether the characters represent ISO-8859-1 or not. Such byte
302 * codes are converted by the following method to the correct URL versions of
303 * the strings that the filenames represent (that is, the correct URL representations
304 * of the filenames in their original encodings). This is useful for interactions with
305 * Perl as Java and Perl can use URL-encoded filenames to talk about the same files
306 * on the file system, instead of having to work out what encoding they are in. */
307
308 public static String fileToURLEncoding(File file) {
309 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
310 return file.getAbsolutePath();
311 }
312
313 String filename_url_encoded = "";
314
315 // The following test for whether the file exists or not is a problem
316 // when a File object--whose actual file is in the process of being moved
317 // and therefore temporarily does not 'exist' on the actual system--can't
318 // be URL encoded: the following would return "" when a file doesn't exist.
319 // So commenting out the test.
320 /*
321 if(!file.getName().equals("recycle")) {
322 if(!file.isFile() && !file.isDirectory()) {
323 System.err.println("*** ERROR. Java can't see file: " + file.getAbsolutePath());
324 return "";
325 }
326
327 if(!file.exists()) {
328 System.err.println("*** NOTE: File doesn't exist: " + file.getAbsolutePath());
329 return ""; //file.getName();
330 }
331 }
332 */
333
334 URI filename_uri = file.toURI();
335 try {
336 // The trick:
337 // 1. toASCIIString() will %xx encode values > 127
338 // 2. Decode the result to "ISO-8859-1"
339 // 3. URL encode the bytes to string
340
341 // Step 2 forces the string to be 8-bit values. It
342 // doesn't matter if the starting raw filename was *not*
343 // in the ISO-8859-1 encoding, the effect is to ensure
344 // we have an 8-bit byte string that (numerically)
345 // captures the right value. These numerical values are
346 // then used to determine how to URL encode it
347
348 String filename_ascii = filename_uri.toASCIIString();
349 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
350 filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
351
352 }
353 catch (Exception e) {
354 e.printStackTrace();
355 // Give up trying to convert
356 filename_url_encoded = file.getAbsolutePath();
357 }
358 return filename_url_encoded;
359 }
360
361
362 // For unicode codepoints see:
363 // http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT for ISO8859-1 (Latin-1)
364 // where 0xE2 maps to codepoint 0x00E2 and is defined as "Latin small letter a with circumflex"
365 // http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT for ISO8859-7 (Greek)
366 // where 0xE2 maps to codepoint 0x03B2 and is defined as "Greek small letter beta"
367 public static String iso_8859_1_filename_to_url_encoded(String raw_bytes_filename)
368 throws Exception
369 {
370 String urlEncoded = "";
371
372 try {
373 // By this point we have a UTF-8 encoded string that captures
374 // what the ISO-8859-1 (Latin-1) character is that corresponded to the
375 // 8-bit numeric value for that character in the filename
376 // on the file system
377
378 // For example:
379 // File system char: <lower-case beta char in Latin-7> = %E2
380 // Equivalent Latin 1 char: <lower-case a with circumflex> = %E2
381 // Mapped to UTF-8: <lower-case a with circumflex> = <C3><A2>
382
383 // Our task is to take the string the contains <C3><A2> and ensure that
384 // we "see" it as <E2>
385
386 byte [] raw_bytes = raw_bytes_filename.getBytes("ISO-8859-1");
387 String unicode_filename = new String(raw_bytes,"UTF-8");
388
389 for(int i = 0; i < unicode_filename.length(); i++) {
390 char charVal = unicode_filename.charAt(i);
391 if ((int)charVal > 255) {
392 urlEncoded += String.format("&#x%02X;", (int)charVal);
393 }
394 else if((int)charVal > 127) {
395 urlEncoded += String.format("%%%02X", (int)charVal);
396 } else {
397 urlEncoded += String.format("%c", (char)charVal);
398 }
399 }
400 }
401 catch (Exception e) {
402 //e.printStackTrace();
403 throw(e);
404 }
405
406 return urlEncoded;
407 }
408
409 // unused for now
410 public static String raw_filename_to_url_encoded(String fileName)
411 throws Exception
412 {
413 String urlEncoded = "";
414 try {
415 byte[] bytes = fileName.getBytes();
416
417 for(int i = 0; i < bytes.length; i++) {
418 // mask each byte (by applying & 0xFF) to make the signed
419 // byte (in the range -128 to 127) unsigned (in the range
420 // 0 to 255).
421
422 int byteVal = (int)(bytes[i] & 0xFF);
423
424 if(byteVal > 127) {
425 urlEncoded += String.format("%%%02X", (int)byteVal);
426 } else {
427 urlEncoded += String.format("%c",(char)byteVal);
428 }
429 }
430 }
431 catch (Exception e) {
432 //e.printStackTrace();
433 throw(e);
434 }
435
436 return urlEncoded;
437 }
438
439}
Note: See TracBrowser for help on using the repository browser.