source: main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java@ 23436

Last change on this file since 23436 was 23436, checked in by ak19, 13 years ago

Synchronising a couple of methods in FilenameEncoding to ensure proper lock on state variable (refreshRequired) used in conditions

File size: 18.1 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2010 Greenstone Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29import java.io.File;
30import java.net.*;
31import java.nio.charset.*;
32import java.util.*;
33import org.greenstone.gatherer.collection.CollectionManager;
34
35
36/** Static access class that contains many of the methods used to work with filename encodings.
37* Works closely with classes FileNode, CollectionTreeNode, MetadataXMLFile, MetadataXMLFileManager
38* to maintain a map of URLEncodedFilenames to their filename encodings.
39* The process of filename encoding further affects the CollectionManager which refreshes its CollectionTree,
40* FileManager (move, delete, rename actions), MetadataValueTableModel, EnrichPane. */
41
42public class FilenameEncoding {
43 /** Display of filenames in the trees are in URL encoding, if debugging */
44 public static boolean DEBUGGING = false;
45
46 /** Set to false by Gatherer if the locale is UTF-8, as Java's handling is
47 * such that non-UTF8 filename encodings on a UTF-8 locale are destructively
48 * converted so that the bytecodes in the filename are not preserved. */
49 public static boolean MULTIPLE_FILENAME_ENCODINGS_SUPPORTED = false;
50
51 /** Also set by Gatherer.
52 * If the OS supports multiple filename encodings, we will be working with URL strings
53 * and the applicable separators are always the forward slash ("/") not File.separator.
54 * If multiple filename encodings are not supported, we're dealing with File.separator. */
55 public static String URL_FILE_SEPARATOR = File.separator;
56
57
58 /** gs.filenameEncoding is a special sort of metadata that is not merely to be stored along
59 * with a file, but is to be applied in real-time on the file's name in the CollectionTree
60 * display. Since FileNodes are constantly destroyed and reconstructed by that Tree when
61 * its nodes are expanded and contracted, storing the filename encodings of each file along
62 * with the file in a FileNode doesn't help because it doesn't last. Instead of rediscovering
63 * the encoding at every stage by querying the metadataXML file, we store the encodings for
64 * fast access: in a map of (URLEncodedFilePath, filename-encoding) pairs.
65 * The current design of the map is to only store any active filename metadata assigned
66 * directly at that file/folder's level, and if there is none discovered at that level, then
67 * storing the empty string for it. Therefore, if the hashmap contains no entry for
68 * a file, it means this still needs to be retrieved. */
69 public static Map map = new HashMap();
70
71//*********************** BUSY REFRESHING / REQUIRING REFRESH *********************
72
73 /** Set to true if filename encoding metadata was changed. Called by the enter keyPress
74 * event in gui.EnrichPane and when the gs.FilenameEncoding field loses focus. */
75 private static boolean refreshRequired = false;
76
77 synchronized public static boolean isRefreshRequired() {
78 return refreshRequired;
79 }
80
81 synchronized public static void setRefreshRequired(boolean state) {
82 if(MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
83 refreshRequired = state;
84 } else {
85 refreshRequired = false;
86 }
87 }
88
89//************************** MAP RETRIEVAL METHODS ******************************
90
91 /** Returns the cumulative gs.filenameEncoding metadata
92 * assigned to a file inside the collection. */
93 public static String findFilenameEncoding(
94 File file, String urlEncodedFilePath, boolean bruteForceLookup)
95 {
96 //if(bruteForceLookup) {
97 // return findFilenameEncodingBruteForce(file, urlEncodedFilePath, bruteForceLookup);
98 //}
99
100 String encoding = "";
101
102 // Check any assigned encoding at this level, starting with the map first
103 // and else retrieving the filename encoding from the metadata file
104 if(!map.containsKey(urlEncodedFilePath)) {
105
106 // Check for filename encoding metadata *directly* associated with the file
107 // Now don't need to get any inherited encoding metadata here, because of
108 // the way we're storing and retrieving encoding information from the map.
109 ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(file, true); // true: gets gs.filenameEncoding only
110 if(!list.isEmpty()) {
111 MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
112 encoding = metavalue.getValue();
113 } // else no filename encoding set yet at this level
114
115 // Now we've done a lookup at this level cache the result in the map,
116 // including empty strings, to indicate that we've done a full lookup
117 map.put(urlEncodedFilePath, encoding);
118 }
119 else { // an entry exists in the map, get it from there
120 encoding = (String)map.get(urlEncodedFilePath);
121 }
122
123 // if no meta was specified at at the file level, look for any inherited metadata
124 if(encoding.equals("")) {
125 encoding = getInheritedFilenameEncoding(urlEncodedFilePath, file);
126 }
127
128 //System.err.println("\n@@@@Looked for: " + urlEncodedFilePath + " | found: " + encoding);
129 return encoding; // found something in map, may still be "", but it's what was stored
130 }
131
132 /** Checks the file-to-encoding map for all the superfolders of the given
133 * filename in sequence for an applicable encoding. Note that the file/folder
134 * at the level of urlFoldername (and dir) has already been inspected. */
135 static public String getInheritedFilenameEncoding(String urlFoldername, File dir)
136 {
137 String encoding = "";
138 boolean done = false;
139
140 // don't want to search past import folder which is as
141 // far as we need to go to determine inherited encodings
142 File importDir = new File(CollectionManager.getLoadedCollectionImportDirectoryPath());
143 if(dir.equals(importDir)) { // if the top-level dir was already checked, we're done
144 done = true;
145 }
146
147 // For directories, first remove trailing file separator in order to start checking from higher level folders
148 int lastIndex = urlFoldername.length()-1;
149 char urlFileSeparatorChar = URL_FILE_SEPARATOR.charAt(0);
150 if(urlFoldername.charAt(lastIndex) == urlFileSeparatorChar) {
151 urlFoldername = urlFoldername.substring(0, lastIndex);
152 }
153
154 while(!done) {
155 // get the folder that's one level up
156 dir = dir.getParentFile();
157
158 int index = urlFoldername.lastIndexOf(URL_FILE_SEPARATOR);
159 if(index == -1) { // no more slashes
160 done = true;
161 } else {
162 urlFoldername = urlFoldername.substring(0, index);
163 }
164
165 // now look in the map to see whether there's an encoding for this folder
166 String folder = urlFoldername + URL_FILE_SEPARATOR;
167 if(map.containsKey(folder)) {
168 encoding = (String)map.get(folder); // may be ""
169 } else { // no entry in map, so look in the metadata.xml at this folder level
170 ArrayList list = MetadataXMLFileManager.getMetadataAssignedDirectlyToFile(
171 dir, true); // true: gets gs.filenameEncoding only
172 if(!list.isEmpty()) {
173 MetadataValue metavalue = (MetadataValue)list.get(0); // get(list.size()-1);
174 encoding = metavalue.getValue();
175 }
176 map.put(folder, encoding); // may be ""
177 }
178
179 if(!encoding.equals("")){
180 done = true;
181 } // else if "", loop to check next folder up
182 else if(dir.equals(importDir)) { // don't iterate past the import folder, which we've now checked
183 done = true;
184 }
185 }
186
187 return encoding;
188 }
189
190 /** Called by GUIManager when a collection is closed. This then empties the
191 * file-to-encoding map which is applicable only on a per-collection basis */
192 static public void closeCollection() {
193 //printFilenameMap("Closing collection. Clearing file-to-encoding map of entries:");
194 map.clear();
195 }
196
197 // Useful for debugging: prints contents of file-to-encoding map
198 static public void printFilenameMap(String heading) {
199 System.err.println("\n********************************************");
200 System.err.println(heading.toUpperCase());
201 Iterator entries = map.entrySet().iterator();
202 while(entries.hasNext()) {
203 Map.Entry entry = (Map.Entry)entries.next();
204 System.err.println("+ " + (String)entry.getKey() + ": " + (String)entry.getValue());
205 }
206 System.err.println("********************************************\n");
207 }
208
209 // UNUSED at present. Brute force version of the findFilenameEncoding() method
210 // Doesn't use the map, but gets *all* the metadata assigned to a file/folder to
211 // work out the encoding applicable to a file/folder.
212 public static String findFilenameEncodingBruteForce(File file, String urlEncodedFilename,
213 boolean bruteForceLookup)
214 {
215 System.err.println("\n***** BRUTE FORCE getFilenameEncoding() called\n");
216
217
218 String encoding = "";
219
220 // Check for filename encoding metadata *directly* associated with the file
221 // Now don't need to get any inherited encoding metadata here, because of
222 // the way we're storing and retrieving encoding information from the map.
223
224 ArrayList list = MetadataXMLFileManager.getMetadataAssignedToFile(file, true); // true: gets gs.filenameEncoding only
225 if(!list.isEmpty()) {
226 // try to get the filename encoding meta that was assigned last to this
227 // file, even though it makes no sense to have multiple values for it
228 MetadataValue metavalue = (MetadataValue)list.get(list.size()-1);
229 encoding = metavalue.getValue();
230
231 if(encoding == null) { // unlikely ???
232 System.err.println("**** ERROR: encoding for "
233 + urlEncodedFilename + " is NULL!");
234 encoding = "";
235 }
236 } // else no filename encoding set yet, perhaps
237 //System.err.println("**** Found encoding for " + urlEncodedFilename + " " + encoding);
238 return encoding;
239 }
240
241//****************************** APPLYING ENCODINGS TO FILENAMES *****************************
242
243 /** URL encoded version of the byte codes of the given file's name */
244 public static String calcURLEncodedFilePath(File file) {
245 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
246 return file.getAbsolutePath();
247 }
248 else {
249 String filename = fileToURLEncoding(file);
250 return filename;
251 }
252 }
253
254 /** URL encoded version of the byte codes of this file's name */
255 public static String calcURLEncodedFileName(String urlfilepath) {
256 String filename = urlfilepath;
257 if(filename.endsWith(URL_FILE_SEPARATOR)) { // directory, remove trailing slash
258 filename = filename.substring(0, filename.length() - 1);
259 }
260
261 // remove the directory prefix (if any) to get the filename
262 int index = filename.lastIndexOf(URL_FILE_SEPARATOR);
263 if(index != -1) {
264 filename = filename.substring(index+1); // skip separator
265 }
266
267 return filename;
268 }
269
270 /** Given a string representing an alias to an official encoding (and unofficial ones
271 * starting with "Latin-"), attempts to work out what the canonical encoding for that is.
272 * If the given encoding is unrecognised, it is returned as is. */
273 public static String canonicalEncodingName(String encoding) {
274 String canonicalEncoding = encoding;
275 try {
276 // Latin-1 -> ISO-8859-1
277 String alias = canonicalEncoding.toLowerCase();
278 if(alias.startsWith("latin")){
279 canonicalEncoding = "ISO-8859" + alias.substring("latin".length());
280 }
281
282 // canonical encoding for official aliases
283 canonicalEncoding = Charset.forName(canonicalEncoding).name();
284 return canonicalEncoding;
285 } catch (Exception e) {
286 System.err.println("(Could not recognise encoding (alias): "
287 + encoding + ".)");
288 return encoding; // no alias could be found, return the original parameter
289 }
290 }
291
292//************************* GETTING THE URL ENCODING OF FILENAMES *********************************
293 // Dr Bainbridge's methods
294 /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,
295 * we can work with URL-encoded filenames in Java. Java works with whatever
296 * encoding the filesystem uses. Unlike systems working with UTF-8, where Java
297 * interprets filenames as UTF-8 (a destructive process since characters invalid
298 * for UTF-8 are replaced with the invalid character, which means the original
299 * character's byte codes can not be regained), working with an ISO-8859-1
300 * system means the original byte codes of the characters are preserved,
301 * regardless of whether the characters represent ISO-8859-1 or not. Such byte
302 * codes are converted by the following method to the correct URL versions of
303 * the strings that the filenames represent (that is, the correct URL representations
304 * of the filenames in their original encodings). This is useful for interactions with
305 * Perl as Java and Perl can use URL-encoded filenames to talk about the same files
306 * on the file system, instead of having to work out what encoding they are in. */
307
308 public static String fileToURLEncoding(File file) {
309 if(!MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
310 return file.getAbsolutePath();
311 }
312
313 String filename_url_encoded = "";
314
315 // The following test for whether the file exists or not is a problem
316 // when a File object--whose actual file is in the process of being moved
317 // and therefore temporarily does not 'exist' on the actual system--can't
318 // be URL encoded: the following would return "" when a file doesn't exist.
319 // So commenting out the test.
320 /*
321 if(!file.getName().equals("recycle")) {
322 if(!file.isFile() && !file.isDirectory()) {
323 System.err.println("*** ERROR. Java can't see file: " + file.getAbsolutePath());
324 return "";
325 }
326
327 if(!file.exists()) {
328 System.err.println("*** NOTE: File doesn't exist: " + file.getAbsolutePath());
329 return ""; //file.getName();
330 }
331 }
332 */
333
334 URI filename_uri = file.toURI();
335 try {
336 // The trick:
337 // 1. toASCIIString() will %xx encode values > 127
338 // 2. Decode the result to "ISO-8859-1"
339 // 3. URL encode the bytes to string
340
341 // Step 2 forces the string to be 8-bit values. It
342 // doesn't matter if the starting raw filename was *not*
343 // in the ISO-8859-1 encoding, the effect is to ensure
344 // we have an 8-bit byte string that (numerically)
345 // captures the right value. These numerical values are
346 // then used to determine how to URL encode it
347
348 String filename_ascii = filename_uri.toASCIIString();
349 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
350 filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
351 }
352 catch (Exception e) {
353 e.printStackTrace();
354 // Give up trying to convert
355 filename_url_encoded = file.getAbsolutePath();
356 }
357 return filename_url_encoded;
358 }
359
360
361 // For unicode codepoints see:
362 // http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT for ISO8859-1 (Latin-1)
363 // where 0xE2 maps to codepoint 0x00E2 and is defined as "Latin small letter a with circumflex"
364 // http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT for ISO8859-7 (Greek)
365 // where 0xE2 maps to codepoint 0x03B2 and is defined as "Greek small letter beta"
366 public static String iso_8859_1_filename_to_url_encoded(String raw_bytes_filename)
367 throws Exception
368 {
369 String urlEncoded = "";
370
371 try {
372 // By this point we have a UTF-8 encoded string that captures
373 // what the ISO-8859-1 (Latin-1) character is that corresponded to the
374 // 8-bit numeric value for that character in the filename
375 // on the file system
376
377 // For example:
378 // File system char: <lower-case beta char in Latin-7> = %E2
379 // Equivalent Latin 1 char: <lower-case a with circumflex> = %E2
380 // Mapped to UTF-8: <lower-case a with circumflex> = <C3><A2>
381
382 // Our task is to take the string the contains <C3><A2> and ensure that
383 // we "see" it as <E2>
384
385 byte [] raw_bytes = raw_bytes_filename.getBytes("ISO-8859-1");
386 String unicode_filename = new String(raw_bytes,"UTF-8");
387
388 for(int i = 0; i < unicode_filename.length(); i++) {
389 char charVal = unicode_filename.charAt(i);
390 if((int)charVal > 127) {
391 urlEncoded += String.format("%%%02X", (int)charVal);
392 } else {
393 urlEncoded += String.format("%c", (char)charVal);
394 }
395 }
396 }
397 catch (Exception e) {
398 //e.printStackTrace();
399 throw(e);
400 }
401
402 return urlEncoded;
403 }
404
405 // unused for now
406 public static String raw_filename_to_url_encoded(String fileName)
407 throws Exception
408 {
409 String urlEncoded = "";
410 try {
411 byte[] bytes = fileName.getBytes();
412
413 for(int i = 0; i < bytes.length; i++) {
414 // mask each byte (by applying & 0xFF) to make the signed
415 // byte (in the range -128 to 127) unsigned (in the range
416 // 0 to 255).
417
418 int byteVal = (int)(bytes[i] & 0xFF);
419
420 if(byteVal > 127) {
421 urlEncoded += String.format("%%%02X", (int)byteVal);
422 } else {
423 urlEncoded += String.format("%c",(char)byteVal);
424 }
425 }
426 }
427 catch (Exception e) {
428 //e.printStackTrace();
429 throw(e);
430 }
431
432 return urlEncoded;
433 }
434
435}
Note: See TracBrowser for help on using the repository browser.