source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 34394

Last change on this file since 34394 was 34394, checked in by ak19, 4 years ago

Bugfix 1 for GLI metadata slowdown: selecting multiple Gathererd files in GLI became very slow. Kathy and Dr Bainbridge had tracked this down to code I had added to support non basic ASCII filenames in GLI, which was making an expensive win operating system function call on Windows for each selected file, launching a Java Process for each. The speed of selecting multiple files is now back to being almost as fast as in 3.09. Tested on Windows and linux. Had to treat windows as a special case because I can't get the code modifications to work on Linux: the perl code stores a hex-encoded string for the filename that GLI now uses when OS is Windows and compares against the hex encoded name of a file selected. But on linux the hex encoded value generated by perl is not the same as that which java generates and after trying repeatedly, I'e not been able to succeed to get it to work. So the code behaves as before for Linux.

  • Property svn:keywords set to Author Date Id Revision
File size: 28.3 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import java.net.URLDecoder;
33import org.greenstone.gatherer.DebugStream;
34import org.greenstone.gatherer.util.Utility;
35
36//import org.greenstone.gatherer.feedback.Base64; // decode() from Base64 didn't work
37import org.apache.commons.codec.binary.Base64; // decoding from Base64 works
38
39/** This class represents one doc.xml file */
40
41public abstract class DocXMLFile extends File
42{
43 static boolean isWin = Utility.isWindows();
44 // For Linux, we continue using gsdlsourcefilename as key to the metadata mapping
45 // For Windows, we use the hex encoded long file paths as key
46 static String GSDL_SOURCE_FILE_METANAME = isWin ? "gsdlfullsourcepath" : "gsdlsourcefilename";
47
48 protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
49
50 protected final String MetadataWrap;
51 protected final String MetadataItem;
52
53 protected final String FILE_RENAME_METHOD_NONE = "none";
54 protected final String FILE_RENAME_METHOD_URL = "url";
55 protected final String FILE_RENAME_METHOD_BASE64 = "base64";
56
57 public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
58 {
59 super(doc_xml_file_path);
60 this.MetadataWrap = metaWrap;
61 this.MetadataItem = metaItem;
62 }
63
64 /** On Windows, file_relative_path will be hex-encoded for codepts beyond ASCII.
65 * But keys into the source_file_name_to_description_elements_mapping will then also match on Windows */
66 public ArrayList getMetadataExtractedFromFile(File file, String file_relative_path)
67 {
68 // Build up a list of metadata extracted from this file
69 ArrayList metadata_values = new ArrayList();
70
71 ///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
72 /// System.err.println("\n@@@ relFilename: " + relFilename);
73 ///}
74
75 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
76 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
77 if (description_elements_list == null) {
78 // ...it doesn't
79 ///System.err.println("Unable to find meta for file path form " + file_relative_path);
80 return metadata_values; // we're done
81 }
82
83 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
84
85 // Parse the file
86 DebugStream.println("Applicable file: " + this);
87 try {
88 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
89
90 int description_element_num = 0;
91 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
92 boolean in_relevant_description_element = false;
93
94 String line = null;
95 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
96 // Check if this line contains the start of a relevant "Description" element
97 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
98 if (line_num == next_description_element_start) {
99 in_relevant_description_element = true;
100 continue;
101 }
102
103 // If we're not in a relevant Description element we don't care about anything
104 if (in_relevant_description_element == false) {
105 continue;
106 }
107
108 // Check if this line contains the end of the relevant Description element
109 if (line.indexOf("</"+MetadataWrap+">") != -1) {
110 description_element_num++;
111 if (description_element_num == description_elements_list.size()) {
112 break;
113 }
114
115 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
116 in_relevant_description_element = false;
117 continue;
118 }
119
120 // If this line doesn't contain a complete Metadata element, we're not interested
121 if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) {
122 continue;
123 }
124
125 // Extract the metadata element name
126 int name_index = line.indexOf(" name=\"") + " name=\"".length();
127 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
128
129 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
130 // Actually, if it is ex. then we are interested
131 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
132
133 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
134 continue;
135 }
136
137 // Extracted metadata!
138 // do it like this just in case we have ex.
139 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
140
141 // We completely ignore bibliographic data
142 if (metadata_element_name.equals("SourceSegment")) {
143 buffered_reader.close();
144 return new ArrayList();
145 }
146
147 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
148 if (metadata_element_name.startsWith("gsdl")) {
149 continue;
150 }
151
152 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
153
154 // Value trees are not stored for extracted metadata, so create a new value tree node now
155 int value_index = line.indexOf(">", name_index) + ">".length();
156 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
157
158 metadata_element.addMetadataValue(metadata_element_value);
159 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
160
161 // Add the new metadata value to the list
162 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
163 metadata_values.add(metadata_value);
164 }
165
166 buffered_reader.close();
167 }
168 catch (FileNotFoundException exception) {
169 DebugStream.printStackTrace(exception);
170 }
171 catch (IOException exception) {
172 DebugStream.printStackTrace(exception);
173 }
174
175 return metadata_values;
176 }
177
178
179
180
181 /**
182 * Every file must be skimmed when a collection is opened, for two reasons:
183 * - To build a mapping from source file to its corresponding doc.xml file
184 * - To get a complete list of all extracted metadata elements
185 */
186 public void skimFile()
187 {
188 String fileRenameMethod = null;
189 String gsdlsourcefilename_value = null;
190 boolean is_unix_path = false;
191
192 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
193
194 // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
195 DebugStream.println("Skimming " + this + "...");
196 try {
197 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
198 int description_element_start = -1;
199
200 String line = null;
201 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
202 // This line contains the start of a "MetadataWrap" element
203 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
204 if (line.indexOf("<"+MetadataWrap+">") != -1) {
205 if (description_element_start != -1) {
206 System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
207 }
208 description_element_start = line_num;
209 continue;
210 }
211
212 // This line contains the end of a "MetadataWrap" element
213 if (line.indexOf("</"+MetadataWrap+">") != -1) {
214 if (description_element_start == -1) {
215 System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
216 }
217 description_element_start = -1;
218 continue;
219 }
220
221 // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
222 if (description_element_start == -1) {
223 continue;
224 }
225
226 // This line doesn't contain a Metadata element, so we're not interested
227 if (line.indexOf("<"+MetadataItem+" ") == -1) {
228 DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
229 continue;
230 }
231
232 // Extract the metadata element name
233 int name_index = line.indexOf(" name=\"") + " name=\"".length();
234 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
235
236 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
237 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
238 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
239 continue;
240 }
241
242 // Extracted metadata! May have ex. so make sure we remove that
243 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
244 if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
245 // Extract the element value
246 int value_index = line.indexOf(">", name_index) + ">".length();
247 fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));
248 }
249
250 // Note which file this is for
251 //else if (metadata_element_name.equals("gsdlsourcefilename")) {
252 else if (metadata_element_name.equals(GSDL_SOURCE_FILE_METANAME)) {
253 // On Unix, GSDL_SOURCE_FILE_METANAME is the gsdlsourcefilename metadata field
254 // which may be encoded by the encoding denoted in fileRenameMethod (and will need decoding)
255 // On Windows, GSDL_SOURCE_FILE_METANAME is a different metadata field that
256 // will be hex encoded for non-ASCII chars
257
258 // Extract the gsdlsourcefilename element value
259 int value_index = line.indexOf(">", name_index) + ">".length();
260 gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
261
262 // We're only interested in the path relative to the import folder
263 int import_index = gsdlsourcefilename_value.indexOf("import");
264 if (import_index != -1) {
265 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
266
267 is_unix_path = gsdlsourcefilename_value.startsWith("/");
268 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
269
270 // (Will decode gsdlsourcefilename at end of this method, once we know
271 // for certain the fileRenameMethod that was used to encode it.)
272
273 // Make sure the path matches the OS that is running
274 if (is_unix_path && isWin) {
275 // Convert path from Unix to Windows
276 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
277 }
278 else if (!is_unix_path && !isWin) {
279 // Convert path from Windows to Unix
280 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
281 }
282
283 ///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
284 // Remember this for quick access later
285 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
286 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
287 }
288
289 // Would be better to store hex src file name decoded? But how do we know what encoding the filename is in
290 // https://stackoverflow.com/questions/13990941/how-to-convert-hex-string-to-java-string
291
292
293 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
294 }
295
296 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
297 // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
298 // which are the gsdlsourcefilenames for the fedora digital object representing a collection.
299 // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
300 else if (gsdlsourcefilename_value.indexOf("tmp") == -1
301 && !gsdlsourcefilename_value.endsWith("collect.cfg")
302 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
303 // We don't really know what is going on...
304 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
305 }
306 }
307
308 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
309 if (metadata_element_name.startsWith("gsdl")) {
310 continue;
311 }
312
313 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
314 if (metadata_element == null) {
315 // This element isn't defined in ex.mds, so create it for this session
316 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
317 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
318 }
319 }
320
321 buffered_reader.close();
322
323 // ON WINDOWS, we're working with hex encoded full file path instead of with gsdlsourcefilename,
324 // so needn't bother decoding gsdlsourcefilename as it's unused.
325 // On UNIX, continue decoding gsdlsourcefilename as before
326 if(!isWin) {
327 // Now that we're done skimming, we actually need to decode gsdlsourcefilename
328 // based on whatever fileRenameMethod was used to encode it, so that we can
329 // at last properly compare properly against filenames on the file system
330 // in order to load the correct ex.meta for the file.
331 // Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
332 // we can finally perform the decoding of gsdlsourcefilename.
333 if(fileRenameMethod == null) {
334 fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
335 }
336
337 // If gsdlsourcefilename was encoded, we remove it from the map under its encoded
338 // filename, decode it and add it back into map using its decoded filename.
339 if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
340 ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value);
341 gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
342 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list);
343 }
344 }
345
346 }
347 catch (FileNotFoundException exception) {
348 DebugStream.printStackTrace(exception);
349 }
350 catch (IOException exception) {
351 DebugStream.printStackTrace(exception);
352 } catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
353 DebugStream.printStackTrace(exception);
354 }
355 }
356
357 protected String decodeSourceFilename(String relative_sourcefile_path,
358 String encodingMethod, boolean is_unix_path)
359 throws Exception
360 {
361
362 ///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
363
364 // First get the file extension. Both in Base64 and URL encoded strings,
365 // the full-stop character (.) doesn't get encoded.
366 // That means getting the file extension is straightforward.
367
368 // Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
369 // 26 lowercase characters, 26 uppercase characters as well as the
370 // Plus sign (+) and the Forward Slash (/).
371 int fullstop = relative_sourcefile_path.indexOf(".");
372 String file_ext = "";
373 if(fullstop != -1) {
374 file_ext = relative_sourcefile_path.substring(fullstop);
375 relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
376 }
377
378 String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
379
380 String decoded_gsdlsourcefilename = "";
381
382 String separator = is_unix_path ? "/" : "\\";
383 for(int i = 0; i < importFilePathParts.length; i++) {
384 String decoded_filePathPart = "";
385 if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
386 // URL decode each part of gsdlsourcefilename.
387 // Need to set the decoder to use the default system encoding
388 // This is stored in the System's file.encoding property.
389 decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
390 }
391 else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
392 // Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
393 //byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
394 // Using org.apache.commons.codec.binary.Base64 instead
395 // https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
396 // General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
397 byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
398 ///System.err.println("Got base64 string: " + importFilePathParts[i]);
399 ///System.err.println("Decoded from base64 to bytes: " + new String(bytes, System.getProperty("file.encoding")));
400 // Using system file.encoding to interpret the resulting bytestring as a String,
401 // just as we always did with URL decoding method
402 decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
403 }
404
405 if(i == 0) {
406 decoded_gsdlsourcefilename = decoded_filePathPart;
407 } else {
408 decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
409 }
410 ///System.err.println("Built up: " + decoded_gsdlsourcefilename);
411 }
412
413 // add the file extension back in
414 decoded_gsdlsourcefilename += file_ext;
415
416 ///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
417
418 return decoded_gsdlsourcefilename;
419 }
420
421 /**
422 * Given a filepath, returns the parts between each file separator as an array.
423 * For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
424 */
425 private static String[] getFilePathParts(String filepath, boolean is_unix_path) {
426 StringTokenizer tok;
427 if(is_unix_path) {
428 tok = new StringTokenizer(filepath, "/");
429 } else {
430 tok = new StringTokenizer(filepath, "\\");
431 }
432 String[] parts;
433 int count = tok.countTokens();
434 if(count <= 0) {
435 parts = new String[]{filepath};
436 } else {
437 int i = 0;
438 parts = new String[count];
439 while(tok.hasMoreTokens()) {
440 parts[i] = tok.nextToken();
441 //System.err.println("Next part: " + parts[i]);
442 i++;
443 }
444 }
445 return parts;
446 }
447
448 /*
449 public ArrayList getMetadataExtractedFromFile(File file)
450 {
451 // Build up a list of metadata extracted from this file
452 ArrayList metadata_values = new ArrayList();
453
454 String file_relative_path = file.getAbsolutePath();
455 int import_index = file_relative_path.indexOf("import");
456 if (import_index != -1) {
457 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
458 }
459
460 // Check whether this doc.xml file contains extracted metadata for the specified file
461 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
462 if (description_elements_list == null) {
463 // ...it doesn't
464 return metadata_values;
465 }
466
467 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
468
469 // Parse the doc.xml file
470 DebugStream.println("Applicable doc.xml file: " + this);
471 try {
472 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
473
474 int description_element_num = 0;
475 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
476 boolean in_relevant_description_element = false;
477
478 String line = null;
479 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
480 // Check if this line contains the start of a relevant Description element
481 if (line_num == next_description_element_start) {
482 in_relevant_description_element = true;
483 continue;
484 }
485
486 // If we're not in a relevant Description element we don't care about anything
487 if (in_relevant_description_element == false) {
488 continue;
489 }
490
491 // Check if this line contains the end of the relevant Description element
492 if (line.indexOf("</Description>") != -1) {
493 description_element_num++;
494 if (description_element_num == description_elements_list.size()) {
495 break;
496 }
497
498 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
499 in_relevant_description_element = false;
500 continue;
501 }
502
503 // If this line doesn't contain a complete Metadata element, we're not interested
504 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
505 continue;
506 }
507
508 // Extract the metadata element name
509 int name_index = line.indexOf(" name=\"") + " name=\"".length();
510 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
511
512 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
513 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
514 if (!metadata_set_namespace.equals("")) {
515 continue;
516 }
517
518 // Extracted metadata!
519 String metadata_element_name = metadata_element_name_full;
520
521 // We completely ignore bibliographic data
522 if (metadata_element_name.equals("SourceSegment")) {
523 buffered_reader.close();
524 return new ArrayList();
525 }
526
527 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
528 if (metadata_element_name.startsWith("gsdl")) {
529 continue;
530 }
531
532 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
533
534 // Value trees are not stored for extracted metadata, so create a new value tree node now
535 int value_index = line.indexOf(">", name_index) + ">".length();
536 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
537
538 metadata_element.addMetadataValue(metadata_element_value);
539 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
540
541 // Add the new metadata value to the list
542 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
543 metadata_values.add(metadata_value);
544 }
545
546 buffered_reader.close();
547 }
548 catch (FileNotFoundException exception) {
549 DebugStream.printStackTrace(exception);
550 }
551 catch (IOException exception) {
552 DebugStream.printStackTrace(exception);
553 }
554
555 return metadata_values;
556 }
557
558 */
559
560 /**
561 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
562 * - To build a mapping from source file to its corresponding doc.xml file
563 * - To get a complete list of all extracted metadata elements
564 */
565 /*
566 public void skimFile()
567 {
568 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
569
570 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
571 DebugStream.println("Skimming " + this + "...");
572 try {
573 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
574 int description_element_start = -1;
575
576 String line = null;
577 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
578 // This line contains the start of a Description element
579 if (line.indexOf("<Description>") != -1) {
580 if (description_element_start != -1) {
581 System.err.println("Parse error: previous Description element unfinished!");
582 }
583 description_element_start = line_num;
584 continue;
585 }
586
587 // This line contains the end of a Description element
588 if (line.indexOf("</Description>") != -1) {
589 if (description_element_start == -1) {
590 System.err.println("Parse error: Description element unstarted!");
591 }
592 description_element_start = -1;
593 continue;
594 }
595
596 // If we're not in a Description element there shouldn't be any Metadata elements
597 if (description_element_start == -1) {
598 continue;
599 }
600
601 // This line doesn't contain a Metadata element, so we're not interested
602 if (line.indexOf("<Metadata ") == -1) {
603 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
604 continue;
605 }
606
607 // Extract the metadata element name
608 int name_index = line.indexOf(" name=\"") + " name=\"".length();
609 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
610
611 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
612 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
613 if (!metadata_set_namespace.equals("")) {
614 continue;
615 }
616
617 // Extracted metadata!
618 String metadata_element_name = metadata_element_name_full;
619
620 // Note which file this doc.xml is for
621 if (metadata_element_name.equals("gsdlsourcefilename")) {
622 // Extract the gsdlsourcefilename element value
623 int value_index = line.indexOf(">", name_index) + ">".length();
624 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
625
626 // We're only interested in the path relative to the import folder
627 int import_index = gsdlsourcefilename_value.indexOf("import");
628 if (import_index != -1) {
629 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
630
631 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
632 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
633
634 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
635 // This is stored in the System's file.encoding property.
636 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
637
638 // Make sure the path matches the OS that is running
639 if (is_unix_path && isWin) {
640 // Convert path from Unix to Windows
641 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
642 }
643 else if (!is_unix_path && !isWin) {
644 // Convert path from Windows to Unix
645 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
646 }
647
648 // Remember this for quick access later
649 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
650 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
651 }
652
653 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
654 }
655
656 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
657 // This is true when the source files come from a zip file processed by ZIPPlug, for example
658 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
659 // We don't really know what is going on...
660 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
661 }
662 }
663
664 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
665 if (metadata_element_name.startsWith("gsdl")) {
666 continue;
667 }
668
669 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
670 if (metadata_element == null) {
671 // This element isn't defined in ex.mds, so create it for this session
672 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
673 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
674 }
675 }
676
677 buffered_reader.close();
678 }
679 catch (FileNotFoundException exception) {
680 DebugStream.printStackTrace(exception);
681 }
682 catch (IOException exception) {
683 DebugStream.printStackTrace(exception);
684 }
685 }
686 */
687
688}
Note: See TracBrowser for help on using the repository browser.