source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 34509

Last change on this file since 34509 was 34509, checked in by ak19, 4 years ago

Related to previous commits 34506-34508. Storing both the Win 8.3 Short filename of gsdlsourcefilename and its long filename version.

  • Property svn:keywords set to Author Date Id Revision
File size: 30.0 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import java.net.URLDecoder;
33import org.greenstone.gatherer.DebugStream;
34import org.greenstone.gatherer.Gatherer;
35import org.greenstone.gatherer.util.Utility;
36
37//import org.greenstone.gatherer.feedback.Base64; // decode() from Base64 didn't work
38import org.apache.commons.codec.binary.Base64; // decoding from Base64 works
39
40/** This class represents one doc.xml file */
41
42public abstract class DocXMLFile extends File
43{
44 static boolean isWin = Utility.isWindows();
45
46 protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
47
48 protected final String MetadataWrap;
49 protected final String MetadataItem;
50
51 protected final String FILE_RENAME_METHOD_NONE = "none";
52 protected final String FILE_RENAME_METHOD_URL = "url";
53 protected final String FILE_RENAME_METHOD_BASE64 = "base64";
54
55 public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
56 {
57 super(doc_xml_file_path);
58 this.MetadataWrap = metaWrap;
59 this.MetadataItem = metaItem;
60 }
61
62 /** On Windows, file_relative_path will be hex-encoded for codepts beyond ASCII.
63 * But keys into the source_file_name_to_description_elements_mapping will then also match on Windows */
64 public ArrayList getMetadataExtractedFromFile(File file, String file_relative_path)
65 {
66 // Build up a list of metadata extracted from this file
67 ArrayList metadata_values = new ArrayList();
68
69 ///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
70 /// System.err.println("\n@@@ relFilename: " + relFilename);
71 ///}
72
73 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
74 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
75 if (description_elements_list == null) {
76 // ...it doesn't
77 ///System.err.println("Unable to find meta for file path form " + file_relative_path);
78 return metadata_values; // we're done
79 } ///else { System.err.println("@@@ file rel path: " + file_relative_path + " matched" ); }
80
81 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
82
83 // Parse the file
84 DebugStream.println("Applicable file: " + this);
85 try {
86 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
87
88 int description_element_num = 0;
89 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
90 boolean in_relevant_description_element = false;
91
92 String line = null;
93 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
94 // Check if this line contains the start of a relevant "Description" element
95 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
96 if (line_num == next_description_element_start) {
97 in_relevant_description_element = true;
98 continue;
99 }
100
101 // If we're not in a relevant Description element we don't care about anything
102 if (in_relevant_description_element == false) {
103 continue;
104 }
105
106 // Check if this line contains the end of the relevant Description element
107 if (line.indexOf("</"+MetadataWrap+">") != -1) {
108 description_element_num++;
109 if (description_element_num == description_elements_list.size()) {
110 break;
111 }
112
113 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
114 in_relevant_description_element = false;
115 continue;
116 }
117
118 // If this line doesn't contain a complete Metadata element, we're not interested
119 if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) {
120 continue;
121 }
122
123 // Extract the metadata element name
124 int name_index = line.indexOf(" name=\"") + " name=\"".length();
125 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
126
127 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
128 // Actually, if it is ex. then we are interested
129 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
130
131 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
132 continue;
133 }
134
135 // Extracted metadata!
136 // do it like this just in case we have ex.
137 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
138
139 // We completely ignore bibliographic data
140 if (metadata_element_name.equals("SourceSegment")) {
141 buffered_reader.close();
142 return new ArrayList();
143 }
144
145 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
146 if (metadata_element_name.startsWith("gsdl")) {
147 continue;
148 }
149
150 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
151
152 // Value trees are not stored for extracted metadata, so create a new value tree node now
153 int value_index = line.indexOf(">", name_index) + ">".length();
154 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
155
156 metadata_element.addMetadataValue(metadata_element_value);
157 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
158
159 // Add the new metadata value to the list
160 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
161 metadata_values.add(metadata_value);
162 }
163
164 buffered_reader.close();
165 }
166 catch (FileNotFoundException exception) {
167 DebugStream.printStackTrace(exception);
168 }
169 catch (IOException exception) {
170 DebugStream.printStackTrace(exception);
171 }
172
173 return metadata_values;
174 }
175
176
177
178
179 /**
180 * Every file must be skimmed when a collection is opened, for two reasons:
181 * - To build a mapping from source file to its corresponding doc.xml file
182 * - To get a complete list of all extracted metadata elements
183 */
184 public void skimFile()
185 {
186 String fileRenameMethod = null;
187 String gsdlsourcefilename_value = null;
188 boolean is_unix_path = false;
189 int description_element_start_gsdlsourcefilename_value = -1;
190
191 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
192
193 // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
194 DebugStream.println("Skimming " + this + "...");
195 try {
196 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
197 int description_element_start = -1;
198
199 String line = null;
200 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
201 // This line contains the start of a "MetadataWrap" element
202 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
203 if (line.indexOf("<"+MetadataWrap+">") != -1) {
204 if (description_element_start != -1) {
205 System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
206 }
207 description_element_start = line_num;
208 continue;
209 }
210
211 // This line contains the end of a "MetadataWrap" element
212 if (line.indexOf("</"+MetadataWrap+">") != -1) {
213 if (description_element_start == -1) {
214 System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
215 }
216 description_element_start = -1;
217 continue;
218 }
219
220 // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
221 if (description_element_start == -1) {
222 continue;
223 }
224
225 // This line doesn't contain a Metadata element, so we're not interested
226 if (line.indexOf("<"+MetadataItem+" ") == -1) {
227 DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
228 continue;
229 }
230
231 // Extract the metadata element name
232 int name_index = line.indexOf(" name=\"") + " name=\"".length();
233 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
234
235 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
236 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
237 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
238 continue;
239 }
240
241 // Extracted metadata! May have ex. so make sure we remove that
242 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
243 if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
244 // Extract the element value
245 int value_index = line.indexOf(">", name_index) + ">".length();
246 fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));
247 }
248
249 // Note which file this is for
250 else if (metadata_element_name.equals("gsdlsourcefilename")) {
251 // the gsdlsourcefilename metadata field may be encoded by the encoding denoted
252 // in fileRenameMethod (and will need decoding)
253
254 // Extract the gsdlsourcefilename element value
255 int value_index = line.indexOf(">", name_index) + ">".length();
256 gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
257
258 // We're only interested in the path relative to the import folder
259 int import_index = gsdlsourcefilename_value.indexOf("import");
260 if (import_index != -1) {
261
262 ///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
263 ///System.err.println("@@@@ Found description_element_start_gsdlsourcefilename_value: " + description_element_start);
264 description_element_start_gsdlsourcefilename_value = description_element_start;
265
266 }
267
268 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
269 // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
270 // which are the gsdlsourcefilenames for the fedora digital object representing a collection.
271 // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
272 else if (gsdlsourcefilename_value.indexOf("tmp") == -1
273 && !gsdlsourcefilename_value.endsWith("collect.cfg")
274 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
275 // We don't really know what is going on...
276 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
277 }
278 }
279
280 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
281 if (metadata_element_name.startsWith("gsdl")) {
282 continue;
283 }
284
285 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
286 if (metadata_element == null) {
287 // This element isn't defined in ex.mds, so create it for this session
288 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
289 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
290 }
291 }
292
293 buffered_reader.close();
294
295 // Work out if is_unix_path
296 int import_index = gsdlsourcefilename_value.indexOf("import");
297 if (import_index != -1) {
298 String tempStr = gsdlsourcefilename_value.substring(import_index + "import".length());
299 is_unix_path = tempStr.startsWith("/");
300 }
301 // We're only interested in the path relative to the import folder
302 // Lop off "import" folder prefix
303 gsdlsourcefilename_value = adjustForRelativeToImportDir(gsdlsourcefilename_value);
304
305 // Now that we're done skimming, we actually need to decode gsdlsourcefilename
306 // based on whatever fileRenameMethod was used to encode it, so that we can
307 // at last properly compare against filenames on the file system
308 // in order to load the correct ex.meta for the file.
309 // Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
310 // we can finally perform the decoding of gsdlsourcefilename.
311 if(fileRenameMethod == null) {
312 fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
313 }
314 if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
315 gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
316 }
317
318 // Now we can finally put the gsdlsourcefilename path relative to import dir into the hashmap
319 ///System.err.println("@@@ into map: " + gsdlsourcefilename_value);
320 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
321 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
322 }
323 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start_gsdlsourcefilename_value));
324
325 // Next, if Windows, check if dealing with Win 8.3 Short Filename
326 // In that case, convert short file name to long filename - works only if the file exists
327 if(isWin && gsdlsourcefilename_value.indexOf("~") != -1) {
328
329 String long_gsdlsourcefilename = gsdlsourcefilename_value;
330
331 // gsdlsourcefilename is stored from import folder onwards: import/opt_subdir/filename.ext
332 // This may contain Win 8.3 shortening. To get Win Long filename, prefix current collection dir
333 // and if resulting file exists, getCanonicalPath() which produces Win Long filename.
334 File currentCollectionFolder = Gatherer.c_man.getCollection().getCollectionDirectory();
335 File f = new File(currentCollectionFolder, "import" + File.separator + gsdlsourcefilename_value); // should work even if linux style slashes in gsdlsourcefilename_value
336 ///System.err.println("### file: " + f.getAbsolutePath());
337
338 if(f.exists()) {
339 long_gsdlsourcefilename = f.getCanonicalPath();
340 ///System.err.println("### canon: " + long_gsdlsourcefilename);
341 } // else couldn't find a version of the filename stored in doc.xml that exists, giving up, leave gsdlsourcefilename_value as is
342
343 // Again, we're only interested in the path relative to the import folder
344 long_gsdlsourcefilename = adjustForRelativeToImportDir(long_gsdlsourcefilename);
345 if(!gsdlsourcefilename_value.equals(long_gsdlsourcefilename)) { // truly distinct Win long and short file names
346 // Put a copy of the ref to gsdlsourcefilename's metadata list under the long filename as well
347 ///System.err.println("@@@ long filename into map: " + long_gsdlsourcefilename);
348 Object arrList = source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value);
349 source_file_name_to_description_elements_mapping.put(long_gsdlsourcefilename, arrList);
350 }
351 }
352
353 }
354 catch (FileNotFoundException exception) {
355 DebugStream.printStackTrace(exception);
356 }
357 catch (IOException exception) {
358 DebugStream.printStackTrace(exception);
359 } catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
360 DebugStream.printStackTrace(exception);
361 }
362 }
363
364 private String adjustForRelativeToImportDir(String gsdlsourcefilename_value) {
365 int import_index = gsdlsourcefilename_value.indexOf("import");
366 if (import_index != -1) {
367 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
368
369 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
370 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
371
372 // (Will decode gsdlsourcefilename at end of this method, once we know
373 // for certain the fileRenameMethod that was used to encode it.)
374
375 // Make sure the path matches the OS that is running
376 if (is_unix_path && isWin) {
377 // Convert path from Unix to Windows
378 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
379 }
380 else if (!is_unix_path && !isWin) {
381 // Convert path from Windows to Unix
382 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
383 }
384 }
385 return gsdlsourcefilename_value;
386 }
387
388 protected String decodeSourceFilename(String relative_sourcefile_path,
389 String encodingMethod, boolean is_unix_path)
390 throws Exception
391 {
392
393 ///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
394
395 // First get the file extension. Both in Base64 and URL encoded strings,
396 // the full-stop character (.) doesn't get encoded.
397 // That means getting the file extension is straightforward.
398
399 // Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
400 // 26 lowercase characters, 26 uppercase characters as well as the
401 // Plus sign (+) and the Forward Slash (/).
402 int fullstop = relative_sourcefile_path.indexOf(".");
403 String file_ext = "";
404 if(fullstop != -1) {
405 file_ext = relative_sourcefile_path.substring(fullstop);
406 relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
407 }
408
409 String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
410
411 String decoded_gsdlsourcefilename = "";
412
413 String separator = is_unix_path ? "/" : "\\";
414 for(int i = 0; i < importFilePathParts.length; i++) {
415 String decoded_filePathPart = "";
416 if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
417 // URL decode each part of gsdlsourcefilename.
418 // Need to set the decoder to use the default system encoding
419 // This is stored in the System's file.encoding property.
420 decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
421 }
422 else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
423 // Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
424 //byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
425 // Using org.apache.commons.codec.binary.Base64 instead
426 // https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
427 // General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
428 byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
429 ///System.err.println("Got base64 string: " + importFilePathParts[i]);
430 ///System.err.println("Decoded from base64 to bytes: " + new String(bytes, System.getProperty("file.encoding")));
431 // Using system file.encoding to interpret the resulting bytestring as a String,
432 // just as we always did with URL decoding method
433 decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
434 }
435
436 if(i == 0) {
437 decoded_gsdlsourcefilename = decoded_filePathPart;
438 } else {
439 decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
440 }
441 ///System.err.println("Built up: " + decoded_gsdlsourcefilename);
442 }
443
444 // add the file extension back in
445 decoded_gsdlsourcefilename += file_ext;
446
447 ///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
448
449 return decoded_gsdlsourcefilename;
450 }
451
452 /**
453 * Given a filepath, returns the parts between each file separator as an array.
454 * For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
455 */
456 private static String[] getFilePathParts(String filepath, boolean is_unix_path) {
457 StringTokenizer tok;
458 if(is_unix_path) {
459 tok = new StringTokenizer(filepath, "/");
460 } else {
461 tok = new StringTokenizer(filepath, "\\");
462 }
463 String[] parts;
464 int count = tok.countTokens();
465 if(count <= 0) {
466 parts = new String[]{filepath};
467 } else {
468 int i = 0;
469 parts = new String[count];
470 while(tok.hasMoreTokens()) {
471 parts[i] = tok.nextToken();
472 //System.err.println("Next part: " + parts[i]);
473 i++;
474 }
475 }
476 return parts;
477 }
478
479 /*
480 public ArrayList getMetadataExtractedFromFile(File file)
481 {
482 // Build up a list of metadata extracted from this file
483 ArrayList metadata_values = new ArrayList();
484
485 String file_relative_path = file.getAbsolutePath();
486 int import_index = file_relative_path.indexOf("import");
487 if (import_index != -1) {
488 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
489 }
490
491 // Check whether this doc.xml file contains extracted metadata for the specified file
492 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
493 if (description_elements_list == null) {
494 // ...it doesn't
495 return metadata_values;
496 }
497
498 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
499
500 // Parse the doc.xml file
501 DebugStream.println("Applicable doc.xml file: " + this);
502 try {
503 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
504
505 int description_element_num = 0;
506 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
507 boolean in_relevant_description_element = false;
508
509 String line = null;
510 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
511 // Check if this line contains the start of a relevant Description element
512 if (line_num == next_description_element_start) {
513 in_relevant_description_element = true;
514 continue;
515 }
516
517 // If we're not in a relevant Description element we don't care about anything
518 if (in_relevant_description_element == false) {
519 continue;
520 }
521
522 // Check if this line contains the end of the relevant Description element
523 if (line.indexOf("</Description>") != -1) {
524 description_element_num++;
525 if (description_element_num == description_elements_list.size()) {
526 break;
527 }
528
529 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
530 in_relevant_description_element = false;
531 continue;
532 }
533
534 // If this line doesn't contain a complete Metadata element, we're not interested
535 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
536 continue;
537 }
538
539 // Extract the metadata element name
540 int name_index = line.indexOf(" name=\"") + " name=\"".length();
541 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
542
543 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
544 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
545 if (!metadata_set_namespace.equals("")) {
546 continue;
547 }
548
549 // Extracted metadata!
550 String metadata_element_name = metadata_element_name_full;
551
552 // We completely ignore bibliographic data
553 if (metadata_element_name.equals("SourceSegment")) {
554 buffered_reader.close();
555 return new ArrayList();
556 }
557
558 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
559 if (metadata_element_name.startsWith("gsdl")) {
560 continue;
561 }
562
563 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
564
565 // Value trees are not stored for extracted metadata, so create a new value tree node now
566 int value_index = line.indexOf(">", name_index) + ">".length();
567 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
568
569 metadata_element.addMetadataValue(metadata_element_value);
570 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
571
572 // Add the new metadata value to the list
573 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
574 metadata_values.add(metadata_value);
575 }
576
577 buffered_reader.close();
578 }
579 catch (FileNotFoundException exception) {
580 DebugStream.printStackTrace(exception);
581 }
582 catch (IOException exception) {
583 DebugStream.printStackTrace(exception);
584 }
585
586 return metadata_values;
587 }
588
589 */
590
591 /**
592 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
593 * - To build a mapping from source file to its corresponding doc.xml file
594 * - To get a complete list of all extracted metadata elements
595 */
596 /*
597 public void skimFile()
598 {
599 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
600
601 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
602 DebugStream.println("Skimming " + this + "...");
603 try {
604 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
605 int description_element_start = -1;
606
607 String line = null;
608 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
609 // This line contains the start of a Description element
610 if (line.indexOf("<Description>") != -1) {
611 if (description_element_start != -1) {
612 System.err.println("Parse error: previous Description element unfinished!");
613 }
614 description_element_start = line_num;
615 continue;
616 }
617
618 // This line contains the end of a Description element
619 if (line.indexOf("</Description>") != -1) {
620 if (description_element_start == -1) {
621 System.err.println("Parse error: Description element unstarted!");
622 }
623 description_element_start = -1;
624 continue;
625 }
626
627 // If we're not in a Description element there shouldn't be any Metadata elements
628 if (description_element_start == -1) {
629 continue;
630 }
631
632 // This line doesn't contain a Metadata element, so we're not interested
633 if (line.indexOf("<Metadata ") == -1) {
634 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
635 continue;
636 }
637
638 // Extract the metadata element name
639 int name_index = line.indexOf(" name=\"") + " name=\"".length();
640 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
641
642 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
643 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
644 if (!metadata_set_namespace.equals("")) {
645 continue;
646 }
647
648 // Extracted metadata!
649 String metadata_element_name = metadata_element_name_full;
650
651 // Note which file this doc.xml is for
652 if (metadata_element_name.equals("gsdlsourcefilename")) {
653 // Extract the gsdlsourcefilename element value
654 int value_index = line.indexOf(">", name_index) + ">".length();
655 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
656
657 // We're only interested in the path relative to the import folder
658 int import_index = gsdlsourcefilename_value.indexOf("import");
659 if (import_index != -1) {
660 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
661
662 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
663 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
664
665 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
666 // This is stored in the System's file.encoding property.
667 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
668
669 // Make sure the path matches the OS that is running
670 if (is_unix_path && isWin) {
671 // Convert path from Unix to Windows
672 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
673 }
674 else if (!is_unix_path && !isWin) {
675 // Convert path from Windows to Unix
676 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
677 }
678
679 // Remember this for quick access later
680 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
681 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
682 }
683
684 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
685 }
686
687 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
688 // This is true when the source files come from a zip file processed by ZIPPlug, for example
689 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
690 // We don't really know what is going on...
691 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
692 }
693 }
694
695 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
696 if (metadata_element_name.startsWith("gsdl")) {
697 continue;
698 }
699
700 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
701 if (metadata_element == null) {
702 // This element isn't defined in ex.mds, so create it for this session
703 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
704 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
705 }
706 }
707
708 buffered_reader.close();
709 }
710 catch (FileNotFoundException exception) {
711 DebugStream.printStackTrace(exception);
712 }
713 catch (IOException exception) {
714 DebugStream.printStackTrace(exception);
715 }
716 }
717 */
718
719}
Note: See TracBrowser for help on using the repository browser.