source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 34510

Last change on this file since 34510 was 34510, checked in by ak19, 4 years ago

Dr Bainbridge didn't want a heuristic test on tilda for checking if a gsdlsourcefilename stored in doc.xml was a Win 8.3 Short filename or not. The idea was to set a flag on the perl end for if the gsdlsourcefilename stored was a shortfilename or not. Unfortunately, the perl code (DirectoryPlugin::read, variable subfile) was set to short filenames when doing a readdir, not by any algorithm. So it's always potentially a short file name on windows, except that only the presence of a tilda would indicate this. Whether we test for this in perl and set a flag or test for this in Java doesn't matter then. So have taken out the test for presence of tilda, though this means the java code will always try to work out a long filename of a gsdlsourcefilename on Windows.

  • Property svn:keywords set to Author Date Id Revision
File size: 30.0 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import java.net.URLDecoder;
33import org.greenstone.gatherer.DebugStream;
34import org.greenstone.gatherer.Gatherer;
35import org.greenstone.gatherer.util.Utility;
36
37//import org.greenstone.gatherer.feedback.Base64; // decode() from Base64 didn't work
38import org.apache.commons.codec.binary.Base64; // decoding from Base64 works
39
40/** This class represents one doc.xml file */
41
42public abstract class DocXMLFile extends File
43{
44 static boolean isWin = Utility.isWindows();
45
46 protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
47
48 protected final String MetadataWrap;
49 protected final String MetadataItem;
50
51 protected final String FILE_RENAME_METHOD_NONE = "none";
52 protected final String FILE_RENAME_METHOD_URL = "url";
53 protected final String FILE_RENAME_METHOD_BASE64 = "base64";
54
55 public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
56 {
57 super(doc_xml_file_path);
58 this.MetadataWrap = metaWrap;
59 this.MetadataItem = metaItem;
60 }
61
62 /** On Windows, file_relative_path will be hex-encoded for codepts beyond ASCII.
63 * But keys into the source_file_name_to_description_elements_mapping will then also match on Windows */
64 public ArrayList getMetadataExtractedFromFile(File file, String file_relative_path)
65 {
66 // Build up a list of metadata extracted from this file
67 ArrayList metadata_values = new ArrayList();
68
69 ///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
70 /// System.err.println("\n@@@ relFilename: " + relFilename);
71 ///}
72
73 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
74 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
75 if (description_elements_list == null) {
76 // ...it doesn't
77 ///System.err.println("Unable to find meta for file path form " + file_relative_path);
78 return metadata_values; // we're done
79 } ///else { System.err.println("@@@ file rel path: " + file_relative_path + " matched" ); }
80
81 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
82
83 // Parse the file
84 DebugStream.println("Applicable file: " + this);
85 try {
86 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
87
88 int description_element_num = 0;
89 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
90 boolean in_relevant_description_element = false;
91
92 String line = null;
93 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
94 // Check if this line contains the start of a relevant "Description" element
95 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
96 if (line_num == next_description_element_start) {
97 in_relevant_description_element = true;
98 continue;
99 }
100
101 // If we're not in a relevant Description element we don't care about anything
102 if (in_relevant_description_element == false) {
103 continue;
104 }
105
106 // Check if this line contains the end of the relevant Description element
107 if (line.indexOf("</"+MetadataWrap+">") != -1) {
108 description_element_num++;
109 if (description_element_num == description_elements_list.size()) {
110 break;
111 }
112
113 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
114 in_relevant_description_element = false;
115 continue;
116 }
117
118 // If this line doesn't contain a complete Metadata element, we're not interested
119 if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) {
120 continue;
121 }
122
123 // Extract the metadata element name
124 int name_index = line.indexOf(" name=\"") + " name=\"".length();
125 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
126
127 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
128 // Actually, if it is ex. then we are interested
129 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
130
131 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
132 continue;
133 }
134
135 // Extracted metadata!
136 // do it like this just in case we have ex.
137 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
138
139 // We completely ignore bibliographic data
140 if (metadata_element_name.equals("SourceSegment")) {
141 buffered_reader.close();
142 return new ArrayList();
143 }
144
145 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
146 if (metadata_element_name.startsWith("gsdl")) {
147 continue;
148 }
149
150 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
151
152 // Value trees are not stored for extracted metadata, so create a new value tree node now
153 int value_index = line.indexOf(">", name_index) + ">".length();
154 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
155
156 metadata_element.addMetadataValue(metadata_element_value);
157 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
158
159 // Add the new metadata value to the list
160 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
161 metadata_values.add(metadata_value);
162 }
163
164 buffered_reader.close();
165 }
166 catch (FileNotFoundException exception) {
167 DebugStream.printStackTrace(exception);
168 }
169 catch (IOException exception) {
170 DebugStream.printStackTrace(exception);
171 }
172
173 return metadata_values;
174 }
175
176
177
178
179 /**
180 * Every file must be skimmed when a collection is opened, for two reasons:
181 * - To build a mapping from source file to its corresponding doc.xml file
182 * - To get a complete list of all extracted metadata elements
183 */
184 public void skimFile()
185 {
186 String fileRenameMethod = null;
187 String gsdlsourcefilename_value = null;
188 boolean is_unix_path = false;
189 int description_element_start_gsdlsourcefilename_value = -1;
190
191 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
192
193 // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
194 DebugStream.println("Skimming " + this + "...");
195 try {
196 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
197 int description_element_start = -1;
198
199 String line = null;
200 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
201 // This line contains the start of a "MetadataWrap" element
202 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
203 if (line.indexOf("<"+MetadataWrap+">") != -1) {
204 if (description_element_start != -1) {
205 System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
206 }
207 description_element_start = line_num;
208 continue;
209 }
210
211 // This line contains the end of a "MetadataWrap" element
212 if (line.indexOf("</"+MetadataWrap+">") != -1) {
213 if (description_element_start == -1) {
214 System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
215 }
216 description_element_start = -1;
217 continue;
218 }
219
220 // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
221 if (description_element_start == -1) {
222 continue;
223 }
224
225 // This line doesn't contain a Metadata element, so we're not interested
226 if (line.indexOf("<"+MetadataItem+" ") == -1) {
227 DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
228 continue;
229 }
230
231 // Extract the metadata element name
232 int name_index = line.indexOf(" name=\"") + " name=\"".length();
233 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
234
235 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
236 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
237 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
238 continue;
239 }
240
241 // Extracted metadata! May have ex. so make sure we remove that
242 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
243 if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
244 // Extract the element value
245 int value_index = line.indexOf(">", name_index) + ">".length();
246 fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));
247 }
248
249 // Note which file this is for
250 else if (metadata_element_name.equals("gsdlsourcefilename")) {
251 // the gsdlsourcefilename metadata field may be encoded by the encoding denoted
252 // in fileRenameMethod (and will need decoding)
253
254 // Extract the gsdlsourcefilename element value
255 int value_index = line.indexOf(">", name_index) + ">".length();
256 gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
257
258 // We're only interested in the path relative to the import folder
259 int import_index = gsdlsourcefilename_value.indexOf("import");
260 if (import_index != -1) {
261
262 ///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
263 ///System.err.println("@@@@ Found description_element_start_gsdlsourcefilename_value: " + description_element_start);
264 description_element_start_gsdlsourcefilename_value = description_element_start;
265
266 }
267
268 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
269 // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
270 // which are the gsdlsourcefilenames for the fedora digital object representing a collection.
271 // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
272 else if (gsdlsourcefilename_value.indexOf("tmp") == -1
273 && !gsdlsourcefilename_value.endsWith("collect.cfg")
274 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
275 // We don't really know what is going on...
276 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
277 }
278 }
279
280 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
281 if (metadata_element_name.startsWith("gsdl")) {
282 continue;
283 }
284
285 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
286 if (metadata_element == null) {
287 // This element isn't defined in ex.mds, so create it for this session
288 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
289 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
290 }
291 }
292
293 buffered_reader.close();
294
295 // Work out if is_unix_path
296 int import_index = gsdlsourcefilename_value.indexOf("import");
297 if (import_index != -1) {
298 String tempStr = gsdlsourcefilename_value.substring(import_index + "import".length());
299 is_unix_path = tempStr.startsWith("/");
300 }
301 // We're only interested in the path relative to the import folder
302 // Lop off "import" folder prefix
303 gsdlsourcefilename_value = adjustForRelativeToImportDir(gsdlsourcefilename_value);
304
305 // Now that we're done skimming, we actually need to decode gsdlsourcefilename
306 // based on whatever fileRenameMethod was used to encode it, so that we can
307 // at last properly compare against filenames on the file system
308 // in order to load the correct ex.meta for the file.
309 // Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
310 // we can finally perform the decoding of gsdlsourcefilename.
311 if(fileRenameMethod == null) {
312 fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
313 }
314 if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
315 gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
316 }
317
318 // Now we can finally put the gsdlsourcefilename path relative to import dir into the hashmap
319 ///System.err.println("@@@ into map: " + gsdlsourcefilename_value);
320 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
321 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
322 }
323 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start_gsdlsourcefilename_value));
324
325 // Next, if Windows, check if dealing with Win 8.3 Short Filename
326 // In that case, convert short file name to long filename - works only if the file exists
327 if(isWin /*&& gsdlsourcefilename_value.indexOf("~") != -1*/) {
328
329 String long_gsdlsourcefilename = gsdlsourcefilename_value;
330
331 // gsdlsourcefilename is stored from import folder onwards: import/opt_subdir/filename.ext
332 // This may contain Win 8.3 shortening. To get Win Long filename, prefix current collection dir
333 // and if resulting file exists, getCanonicalPath() which produces Win Long filename.
334 File currentCollectionFolder = Gatherer.c_man.getCollection().getCollectionDirectory();
335 File f = new File(currentCollectionFolder, "import" + File.separator + gsdlsourcefilename_value); // should work even if linux style slashes in gsdlsourcefilename_value
336 ///System.err.println("### file: " + f.getAbsolutePath());
337
338 if(f.exists()) {
339 long_gsdlsourcefilename = f.getCanonicalPath();
340 ///System.err.println("### canon: " + long_gsdlsourcefilename);
341 } // else couldn't find a version of the filename stored in doc.xml that exists, giving up, leave gsdlsourcefilename_value as is
342
343 // Again, we're only interested in the path relative to the import folder
344 long_gsdlsourcefilename = adjustForRelativeToImportDir(long_gsdlsourcefilename);
345 if(!gsdlsourcefilename_value.equals(long_gsdlsourcefilename)) { // truly distinct Win long and short file names
346 // Put a copy of the ref to gsdlsourcefilename's metadata list under the long filename as well
347 ///System.err.println("@@@ long filename into map: " + long_gsdlsourcefilename);
348 Object arrList = source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value);
349 source_file_name_to_description_elements_mapping.put(long_gsdlsourcefilename, arrList);
350 }
351 }
352
353 }
354 catch (FileNotFoundException exception) {
355 DebugStream.printStackTrace(exception);
356 }
357 catch (IOException exception) {
358 DebugStream.printStackTrace(exception);
359 } catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
360 DebugStream.printStackTrace(exception);
361 }
362 }
363
364 private String adjustForRelativeToImportDir(String gsdlsourcefilename_value) {
365 int import_index = gsdlsourcefilename_value.indexOf("import");
366 if (import_index != -1) {
367 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
368
369 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
370 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
371
372 // (Will decode gsdlsourcefilename at end of this method, once we know
373 // for certain the fileRenameMethod that was used to encode it.)
374
375 // Make sure the path matches the OS that is running
376 if (is_unix_path && isWin) {
377 // Convert path from Unix to Windows
378 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
379 }
380 else if (!is_unix_path && !isWin) {
381 // Convert path from Windows to Unix
382 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
383 }
384 }
385 return gsdlsourcefilename_value;
386 }
387
388 protected String decodeSourceFilename(String relative_sourcefile_path,
389 String encodingMethod, boolean is_unix_path)
390 throws Exception
391 {
392
393 ///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
394
395 // First get the file extension. Both in Base64 and URL encoded strings,
396 // the full-stop character (.) doesn't get encoded.
397 // That means getting the file extension is straightforward.
398
399 // Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
400 // 26 lowercase characters, 26 uppercase characters as well as the
401 // Plus sign (+) and the Forward Slash (/).
402 int fullstop = relative_sourcefile_path.indexOf(".");
403 String file_ext = "";
404 if(fullstop != -1) {
405 file_ext = relative_sourcefile_path.substring(fullstop);
406 relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
407 }
408
409 String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
410
411 String decoded_gsdlsourcefilename = "";
412
413 String separator = is_unix_path ? "/" : "\\";
414 for(int i = 0; i < importFilePathParts.length; i++) {
415 String decoded_filePathPart = "";
416 if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
417 // URL decode each part of gsdlsourcefilename.
418 // Need to set the decoder to use the default system encoding
419 // This is stored in the System's file.encoding property.
420 decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
421 }
422 else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
423 // Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
424 //byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
425 // Using org.apache.commons.codec.binary.Base64 instead
426 // https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
427 // General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
428 byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
429 ///System.err.println("Got base64 string: " + importFilePathParts[i]);
430 ///System.err.println("Decoded from base64 to bytes: " + new String(bytes, System.getProperty("file.encoding")));
431 // Using system file.encoding to interpret the resulting bytestring as a String,
432 // just as we always did with URL decoding method
433 decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
434 }
435
436 if(i == 0) {
437 decoded_gsdlsourcefilename = decoded_filePathPart;
438 } else {
439 decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
440 }
441 ///System.err.println("Built up: " + decoded_gsdlsourcefilename);
442 }
443
444 // add the file extension back in
445 decoded_gsdlsourcefilename += file_ext;
446
447 ///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
448
449 return decoded_gsdlsourcefilename;
450 }
451
452 /**
453 * Given a filepath, returns the parts between each file separator as an array.
454 * For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
455 */
456 private static String[] getFilePathParts(String filepath, boolean is_unix_path) {
457 StringTokenizer tok;
458 if(is_unix_path) {
459 tok = new StringTokenizer(filepath, "/");
460 } else {
461 tok = new StringTokenizer(filepath, "\\");
462 }
463 String[] parts;
464 int count = tok.countTokens();
465 if(count <= 0) {
466 parts = new String[]{filepath};
467 } else {
468 int i = 0;
469 parts = new String[count];
470 while(tok.hasMoreTokens()) {
471 parts[i] = tok.nextToken();
472 //System.err.println("Next part: " + parts[i]);
473 i++;
474 }
475 }
476 return parts;
477 }
478
479 /*
480 public ArrayList getMetadataExtractedFromFile(File file)
481 {
482 // Build up a list of metadata extracted from this file
483 ArrayList metadata_values = new ArrayList();
484
485 String file_relative_path = file.getAbsolutePath();
486 int import_index = file_relative_path.indexOf("import");
487 if (import_index != -1) {
488 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
489 }
490
491 // Check whether this doc.xml file contains extracted metadata for the specified file
492 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
493 if (description_elements_list == null) {
494 // ...it doesn't
495 return metadata_values;
496 }
497
498 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
499
500 // Parse the doc.xml file
501 DebugStream.println("Applicable doc.xml file: " + this);
502 try {
503 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
504
505 int description_element_num = 0;
506 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
507 boolean in_relevant_description_element = false;
508
509 String line = null;
510 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
511 // Check if this line contains the start of a relevant Description element
512 if (line_num == next_description_element_start) {
513 in_relevant_description_element = true;
514 continue;
515 }
516
517 // If we're not in a relevant Description element we don't care about anything
518 if (in_relevant_description_element == false) {
519 continue;
520 }
521
522 // Check if this line contains the end of the relevant Description element
523 if (line.indexOf("</Description>") != -1) {
524 description_element_num++;
525 if (description_element_num == description_elements_list.size()) {
526 break;
527 }
528
529 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
530 in_relevant_description_element = false;
531 continue;
532 }
533
534 // If this line doesn't contain a complete Metadata element, we're not interested
535 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
536 continue;
537 }
538
539 // Extract the metadata element name
540 int name_index = line.indexOf(" name=\"") + " name=\"".length();
541 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
542
543 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
544 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
545 if (!metadata_set_namespace.equals("")) {
546 continue;
547 }
548
549 // Extracted metadata!
550 String metadata_element_name = metadata_element_name_full;
551
552 // We completely ignore bibliographic data
553 if (metadata_element_name.equals("SourceSegment")) {
554 buffered_reader.close();
555 return new ArrayList();
556 }
557
558 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
559 if (metadata_element_name.startsWith("gsdl")) {
560 continue;
561 }
562
563 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
564
565 // Value trees are not stored for extracted metadata, so create a new value tree node now
566 int value_index = line.indexOf(">", name_index) + ">".length();
567 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
568
569 metadata_element.addMetadataValue(metadata_element_value);
570 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
571
572 // Add the new metadata value to the list
573 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
574 metadata_values.add(metadata_value);
575 }
576
577 buffered_reader.close();
578 }
579 catch (FileNotFoundException exception) {
580 DebugStream.printStackTrace(exception);
581 }
582 catch (IOException exception) {
583 DebugStream.printStackTrace(exception);
584 }
585
586 return metadata_values;
587 }
588
589 */
590
591 /**
592 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
593 * - To build a mapping from source file to its corresponding doc.xml file
594 * - To get a complete list of all extracted metadata elements
595 */
596 /*
597 public void skimFile()
598 {
599 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
600
601 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
602 DebugStream.println("Skimming " + this + "...");
603 try {
604 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
605 int description_element_start = -1;
606
607 String line = null;
608 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
609 // This line contains the start of a Description element
610 if (line.indexOf("<Description>") != -1) {
611 if (description_element_start != -1) {
612 System.err.println("Parse error: previous Description element unfinished!");
613 }
614 description_element_start = line_num;
615 continue;
616 }
617
618 // This line contains the end of a Description element
619 if (line.indexOf("</Description>") != -1) {
620 if (description_element_start == -1) {
621 System.err.println("Parse error: Description element unstarted!");
622 }
623 description_element_start = -1;
624 continue;
625 }
626
627 // If we're not in a Description element there shouldn't be any Metadata elements
628 if (description_element_start == -1) {
629 continue;
630 }
631
632 // This line doesn't contain a Metadata element, so we're not interested
633 if (line.indexOf("<Metadata ") == -1) {
634 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
635 continue;
636 }
637
638 // Extract the metadata element name
639 int name_index = line.indexOf(" name=\"") + " name=\"".length();
640 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
641
642 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
643 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
644 if (!metadata_set_namespace.equals("")) {
645 continue;
646 }
647
648 // Extracted metadata!
649 String metadata_element_name = metadata_element_name_full;
650
651 // Note which file this doc.xml is for
652 if (metadata_element_name.equals("gsdlsourcefilename")) {
653 // Extract the gsdlsourcefilename element value
654 int value_index = line.indexOf(">", name_index) + ">".length();
655 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
656
657 // We're only interested in the path relative to the import folder
658 int import_index = gsdlsourcefilename_value.indexOf("import");
659 if (import_index != -1) {
660 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
661
662 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
663 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
664
665 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
666 // This is stored in the System's file.encoding property.
667 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
668
669 // Make sure the path matches the OS that is running
670 if (is_unix_path && isWin) {
671 // Convert path from Unix to Windows
672 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
673 }
674 else if (!is_unix_path && !isWin) {
675 // Convert path from Windows to Unix
676 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
677 }
678
679 // Remember this for quick access later
680 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
681 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
682 }
683
684 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
685 }
686
687 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
688 // This is true when the source files come from a zip file processed by ZIPPlug, for example
689 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
690 // We don't really know what is going on...
691 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
692 }
693 }
694
695 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
696 if (metadata_element_name.startsWith("gsdl")) {
697 continue;
698 }
699
700 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
701 if (metadata_element == null) {
702 // This element isn't defined in ex.mds, so create it for this session
703 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
704 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
705 }
706 }
707
708 buffered_reader.close();
709 }
710 catch (FileNotFoundException exception) {
711 DebugStream.printStackTrace(exception);
712 }
713 catch (IOException exception) {
714 DebugStream.printStackTrace(exception);
715 }
716 }
717 */
718
719}
Note: See TracBrowser for help on using the repository browser.