source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 33758

Last change on this file since 33758 was 33758, checked in by ak19, 4 years ago

Removed debugging and last bit of cleanup.

  • Property svn:keywords set to Author Date Id Revision
File size: 31.6 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import java.net.URLDecoder;
33import org.greenstone.gatherer.DebugStream;
34import org.greenstone.gatherer.util.Utility;
35
36//import org.greenstone.gatherer.feedback.Base64; // decode() from Base64 didn't work
37import org.apache.commons.codec.binary.Base64; // decoding from Base64 works
38
39/** This class represents one doc.xml file */
40
41public abstract class DocXMLFile extends File
42{
43 protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
44
45 protected final String MetadataWrap;
46 protected final String MetadataItem;
47
48 protected final String FILE_RENAME_METHOD_NONE = "none";
49 protected final String FILE_RENAME_METHOD_URL = "url";
50 protected final String FILE_RENAME_METHOD_BASE64 = "base64";
51
52 public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
53 {
54 super(doc_xml_file_path);
55 this.MetadataWrap = metaWrap;
56 this.MetadataItem = metaItem;
57 }
58
59 /**
60 * Checks if various versions of the file object's filename, denoted relatively by file_relative_path,
61 * occur in the source_file_name_to_description_elements_mapping map
62 */
63 private ArrayList findSourceFileMapKeyMatch(File file, String file_relative_path) {
64 ArrayList description_elements_list = null;
65
66 ///System.err.println("Looking for key " + file_relative_path);
67 description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
68 if(description_elements_list != null) {
69 ///System.err.println(" Found key matching REGULAR filepath: " + file_relative_path);
70 return description_elements_list;
71 }
72 else if(!Utility.isWindows()) { // couldn't find a matching key, we're done
73 ///System.err.println("Unable to find ex.meta for regular file path form " + file_relative_path);
74 return null;
75 }
76
77 // Now we can try windows short filename as map key
78
79 String win_short_file_relative_path = "";
80 try{
81 win_short_file_relative_path = Utility.getWindowsShortFileName(file.getAbsolutePath());
82 //System.err.println("@@@ Searching for short file name: " + win_short_file_relative_path);
83 } catch(Exception e) { // we're done trying to find a matching key
84 System.err.println("Failed to convert to windows short file name: " + win_short_file_relative_path);
85 return null;
86 }
87
88 // Got a windows short file name, lop off import folder again
89 int import_index = win_short_file_relative_path.indexOf("import");
90 if (import_index != -1) {
91 win_short_file_relative_path = win_short_file_relative_path.substring(import_index + "import".length() + 1);
92 }
93
94 ///System.err.println("### Looking for Windows short file name |" + win_short_file_relative_path + "| in map of sourcefilenames to doc.xml's ex meta.");
95 description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(win_short_file_relative_path);
96 if (description_elements_list != null) {
97 ///System.err.println(" Found key matching FULL win shortfile path: " + win_short_file_relative_path);
98 return description_elements_list; // found
99 }
100
101 // else, check whether a map key is matched by any REMAINING combination of windows shortfile path and regular path:
102 // - windows shortfilename's rel-dir-path with regular tailname
103 // - and regular rel-dir-path with windows shortfilename's tailname
104
105 String shortFileTailName = win_short_file_relative_path;
106 String shortFileRelDirPath = "";
107 int lastSep = win_short_file_relative_path.lastIndexOf(File.separator);
108 if(lastSep != -1) {
109 shortFileTailName = win_short_file_relative_path.substring(lastSep+1);
110 shortFileRelDirPath = win_short_file_relative_path.substring(0, lastSep+1); // include the slash
111 }
112
113 String fileTailName = file_relative_path;
114 String fileRelDirPath = "";
115 lastSep = file_relative_path.lastIndexOf(File.separator);
116 if(lastSep != -1) {
117 fileTailName = file_relative_path.substring(lastSep+1);
118 fileRelDirPath = file_relative_path.substring(0, lastSep+1); // include the slash
119 }
120
121 String path = shortFileRelDirPath + fileTailName;
122 ///System.err.println("### Looking for Windows short file name |" + path + "| in map of sourcefilenames to doc.xml's ex meta.");
123 description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path);
124
125 if(description_elements_list != null) {
126 ///System.err.println(" Found key matching MIX of win shortfile path and regular path: " + path);
127 return description_elements_list; // found
128 }
129
130 // try the other combination
131 path = fileRelDirPath + shortFileTailName;
132 ///System.err.println("### Looking for Windows short file name |" + path + "| in map of sourcefilenames to doc.xml's ex meta.");
133 description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path);
134
135 if(description_elements_list != null) {
136 ///System.err.println(" Found key matching MIX of regular path and win shortfile path: " + path);
137 return description_elements_list; // found
138 }
139
140 // could not find gsdlsourcefilename in map
141 ///System.err.println("Unable to find ex.meta for regular file path form " + file_relative_path);
142 ///System.err.println(" Or for windows shortFile path form, or for combinations with regular file path form");
143
144 return description_elements_list; // returns null at this point
145 }
146
147
148 public ArrayList getMetadataExtractedFromFile(File file)
149 {
150 // Build up a list of metadata extracted from this file
151 ArrayList metadata_values = new ArrayList();
152
153 String file_relative_path = file.getAbsolutePath();
154 int import_index = file_relative_path.indexOf("import");
155 if (import_index != -1) {
156 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
157 }
158
159 ///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
160 /// System.err.println("\n@@@ relFilename: " + relFilename);
161 ///}
162
163 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
164 //ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
165 ArrayList description_elements_list = findSourceFileMapKeyMatch(file, file_relative_path);
166 if (description_elements_list == null) {
167 // ...it doesn't
168 return metadata_values; // we're done
169 }
170
171 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
172
173 // Parse the file
174 DebugStream.println("Applicable file: " + this);
175 try {
176 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
177
178 int description_element_num = 0;
179 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
180 boolean in_relevant_description_element = false;
181
182 String line = null;
183 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
184 // Check if this line contains the start of a relevant "Description" element
185 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
186 if (line_num == next_description_element_start) {
187 in_relevant_description_element = true;
188 continue;
189 }
190
191 // If we're not in a relevant Description element we don't care about anything
192 if (in_relevant_description_element == false) {
193 continue;
194 }
195
196 // Check if this line contains the end of the relevant Description element
197 if (line.indexOf("</"+MetadataWrap+">") != -1) {
198 description_element_num++;
199 if (description_element_num == description_elements_list.size()) {
200 break;
201 }
202
203 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
204 in_relevant_description_element = false;
205 continue;
206 }
207
208 // If this line doesn't contain a complete Metadata element, we're not interested
209 if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) {
210 continue;
211 }
212
213 // Extract the metadata element name
214 int name_index = line.indexOf(" name=\"") + " name=\"".length();
215 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
216
217 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
218 // Actually, if it is ex. then we are interested
219 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
220
221 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
222 continue;
223 }
224
225 // Extracted metadata!
226 // do it like this just in case we have ex.
227 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
228
229 // We completely ignore bibliographic data
230 if (metadata_element_name.equals("SourceSegment")) {
231 buffered_reader.close();
232 return new ArrayList();
233 }
234
235 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
236 if (metadata_element_name.startsWith("gsdl")) {
237 continue;
238 }
239
240 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
241
242 // Value trees are not stored for extracted metadata, so create a new value tree node now
243 int value_index = line.indexOf(">", name_index) + ">".length();
244 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
245
246 metadata_element.addMetadataValue(metadata_element_value);
247 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
248
249 // Add the new metadata value to the list
250 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
251 metadata_values.add(metadata_value);
252 }
253
254 buffered_reader.close();
255 }
256 catch (FileNotFoundException exception) {
257 DebugStream.printStackTrace(exception);
258 }
259 catch (IOException exception) {
260 DebugStream.printStackTrace(exception);
261 }
262
263 return metadata_values;
264 }
265
266
267
268
269 /**
270 * Every file must be skimmed when a collection is opened, for two reasons:
271 * - To build a mapping from source file to its corresponding doc.xml file
272 * - To get a complete list of all extracted metadata elements
273 */
274 public void skimFile()
275 {
276 String fileRenameMethod = null;
277 String gsdlsourcefilename_value = null;
278 boolean is_unix_path = false;
279
280 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
281
282 // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
283 DebugStream.println("Skimming " + this + "...");
284 try {
285 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
286 int description_element_start = -1;
287
288 String line = null;
289 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
290 // This line contains the start of a "MetadataWrap" element
291 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
292 if (line.indexOf("<"+MetadataWrap+">") != -1) {
293 if (description_element_start != -1) {
294 System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
295 }
296 description_element_start = line_num;
297 continue;
298 }
299
300 // This line contains the end of a "MetadataWrap" element
301 if (line.indexOf("</"+MetadataWrap+">") != -1) {
302 if (description_element_start == -1) {
303 System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
304 }
305 description_element_start = -1;
306 continue;
307 }
308
309 // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
310 if (description_element_start == -1) {
311 continue;
312 }
313
314 // This line doesn't contain a Metadata element, so we're not interested
315 if (line.indexOf("<"+MetadataItem+" ") == -1) {
316 DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
317 continue;
318 }
319
320 // Extract the metadata element name
321 int name_index = line.indexOf(" name=\"") + " name=\"".length();
322 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
323
324 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
325 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
326 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
327 continue;
328 }
329
330 // Extracted metadata! May have ex. so make sure we remove that
331 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
332 if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
333 // Extract the element value
334 int value_index = line.indexOf(">", name_index) + ">".length();
335 fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));
336 }
337
338 // Note which file this is for
339 else if (metadata_element_name.equals("gsdlsourcefilename")) {
340 // Extract the gsdlsourcefilename element value
341 int value_index = line.indexOf(">", name_index) + ">".length();
342 gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
343
344 // We're only interested in the path relative to the import folder
345 int import_index = gsdlsourcefilename_value.indexOf("import");
346 if (import_index != -1) {
347 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
348
349 is_unix_path = gsdlsourcefilename_value.startsWith("/");
350 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
351
352 // (Will decode gsdlsourcefilename at end of this method, once we know
353 // for certain the fileRenameMethod that was used to encode it.)
354
355 // Make sure the path matches the OS that is running
356 if (is_unix_path && Utility.isWindows()) {
357 // Convert path from Unix to Windows
358 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
359 }
360 else if (!is_unix_path && !Utility.isWindows()) {
361 // Convert path from Windows to Unix
362 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
363 }
364
365 ///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
366 // Remember this for quick access later
367 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
368 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
369 }
370
371 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
372 }
373
374 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
375 // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
376 // which are the gsdlsourcefilenames for the fedora digital object representing a collection.
377 // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
378 else if (gsdlsourcefilename_value.indexOf("tmp") == -1
379 && !gsdlsourcefilename_value.endsWith("collect.cfg")
380 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
381 // We don't really know what is going on...
382 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
383 }
384 }
385
386 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
387 if (metadata_element_name.startsWith("gsdl")) {
388 continue;
389 }
390
391 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
392 if (metadata_element == null) {
393 // This element isn't defined in ex.mds, so create it for this session
394 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
395 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
396 }
397 }
398
399 buffered_reader.close();
400
401 // Now that we're done skimming, we actually need to decode gsdlsourcefilename
402 // based on whatever fileRenameMethod was used to encode it, so that we can
403 // at last properly compare properly against filenames on the file system
404 // in order to load the correct ex.meta for the file.
405 // Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
406 // we can finally perform the decoding of gsdlsourcefilename.
407 if(fileRenameMethod == null) {
408 fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
409 }
410 // If gsdlsourcefilename was encoded, we remove it from the map under its encoded
411 // filename, decode it and add it back into map using its decoded filename.
412 if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
413 ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value);
414 gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
415 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list);
416 }
417 }
418 catch (FileNotFoundException exception) {
419 DebugStream.printStackTrace(exception);
420 }
421 catch (IOException exception) {
422 DebugStream.printStackTrace(exception);
423 } catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
424 DebugStream.printStackTrace(exception);
425 }
426 }
427
428 protected String decodeSourceFilename(String relative_sourcefile_path,
429 String encodingMethod, boolean is_unix_path)
430 throws Exception
431 {
432
433 ///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
434
435 // First get the file extension. Both in Base64 and URL encoded strings,
436 // the full-stop character (.) doesn't get encoded.
437 // That means getting the file extension is straightforward.
438
439 // Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
440 // 26 lowercase characters, 26 uppercase characters as well as the
441 // Plus sign (+) and the Forward Slash (/).
442 int fullstop = relative_sourcefile_path.indexOf(".");
443 String file_ext = "";
444 if(fullstop != -1) {
445 file_ext = relative_sourcefile_path.substring(fullstop);
446 relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
447 }
448
449 String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
450
451 String decoded_gsdlsourcefilename = "";
452
453 String separator = is_unix_path ? "/" : "\\";
454 for(int i = 0; i < importFilePathParts.length; i++) {
455 String decoded_filePathPart = "";
456 if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
457 // URL decode each part of gsdlsourcefilename.
458 // Need to set the decoder to use the default system encoding
459 // This is stored in the System's file.encoding property.
460 decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
461 }
462 else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
463 // Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
464 //byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
465 // Using org.apache.commons.codec.binary.Base64 instead
466 // https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
467 // General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
468 byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
469 ///System.err.println("Got base64 string: " + importFilePathParts[i]);
470 ///System.err.println("Decoded from base64 to bytes: " + new String(bytes, System.getProperty("file.encoding")));
471 // Using system file.encoding to interpret the resulting bytestring as a String,
472 // just as we always did with URL decoding method
473 decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
474 }
475
476 if(i == 0) {
477 decoded_gsdlsourcefilename = decoded_filePathPart;
478 } else {
479 decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
480 }
481 ///System.err.println("Built up: " + decoded_gsdlsourcefilename);
482 }
483
484 // add the file extension back in
485 decoded_gsdlsourcefilename += file_ext;
486
487 ///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
488
489 return decoded_gsdlsourcefilename;
490 }
491
492 /**
493 * Given a filepath, returns the parts between each file separator as an array.
494 * For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
495 */
496 private static String[] getFilePathParts(String filepath, boolean is_unix_path) {
497 StringTokenizer tok;
498 if(is_unix_path) {
499 tok = new StringTokenizer(filepath, "/");
500 } else {
501 tok = new StringTokenizer(filepath, "\\");
502 }
503 String[] parts;
504 int count = tok.countTokens();
505 if(count <= 0) {
506 parts = new String[]{filepath};
507 } else {
508 int i = 0;
509 parts = new String[count];
510 while(tok.hasMoreTokens()) {
511 parts[i] = tok.nextToken();
512 //System.err.println("Next part: " + parts[i]);
513 i++;
514 }
515 }
516 return parts;
517 }
518
519 /*
520 public ArrayList getMetadataExtractedFromFile(File file)
521 {
522 // Build up a list of metadata extracted from this file
523 ArrayList metadata_values = new ArrayList();
524
525 String file_relative_path = file.getAbsolutePath();
526 int import_index = file_relative_path.indexOf("import");
527 if (import_index != -1) {
528 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
529 }
530
531 // Check whether this doc.xml file contains extracted metadata for the specified file
532 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
533 if (description_elements_list == null) {
534 // ...it doesn't
535 return metadata_values;
536 }
537
538 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
539
540 // Parse the doc.xml file
541 DebugStream.println("Applicable doc.xml file: " + this);
542 try {
543 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
544
545 int description_element_num = 0;
546 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
547 boolean in_relevant_description_element = false;
548
549 String line = null;
550 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
551 // Check if this line contains the start of a relevant Description element
552 if (line_num == next_description_element_start) {
553 in_relevant_description_element = true;
554 continue;
555 }
556
557 // If we're not in a relevant Description element we don't care about anything
558 if (in_relevant_description_element == false) {
559 continue;
560 }
561
562 // Check if this line contains the end of the relevant Description element
563 if (line.indexOf("</Description>") != -1) {
564 description_element_num++;
565 if (description_element_num == description_elements_list.size()) {
566 break;
567 }
568
569 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
570 in_relevant_description_element = false;
571 continue;
572 }
573
574 // If this line doesn't contain a complete Metadata element, we're not interested
575 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
576 continue;
577 }
578
579 // Extract the metadata element name
580 int name_index = line.indexOf(" name=\"") + " name=\"".length();
581 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
582
583 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
584 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
585 if (!metadata_set_namespace.equals("")) {
586 continue;
587 }
588
589 // Extracted metadata!
590 String metadata_element_name = metadata_element_name_full;
591
592 // We completely ignore bibliographic data
593 if (metadata_element_name.equals("SourceSegment")) {
594 buffered_reader.close();
595 return new ArrayList();
596 }
597
598 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
599 if (metadata_element_name.startsWith("gsdl")) {
600 continue;
601 }
602
603 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
604
605 // Value trees are not stored for extracted metadata, so create a new value tree node now
606 int value_index = line.indexOf(">", name_index) + ">".length();
607 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
608
609 metadata_element.addMetadataValue(metadata_element_value);
610 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
611
612 // Add the new metadata value to the list
613 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
614 metadata_values.add(metadata_value);
615 }
616
617 buffered_reader.close();
618 }
619 catch (FileNotFoundException exception) {
620 DebugStream.printStackTrace(exception);
621 }
622 catch (IOException exception) {
623 DebugStream.printStackTrace(exception);
624 }
625
626 return metadata_values;
627 }
628
629 */
630
631 /**
632 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
633 * - To build a mapping from source file to its corresponding doc.xml file
634 * - To get a complete list of all extracted metadata elements
635 */
636 /*
637 public void skimFile()
638 {
639 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
640
641 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
642 DebugStream.println("Skimming " + this + "...");
643 try {
644 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
645 int description_element_start = -1;
646
647 String line = null;
648 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
649 // This line contains the start of a Description element
650 if (line.indexOf("<Description>") != -1) {
651 if (description_element_start != -1) {
652 System.err.println("Parse error: previous Description element unfinished!");
653 }
654 description_element_start = line_num;
655 continue;
656 }
657
658 // This line contains the end of a Description element
659 if (line.indexOf("</Description>") != -1) {
660 if (description_element_start == -1) {
661 System.err.println("Parse error: Description element unstarted!");
662 }
663 description_element_start = -1;
664 continue;
665 }
666
667 // If we're not in a Description element there shouldn't be any Metadata elements
668 if (description_element_start == -1) {
669 continue;
670 }
671
672 // This line doesn't contain a Metadata element, so we're not interested
673 if (line.indexOf("<Metadata ") == -1) {
674 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
675 continue;
676 }
677
678 // Extract the metadata element name
679 int name_index = line.indexOf(" name=\"") + " name=\"".length();
680 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
681
682 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
683 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
684 if (!metadata_set_namespace.equals("")) {
685 continue;
686 }
687
688 // Extracted metadata!
689 String metadata_element_name = metadata_element_name_full;
690
691 // Note which file this doc.xml is for
692 if (metadata_element_name.equals("gsdlsourcefilename")) {
693 // Extract the gsdlsourcefilename element value
694 int value_index = line.indexOf(">", name_index) + ">".length();
695 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
696
697 // We're only interested in the path relative to the import folder
698 int import_index = gsdlsourcefilename_value.indexOf("import");
699 if (import_index != -1) {
700 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
701
702 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
703 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
704
705 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
706 // This is stored in the System's file.encoding property.
707 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
708
709 // Make sure the path matches the OS that is running
710 if (is_unix_path && Utility.isWindows()) {
711 // Convert path from Unix to Windows
712 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
713 }
714 else if (!is_unix_path && !Utility.isWindows()) {
715 // Convert path from Windows to Unix
716 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
717 }
718
719 // Remember this for quick access later
720 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
721 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
722 }
723
724 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
725 }
726
727 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
728 // This is true when the source files come from a zip file processed by ZIPPlug, for example
729 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
730 // We don't really know what is going on...
731 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
732 }
733 }
734
735 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
736 if (metadata_element_name.startsWith("gsdl")) {
737 continue;
738 }
739
740 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
741 if (metadata_element == null) {
742 // This element isn't defined in ex.mds, so create it for this session
743 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
744 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
745 }
746 }
747
748 buffered_reader.close();
749 }
750 catch (FileNotFoundException exception) {
751 DebugStream.printStackTrace(exception);
752 }
753 catch (IOException exception) {
754 DebugStream.printStackTrace(exception);
755 }
756 }
757 */
758
759}
Note: See TracBrowser for help on using the repository browser.