source: gli/trunk/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 17095

Last change on this file since 17095 was 17095, checked in by ak19, 16 years ago

FLI exports a separate docMETS xml file representing a collection object for ingestion into Fedora. Though the gsdlsourcefilename of this collection object does not refer to the import directory, there's no need to print a warning about this as it is desired behaviour.

  • Property svn:keywords set to Author Date Id Revision
File size: 21.0 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import java.net.URLDecoder;
33import org.greenstone.gatherer.DebugStream;
34import org.greenstone.gatherer.util.Utility;
35
36
37/** This class represents one doc.xml file */
38
39public abstract class DocXMLFile extends File
40{
41 protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
42
43 protected final String MetadataWrap;
44 protected final String MetadataItem;
45
46 public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
47 {
48 super(doc_xml_file_path);
49 this.MetadataWrap = metaWrap;
50 this.MetadataItem = metaItem;
51 }
52
53
54 public ArrayList getMetadataExtractedFromFile(File file)
55 {
56 // Build up a list of metadata extracted from this file
57 ArrayList metadata_values = new ArrayList();
58
59 String file_relative_path = file.getAbsolutePath();
60 int import_index = file_relative_path.indexOf("import");
61 if (import_index != -1) {
62 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
63 }
64
65 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
66 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
67 if (description_elements_list == null) {
68 // ...it doesn't
69 return metadata_values;
70 }
71
72 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
73
74 // Parse the file
75 DebugStream.println("Applicable file: " + this);
76 try {
77 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
78
79 int description_element_num = 0;
80 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
81 boolean in_relevant_description_element = false;
82
83 String line = null;
84 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
85 // Check if this line contains the start of a relevant "Description" element
86 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
87 if (line_num == next_description_element_start) {
88 in_relevant_description_element = true;
89 continue;
90 }
91
92 // If we're not in a relevant Description element we don't care about anything
93 if (in_relevant_description_element == false) {
94 continue;
95 }
96
97 // Check if this line contains the end of the relevant Description element
98 if (line.indexOf("</"+MetadataWrap+">") != -1) {
99 description_element_num++;
100 if (description_element_num == description_elements_list.size()) {
101 break;
102 }
103
104 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
105 in_relevant_description_element = false;
106 continue;
107 }
108
109 // If this line doesn't contain a complete Metadata element, we're not interested
110 if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) {
111 continue;
112 }
113
114 // Extract the metadata element name
115 int name_index = line.indexOf(" name=\"") + " name=\"".length();
116 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
117
118 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
119 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
120 if (!metadata_set_namespace.equals("")) {
121 continue;
122 }
123
124 // Extracted metadata!
125 String metadata_element_name = metadata_element_name_full;
126
127 // We completely ignore bibliographic data
128 if (metadata_element_name.equals("SourceSegment")) {
129 buffered_reader.close();
130 return new ArrayList();
131 }
132
133 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
134 if (metadata_element_name.startsWith("gsdl")) {
135 continue;
136 }
137
138 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
139
140 // Value trees are not stored for extracted metadata, so create a new value tree node now
141 int value_index = line.indexOf(">", name_index) + ">".length();
142 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
143
144 metadata_element.addMetadataValue(metadata_element_value);
145 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
146
147 // Add the new metadata value to the list
148 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
149 metadata_values.add(metadata_value);
150 }
151
152 buffered_reader.close();
153 }
154 catch (FileNotFoundException exception) {
155 DebugStream.printStackTrace(exception);
156 }
157 catch (IOException exception) {
158 DebugStream.printStackTrace(exception);
159 }
160
161 return metadata_values;
162 }
163
164
165
166
167 /**
168 * Every file must be skimmed when a collection is opened, for two reasons:
169 * - To build a mapping from source file to its corresponding doc.xml file
170 * - To get a complete list of all extracted metadata elements
171 */
172 public void skimFile()
173 {
174 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
175
176 // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
177 DebugStream.println("Skimming " + this + "...");
178 try {
179 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
180 int description_element_start = -1;
181
182 String line = null;
183 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
184 // This line contains the start of a "MetadataWrap" element
185 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
186 if (line.indexOf("<"+MetadataWrap+">") != -1) {
187 if (description_element_start != -1) {
188 System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
189 }
190 description_element_start = line_num;
191 continue;
192 }
193
194 // This line contains the end of a "MetadataWrap" element
195 if (line.indexOf("</"+MetadataWrap+">") != -1) {
196 if (description_element_start == -1) {
197 System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
198 }
199 description_element_start = -1;
200 continue;
201 }
202
203 // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
204 if (description_element_start == -1) {
205 continue;
206 }
207
208 // This line doesn't contain a Metadata element, so we're not interested
209 if (line.indexOf("<"+MetadataItem+" ") == -1) {
210 DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
211 continue;
212 }
213
214 // Extract the metadata element name
215 int name_index = line.indexOf(" name=\"") + " name=\"".length();
216 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
217
218 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
219 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
220 if (!metadata_set_namespace.equals("")) {
221 continue;
222 }
223
224 // Extracted metadata!
225 String metadata_element_name = metadata_element_name_full;
226
227 // Note which file this is for
228 if (metadata_element_name.equals("gsdlsourcefilename")) {
229 // Extract the gsdlsourcefilename element value
230 int value_index = line.indexOf(">", name_index) + ">".length();
231 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
232
233 // We're only interested in the path relative to the import folder
234 int import_index = gsdlsourcefilename_value.indexOf("import");
235 if (import_index != -1) {
236 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
237
238 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
239 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
240
241 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
242 // This is stored in the System's file.encoding property.
243 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
244
245 // Make sure the path matches the OS that is running
246 if (is_unix_path && Utility.isWindows()) {
247 // Convert path from Unix to Windows
248 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
249 }
250 else if (!is_unix_path && !Utility.isWindows()) {
251 // Convert path from Windows to Unix
252 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
253 }
254
255 // Remember this for quick access later
256 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
257 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
258 }
259
260 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
261 }
262
263 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
264 // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
265 // which are the gsdlsourcefilenames for the fedora digital object representing a collection.
266 // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
267 else if (gsdlsourcefilename_value.indexOf("tmp") == -1
268 && !gsdlsourcefilename_value.endsWith("collect.cfg")
269 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
270 // We don't really know what is going on...
271 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
272 }
273 }
274
275 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
276 if (metadata_element_name.startsWith("gsdl")) {
277 continue;
278 }
279
280 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
281 if (metadata_element == null) {
282 // This element isn't defined in ex.mds, so create it for this session
283 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
284 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
285 }
286 }
287
288 buffered_reader.close();
289 }
290 catch (FileNotFoundException exception) {
291 DebugStream.printStackTrace(exception);
292 }
293 catch (IOException exception) {
294 DebugStream.printStackTrace(exception);
295 }
296 }
297
298
299 /*
300 public ArrayList getMetadataExtractedFromFile(File file)
301 {
302 // Build up a list of metadata extracted from this file
303 ArrayList metadata_values = new ArrayList();
304
305 String file_relative_path = file.getAbsolutePath();
306 int import_index = file_relative_path.indexOf("import");
307 if (import_index != -1) {
308 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
309 }
310
311 // Check whether this doc.xml file contains extracted metadata for the specified file
312 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
313 if (description_elements_list == null) {
314 // ...it doesn't
315 return metadata_values;
316 }
317
318 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
319
320 // Parse the doc.xml file
321 DebugStream.println("Applicable doc.xml file: " + this);
322 try {
323 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
324
325 int description_element_num = 0;
326 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
327 boolean in_relevant_description_element = false;
328
329 String line = null;
330 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
331 // Check if this line contains the start of a relevant Description element
332 if (line_num == next_description_element_start) {
333 in_relevant_description_element = true;
334 continue;
335 }
336
337 // If we're not in a relevant Description element we don't care about anything
338 if (in_relevant_description_element == false) {
339 continue;
340 }
341
342 // Check if this line contains the end of the relevant Description element
343 if (line.indexOf("</Description>") != -1) {
344 description_element_num++;
345 if (description_element_num == description_elements_list.size()) {
346 break;
347 }
348
349 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
350 in_relevant_description_element = false;
351 continue;
352 }
353
354 // If this line doesn't contain a complete Metadata element, we're not interested
355 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
356 continue;
357 }
358
359 // Extract the metadata element name
360 int name_index = line.indexOf(" name=\"") + " name=\"".length();
361 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
362
363 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
364 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
365 if (!metadata_set_namespace.equals("")) {
366 continue;
367 }
368
369 // Extracted metadata!
370 String metadata_element_name = metadata_element_name_full;
371
372 // We completely ignore bibliographic data
373 if (metadata_element_name.equals("SourceSegment")) {
374 buffered_reader.close();
375 return new ArrayList();
376 }
377
378 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
379 if (metadata_element_name.startsWith("gsdl")) {
380 continue;
381 }
382
383 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
384
385 // Value trees are not stored for extracted metadata, so create a new value tree node now
386 int value_index = line.indexOf(">", name_index) + ">".length();
387 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
388
389 metadata_element.addMetadataValue(metadata_element_value);
390 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
391
392 // Add the new metadata value to the list
393 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
394 metadata_values.add(metadata_value);
395 }
396
397 buffered_reader.close();
398 }
399 catch (FileNotFoundException exception) {
400 DebugStream.printStackTrace(exception);
401 }
402 catch (IOException exception) {
403 DebugStream.printStackTrace(exception);
404 }
405
406 return metadata_values;
407 }
408
409 */
410
411 /**
412 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
413 * - To build a mapping from source file to its corresponding doc.xml file
414 * - To get a complete list of all extracted metadata elements
415 */
416 /*
417 public void skimFile()
418 {
419 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
420
421 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
422 DebugStream.println("Skimming " + this + "...");
423 try {
424 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
425 int description_element_start = -1;
426
427 String line = null;
428 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
429 // This line contains the start of a Description element
430 if (line.indexOf("<Description>") != -1) {
431 if (description_element_start != -1) {
432 System.err.println("Parse error: previous Description element unfinished!");
433 }
434 description_element_start = line_num;
435 continue;
436 }
437
438 // This line contains the end of a Description element
439 if (line.indexOf("</Description>") != -1) {
440 if (description_element_start == -1) {
441 System.err.println("Parse error: Description element unstarted!");
442 }
443 description_element_start = -1;
444 continue;
445 }
446
447 // If we're not in a Description element there shouldn't be any Metadata elements
448 if (description_element_start == -1) {
449 continue;
450 }
451
452 // This line doesn't contain a Metadata element, so we're not interested
453 if (line.indexOf("<Metadata ") == -1) {
454 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
455 continue;
456 }
457
458 // Extract the metadata element name
459 int name_index = line.indexOf(" name=\"") + " name=\"".length();
460 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
461
462 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
463 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
464 if (!metadata_set_namespace.equals("")) {
465 continue;
466 }
467
468 // Extracted metadata!
469 String metadata_element_name = metadata_element_name_full;
470
471 // Note which file this doc.xml is for
472 if (metadata_element_name.equals("gsdlsourcefilename")) {
473 // Extract the gsdlsourcefilename element value
474 int value_index = line.indexOf(">", name_index) + ">".length();
475 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
476
477 // We're only interested in the path relative to the import folder
478 int import_index = gsdlsourcefilename_value.indexOf("import");
479 if (import_index != -1) {
480 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
481
482 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
483 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
484
485 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
486 // This is stored in the System's file.encoding property.
487 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
488
489 // Make sure the path matches the OS that is running
490 if (is_unix_path && Utility.isWindows()) {
491 // Convert path from Unix to Windows
492 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
493 }
494 else if (!is_unix_path && !Utility.isWindows()) {
495 // Convert path from Windows to Unix
496 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
497 }
498
499 // Remember this for quick access later
500 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
501 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
502 }
503
504 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
505 }
506
507 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
508 // This is true when the source files come from a zip file processed by ZIPPlug, for example
509 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
510 // We don't really know what is going on...
511 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
512 }
513 }
514
515 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
516 if (metadata_element_name.startsWith("gsdl")) {
517 continue;
518 }
519
520 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
521 if (metadata_element == null) {
522 // This element isn't defined in ex.mds, so create it for this session
523 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
524 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
525 }
526 }
527
528 buffered_reader.close();
529 }
530 catch (FileNotFoundException exception) {
531 DebugStream.printStackTrace(exception);
532 }
533 catch (IOException exception) {
534 DebugStream.printStackTrace(exception);
535 }
536 }
537 */
538
539}
Note: See TracBrowser for help on using the repository browser.