source: gli/trunk/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 17014

Last change on this file since 17014 was 17014, checked in by ak19, 16 years ago
  1. Made MetadataWrap and MetadataItem members final rather than static, now they are passed by subclasses to the superclass constructor (DocXMLFile). 2. Skip warning on gsdlsourcefilename etc/collect.cfg since this occurs when working with FLI and is not an error.
  • Property svn:keywords set to Author Date Id Revision
File size: 20.8 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import java.net.URLDecoder;
33import org.greenstone.gatherer.DebugStream;
34import org.greenstone.gatherer.util.Utility;
35
36
37/** This class represents one doc.xml file */
38
39public abstract class DocXMLFile extends File
40{
41 protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
42
43 protected final String MetadataWrap;
44 protected final String MetadataItem;
45
46 public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
47 {
48 super(doc_xml_file_path);
49 this.MetadataWrap = metaWrap;
50 this.MetadataItem = metaItem;
51 }
52
53
54 public ArrayList getMetadataExtractedFromFile(File file)
55 {
56 // Build up a list of metadata extracted from this file
57 ArrayList metadata_values = new ArrayList();
58
59 String file_relative_path = file.getAbsolutePath();
60 int import_index = file_relative_path.indexOf("import");
61 if (import_index != -1) {
62 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
63 }
64
65 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
66 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
67 if (description_elements_list == null) {
68 // ...it doesn't
69 return metadata_values;
70 }
71
72 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
73
74 // Parse the file
75 DebugStream.println("Applicable file: " + this);
76 try {
77 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
78
79 int description_element_num = 0;
80 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
81 boolean in_relevant_description_element = false;
82
83 String line = null;
84 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
85 // Check if this line contains the start of a relevant "Description" element
86 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
87 if (line_num == next_description_element_start) {
88 in_relevant_description_element = true;
89 continue;
90 }
91
92 // If we're not in a relevant Description element we don't care about anything
93 if (in_relevant_description_element == false) {
94 continue;
95 }
96
97 // Check if this line contains the end of the relevant Description element
98 if (line.indexOf("</"+MetadataWrap+">") != -1) {
99 description_element_num++;
100 if (description_element_num == description_elements_list.size()) {
101 break;
102 }
103
104 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
105 in_relevant_description_element = false;
106 continue;
107 }
108
109 // If this line doesn't contain a complete Metadata element, we're not interested
110 if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) {
111 continue;
112 }
113
114 // Extract the metadata element name
115 int name_index = line.indexOf(" name=\"") + " name=\"".length();
116 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
117
118 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
119 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
120 if (!metadata_set_namespace.equals("")) {
121 continue;
122 }
123
124 // Extracted metadata!
125 String metadata_element_name = metadata_element_name_full;
126
127 // We completely ignore bibliographic data
128 if (metadata_element_name.equals("SourceSegment")) {
129 buffered_reader.close();
130 return new ArrayList();
131 }
132
133 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
134 if (metadata_element_name.startsWith("gsdl")) {
135 continue;
136 }
137
138 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
139
140 // Value trees are not stored for extracted metadata, so create a new value tree node now
141 int value_index = line.indexOf(">", name_index) + ">".length();
142 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
143
144 metadata_element.addMetadataValue(metadata_element_value);
145 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
146
147 // Add the new metadata value to the list
148 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
149 metadata_values.add(metadata_value);
150 }
151
152 buffered_reader.close();
153 }
154 catch (FileNotFoundException exception) {
155 DebugStream.printStackTrace(exception);
156 }
157 catch (IOException exception) {
158 DebugStream.printStackTrace(exception);
159 }
160
161 return metadata_values;
162 }
163
164
165
166
167 /**
168 * Every file must be skimmed when a collection is opened, for two reasons:
169 * - To build a mapping from source file to its corresponding doc.xml file
170 * - To get a complete list of all extracted metadata elements
171 */
172 public void skimFile()
173 {
174 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
175
176 // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
177 DebugStream.println("Skimming " + this + "...");
178 try {
179 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
180 int description_element_start = -1;
181
182 String line = null;
183 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
184 // This line contains the start of a "MetadataWrap" element
185 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
186 if (line.indexOf("<"+MetadataWrap+">") != -1) {
187 if (description_element_start != -1) {
188 System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
189 }
190 description_element_start = line_num;
191 continue;
192 }
193
194 // This line contains the end of a "MetadataWrap" element
195 if (line.indexOf("</"+MetadataWrap+">") != -1) {
196 if (description_element_start == -1) {
197 System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
198 }
199 description_element_start = -1;
200 continue;
201 }
202
203 // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
204 if (description_element_start == -1) {
205 continue;
206 }
207
208 // This line doesn't contain a Metadata element, so we're not interested
209 if (line.indexOf("<"+MetadataItem+" ") == -1) {
210 DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
211 continue;
212 }
213
214 // Extract the metadata element name
215 int name_index = line.indexOf(" name=\"") + " name=\"".length();
216 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
217
218 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
219 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
220 if (!metadata_set_namespace.equals("")) {
221 continue;
222 }
223
224 // Extracted metadata!
225 String metadata_element_name = metadata_element_name_full;
226
227 // Note which file this is for
228 if (metadata_element_name.equals("gsdlsourcefilename")) {
229 // Extract the gsdlsourcefilename element value
230 int value_index = line.indexOf(">", name_index) + ">".length();
231 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
232
233 // We're only interested in the path relative to the import folder
234 int import_index = gsdlsourcefilename_value.indexOf("import");
235 if (import_index != -1) {
236 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
237
238 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
239 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
240
241 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
242 // This is stored in the System's file.encoding property.
243 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
244
245 // Make sure the path matches the OS that is running
246 if (is_unix_path && Utility.isWindows()) {
247 // Convert path from Unix to Windows
248 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
249 }
250 else if (!is_unix_path && !Utility.isWindows()) {
251 // Convert path from Windows to Unix
252 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
253 }
254
255 // Remember this for quick access later
256 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
257 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
258 }
259
260 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
261 }
262
263 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
264 // or (as in the case of using FLI) if it is the etc/collect.cfg file
265 // This is true when the source files come from a zip file processed by ZIPPlug, for example
266 else if (gsdlsourcefilename_value.indexOf("tmp") == -1 && !gsdlsourcefilename_value.endsWith("collect.cfg")) {
267 // We don't really know what is going on...
268 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
269 }
270 }
271
272 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
273 if (metadata_element_name.startsWith("gsdl")) {
274 continue;
275 }
276
277 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
278 if (metadata_element == null) {
279 // This element isn't defined in ex.mds, so create it for this session
280 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
281 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
282 }
283 }
284
285 buffered_reader.close();
286 }
287 catch (FileNotFoundException exception) {
288 DebugStream.printStackTrace(exception);
289 }
290 catch (IOException exception) {
291 DebugStream.printStackTrace(exception);
292 }
293 }
294
295
296 /*
297 public ArrayList getMetadataExtractedFromFile(File file)
298 {
299 // Build up a list of metadata extracted from this file
300 ArrayList metadata_values = new ArrayList();
301
302 String file_relative_path = file.getAbsolutePath();
303 int import_index = file_relative_path.indexOf("import");
304 if (import_index != -1) {
305 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
306 }
307
308 // Check whether this doc.xml file contains extracted metadata for the specified file
309 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
310 if (description_elements_list == null) {
311 // ...it doesn't
312 return metadata_values;
313 }
314
315 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
316
317 // Parse the doc.xml file
318 DebugStream.println("Applicable doc.xml file: " + this);
319 try {
320 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
321
322 int description_element_num = 0;
323 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
324 boolean in_relevant_description_element = false;
325
326 String line = null;
327 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
328 // Check if this line contains the start of a relevant Description element
329 if (line_num == next_description_element_start) {
330 in_relevant_description_element = true;
331 continue;
332 }
333
334 // If we're not in a relevant Description element we don't care about anything
335 if (in_relevant_description_element == false) {
336 continue;
337 }
338
339 // Check if this line contains the end of the relevant Description element
340 if (line.indexOf("</Description>") != -1) {
341 description_element_num++;
342 if (description_element_num == description_elements_list.size()) {
343 break;
344 }
345
346 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
347 in_relevant_description_element = false;
348 continue;
349 }
350
351 // If this line doesn't contain a complete Metadata element, we're not interested
352 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
353 continue;
354 }
355
356 // Extract the metadata element name
357 int name_index = line.indexOf(" name=\"") + " name=\"".length();
358 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
359
360 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
361 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
362 if (!metadata_set_namespace.equals("")) {
363 continue;
364 }
365
366 // Extracted metadata!
367 String metadata_element_name = metadata_element_name_full;
368
369 // We completely ignore bibliographic data
370 if (metadata_element_name.equals("SourceSegment")) {
371 buffered_reader.close();
372 return new ArrayList();
373 }
374
375 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
376 if (metadata_element_name.startsWith("gsdl")) {
377 continue;
378 }
379
380 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
381
382 // Value trees are not stored for extracted metadata, so create a new value tree node now
383 int value_index = line.indexOf(">", name_index) + ">".length();
384 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
385
386 metadata_element.addMetadataValue(metadata_element_value);
387 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
388
389 // Add the new metadata value to the list
390 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
391 metadata_values.add(metadata_value);
392 }
393
394 buffered_reader.close();
395 }
396 catch (FileNotFoundException exception) {
397 DebugStream.printStackTrace(exception);
398 }
399 catch (IOException exception) {
400 DebugStream.printStackTrace(exception);
401 }
402
403 return metadata_values;
404 }
405
406 */
407
408 /**
409 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
410 * - To build a mapping from source file to its corresponding doc.xml file
411 * - To get a complete list of all extracted metadata elements
412 */
413 /*
414 public void skimFile()
415 {
416 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
417
418 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
419 DebugStream.println("Skimming " + this + "...");
420 try {
421 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
422 int description_element_start = -1;
423
424 String line = null;
425 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
426 // This line contains the start of a Description element
427 if (line.indexOf("<Description>") != -1) {
428 if (description_element_start != -1) {
429 System.err.println("Parse error: previous Description element unfinished!");
430 }
431 description_element_start = line_num;
432 continue;
433 }
434
435 // This line contains the end of a Description element
436 if (line.indexOf("</Description>") != -1) {
437 if (description_element_start == -1) {
438 System.err.println("Parse error: Description element unstarted!");
439 }
440 description_element_start = -1;
441 continue;
442 }
443
444 // If we're not in a Description element there shouldn't be any Metadata elements
445 if (description_element_start == -1) {
446 continue;
447 }
448
449 // This line doesn't contain a Metadata element, so we're not interested
450 if (line.indexOf("<Metadata ") == -1) {
451 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
452 continue;
453 }
454
455 // Extract the metadata element name
456 int name_index = line.indexOf(" name=\"") + " name=\"".length();
457 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
458
459 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
460 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
461 if (!metadata_set_namespace.equals("")) {
462 continue;
463 }
464
465 // Extracted metadata!
466 String metadata_element_name = metadata_element_name_full;
467
468 // Note which file this doc.xml is for
469 if (metadata_element_name.equals("gsdlsourcefilename")) {
470 // Extract the gsdlsourcefilename element value
471 int value_index = line.indexOf(">", name_index) + ">".length();
472 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
473
474 // We're only interested in the path relative to the import folder
475 int import_index = gsdlsourcefilename_value.indexOf("import");
476 if (import_index != -1) {
477 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
478
479 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
480 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
481
482 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
483 // This is stored in the System's file.encoding property.
484 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
485
486 // Make sure the path matches the OS that is running
487 if (is_unix_path && Utility.isWindows()) {
488 // Convert path from Unix to Windows
489 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
490 }
491 else if (!is_unix_path && !Utility.isWindows()) {
492 // Convert path from Windows to Unix
493 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
494 }
495
496 // Remember this for quick access later
497 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
498 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
499 }
500
501 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
502 }
503
504 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
505 // This is true when the source files come from a zip file processed by ZIPPlug, for example
506 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
507 // We don't really know what is going on...
508 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
509 }
510 }
511
512 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
513 if (metadata_element_name.startsWith("gsdl")) {
514 continue;
515 }
516
517 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
518 if (metadata_element == null) {
519 // This element isn't defined in ex.mds, so create it for this session
520 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
521 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
522 }
523 }
524
525 buffered_reader.close();
526 }
527 catch (FileNotFoundException exception) {
528 DebugStream.printStackTrace(exception);
529 }
530 catch (IOException exception) {
531 DebugStream.printStackTrace(exception);
532 }
533 }
534 */
535
536}
Note: See TracBrowser for help on using the repository browser.