source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 22314

Last change on this file since 22314 was 22314, checked in by kjdon, 14 years ago

we are now extracted some metadata with ex. there, eg File.FileType from EmbeddedMetadataPlugin. Store this as ex.File.FileType so that GLI can pick it up as extracted metadata. Modified routines so that we look for no namespace and ex. namespace when getting extracted metadata

  • Property svn:keywords set to Author Date Id Revision
File size: 21.3 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import java.net.URLDecoder;
33import org.greenstone.gatherer.DebugStream;
34import org.greenstone.gatherer.util.Utility;
35
36
37/** This class represents one doc.xml file */
38
39public abstract class DocXMLFile extends File
40{
41 protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
42
43 protected final String MetadataWrap;
44 protected final String MetadataItem;
45
46 public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
47 {
48 super(doc_xml_file_path);
49 this.MetadataWrap = metaWrap;
50 this.MetadataItem = metaItem;
51 }
52
53
54 public ArrayList getMetadataExtractedFromFile(File file)
55 {
56 // Build up a list of metadata extracted from this file
57 ArrayList metadata_values = new ArrayList();
58
59 String file_relative_path = file.getAbsolutePath();
60 int import_index = file_relative_path.indexOf("import");
61 if (import_index != -1) {
62 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
63 }
64
65 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
66 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
67 if (description_elements_list == null) {
68 // ...it doesn't
69 return metadata_values;
70 }
71
72 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
73
74 // Parse the file
75 DebugStream.println("Applicable file: " + this);
76 try {
77 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
78
79 int description_element_num = 0;
80 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
81 boolean in_relevant_description_element = false;
82
83 String line = null;
84 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
85 // Check if this line contains the start of a relevant "Description" element
86 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
87 if (line_num == next_description_element_start) {
88 in_relevant_description_element = true;
89 continue;
90 }
91
92 // If we're not in a relevant Description element we don't care about anything
93 if (in_relevant_description_element == false) {
94 continue;
95 }
96
97 // Check if this line contains the end of the relevant Description element
98 if (line.indexOf("</"+MetadataWrap+">") != -1) {
99 description_element_num++;
100 if (description_element_num == description_elements_list.size()) {
101 break;
102 }
103
104 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
105 in_relevant_description_element = false;
106 continue;
107 }
108
109 // If this line doesn't contain a complete Metadata element, we're not interested
110 if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) {
111 continue;
112 }
113
114 // Extract the metadata element name
115 int name_index = line.indexOf(" name=\"") + " name=\"".length();
116 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
117
118 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
119 // Actually, if it is ex. then we are interested
120 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
121
122 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
123 continue;
124 }
125
126 // Extracted metadata!
127 // do it like this just in case we have ex.
128 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
129
130 // We completely ignore bibliographic data
131 if (metadata_element_name.equals("SourceSegment")) {
132 buffered_reader.close();
133 return new ArrayList();
134 }
135
136 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
137 if (metadata_element_name.startsWith("gsdl")) {
138 continue;
139 }
140
141 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
142
143 // Value trees are not stored for extracted metadata, so create a new value tree node now
144 int value_index = line.indexOf(">", name_index) + ">".length();
145 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
146
147 metadata_element.addMetadataValue(metadata_element_value);
148 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
149
150 // Add the new metadata value to the list
151 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
152 metadata_values.add(metadata_value);
153 }
154
155 buffered_reader.close();
156 }
157 catch (FileNotFoundException exception) {
158 DebugStream.printStackTrace(exception);
159 }
160 catch (IOException exception) {
161 DebugStream.printStackTrace(exception);
162 }
163
164 return metadata_values;
165 }
166
167
168
169
170 /**
171 * Every file must be skimmed when a collection is opened, for two reasons:
172 * - To build a mapping from source file to its corresponding doc.xml file
173 * - To get a complete list of all extracted metadata elements
174 */
175 public void skimFile()
176 {
177 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
178
179 // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
180 DebugStream.println("Skimming " + this + "...");
181 try {
182 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
183 int description_element_start = -1;
184
185 String line = null;
186 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
187 // This line contains the start of a "MetadataWrap" element
188 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
189 if (line.indexOf("<"+MetadataWrap+">") != -1) {
190 if (description_element_start != -1) {
191 System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
192 }
193 description_element_start = line_num;
194 continue;
195 }
196
197 // This line contains the end of a "MetadataWrap" element
198 if (line.indexOf("</"+MetadataWrap+">") != -1) {
199 if (description_element_start == -1) {
200 System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
201 }
202 description_element_start = -1;
203 continue;
204 }
205
206 // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
207 if (description_element_start == -1) {
208 continue;
209 }
210
211 // This line doesn't contain a Metadata element, so we're not interested
212 if (line.indexOf("<"+MetadataItem+" ") == -1) {
213 DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
214 continue;
215 }
216
217 // Extract the metadata element name
218 int name_index = line.indexOf(" name=\"") + " name=\"".length();
219 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
220
221 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
222 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
223 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
224 continue;
225 }
226
227 // Extracted metadata! May have ex. so make sure we remove that
228 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
229 // Note which file this is for
230 if (metadata_element_name.equals("gsdlsourcefilename")) {
231 // Extract the gsdlsourcefilename element value
232 int value_index = line.indexOf(">", name_index) + ">".length();
233 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
234
235 // We're only interested in the path relative to the import folder
236 int import_index = gsdlsourcefilename_value.indexOf("import");
237 if (import_index != -1) {
238 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
239
240 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
241 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
242
243 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
244 // This is stored in the System's file.encoding property.
245 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
246
247 // Make sure the path matches the OS that is running
248 if (is_unix_path && Utility.isWindows()) {
249 // Convert path from Unix to Windows
250 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
251 }
252 else if (!is_unix_path && !Utility.isWindows()) {
253 // Convert path from Windows to Unix
254 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
255 }
256
257 // Remember this for quick access later
258 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
259 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
260 }
261
262 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
263 }
264
265 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
266 // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
267 // which are the gsdlsourcefilenames for the fedora digital object representing a collection.
268 // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
269 else if (gsdlsourcefilename_value.indexOf("tmp") == -1
270 && !gsdlsourcefilename_value.endsWith("collect.cfg")
271 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
272 // We don't really know what is going on...
273 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
274 }
275 }
276
277 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
278 if (metadata_element_name.startsWith("gsdl")) {
279 continue;
280 }
281
282 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
283 if (metadata_element == null) {
284 // This element isn't defined in ex.mds, so create it for this session
285 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
286 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
287 }
288 }
289
290 buffered_reader.close();
291 }
292 catch (FileNotFoundException exception) {
293 DebugStream.printStackTrace(exception);
294 }
295 catch (IOException exception) {
296 DebugStream.printStackTrace(exception);
297 }
298 }
299
300
301 /*
302 public ArrayList getMetadataExtractedFromFile(File file)
303 {
304 // Build up a list of metadata extracted from this file
305 ArrayList metadata_values = new ArrayList();
306
307 String file_relative_path = file.getAbsolutePath();
308 int import_index = file_relative_path.indexOf("import");
309 if (import_index != -1) {
310 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
311 }
312
313 // Check whether this doc.xml file contains extracted metadata for the specified file
314 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
315 if (description_elements_list == null) {
316 // ...it doesn't
317 return metadata_values;
318 }
319
320 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
321
322 // Parse the doc.xml file
323 DebugStream.println("Applicable doc.xml file: " + this);
324 try {
325 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
326
327 int description_element_num = 0;
328 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
329 boolean in_relevant_description_element = false;
330
331 String line = null;
332 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
333 // Check if this line contains the start of a relevant Description element
334 if (line_num == next_description_element_start) {
335 in_relevant_description_element = true;
336 continue;
337 }
338
339 // If we're not in a relevant Description element we don't care about anything
340 if (in_relevant_description_element == false) {
341 continue;
342 }
343
344 // Check if this line contains the end of the relevant Description element
345 if (line.indexOf("</Description>") != -1) {
346 description_element_num++;
347 if (description_element_num == description_elements_list.size()) {
348 break;
349 }
350
351 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
352 in_relevant_description_element = false;
353 continue;
354 }
355
356 // If this line doesn't contain a complete Metadata element, we're not interested
357 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
358 continue;
359 }
360
361 // Extract the metadata element name
362 int name_index = line.indexOf(" name=\"") + " name=\"".length();
363 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
364
365 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
366 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
367 if (!metadata_set_namespace.equals("")) {
368 continue;
369 }
370
371 // Extracted metadata!
372 String metadata_element_name = metadata_element_name_full;
373
374 // We completely ignore bibliographic data
375 if (metadata_element_name.equals("SourceSegment")) {
376 buffered_reader.close();
377 return new ArrayList();
378 }
379
380 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
381 if (metadata_element_name.startsWith("gsdl")) {
382 continue;
383 }
384
385 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
386
387 // Value trees are not stored for extracted metadata, so create a new value tree node now
388 int value_index = line.indexOf(">", name_index) + ">".length();
389 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
390
391 metadata_element.addMetadataValue(metadata_element_value);
392 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
393
394 // Add the new metadata value to the list
395 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
396 metadata_values.add(metadata_value);
397 }
398
399 buffered_reader.close();
400 }
401 catch (FileNotFoundException exception) {
402 DebugStream.printStackTrace(exception);
403 }
404 catch (IOException exception) {
405 DebugStream.printStackTrace(exception);
406 }
407
408 return metadata_values;
409 }
410
411 */
412
413 /**
414 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
415 * - To build a mapping from source file to its corresponding doc.xml file
416 * - To get a complete list of all extracted metadata elements
417 */
418 /*
419 public void skimFile()
420 {
421 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
422
423 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
424 DebugStream.println("Skimming " + this + "...");
425 try {
426 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
427 int description_element_start = -1;
428
429 String line = null;
430 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
431 // This line contains the start of a Description element
432 if (line.indexOf("<Description>") != -1) {
433 if (description_element_start != -1) {
434 System.err.println("Parse error: previous Description element unfinished!");
435 }
436 description_element_start = line_num;
437 continue;
438 }
439
440 // This line contains the end of a Description element
441 if (line.indexOf("</Description>") != -1) {
442 if (description_element_start == -1) {
443 System.err.println("Parse error: Description element unstarted!");
444 }
445 description_element_start = -1;
446 continue;
447 }
448
449 // If we're not in a Description element there shouldn't be any Metadata elements
450 if (description_element_start == -1) {
451 continue;
452 }
453
454 // This line doesn't contain a Metadata element, so we're not interested
455 if (line.indexOf("<Metadata ") == -1) {
456 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
457 continue;
458 }
459
460 // Extract the metadata element name
461 int name_index = line.indexOf(" name=\"") + " name=\"".length();
462 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
463
464 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
465 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
466 if (!metadata_set_namespace.equals("")) {
467 continue;
468 }
469
470 // Extracted metadata!
471 String metadata_element_name = metadata_element_name_full;
472
473 // Note which file this doc.xml is for
474 if (metadata_element_name.equals("gsdlsourcefilename")) {
475 // Extract the gsdlsourcefilename element value
476 int value_index = line.indexOf(">", name_index) + ">".length();
477 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
478
479 // We're only interested in the path relative to the import folder
480 int import_index = gsdlsourcefilename_value.indexOf("import");
481 if (import_index != -1) {
482 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
483
484 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
485 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
486
487 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
488 // This is stored in the System's file.encoding property.
489 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
490
491 // Make sure the path matches the OS that is running
492 if (is_unix_path && Utility.isWindows()) {
493 // Convert path from Unix to Windows
494 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
495 }
496 else if (!is_unix_path && !Utility.isWindows()) {
497 // Convert path from Windows to Unix
498 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
499 }
500
501 // Remember this for quick access later
502 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
503 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
504 }
505
506 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
507 }
508
509 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
510 // This is true when the source files come from a zip file processed by ZIPPlug, for example
511 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
512 // We don't really know what is going on...
513 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
514 }
515 }
516
517 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
518 if (metadata_element_name.startsWith("gsdl")) {
519 continue;
520 }
521
522 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
523 if (metadata_element == null) {
524 // This element isn't defined in ex.mds, so create it for this session
525 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
526 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
527 }
528 }
529
530 buffered_reader.close();
531 }
532 catch (FileNotFoundException exception) {
533 DebugStream.printStackTrace(exception);
534 }
535 catch (IOException exception) {
536 DebugStream.printStackTrace(exception);
537 }
538 }
539 */
540
541}
Note: See TracBrowser for help on using the repository browser.