source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 23763

Last change on this file since 23763 was 23763, checked in by sjm84, 13 years ago

Fixed a visual issue in GLI that was preventing some characters from displaying properly in the Enrich metadata table, despite being correct in the file

  • Property svn:keywords set to Author Date Id Revision
File size: 21.4 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import java.net.URLDecoder;
33import org.greenstone.gatherer.DebugStream;
34import org.greenstone.gatherer.util.Utility;
35
36
37/** This class represents one doc.xml file */
38
39public abstract class DocXMLFile extends File
40{
41 protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
42
43 protected final String MetadataWrap;
44 protected final String MetadataItem;
45
46 public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
47 {
48 super(doc_xml_file_path);
49 this.MetadataWrap = metaWrap;
50 this.MetadataItem = metaItem;
51 }
52
53
54 public ArrayList getMetadataExtractedFromFile(File file)
55 {
56 // Build up a list of metadata extracted from this file
57 ArrayList metadata_values = new ArrayList();
58
59 String file_relative_path = file.getAbsolutePath();
60 int import_index = file_relative_path.indexOf("import");
61 if (import_index != -1) {
62 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
63 }
64
65 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
66 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
67 if (description_elements_list == null) {
68 // ...it doesn't
69 return metadata_values;
70 }
71
72 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
73
74 // Parse the file
75 DebugStream.println("Applicable file: " + this);
76 try {
77 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
78
79 int description_element_num = 0;
80 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
81 boolean in_relevant_description_element = false;
82
83 String line = null;
84 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
85 // Check if this line contains the start of a relevant "Description" element
86 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
87 if (line_num == next_description_element_start) {
88 in_relevant_description_element = true;
89 continue;
90 }
91
92 // If we're not in a relevant Description element we don't care about anything
93 if (in_relevant_description_element == false) {
94 continue;
95 }
96
97 // Check if this line contains the end of the relevant Description element
98 if (line.indexOf("</"+MetadataWrap+">") != -1) {
99 description_element_num++;
100 if (description_element_num == description_elements_list.size()) {
101 break;
102 }
103
104 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
105 in_relevant_description_element = false;
106 continue;
107 }
108
109 // If this line doesn't contain a complete Metadata element, we're not interested
110 if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) {
111 continue;
112 }
113
114 // Extract the metadata element name
115 int name_index = line.indexOf(" name=\"") + " name=\"".length();
116 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
117
118 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
119 // Actually, if it is ex. then we are interested
120 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
121
122 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
123 continue;
124 }
125
126 // Extracted metadata!
127 // do it like this just in case we have ex.
128 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
129
130 // We completely ignore bibliographic data
131 if (metadata_element_name.equals("SourceSegment")) {
132 buffered_reader.close();
133 return new ArrayList();
134 }
135
136 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
137 if (metadata_element_name.startsWith("gsdl")) {
138 continue;
139 }
140
141 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
142
143 // Value trees are not stored for extracted metadata, so create a new value tree node now
144 int value_index = line.indexOf(">", name_index) + ">".length();
145 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
146
147 metadata_element.addMetadataValue(metadata_element_value);
148 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
149
150 // Add the new metadata value to the list
151 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
152 metadata_values.add(metadata_value);
153 }
154
155 buffered_reader.close();
156 }
157 catch (FileNotFoundException exception) {
158 DebugStream.printStackTrace(exception);
159 }
160 catch (IOException exception) {
161 DebugStream.printStackTrace(exception);
162 }
163
164 return metadata_values;
165 }
166
167
168
169
170 /**
171 * Every file must be skimmed when a collection is opened, for two reasons:
172 * - To build a mapping from source file to its corresponding doc.xml file
173 * - To get a complete list of all extracted metadata elements
174 */
175 public void skimFile()
176 {
177 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
178
179 // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
180 DebugStream.println("Skimming " + this + "...");
181 try {
182 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
183 int description_element_start = -1;
184
185 String line = null;
186 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
187 // This line contains the start of a "MetadataWrap" element
188 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
189 if (line.indexOf("<"+MetadataWrap+">") != -1) {
190 if (description_element_start != -1) {
191 System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
192 }
193 description_element_start = line_num;
194 continue;
195 }
196
197 // This line contains the end of a "MetadataWrap" element
198 if (line.indexOf("</"+MetadataWrap+">") != -1) {
199 if (description_element_start == -1) {
200 System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
201 }
202 description_element_start = -1;
203 continue;
204 }
205
206 // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
207 if (description_element_start == -1) {
208 continue;
209 }
210
211 // This line doesn't contain a Metadata element, so we're not interested
212 if (line.indexOf("<"+MetadataItem+" ") == -1) {
213 DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
214 continue;
215 }
216
217 // Extract the metadata element name
218 int name_index = line.indexOf(" name=\"") + " name=\"".length();
219 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
220
221 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
222 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
223 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
224 continue;
225 }
226
227 // Extracted metadata! May have ex. so make sure we remove that
228 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
229 // Note which file this is for
230 if (metadata_element_name.equals("gsdlsourcefilename")) {
231 // Extract the gsdlsourcefilename element value
232 int value_index = line.indexOf(">", name_index) + ">".length();
233 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
234
235 // We're only interested in the path relative to the import folder
236 int import_index = gsdlsourcefilename_value.indexOf("import");
237 if (import_index != -1) {
238 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
239
240 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
241 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
242
243 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
244 // This is stored in the System's file.encoding property.
245 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
246
247 // Make sure the path matches the OS that is running
248 if (is_unix_path && Utility.isWindows()) {
249 // Convert path from Unix to Windows
250 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
251 }
252 else if (!is_unix_path && !Utility.isWindows()) {
253 // Convert path from Windows to Unix
254 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
255 }
256
257 // Remember this for quick access later
258 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
259 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
260 }
261
262 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
263 }
264
265 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
266 // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
267 // which are the gsdlsourcefilenames for the fedora digital object representing a collection.
268 // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
269 else if (gsdlsourcefilename_value.indexOf("tmp") == -1
270 && !gsdlsourcefilename_value.endsWith("collect.cfg")
271 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
272 // We don't really know what is going on...
273 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
274 }
275 }
276
277 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
278 if (metadata_element_name.startsWith("gsdl")) {
279 continue;
280 }
281
282 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
283 if (metadata_element == null) {
284 // This element isn't defined in ex.mds, so create it for this session
285 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
286 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
287 }
288 }
289
290 buffered_reader.close();
291 }
292 catch (FileNotFoundException exception) {
293 DebugStream.printStackTrace(exception);
294 }
295 catch (IOException exception) {
296 DebugStream.printStackTrace(exception);
297 }
298 }
299
300
301 /*
302 public ArrayList getMetadataExtractedFromFile(File file)
303 {
304 // Build up a list of metadata extracted from this file
305 ArrayList metadata_values = new ArrayList();
306
307 String file_relative_path = file.getAbsolutePath();
308 int import_index = file_relative_path.indexOf("import");
309 if (import_index != -1) {
310 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
311 }
312
313 // Check whether this doc.xml file contains extracted metadata for the specified file
314 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
315 if (description_elements_list == null) {
316 // ...it doesn't
317 return metadata_values;
318 }
319
320 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
321
322 // Parse the doc.xml file
323 DebugStream.println("Applicable doc.xml file: " + this);
324 try {
325 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
326
327 int description_element_num = 0;
328 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
329 boolean in_relevant_description_element = false;
330
331 String line = null;
332 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
333 // Check if this line contains the start of a relevant Description element
334 if (line_num == next_description_element_start) {
335 in_relevant_description_element = true;
336 continue;
337 }
338
339 // If we're not in a relevant Description element we don't care about anything
340 if (in_relevant_description_element == false) {
341 continue;
342 }
343
344 // Check if this line contains the end of the relevant Description element
345 if (line.indexOf("</Description>") != -1) {
346 description_element_num++;
347 if (description_element_num == description_elements_list.size()) {
348 break;
349 }
350
351 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
352 in_relevant_description_element = false;
353 continue;
354 }
355
356 // If this line doesn't contain a complete Metadata element, we're not interested
357 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
358 continue;
359 }
360
361 // Extract the metadata element name
362 int name_index = line.indexOf(" name=\"") + " name=\"".length();
363 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
364
365 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
366 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
367 if (!metadata_set_namespace.equals("")) {
368 continue;
369 }
370
371 // Extracted metadata!
372 String metadata_element_name = metadata_element_name_full;
373
374 // We completely ignore bibliographic data
375 if (metadata_element_name.equals("SourceSegment")) {
376 buffered_reader.close();
377 return new ArrayList();
378 }
379
380 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
381 if (metadata_element_name.startsWith("gsdl")) {
382 continue;
383 }
384
385 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
386
387 // Value trees are not stored for extracted metadata, so create a new value tree node now
388 int value_index = line.indexOf(">", name_index) + ">".length();
389 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
390
391 metadata_element.addMetadataValue(metadata_element_value);
392 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
393
394 // Add the new metadata value to the list
395 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
396 metadata_values.add(metadata_value);
397 }
398
399 buffered_reader.close();
400 }
401 catch (FileNotFoundException exception) {
402 DebugStream.printStackTrace(exception);
403 }
404 catch (IOException exception) {
405 DebugStream.printStackTrace(exception);
406 }
407
408 return metadata_values;
409 }
410
411 */
412
413 /**
414 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
415 * - To build a mapping from source file to its corresponding doc.xml file
416 * - To get a complete list of all extracted metadata elements
417 */
418 /*
419 public void skimFile()
420 {
421 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
422
423 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
424 DebugStream.println("Skimming " + this + "...");
425 try {
426 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
427 int description_element_start = -1;
428
429 String line = null;
430 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
431 // This line contains the start of a Description element
432 if (line.indexOf("<Description>") != -1) {
433 if (description_element_start != -1) {
434 System.err.println("Parse error: previous Description element unfinished!");
435 }
436 description_element_start = line_num;
437 continue;
438 }
439
440 // This line contains the end of a Description element
441 if (line.indexOf("</Description>") != -1) {
442 if (description_element_start == -1) {
443 System.err.println("Parse error: Description element unstarted!");
444 }
445 description_element_start = -1;
446 continue;
447 }
448
449 // If we're not in a Description element there shouldn't be any Metadata elements
450 if (description_element_start == -1) {
451 continue;
452 }
453
454 // This line doesn't contain a Metadata element, so we're not interested
455 if (line.indexOf("<Metadata ") == -1) {
456 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
457 continue;
458 }
459
460 // Extract the metadata element name
461 int name_index = line.indexOf(" name=\"") + " name=\"".length();
462 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
463
464 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
465 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
466 if (!metadata_set_namespace.equals("")) {
467 continue;
468 }
469
470 // Extracted metadata!
471 String metadata_element_name = metadata_element_name_full;
472
473 // Note which file this doc.xml is for
474 if (metadata_element_name.equals("gsdlsourcefilename")) {
475 // Extract the gsdlsourcefilename element value
476 int value_index = line.indexOf(">", name_index) + ">".length();
477 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
478
479 // We're only interested in the path relative to the import folder
480 int import_index = gsdlsourcefilename_value.indexOf("import");
481 if (import_index != -1) {
482 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
483
484 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
485 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
486
487 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
488 // This is stored in the System's file.encoding property.
489 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
490
491 // Make sure the path matches the OS that is running
492 if (is_unix_path && Utility.isWindows()) {
493 // Convert path from Unix to Windows
494 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
495 }
496 else if (!is_unix_path && !Utility.isWindows()) {
497 // Convert path from Windows to Unix
498 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
499 }
500
501 // Remember this for quick access later
502 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
503 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
504 }
505
506 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
507 }
508
509 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
510 // This is true when the source files come from a zip file processed by ZIPPlug, for example
511 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
512 // We don't really know what is going on...
513 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
514 }
515 }
516
517 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
518 if (metadata_element_name.startsWith("gsdl")) {
519 continue;
520 }
521
522 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
523 if (metadata_element == null) {
524 // This element isn't defined in ex.mds, so create it for this session
525 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
526 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
527 }
528 }
529
530 buffered_reader.close();
531 }
532 catch (FileNotFoundException exception) {
533 DebugStream.printStackTrace(exception);
534 }
535 catch (IOException exception) {
536 DebugStream.printStackTrace(exception);
537 }
538 }
539 */
540
541}
Note: See TracBrowser for help on using the repository browser.