Context Navigation

source: gli/trunk/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 17014

Last change on this file since 17014 was 17014, checked in by ak19, 16 years ago
Made MetadataWrap and MetadataItem members final rather than static, now they are passed by subclasses to the superclass constructor (DocXMLFile). 2. Skip warning on gsdlsourcefilename etc/collect.cfg since this occurs when working with FLI and is not an error.
Property svn:keywords set to `Author Date Id Revision`
File size: 20.8 KB

Line
1	/**
2	*############################################################################
3	* A component of the Greenstone Librarian Interface, part of the Greenstone
4	* digital library suite from the New Zealand Digital Library Project at the
5	* University of Waikato, New Zealand.
6	*
7	* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8	*
9	* Copyright (C) 2004 New Zealand Digital Library Project
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*############################################################################
25	*/
26
27	package org.greenstone.gatherer.metadata;
28
29
30	import java.io.*;
31	import java.util.*;
32	import java.net.URLDecoder;
33	import org.greenstone.gatherer.DebugStream;
34	import org.greenstone.gatherer.util.Utility;
35
36
37	/** This class represents one doc.xml file */
38
39	public abstract class DocXMLFile extends File
40	{
41	protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
42
43	protected final String MetadataWrap;
44	protected final String MetadataItem;
45
46	public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
47	{
48	super(doc_xml_file_path);
49	this.MetadataWrap = metaWrap;
50	this.MetadataItem = metaItem;
51	}
52
53
54	public ArrayList getMetadataExtractedFromFile(File file)
55	{
56	// Build up a list of metadata extracted from this file
57	ArrayList metadata_values = new ArrayList();
58
59	String file_relative_path = file.getAbsolutePath();
60	int import_index = file_relative_path.indexOf("import");
61	if (import_index != -1) {
62	file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
63	}
64
65	// Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
66	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
67	if (description_elements_list == null) {
68	// ...it doesn't
69	return metadata_values;
70	}
71
72	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
73
74	// Parse the file
75	DebugStream.println("Applicable file: " + this);
76	try {
77	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
78
79	int description_element_num = 0;
80	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
81	boolean in_relevant_description_element = false;
82
83	String line = null;
84	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
85	// Check if this line contains the start of a relevant "Description" element
86	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
87	if (line_num == next_description_element_start) {
88	in_relevant_description_element = true;
89	continue;
90	}
91
92	// If we're not in a relevant Description element we don't care about anything
93	if (in_relevant_description_element == false) {
94	continue;
95	}
96
97	// Check if this line contains the end of the relevant Description element
98	if (line.indexOf("</"+MetadataWrap+">") != -1) {
99	description_element_num++;
100	if (description_element_num == description_elements_list.size()) {
101	break;
102	}
103
104	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
105	in_relevant_description_element = false;
106	continue;
107	}
108
109	// If this line doesn't contain a complete Metadata element, we're not interested
110	if (line.indexOf("<"+MetadataItem+" ") == -1 \|\| line.indexOf("</"+MetadataItem+">") == -1) {
111	continue;
112	}
113
114	// Extract the metadata element name
115	int name_index = line.indexOf(" name=\"") + " name=\"".length();
116	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
117
118	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
119	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
120	if (!metadata_set_namespace.equals("")) {
121	continue;
122	}
123
124	// Extracted metadata!
125	String metadata_element_name = metadata_element_name_full;
126
127	// We completely ignore bibliographic data
128	if (metadata_element_name.equals("SourceSegment")) {
129	buffered_reader.close();
130	return new ArrayList();
131	}
132
133	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
134	if (metadata_element_name.startsWith("gsdl")) {
135	continue;
136	}
137
138	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
139
140	// Value trees are not stored for extracted metadata, so create a new value tree node now
141	int value_index = line.indexOf(">", name_index) + ">".length();
142	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
143
144	metadata_element.addMetadataValue(metadata_element_value);
145	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
146
147	// Add the new metadata value to the list
148	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
149	metadata_values.add(metadata_value);
150	}
151
152	buffered_reader.close();
153	}
154	catch (FileNotFoundException exception) {
155	DebugStream.printStackTrace(exception);
156	}
157	catch (IOException exception) {
158	DebugStream.printStackTrace(exception);
159	}
160
161	return metadata_values;
162	}
163
164
165
166
167	/**
168	* Every file must be skimmed when a collection is opened, for two reasons:
169	* - To build a mapping from source file to its corresponding doc.xml file
170	* - To get a complete list of all extracted metadata elements
171	*/
172	public void skimFile()
173	{
174	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
175
176	// Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
177	DebugStream.println("Skimming " + this + "...");
178	try {
179	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
180	int description_element_start = -1;
181
182	String line = null;
183	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
184	// This line contains the start of a "MetadataWrap" element
185	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
186	if (line.indexOf("<"+MetadataWrap+">") != -1) {
187	if (description_element_start != -1) {
188	System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
189	}
190	description_element_start = line_num;
191	continue;
192	}
193
194	// This line contains the end of a "MetadataWrap" element
195	if (line.indexOf("</"+MetadataWrap+">") != -1) {
196	if (description_element_start == -1) {
197	System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
198	}
199	description_element_start = -1;
200	continue;
201	}
202
203	// If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
204	if (description_element_start == -1) {
205	continue;
206	}
207
208	// This line doesn't contain a Metadata element, so we're not interested
209	if (line.indexOf("<"+MetadataItem+" ") == -1) {
210	DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
211	continue;
212	}
213
214	// Extract the metadata element name
215	int name_index = line.indexOf(" name=\"") + " name=\"".length();
216	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
217
218	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
219	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
220	if (!metadata_set_namespace.equals("")) {
221	continue;
222	}
223
224	// Extracted metadata!
225	String metadata_element_name = metadata_element_name_full;
226
227	// Note which file this is for
228	if (metadata_element_name.equals("gsdlsourcefilename")) {
229	// Extract the gsdlsourcefilename element value
230	int value_index = line.indexOf(">", name_index) + ">".length();
231	String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
232
233	// We're only interested in the path relative to the import folder
234	int import_index = gsdlsourcefilename_value.indexOf("import");
235	if (import_index != -1) {
236	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
237
238	boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
239	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
240
241	// URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
242	// This is stored in the System's file.encoding property.
243	gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
244
245	// Make sure the path matches the OS that is running
246	if (is_unix_path && Utility.isWindows()) {
247	// Convert path from Unix to Windows
248	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
249	}
250	else if (!is_unix_path && !Utility.isWindows()) {
251	// Convert path from Windows to Unix
252	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
253	}
254
255	// Remember this for quick access later
256	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
257	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
258	}
259
260	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
261	}
262
263	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
264	// or (as in the case of using FLI) if it is the etc/collect.cfg file
265	// This is true when the source files come from a zip file processed by ZIPPlug, for example
266	else if (gsdlsourcefilename_value.indexOf("tmp") == -1 && !gsdlsourcefilename_value.endsWith("collect.cfg")) {
267	// We don't really know what is going on...
268	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
269	}
270	}
271
272	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
273	if (metadata_element_name.startsWith("gsdl")) {
274	continue;
275	}
276
277	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
278	if (metadata_element == null) {
279	// This element isn't defined in ex.mds, so create it for this session
280	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
281	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
282	}
283	}
284
285	buffered_reader.close();
286	}
287	catch (FileNotFoundException exception) {
288	DebugStream.printStackTrace(exception);
289	}
290	catch (IOException exception) {
291	DebugStream.printStackTrace(exception);
292	}
293	}
294
295
296	/*
297	public ArrayList getMetadataExtractedFromFile(File file)
298	{
299	// Build up a list of metadata extracted from this file
300	ArrayList metadata_values = new ArrayList();
301
302	String file_relative_path = file.getAbsolutePath();
303	int import_index = file_relative_path.indexOf("import");
304	if (import_index != -1) {
305	file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
306	}
307
308	// Check whether this doc.xml file contains extracted metadata for the specified file
309	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
310	if (description_elements_list == null) {
311	// ...it doesn't
312	return metadata_values;
313	}
314
315	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
316
317	// Parse the doc.xml file
318	DebugStream.println("Applicable doc.xml file: " + this);
319	try {
320	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
321
322	int description_element_num = 0;
323	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
324	boolean in_relevant_description_element = false;
325
326	String line = null;
327	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
328	// Check if this line contains the start of a relevant Description element
329	if (line_num == next_description_element_start) {
330	in_relevant_description_element = true;
331	continue;
332	}
333
334	// If we're not in a relevant Description element we don't care about anything
335	if (in_relevant_description_element == false) {
336	continue;
337	}
338
339	// Check if this line contains the end of the relevant Description element
340	if (line.indexOf("</Description>") != -1) {
341	description_element_num++;
342	if (description_element_num == description_elements_list.size()) {
343	break;
344	}
345
346	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
347	in_relevant_description_element = false;
348	continue;
349	}
350
351	// If this line doesn't contain a complete Metadata element, we're not interested
352	if (line.indexOf("<Metadata ") == -1 \|\| line.indexOf("</Metadata>") == -1) {
353	continue;
354	}
355
356	// Extract the metadata element name
357	int name_index = line.indexOf(" name=\"") + " name=\"".length();
358	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
359
360	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
361	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
362	if (!metadata_set_namespace.equals("")) {
363	continue;
364	}
365
366	// Extracted metadata!
367	String metadata_element_name = metadata_element_name_full;
368
369	// We completely ignore bibliographic data
370	if (metadata_element_name.equals("SourceSegment")) {
371	buffered_reader.close();
372	return new ArrayList();
373	}
374
375	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
376	if (metadata_element_name.startsWith("gsdl")) {
377	continue;
378	}
379
380	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
381
382	// Value trees are not stored for extracted metadata, so create a new value tree node now
383	int value_index = line.indexOf(">", name_index) + ">".length();
384	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
385
386	metadata_element.addMetadataValue(metadata_element_value);
387	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
388
389	// Add the new metadata value to the list
390	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
391	metadata_values.add(metadata_value);
392	}
393
394	buffered_reader.close();
395	}
396	catch (FileNotFoundException exception) {
397	DebugStream.printStackTrace(exception);
398	}
399	catch (IOException exception) {
400	DebugStream.printStackTrace(exception);
401	}
402
403	return metadata_values;
404	}
405
406	*/
407
408	/**
409	* Every doc.xml file must be skimmed when a collection is opened, for two reasons:
410	* - To build a mapping from source file to its corresponding doc.xml file
411	* - To get a complete list of all extracted metadata elements
412	*/
413	/*
414	public void skimFile()
415	{
416	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
417
418	// Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
419	DebugStream.println("Skimming " + this + "...");
420	try {
421	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
422	int description_element_start = -1;
423
424	String line = null;
425	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
426	// This line contains the start of a Description element
427	if (line.indexOf("<Description>") != -1) {
428	if (description_element_start != -1) {
429	System.err.println("Parse error: previous Description element unfinished!");
430	}
431	description_element_start = line_num;
432	continue;
433	}
434
435	// This line contains the end of a Description element
436	if (line.indexOf("</Description>") != -1) {
437	if (description_element_start == -1) {
438	System.err.println("Parse error: Description element unstarted!");
439	}
440	description_element_start = -1;
441	continue;
442	}
443
444	// If we're not in a Description element there shouldn't be any Metadata elements
445	if (description_element_start == -1) {
446	continue;
447	}
448
449	// This line doesn't contain a Metadata element, so we're not interested
450	if (line.indexOf("<Metadata ") == -1) {
451	DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
452	continue;
453	}
454
455	// Extract the metadata element name
456	int name_index = line.indexOf(" name=\"") + " name=\"".length();
457	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
458
459	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
460	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
461	if (!metadata_set_namespace.equals("")) {
462	continue;
463	}
464
465	// Extracted metadata!
466	String metadata_element_name = metadata_element_name_full;
467
468	// Note which file this doc.xml is for
469	if (metadata_element_name.equals("gsdlsourcefilename")) {
470	// Extract the gsdlsourcefilename element value
471	int value_index = line.indexOf(">", name_index) + ">".length();
472	String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
473
474	// We're only interested in the path relative to the import folder
475	int import_index = gsdlsourcefilename_value.indexOf("import");
476	if (import_index != -1) {
477	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
478
479	boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
480	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
481
482	// URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
483	// This is stored in the System's file.encoding property.
484	gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
485
486	// Make sure the path matches the OS that is running
487	if (is_unix_path && Utility.isWindows()) {
488	// Convert path from Unix to Windows
489	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
490	}
491	else if (!is_unix_path && !Utility.isWindows()) {
492	// Convert path from Windows to Unix
493	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
494	}
495
496	// Remember this for quick access later
497	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
498	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
499	}
500
501	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
502	}
503
504	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
505	// This is true when the source files come from a zip file processed by ZIPPlug, for example
506	else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
507	// We don't really know what is going on...
508	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
509	}
510	}
511
512	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
513	if (metadata_element_name.startsWith("gsdl")) {
514	continue;
515	}
516
517	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
518	if (metadata_element == null) {
519	// This element isn't defined in ex.mds, so create it for this session
520	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
521	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
522	}
523	}
524
525	buffered_reader.close();
526	}
527	catch (FileNotFoundException exception) {
528	DebugStream.printStackTrace(exception);
529	}
530	catch (IOException exception) {
531	DebugStream.printStackTrace(exception);
532	}
533	}
534	*/
535
536	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: