Context Navigation

source: trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java@ 6639

Last change on this file since 6639 was 6639, checked in by jmt12, 20 years ago
Prevent hand entered section metadata being extracted in the same way that the top level metadata is
Property svn:keywords set to `Author Date Id Revision`
File size: 10.6 KB

Line
1	/**
2	*#########################################################################
3	*
4	* A component of the Gatherer application, part of the Greenstone digital
5	* library suite from the New Zealand Digital Library Project at the
6	* University of Waikato, New Zealand.
7	*
8	* <BR><BR>
9	*
10	* Author: John Thompson, Greenstone Digital Library, University of Waikato
11	*
12	* <BR><BR>
13	*
14	* Copyright (C) 1999 New Zealand Digital Library Project
15	*
16	* <BR><BR>
17	*
18	* This program is free software; you can redistribute it and/or modify
19	* it under the terms of the GNU General Public License as published by
20	* the Free Software Foundation; either version 2 of the License, or
21	* (at your option) any later version.
22	*
23	* <BR><BR>
24	*
25	* This program is distributed in the hope that it will be useful,
26	* but WITHOUT ANY WARRANTY; without even the implied warranty of
27	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28	* GNU General Public License for more details.
29	*
30	* <BR><BR>
31	*
32	* You should have received a copy of the GNU General Public License
33	* along with this program; if not, write to the Free Software
34	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35	*########################################################################
36	*/
37	package org.greenstone.gatherer.msm;
38
39	import java.io.*;
40	import java.net.*;
41	import java.util.*;
42	import org.greenstone.gatherer.Configuration;
43	import org.greenstone.gatherer.Dictionary;
44	import org.greenstone.gatherer.Gatherer;
45	import org.greenstone.gatherer.collection.Collection;
46	import org.greenstone.gatherer.collection.CollectionManager;
47	import org.greenstone.gatherer.file.FileNode;
48	import org.greenstone.gatherer.msm.ElementWrapper;
49	import org.greenstone.gatherer.msm.MetadataSet;
50	import org.greenstone.gatherer.msm.MetadataSetManager;
51	import org.greenstone.gatherer.msm.MSMUtils;
52	import org.greenstone.gatherer.shell.GShell;
53	import org.greenstone.gatherer.shell.GShellProgressMonitor;
54	import org.greenstone.gatherer.util.StaticStrings;
55	import org.greenstone.gatherer.util.Utility;
56	import org.greenstone.gatherer.valuetree.GValueModel;
57	import org.greenstone.gatherer.valuetree.GValueNode;
58	import org.w3c.dom.*;
59
60	public class GreenstoneArchiveParser {
61
62	private GShell shell;
63
64	static final String ignore_list[] = {"assocfilepath", "gsdl", "Identifier", "URL"}; //"Source",
65
66	public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
67	// We can only extract metadata if an extracted metadata set exists in our collection.
68	if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) {
69	this.shell = shell;
70	// The very firstist thing we do is remove any existing extracted metadata
71	if(Gatherer.c_man != null && Gatherer.c_man.getCollection() != null) {
72	Gatherer.c_man.getCollection().getGDM().removeExtractedMetadata();
73	}
74
75	// Determine the collection archive directory.
76	File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
77	// For each of the hash coded directories within.
78	File document_directories[] = archive_directory.listFiles();
79	for(int i = 0; i < document_directories.length; i++) {
80	// Find the doc.xml file within
81	if(document_directories[i].isDirectory()) {
82	File document_file = new File(document_directories[i], "doc.xml");
83	// Then extract the metadata from it.
84	if(document_file.exists()) {
85	int count = extractMetadata(document_file);
86	// Display a pretty progress message.
87	String[] args = new String[2];
88	args[0] = document_directories[i].getName();
89	args[1] = String.valueOf(count);
90	shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.get("GShell.Extracted", args), GShell.OK, null);
91	args = null;
92	progress.increment();
93	}
94	}
95	}
96	}
97	// All done. Outta here like a bald man.
98	}
99
100	private int extractMetadata(File file) {
101	int count = 0;
102	// Retrieve the DOM of the file.
103	Document document = Utility.parse(file, false);
104
105	Gatherer.println("Parsed greenstone archive document: " + file.getAbsolutePath());
106	// If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
107	if(document != null) {
108	String file_path = null;
109	Element archive_element = document.getDocumentElement();
110	// Retrieve the initial Section element
111	NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT);
112	// It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements.
113	if(section_elements.getLength() < 1) {
114	return count;
115	}
116	Element section_element = (Element) section_elements.item(0);
117	section_elements = null;
118	// Retrieve all of the Metadata sections.
119	NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT);
120	section_element = null;
121	// We first zip through the retrieved metadata, and if we encounter the element 'SourceSegment' - a sure sign this collection came from a bibliographic type file - we break out of extracted metadata parsing as no sense could be made of the data extracted anyway (plus we suffer a death of thirty-thousand pointy bits of metadata!)
122	for(int i = 0; i < metadata_elements.getLength(); i++) {
123	Element metadata_element = (Element) metadata_elements.item(i);
124	String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
125	if(name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) {
126	return 0;
127	}
128	}
129	// Now for each Metadata entry retrieved...
130	for(int i = 0; i < metadata_elements.getLength(); i++) {
131	Element metadata_element = (Element) metadata_elements.item(i);
132	String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
133	// There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
134	if(name.equals("gsdlsourcefilename")) {
135	file_path = MSMUtils.getValue(metadata_element);
136	}
137	else {
138	// Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
139	boolean ignore = false;
140	for(int j = 0; !ignore && j < ignore_list.length; j++) {
141	ignore = name.startsWith(ignore_list[j]);
142	}
143	// Otherwise ensure the metadata is present in our collection.
144	if(!ignore && file_path != null) {
145	// If we successfully retrieved a record we can continue.
146	if(file_path != null) {
147	// We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
148	ElementWrapper element = Gatherer.c_man.msm.getElement(name);
149	if(element == null) {
150	MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
151	if(extracted_mds != null) {
152	element = extracted_mds.addElement(name, Gatherer.config.getLanguage());
153	}
154	}
155	// If we successfully retrieved an element (and we should have) we can continue.
156	// WARNING!! There is one known exception - MARC records. Adding the extracted elements is all good, but adding the extracted metadata causes the whole thing to collapse in a pile of unhappy.
157	if(element != null && !file_path.endsWith(StaticStrings.MARC_EXTENSION) && (element.getNamespace().equals("") \|\| element.getNamespace().equals(Utility.EXTRACTED_METADATA_NAMESPACE))) {
158	// Retrieve the metadata for the current file
159	File target_file = new File(file_path);
160	String value = "";
161	try {
162	value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
163	}
164	catch(UnsupportedEncodingException error) {
165	Gatherer.printStackTrace(error);
166	}
167	// If we successfully retrieved a value we can continue.
168	if(value != null) {
169	// Create a new metadata object.
170	GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
171	GValueNode value_node = null;
172	if(value_tree != null) {
173	value_node = value_tree.getValue(value);
174	}
175	else {
176	value_node = new GValueNode(element.toString(), value);
177	}
178	Metadata metadata = new Metadata(element, value_node);
179	element.inc();
180	///ystem.err.println("Adding extracted metadata: " + metadata);
181	Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
182	count++;
183	// All done. On to next metadata.
184	}
185	value = null;
186	target_file = null;
187	}
188	else {
189	Gatherer.println("Cannot retrieve metadata element " + name);
190	}
191	}
192	}
193	}
194	}
195	}
196	return count;
197	}
198
199	static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
200
201	static public ArrayList extractMetadataElements(File archive_directory) {
202	ArrayList extracted_metadata_elements = new ArrayList();
203	File document_directories[] = archive_directory.listFiles();
204	for(int i = 0; i < document_directories.length; i++) {
205	// Find the doc.xml file within
206	if(document_directories[i].isDirectory()) {
207	File document_file = new File(document_directories[i], "doc.xml");
208	// Then extract the metadata from it.
209	if(document_file.exists()) {
210	try {
211	Document document = Utility.parse(document_file, false);
212	// Retrieve all of the Metadata sections.
213	Element archive_element = document.getDocumentElement();
214	NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
215	// Now for each Metadata entry retrieved...
216	for(int j = 0; j < metadata_elements.getLength(); j++) {
217	Element metadata_element = (Element) metadata_elements.item(j);
218	String name = metadata_element.getAttribute("name");
219	// Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
220	boolean ignore = false;
221	for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) {
222	ignore = name.startsWith(metadata_ignore_list[k]);
223	}
224	if(!ignore && !extracted_metadata_elements.contains(name)) {
225	extracted_metadata_elements.add(name);
226	}
227	name = null;
228	metadata_element = null;
229	}
230	metadata_elements = null;
231	archive_element = null;
232	document = null;
233	}
234	catch (Exception error) {
235	Gatherer.printStackTrace(error);
236	}
237	}
238	document_file = null;
239	}
240	}
241	document_directories = null;
242	return extracted_metadata_elements;
243	}
244	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: