Context Navigation

source: trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java@ 7292

Last change on this file since 7292 was 7292, checked in by kjdon, 20 years ago
made the directory parser recursive, and now it doesn't look at any namespaced metadata - only tries to extract non-namespaced, ie greenstone extracted, metadata
Property svn:keywords set to `Author Date Id Revision`
File size: 10.3 KB

Line
1	/**
2	*#########################################################################
3	*
4	* A component of the Gatherer application, part of the Greenstone digital
5	* library suite from the New Zealand Digital Library Project at the
6	* University of Waikato, New Zealand.
7	*
8	* <BR><BR>
9	*
10	* Author: John Thompson, Greenstone Digital Library, University of Waikato
11	*
12	* <BR><BR>
13	*
14	* Copyright (C) 1999 New Zealand Digital Library Project
15	*
16	* <BR><BR>
17	*
18	* This program is free software; you can redistribute it and/or modify
19	* it under the terms of the GNU General Public License as published by
20	* the Free Software Foundation; either version 2 of the License, or
21	* (at your option) any later version.
22	*
23	* <BR><BR>
24	*
25	* This program is distributed in the hope that it will be useful,
26	* but WITHOUT ANY WARRANTY; without even the implied warranty of
27	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28	* GNU General Public License for more details.
29	*
30	* <BR><BR>
31	*
32	* You should have received a copy of the GNU General Public License
33	* along with this program; if not, write to the Free Software
34	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35	*########################################################################
36	*/
37	package org.greenstone.gatherer.msm;
38
39	import java.io.*;
40	import java.net.*;
41	import java.util.*;
42	import org.greenstone.gatherer.Configuration;
43	import org.greenstone.gatherer.Dictionary;
44	import org.greenstone.gatherer.Gatherer;
45	import org.greenstone.gatherer.collection.Collection;
46	import org.greenstone.gatherer.collection.CollectionManager;
47	import org.greenstone.gatherer.file.FileNode;
48	import org.greenstone.gatherer.msm.ElementWrapper;
49	import org.greenstone.gatherer.msm.MetadataSet;
50	import org.greenstone.gatherer.msm.MetadataSetManager;
51	import org.greenstone.gatherer.msm.MSMUtils;
52	import org.greenstone.gatherer.shell.GShell;
53	import org.greenstone.gatherer.shell.GShellProgressMonitor;
54	import org.greenstone.gatherer.util.StaticStrings;
55	import org.greenstone.gatherer.util.Utility;
56	import org.greenstone.gatherer.valuetree.GValueModel;
57	import org.greenstone.gatherer.valuetree.GValueNode;
58	import org.w3c.dom.*;
59
60	public class GreenstoneArchiveParser {
61
62	private GShell shell;
63	private String file_path;
64
65	static final String ignore_list[] = { "assocfilepath", "gsdl", "Identifier", "SourceSegment", "URL" };
66
67	public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
68	// We can only extract metadata if an extracted metadata set exists in our collection.
69	if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) {
70	this.shell = shell;
71	// The very firstist thing we do is remove any existing extracted metadata
72	if(Gatherer.c_man != null && Gatherer.c_man.getCollection() != null) {
73	Gatherer.c_man.getCollection().getGDM().removeExtractedMetadata();
74	}
75
76	// Determine the collection archive directory.
77	File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
78	// Start the extraction process
79	processDirectory(archive_directory, progress);
80	}
81	// All done. Outta here like a bald man.
82	}
83
84	private void processDirectory(File directory, GShellProgressMonitor progress) {
85	// look for a doc.xml file here
86	File document_file = new File(directory, "doc.xml");
87	// Then extract the metadata from it.
88	if(document_file.exists()) {
89	int count = extractMetadata(document_file);
90	// Display a pretty progress message.
91	String[] args = new String[2];
92	args[0] = directory.getName();
93	args[1] = String.valueOf(count);
94	this.shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.get("GShell.Extracted", args), GShell.OK, null);
95	args = null;
96	progress.increment();
97	}
98
99	// for each directory here, process it
100	File sub_directories[] = directory.listFiles();
101	for(int i = 0; i < sub_directories.length; i++) {
102	// Find the doc.xml file within
103	if(sub_directories[i].isDirectory()) {
104	processDirectory(sub_directories[i], progress);
105	}
106	}
107	}
108
109	private int extractMetadata(File file) {
110	int count = 0;
111	// Retrieve the DOM of the file.
112	Document document = Utility.parse(file, false);
113
114	// If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
115	if (document != null) {
116	file_path = null;
117	Element archive_element = document.getDocumentElement();
118	// Retrieve the initial Section element
119	NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT);
120	// It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements
121	if (section_elements.getLength() < 1) {
122	return count;
123	}
124	Element section_element = (Element) section_elements.item(0);
125	section_elements = null;
126
127	// Retrieve all of the Metadata sections
128	NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT);
129	section_element = null;
130
131	// Zip through the retrieved metadata checking for SourceSegment elements
132	// These are a good sign of bibliographic files, which we must handle specially
133	boolean ignore_values = false;
134	for (int i = 0; i < metadata_elements.getLength(); i++) {
135	Element metadata_element = (Element) metadata_elements.item(i);
136	String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
137	if (name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) {
138	ignore_values = true;
139	break;
140	}
141	}
142
143	// Now for each Metadata entry retrieved...
144	for (int i = 0; i < metadata_elements.getLength(); i++) {
145	Element metadata_element = (Element) metadata_elements.item(i);
146	if (processMetadataElement(metadata_element, ignore_values) == true) {
147	count++;
148	}
149	}
150	}
151
152	return count;
153	}
154
155
156	private boolean processMetadataElement(Element metadata_element, boolean ignore_values)
157	{
158	String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
159	// There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
160	if (name.equals("gsdlsourcefilename")) {
161	file_path = MSMUtils.getValue(metadata_element);
162	return false;
163	}
164
165	// Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata
166	for (int j = 0; j < ignore_list.length; j++) {
167	if (name.startsWith(ignore_list[j])) {
168	return false;
169	}
170	}
171
172	// namespaced metadata, we don't extract at the moment
173	if (name.indexOf(MSMUtils.NS_SEP)!=-1) {
174	return false;
175	}
176
177	if (file_path == null) {
178	return false;
179	}
180
181	// We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
182	ElementWrapper element = Gatherer.c_man.msm.getElement(name);
183	if (element == null) {
184	MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
185	if (extracted_mds != null) {
186	element = extracted_mds.addElement(name, Gatherer.config.getLanguage());
187	}
188	}
189
190	// If ignore_values is set (bibliographic records) we don't care about the values
191	if (ignore_values == true) {
192	return false;
193	}
194
195	// Retrieve the metadata for the current file
196	File target_file = new File(file_path);
197	String value = "";
198	try {
199	value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
200	}
201	catch (IllegalArgumentException error) { // ****
202	value = Utility.decodeGreenstone(MSMUtils.getValue(metadata_element));
203	}
204	catch (UnsupportedEncodingException error) {
205	Gatherer.printStackTrace(error);
206	}
207
208	if (value == null) {
209	return false;
210	}
211	// Create a new metadata object.
212	GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
213	GValueNode value_node = null;
214	if (value_tree != null) {
215	value_node = value_tree.getValue(value);
216	}
217	else {
218	value_node = new GValueNode(element.toString(), value);
219	}
220
221	Metadata metadata = new Metadata(element, value_node);
222	element.inc();
223	///ystem.err.println("Adding extracted metadata: " + metadata);
224	Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
225
226	return true;
227	}
228
229
230	static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
231
232	static public ArrayList extractMetadataElements(File archive_directory) {
233	ArrayList extracted_metadata_elements = new ArrayList();
234	File document_directories[] = archive_directory.listFiles();
235	for(int i = 0; i < document_directories.length; i++) {
236	// Find the doc.xml file within
237	if(document_directories[i].isDirectory()) {
238	File document_file = new File(document_directories[i], "doc.xml");
239	// Then extract the metadata from it.
240	if(document_file.exists()) {
241	try {
242	Document document = Utility.parse(document_file, false);
243	// Retrieve all of the Metadata sections.
244	Element archive_element = document.getDocumentElement();
245	NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
246	// Now for each Metadata entry retrieved...
247	for(int j = 0; j < metadata_elements.getLength(); j++) {
248	Element metadata_element = (Element) metadata_elements.item(j);
249	String name = metadata_element.getAttribute("name");
250	// Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
251	boolean ignore = false;
252	for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) {
253	ignore = name.startsWith(metadata_ignore_list[k]);
254	}
255	if(!ignore && !extracted_metadata_elements.contains(name)) {
256	extracted_metadata_elements.add(name);
257	}
258	name = null;
259	metadata_element = null;
260	}
261	metadata_elements = null;
262	archive_element = null;
263	document = null;
264	}
265	catch (Exception error) {
266	Gatherer.printStackTrace(error);
267	}
268	}
269	document_file = null;
270	}
271	}
272	document_directories = null;
273	return extracted_metadata_elements;
274	}
275	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: