Context Navigation

source: trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java@ 8231

Last change on this file since 8231 was 8231, checked in by mdewsnip, 20 years ago
Replaced all "Gatherer.config" with "Configuration".
Property svn:keywords set to `Author Date Id Revision`
File size: 10.4 KB

Line
1	/**
2	*#########################################################################
3	*
4	* A component of the Gatherer application, part of the Greenstone digital
5	* library suite from the New Zealand Digital Library Project at the
6	* University of Waikato, New Zealand.
7	*
8	* <BR><BR>
9	*
10	* Author: John Thompson, Greenstone Digital Library, University of Waikato
11	*
12	* <BR><BR>
13	*
14	* Copyright (C) 1999 New Zealand Digital Library Project
15	*
16	* <BR><BR>
17	*
18	* This program is free software; you can redistribute it and/or modify
19	* it under the terms of the GNU General Public License as published by
20	* the Free Software Foundation; either version 2 of the License, or
21	* (at your option) any later version.
22	*
23	* <BR><BR>
24	*
25	* This program is distributed in the hope that it will be useful,
26	* but WITHOUT ANY WARRANTY; without even the implied warranty of
27	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28	* GNU General Public License for more details.
29	*
30	* <BR><BR>
31	*
32	* You should have received a copy of the GNU General Public License
33	* along with this program; if not, write to the Free Software
34	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35	*########################################################################
36	*/
37	package org.greenstone.gatherer.msm;
38
39	import java.io.*;
40	import java.net.*;
41	import java.util.*;
42	import org.greenstone.gatherer.Configuration;
43	import org.greenstone.gatherer.Dictionary;
44	import org.greenstone.gatherer.Gatherer;
45	import org.greenstone.gatherer.collection.Collection;
46	import org.greenstone.gatherer.collection.CollectionManager;
47	import org.greenstone.gatherer.file.FileNode;
48	import org.greenstone.gatherer.msm.ElementWrapper;
49	import org.greenstone.gatherer.msm.MetadataSet;
50	import org.greenstone.gatherer.msm.MetadataSetManager;
51	import org.greenstone.gatherer.msm.MSMUtils;
52	import org.greenstone.gatherer.shell.GShell;
53	import org.greenstone.gatherer.shell.GShellProgressMonitor;
54	import org.greenstone.gatherer.util.StaticStrings;
55	import org.greenstone.gatherer.util.Utility;
56	import org.greenstone.gatherer.valuetree.GValueModel;
57	import org.greenstone.gatherer.valuetree.GValueNode;
58	import org.w3c.dom.*;
59
60	public class GreenstoneArchiveParser {
61
62	private GShell shell;
63	private String file_path;
64
65	static final String ignore_list[] = { "assocfilepath", "gsdl", "Identifier", "SourceSegment", "URL" };
66
67	public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
68	// We can only extract metadata if an extracted metadata set exists in our collection.
69	if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) {
70	this.shell = shell;
71	// The very firstist thing we do is remove any existing extracted metadata
72	if(Gatherer.c_man != null && Gatherer.c_man.getCollection() != null) {
73	Gatherer.c_man.getCollection().getGDM().removeExtractedMetadata();
74	}
75
76	// Determine the collection archive directory.
77	File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
78	// Start the extraction process
79	processDirectory(archive_directory, progress);
80	}
81	// All done. Outta here like a bald man.
82	}
83
84	private void processDirectory(File directory, GShellProgressMonitor progress) {
85	// look for a doc.xml file here
86	File document_file = new File(directory, "doc.xml");
87	// Then extract the metadata from it.
88	if(document_file.exists()) {
89	int count = extractMetadata(document_file);
90	// Display a pretty progress message.
91	String[] args = new String[2];
92	args[0] = directory.getName();
93	args[1] = String.valueOf(count);
94	this.shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.get("GShell.Extracted", args), GShell.OK, null);
95	args = null;
96	progress.increment();
97	}
98
99	// for each directory here, process it
100	File sub_directories[] = directory.listFiles();
101	for(int i = 0; i < sub_directories.length; i++) {
102	// Find the doc.xml file within
103	if(sub_directories[i].isDirectory()) {
104	processDirectory(sub_directories[i], progress);
105	}
106	}
107	}
108
109	private int extractMetadata(File file) {
110	int count = 0;
111	// Retrieve the DOM of the file.
112	Document document = Utility.parse(file, false);
113
114	// If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
115	if (document != null) {
116	file_path = null;
117	Element archive_element = document.getDocumentElement();
118	// Retrieve the initial Section element
119	NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT);
120	// It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements
121	if (section_elements.getLength() < 1) {
122	return count;
123	}
124	Element section_element = (Element) section_elements.item(0);
125	section_elements = null;
126
127	// Retrieve all of the Metadata sections
128	NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT);
129	section_element = null;
130
131	// Zip through the retrieved metadata checking for SourceSegment elements
132	// These are a good sign of bibliographic files, which we must handle specially
133	boolean ignore_values = false;
134	for (int i = 0; i < metadata_elements.getLength(); i++) {
135	Element metadata_element = (Element) metadata_elements.item(i);
136	String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
137	if (name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) {
138	ignore_values = true;
139	break;
140	}
141	}
142
143	// Now for each Metadata entry retrieved...
144	for (int i = 0; i < metadata_elements.getLength(); i++) {
145	Element metadata_element = (Element) metadata_elements.item(i);
146	if (processMetadataElement(metadata_element, ignore_values) == true) {
147	count++;
148	}
149	}
150	}
151
152	return count;
153	}
154
155
156	private boolean processMetadataElement(Element metadata_element, boolean ignore_values)
157	{
158	String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
159	// There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
160	if (name.equals("gsdlsourcefilename")) {
161	file_path = MSMUtils.getValue(metadata_element);
162	if (!file_path.startsWith(File.separator)) {
163	String collection_dir = Utility.getCollectionDir();
164	file_path = collection_dir + file_path;
165	}
166
167	return false;
168	}
169
170	// Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata
171	for (int j = 0; j < ignore_list.length; j++) {
172	if (name.startsWith(ignore_list[j])) {
173	return false;
174	}
175	}
176
177	// namespaced metadata, we don't extract at the moment
178	if (name.indexOf(MSMUtils.NS_SEP)!=-1) {
179	return false;
180	}
181
182	if (file_path == null) {
183	return false;
184	}
185
186	// We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
187	ElementWrapper element = Gatherer.c_man.msm.getElement(name);
188	if (element == null) {
189	MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
190	if (extracted_mds != null) {
191	element = extracted_mds.addElement(name, Configuration.getLanguage());
192	}
193	}
194
195	// If ignore_values is set (bibliographic records) we don't care about the values
196	if (ignore_values == true) {
197	return false;
198	}
199
200	// Retrieve the metadata for the current file
201	File target_file = new File(file_path);
202	String value = "";
203	try {
204	value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
205	}
206	catch (IllegalArgumentException error) { // ****
207	value = Utility.decodeGreenstone(MSMUtils.getValue(metadata_element));
208	}
209	catch (UnsupportedEncodingException error) {
210	Gatherer.printStackTrace(error);
211	}
212
213	if (value == null) {
214	return false;
215	}
216	// Create a new metadata object.
217	GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
218	GValueNode value_node = null;
219	if (value_tree != null) {
220	value_node = value_tree.getValue(value);
221	}
222	else {
223	value_node = new GValueNode(element.toString(), value);
224	}
225
226	Metadata metadata = new Metadata(element, value_node);
227	element.inc();
228	///ystem.err.println("Adding extracted metadata: " + metadata);
229	Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
230
231	return true;
232	}
233
234
235	static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
236
237	static public ArrayList extractMetadataElements(File archive_directory) {
238	ArrayList extracted_metadata_elements = new ArrayList();
239	File document_directories[] = archive_directory.listFiles();
240	for(int i = 0; i < document_directories.length; i++) {
241	// Find the doc.xml file within
242	if(document_directories[i].isDirectory()) {
243	File document_file = new File(document_directories[i], "doc.xml");
244	// Then extract the metadata from it.
245	if(document_file.exists()) {
246	try {
247	Document document = Utility.parse(document_file, false);
248	// Retrieve all of the Metadata sections.
249	Element archive_element = document.getDocumentElement();
250	NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
251	// Now for each Metadata entry retrieved...
252	for(int j = 0; j < metadata_elements.getLength(); j++) {
253	Element metadata_element = (Element) metadata_elements.item(j);
254	String name = metadata_element.getAttribute("name");
255	// Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
256	boolean ignore = false;
257	for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) {
258	ignore = name.startsWith(metadata_ignore_list[k]);
259	}
260	if(!ignore && !extracted_metadata_elements.contains(name)) {
261	extracted_metadata_elements.add(name);
262	}
263	name = null;
264	metadata_element = null;
265	}
266	metadata_elements = null;
267	archive_element = null;
268	document = null;
269	}
270	catch (Exception error) {
271	Gatherer.printStackTrace(error);
272	}
273	}
274	document_file = null;
275	}
276	}
277	document_directories = null;
278	return extracted_metadata_elements;
279	}
280	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: