Context Navigation

source: trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java@ 5581

Last change on this file since 5581 was 5564, checked in by mdewsnip, 21 years ago
Many more small improvements and tooltips added. Still more to come!
Property svn:keywords set to `Author Date Id Revision`
File size: 9.1 KB

Line
1	/**
2	*#########################################################################
3	*
4	* A component of the Gatherer application, part of the Greenstone digital
5	* library suite from the New Zealand Digital Library Project at the
6	* University of Waikato, New Zealand.
7	*
8	* <BR><BR>
9	*
10	* Author: John Thompson, Greenstone Digital Library, University of Waikato
11	*
12	* <BR><BR>
13	*
14	* Copyright (C) 1999 New Zealand Digital Library Project
15	*
16	* <BR><BR>
17	*
18	* This program is free software; you can redistribute it and/or modify
19	* it under the terms of the GNU General Public License as published by
20	* the Free Software Foundation; either version 2 of the License, or
21	* (at your option) any later version.
22	*
23	* <BR><BR>
24	*
25	* This program is distributed in the hope that it will be useful,
26	* but WITHOUT ANY WARRANTY; without even the implied warranty of
27	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28	* GNU General Public License for more details.
29	*
30	* <BR><BR>
31	*
32	* You should have received a copy of the GNU General Public License
33	* along with this program; if not, write to the Free Software
34	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35	*########################################################################
36	*/
37	package org.greenstone.gatherer.msm;
38
39	import java.io.*;
40	import java.net.*;
41	import java.util.*;
42	import org.greenstone.gatherer.Configuration;
43	import org.greenstone.gatherer.Dictionary;
44	import org.greenstone.gatherer.Gatherer;
45	import org.greenstone.gatherer.collection.Collection;
46	import org.greenstone.gatherer.collection.CollectionManager;
47	import org.greenstone.gatherer.file.FileNode;
48	import org.greenstone.gatherer.msm.ElementWrapper;
49	import org.greenstone.gatherer.msm.MetadataSet;
50	import org.greenstone.gatherer.msm.MetadataSetManager;
51	import org.greenstone.gatherer.msm.MSMUtils;
52	import org.greenstone.gatherer.shell.GShell;
53	import org.greenstone.gatherer.shell.GShellProgressMonitor;
54	import org.greenstone.gatherer.util.Utility;
55	import org.greenstone.gatherer.valuetree.GValueModel;
56	import org.greenstone.gatherer.valuetree.GValueNode;
57	import org.w3c.dom.*;
58
59	public class GreenstoneArchiveParser {
60
61	private GShell shell;
62
63	static final String ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"}; //"Source",
64
65	public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
66	// We can only extract metadata if an extracted metadata set exists in our collection.
67	if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) {
68	this.shell = shell;
69	// Determine the collection archive directory.
70	File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
71	// For each of the hash coded directories within.
72	File document_directories[] = archive_directory.listFiles();
73	for(int i = 0; i < document_directories.length; i++) {
74	// Find the doc.xml file within
75	if(document_directories[i].isDirectory()) {
76	File document_file = new File(document_directories[i], "doc.xml");
77	// Then extract the metadata from it.
78	if(document_file.exists()) {
79	int count = extractMetadata(document_file);
80	// Display a pretty progress message.
81	String[] args = new String[2];
82	args[0] = document_directories[i].getName();
83	args[1] = String.valueOf(count);
84	shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.newget("GShell.Extracted", args), GShell.OK);
85	args = null;
86	progress.increment();
87	}
88	}
89	}
90	}
91	// All done. Outta here like a bald man.
92	}
93
94	private int extractMetadata(File file) {
95	int count = 0;
96	// Retrieve the DOM of the file.
97	Document document = Utility.parse(file, false);
98	// If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
99	if(document != null) {
100	String file_path = null;
101	Element archive_element = document.getDocumentElement();
102	// Retrieve all of the Metadata sections.
103	NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
104	// Now for each Metadata entry retrieved...
105	for(int i = 0; i < metadata_elements.getLength(); i++) {
106	Element metadata_element = (Element) metadata_elements.item(i);
107	String name = metadata_element.getAttribute("name");
108	// There is a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
109	if(name.equals("gsdlsourcefilename")) {
110	file_path = MSMUtils.getValue(metadata_element);
111	}
112	else {
113	// Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
114	boolean ignore = false;
115	for(int j = 0; !ignore && j < ignore_list.length; j++) {
116	ignore = name.startsWith(ignore_list[j]);
117	}
118	// Otherwise ensure the metadata is present in our collection.
119	if(!ignore && file_path != null) {
120	// If we successfully retrieved a record we can continue.
121	if(file_path != null) {
122	// We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
123	ElementWrapper element = Gatherer.c_man.msm.getElement(name);
124	if(element == null) {
125	MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
126	if(extracted_mds != null) {
127	element = extracted_mds.addElement(name, Gatherer.config.interface_language);
128	}
129	}
130	// If we successfully retrieved an element (and we should have) we can continue.
131	if(element != null) {
132	// Retrieve the metadata for the current file
133	File target_file = new File(file_path);
134	ArrayList metadatum = Gatherer.c_man.getCollection().gdm.getMetadata(target_file);
135	// If no metadata exists for the current element, add it
136	boolean found = false;
137	for(int k = 0; !found && k < metadatum.size(); k++) {
138	Metadata sibling = (Metadata) metadatum.get(k);
139	found = element.equals(sibling.getElement());
140	}
141	metadatum = null;
142	if(!found) {
143	String value = "";
144	try {
145	value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
146	}
147	catch(UnsupportedEncodingException error) {
148	Gatherer.printStackTrace(error);
149	}
150	// If we successfully retrieved a value we can continue.
151	if(value != null) {
152	// Create a new metadata object.
153	GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
154	GValueNode value_node = null;
155	if(value_tree != null) {
156	value_node = value_tree.getValue(value);
157	}
158	else {
159	value_node = new GValueNode(element.toString(), value);
160	}
161	Metadata metadata = new Metadata(element, value_node);
162	Gatherer.c_man.getCollection().gdm.metadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
163	count++;
164	// All done. On to next metadata.
165	}
166	}
167	target_file = null;
168	}
169	else {
170	Gatherer.println("Cannot retrieve metadata element " + name);
171	}
172	}
173	}
174	}
175	}
176	}
177	return count;
178	}
179
180	static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
181
182	static public ArrayList extractMetadataElements(File archive_directory) {
183	ArrayList extracted_metadata_elements = new ArrayList();
184	File document_directories[] = archive_directory.listFiles();
185	for(int i = 0; i < document_directories.length; i++) {
186	// Find the doc.xml file within
187	if(document_directories[i].isDirectory()) {
188	File document_file = new File(document_directories[i], "doc.xml");
189	// Then extract the metadata from it.
190	if(document_file.exists()) {
191	try {
192	Document document = Utility.parse(document_file, false);
193	// Retrieve all of the Metadata sections.
194	Element archive_element = document.getDocumentElement();
195	NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
196	// Now for each Metadata entry retrieved...
197	for(int j = 0; j < metadata_elements.getLength(); j++) {
198	Element metadata_element = (Element) metadata_elements.item(j);
199	String name = metadata_element.getAttribute("name");
200	// Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
201	boolean ignore = false;
202	for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) {
203	ignore = name.startsWith(metadata_ignore_list[k]);
204	}
205	if(!ignore && !extracted_metadata_elements.contains(name)) {
206	extracted_metadata_elements.add(name);
207	}
208	name = null;
209	metadata_element = null;
210	}
211	metadata_elements = null;
212	archive_element = null;
213	document = null;
214	}
215	catch (Exception error) {
216	Gatherer.printStackTrace(error);
217	}
218	}
219	document_file = null;
220	}
221	}
222	document_directories = null;
223	return extracted_metadata_elements;
224	}
225	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: