Context Navigation

source: trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java@ 7743

Last change on this file since 7743 was 7743, checked in by davidb, 20 years ago
GA format changed so full pathname no longer used, but name local to collection. This aids portability between Greenstone sites (as needed when GLI is run as an applet). Parsing of GA files, therefore, now need to add prefix back in to reconstruct full pathname of this installation of Greenstone.
Property svn:keywords set to `Author Date Id Revision`
File size: 10.4 KB

Line
1	/**
2	*#########################################################################
3	*
4	* A component of the Gatherer application, part of the Greenstone digital
5	* library suite from the New Zealand Digital Library Project at the
6	* University of Waikato, New Zealand.
7	*
8	* <BR><BR>
9	*
10	* Author: John Thompson, Greenstone Digital Library, University of Waikato
11	*
12	* <BR><BR>
13	*
14	* Copyright (C) 1999 New Zealand Digital Library Project
15	*
16	* <BR><BR>
17	*
18	* This program is free software; you can redistribute it and/or modify
19	* it under the terms of the GNU General Public License as published by
20	* the Free Software Foundation; either version 2 of the License, or
21	* (at your option) any later version.
22	*
23	* <BR><BR>
24	*
25	* This program is distributed in the hope that it will be useful,
26	* but WITHOUT ANY WARRANTY; without even the implied warranty of
27	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28	* GNU General Public License for more details.
29	*
30	* <BR><BR>
31	*
32	* You should have received a copy of the GNU General Public License
33	* along with this program; if not, write to the Free Software
34	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35	*########################################################################
36	*/
37	package org.greenstone.gatherer.msm;
38
39	import java.io.*;
40	import java.net.*;
41	import java.util.*;
42	import org.greenstone.gatherer.Configuration;
43	import org.greenstone.gatherer.Dictionary;
44	import org.greenstone.gatherer.Gatherer;
45	import org.greenstone.gatherer.collection.Collection;
46	import org.greenstone.gatherer.collection.CollectionManager;
47	import org.greenstone.gatherer.file.FileNode;
48	import org.greenstone.gatherer.msm.ElementWrapper;
49	import org.greenstone.gatherer.msm.MetadataSet;
50	import org.greenstone.gatherer.msm.MetadataSetManager;
51	import org.greenstone.gatherer.msm.MSMUtils;
52	import org.greenstone.gatherer.shell.GShell;
53	import org.greenstone.gatherer.shell.GShellProgressMonitor;
54	import org.greenstone.gatherer.util.StaticStrings;
55	import org.greenstone.gatherer.util.Utility;
56	import org.greenstone.gatherer.valuetree.GValueModel;
57	import org.greenstone.gatherer.valuetree.GValueNode;
58	import org.w3c.dom.*;
59
60	public class GreenstoneArchiveParser {
61
62	private GShell shell;
63	private String file_path;
64
65	static final String ignore_list[] = { "assocfilepath", "gsdl", "Identifier", "SourceSegment", "URL" };
66
67	public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
68	// We can only extract metadata if an extracted metadata set exists in our collection.
69	if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) {
70	this.shell = shell;
71	// The very firstist thing we do is remove any existing extracted metadata
72	if(Gatherer.c_man != null && Gatherer.c_man.getCollection() != null) {
73	Gatherer.c_man.getCollection().getGDM().removeExtractedMetadata();
74	}
75
76	// Determine the collection archive directory.
77	File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
78	// Start the extraction process
79	processDirectory(archive_directory, progress);
80	}
81	// All done. Outta here like a bald man.
82	}
83
84	private void processDirectory(File directory, GShellProgressMonitor progress) {
85	// look for a doc.xml file here
86	File document_file = new File(directory, "doc.xml");
87	// Then extract the metadata from it.
88	if(document_file.exists()) {
89	int count = extractMetadata(document_file);
90	// Display a pretty progress message.
91	String[] args = new String[2];
92	args[0] = directory.getName();
93	args[1] = String.valueOf(count);
94	this.shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.get("GShell.Extracted", args), GShell.OK, null);
95	args = null;
96	progress.increment();
97	}
98
99	// for each directory here, process it
100	File sub_directories[] = directory.listFiles();
101	for(int i = 0; i < sub_directories.length; i++) {
102	// Find the doc.xml file within
103	if(sub_directories[i].isDirectory()) {
104	processDirectory(sub_directories[i], progress);
105	}
106	}
107	}
108
109	private int extractMetadata(File file) {
110	int count = 0;
111	// Retrieve the DOM of the file.
112	Document document = Utility.parse(file, false);
113
114	// If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
115	if (document != null) {
116	file_path = null;
117	Element archive_element = document.getDocumentElement();
118	// Retrieve the initial Section element
119	NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT);
120	// It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements
121	if (section_elements.getLength() < 1) {
122	return count;
123	}
124	Element section_element = (Element) section_elements.item(0);
125	section_elements = null;
126
127	// Retrieve all of the Metadata sections
128	NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT);
129	section_element = null;
130
131	// Zip through the retrieved metadata checking for SourceSegment elements
132	// These are a good sign of bibliographic files, which we must handle specially
133	boolean ignore_values = false;
134	for (int i = 0; i < metadata_elements.getLength(); i++) {
135	Element metadata_element = (Element) metadata_elements.item(i);
136	String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
137	if (name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) {
138	ignore_values = true;
139	break;
140	}
141	}
142
143	// Now for each Metadata entry retrieved...
144	for (int i = 0; i < metadata_elements.getLength(); i++) {
145	Element metadata_element = (Element) metadata_elements.item(i);
146	if (processMetadataElement(metadata_element, ignore_values) == true) {
147	count++;
148	}
149	}
150	}
151
152	return count;
153	}
154
155
156	private boolean processMetadataElement(Element metadata_element, boolean ignore_values)
157	{
158	String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
159	// There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
160	if (name.equals("gsdlsourcefilename")) {
161	file_path = MSMUtils.getValue(metadata_element);
162	if (!file_path.startsWith(File.separator)) {
163	String collection_dir = Utility.getCollectionDir();
164	file_path = collection_dir + file_path;
165	}
166
167	return false;
168	}
169
170	// Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata
171	for (int j = 0; j < ignore_list.length; j++) {
172	if (name.startsWith(ignore_list[j])) {
173	return false;
174	}
175	}
176
177	// namespaced metadata, we don't extract at the moment
178	if (name.indexOf(MSMUtils.NS_SEP)!=-1) {
179	return false;
180	}
181
182	if (file_path == null) {
183	return false;
184	}
185
186	// We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
187	ElementWrapper element = Gatherer.c_man.msm.getElement(name);
188	if (element == null) {
189	MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
190	if (extracted_mds != null) {
191	element = extracted_mds.addElement(name, Gatherer.config.getLanguage());
192	}
193	}
194
195	// If ignore_values is set (bibliographic records) we don't care about the values
196	if (ignore_values == true) {
197	return false;
198	}
199
200	// Retrieve the metadata for the current file
201	File target_file = new File(file_path);
202	String value = "";
203	try {
204	value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
205	}
206	catch (IllegalArgumentException error) { // ****
207	value = Utility.decodeGreenstone(MSMUtils.getValue(metadata_element));
208	}
209	catch (UnsupportedEncodingException error) {
210	Gatherer.printStackTrace(error);
211	}
212
213	if (value == null) {
214	return false;
215	}
216	// Create a new metadata object.
217	GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
218	GValueNode value_node = null;
219	if (value_tree != null) {
220	value_node = value_tree.getValue(value);
221	}
222	else {
223	value_node = new GValueNode(element.toString(), value);
224	}
225
226	Metadata metadata = new Metadata(element, value_node);
227	element.inc();
228	///ystem.err.println("Adding extracted metadata: " + metadata);
229	Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
230
231	return true;
232	}
233
234
235	static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
236
237	static public ArrayList extractMetadataElements(File archive_directory) {
238	ArrayList extracted_metadata_elements = new ArrayList();
239	File document_directories[] = archive_directory.listFiles();
240	for(int i = 0; i < document_directories.length; i++) {
241	// Find the doc.xml file within
242	if(document_directories[i].isDirectory()) {
243	File document_file = new File(document_directories[i], "doc.xml");
244	// Then extract the metadata from it.
245	if(document_file.exists()) {
246	try {
247	Document document = Utility.parse(document_file, false);
248	// Retrieve all of the Metadata sections.
249	Element archive_element = document.getDocumentElement();
250	NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
251	// Now for each Metadata entry retrieved...
252	for(int j = 0; j < metadata_elements.getLength(); j++) {
253	Element metadata_element = (Element) metadata_elements.item(j);
254	String name = metadata_element.getAttribute("name");
255	// Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
256	boolean ignore = false;
257	for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) {
258	ignore = name.startsWith(metadata_ignore_list[k]);
259	}
260	if(!ignore && !extracted_metadata_elements.contains(name)) {
261	extracted_metadata_elements.add(name);
262	}
263	name = null;
264	metadata_element = null;
265	}
266	metadata_elements = null;
267	archive_element = null;
268	document = null;
269	}
270	catch (Exception error) {
271	Gatherer.printStackTrace(error);
272	}
273	}
274	document_file = null;
275	}
276	}
277	document_directories = null;
278	return extracted_metadata_elements;
279	}
280	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: