source: trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java@ 8231

Last change on this file since 8231 was 8231, checked in by mdewsnip, 20 years ago

Replaced all "Gatherer.config" with "Configuration".

  • Property svn:keywords set to Author Date Id Revision
File size: 10.4 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.msm;
38
39import java.io.*;
40import java.net.*;
41import java.util.*;
42import org.greenstone.gatherer.Configuration;
43import org.greenstone.gatherer.Dictionary;
44import org.greenstone.gatherer.Gatherer;
45import org.greenstone.gatherer.collection.Collection;
46import org.greenstone.gatherer.collection.CollectionManager;
47import org.greenstone.gatherer.file.FileNode;
48import org.greenstone.gatherer.msm.ElementWrapper;
49import org.greenstone.gatherer.msm.MetadataSet;
50import org.greenstone.gatherer.msm.MetadataSetManager;
51import org.greenstone.gatherer.msm.MSMUtils;
52import org.greenstone.gatherer.shell.GShell;
53import org.greenstone.gatherer.shell.GShellProgressMonitor;
54import org.greenstone.gatherer.util.StaticStrings;
55import org.greenstone.gatherer.util.Utility;
56import org.greenstone.gatherer.valuetree.GValueModel;
57import org.greenstone.gatherer.valuetree.GValueNode;
58import org.w3c.dom.*;
59
60public class GreenstoneArchiveParser {
61
62 private GShell shell;
63 private String file_path;
64
65 static final String ignore_list[] = { "assocfilepath", "gsdl", "Identifier", "SourceSegment", "URL" };
66
67 public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
68 // We can only extract metadata if an extracted metadata set exists in our collection.
69 if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) {
70 this.shell = shell;
71 // The very firstist thing we do is remove any existing extracted metadata
72 if(Gatherer.c_man != null && Gatherer.c_man.getCollection() != null) {
73 Gatherer.c_man.getCollection().getGDM().removeExtractedMetadata();
74 }
75
76 // Determine the collection archive directory.
77 File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
78 // Start the extraction process
79 processDirectory(archive_directory, progress);
80 }
81 // All done. Outta here like a bald man.
82 }
83
84 private void processDirectory(File directory, GShellProgressMonitor progress) {
85 // look for a doc.xml file here
86 File document_file = new File(directory, "doc.xml");
87 // Then extract the metadata from it.
88 if(document_file.exists()) {
89 int count = extractMetadata(document_file);
90 // Display a pretty progress message.
91 String[] args = new String[2];
92 args[0] = directory.getName();
93 args[1] = String.valueOf(count);
94 this.shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.get("GShell.Extracted", args), GShell.OK, null);
95 args = null;
96 progress.increment();
97 }
98
99 // for each directory here, process it
100 File sub_directories[] = directory.listFiles();
101 for(int i = 0; i < sub_directories.length; i++) {
102 // Find the doc.xml file within
103 if(sub_directories[i].isDirectory()) {
104 processDirectory(sub_directories[i], progress);
105 }
106 }
107 }
108
109 private int extractMetadata(File file) {
110 int count = 0;
111 // Retrieve the DOM of the file.
112 Document document = Utility.parse(file, false);
113
114 // If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
115 if (document != null) {
116 file_path = null;
117 Element archive_element = document.getDocumentElement();
118 // Retrieve the initial Section element
119 NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT);
120 // It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements
121 if (section_elements.getLength() < 1) {
122 return count;
123 }
124 Element section_element = (Element) section_elements.item(0);
125 section_elements = null;
126
127 // Retrieve all of the Metadata sections
128 NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT);
129 section_element = null;
130
131 // Zip through the retrieved metadata checking for SourceSegment elements
132 // These are a good sign of bibliographic files, which we must handle specially
133 boolean ignore_values = false;
134 for (int i = 0; i < metadata_elements.getLength(); i++) {
135 Element metadata_element = (Element) metadata_elements.item(i);
136 String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
137 if (name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) {
138 ignore_values = true;
139 break;
140 }
141 }
142
143 // Now for each Metadata entry retrieved...
144 for (int i = 0; i < metadata_elements.getLength(); i++) {
145 Element metadata_element = (Element) metadata_elements.item(i);
146 if (processMetadataElement(metadata_element, ignore_values) == true) {
147 count++;
148 }
149 }
150 }
151
152 return count;
153 }
154
155
156 private boolean processMetadataElement(Element metadata_element, boolean ignore_values)
157 {
158 String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
159 // There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
160 if (name.equals("gsdlsourcefilename")) {
161 file_path = MSMUtils.getValue(metadata_element);
162 if (!file_path.startsWith(File.separator)) {
163 String collection_dir = Utility.getCollectionDir();
164 file_path = collection_dir + file_path;
165 }
166
167 return false;
168 }
169
170 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata
171 for (int j = 0; j < ignore_list.length; j++) {
172 if (name.startsWith(ignore_list[j])) {
173 return false;
174 }
175 }
176
177 // namespaced metadata, we don't extract at the moment
178 if (name.indexOf(MSMUtils.NS_SEP)!=-1) {
179 return false;
180 }
181
182 if (file_path == null) {
183 return false;
184 }
185
186 // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
187 ElementWrapper element = Gatherer.c_man.msm.getElement(name);
188 if (element == null) {
189 MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
190 if (extracted_mds != null) {
191 element = extracted_mds.addElement(name, Configuration.getLanguage());
192 }
193 }
194
195 // If ignore_values is set (bibliographic records) we don't care about the values
196 if (ignore_values == true) {
197 return false;
198 }
199
200 // Retrieve the metadata for the current file
201 File target_file = new File(file_path);
202 String value = "";
203 try {
204 value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
205 }
206 catch (IllegalArgumentException error) { // ****
207 value = Utility.decodeGreenstone(MSMUtils.getValue(metadata_element));
208 }
209 catch (UnsupportedEncodingException error) {
210 Gatherer.printStackTrace(error);
211 }
212
213 if (value == null) {
214 return false;
215 }
216 // Create a new metadata object.
217 GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
218 GValueNode value_node = null;
219 if (value_tree != null) {
220 value_node = value_tree.getValue(value);
221 }
222 else {
223 value_node = new GValueNode(element.toString(), value);
224 }
225
226 Metadata metadata = new Metadata(element, value_node);
227 element.inc();
228 ///ystem.err.println("Adding extracted metadata: " + metadata);
229 Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
230
231 return true;
232 }
233
234
235 static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
236
237 static public ArrayList extractMetadataElements(File archive_directory) {
238 ArrayList extracted_metadata_elements = new ArrayList();
239 File document_directories[] = archive_directory.listFiles();
240 for(int i = 0; i < document_directories.length; i++) {
241 // Find the doc.xml file within
242 if(document_directories[i].isDirectory()) {
243 File document_file = new File(document_directories[i], "doc.xml");
244 // Then extract the metadata from it.
245 if(document_file.exists()) {
246 try {
247 Document document = Utility.parse(document_file, false);
248 // Retrieve all of the Metadata sections.
249 Element archive_element = document.getDocumentElement();
250 NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
251 // Now for each Metadata entry retrieved...
252 for(int j = 0; j < metadata_elements.getLength(); j++) {
253 Element metadata_element = (Element) metadata_elements.item(j);
254 String name = metadata_element.getAttribute("name");
255 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
256 boolean ignore = false;
257 for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) {
258 ignore = name.startsWith(metadata_ignore_list[k]);
259 }
260 if(!ignore && !extracted_metadata_elements.contains(name)) {
261 extracted_metadata_elements.add(name);
262 }
263 name = null;
264 metadata_element = null;
265 }
266 metadata_elements = null;
267 archive_element = null;
268 document = null;
269 }
270 catch (Exception error) {
271 Gatherer.printStackTrace(error);
272 }
273 }
274 document_file = null;
275 }
276 }
277 document_directories = null;
278 return extracted_metadata_elements;
279 }
280}
Note: See TracBrowser for help on using the repository browser.