source: trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java@ 7292

Last change on this file since 7292 was 7292, checked in by kjdon, 20 years ago

made the directory parser recursive, and now it doesn't look at any namespaced metadata - only tries to extract non-namespaced, ie greenstone extracted, metadata

  • Property svn:keywords set to Author Date Id Revision
File size: 10.3 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.msm;
38
39import java.io.*;
40import java.net.*;
41import java.util.*;
42import org.greenstone.gatherer.Configuration;
43import org.greenstone.gatherer.Dictionary;
44import org.greenstone.gatherer.Gatherer;
45import org.greenstone.gatherer.collection.Collection;
46import org.greenstone.gatherer.collection.CollectionManager;
47import org.greenstone.gatherer.file.FileNode;
48import org.greenstone.gatherer.msm.ElementWrapper;
49import org.greenstone.gatherer.msm.MetadataSet;
50import org.greenstone.gatherer.msm.MetadataSetManager;
51import org.greenstone.gatherer.msm.MSMUtils;
52import org.greenstone.gatherer.shell.GShell;
53import org.greenstone.gatherer.shell.GShellProgressMonitor;
54import org.greenstone.gatherer.util.StaticStrings;
55import org.greenstone.gatherer.util.Utility;
56import org.greenstone.gatherer.valuetree.GValueModel;
57import org.greenstone.gatherer.valuetree.GValueNode;
58import org.w3c.dom.*;
59
60public class GreenstoneArchiveParser {
61
62 private GShell shell;
63 private String file_path;
64
65 static final String ignore_list[] = { "assocfilepath", "gsdl", "Identifier", "SourceSegment", "URL" };
66
67 public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
68 // We can only extract metadata if an extracted metadata set exists in our collection.
69 if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) {
70 this.shell = shell;
71 // The very firstist thing we do is remove any existing extracted metadata
72 if(Gatherer.c_man != null && Gatherer.c_man.getCollection() != null) {
73 Gatherer.c_man.getCollection().getGDM().removeExtractedMetadata();
74 }
75
76 // Determine the collection archive directory.
77 File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
78 // Start the extraction process
79 processDirectory(archive_directory, progress);
80 }
81 // All done. Outta here like a bald man.
82 }
83
84 private void processDirectory(File directory, GShellProgressMonitor progress) {
85 // look for a doc.xml file here
86 File document_file = new File(directory, "doc.xml");
87 // Then extract the metadata from it.
88 if(document_file.exists()) {
89 int count = extractMetadata(document_file);
90 // Display a pretty progress message.
91 String[] args = new String[2];
92 args[0] = directory.getName();
93 args[1] = String.valueOf(count);
94 this.shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.get("GShell.Extracted", args), GShell.OK, null);
95 args = null;
96 progress.increment();
97 }
98
99 // for each directory here, process it
100 File sub_directories[] = directory.listFiles();
101 for(int i = 0; i < sub_directories.length; i++) {
102 // Find the doc.xml file within
103 if(sub_directories[i].isDirectory()) {
104 processDirectory(sub_directories[i], progress);
105 }
106 }
107 }
108
109 private int extractMetadata(File file) {
110 int count = 0;
111 // Retrieve the DOM of the file.
112 Document document = Utility.parse(file, false);
113
114 // If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
115 if (document != null) {
116 file_path = null;
117 Element archive_element = document.getDocumentElement();
118 // Retrieve the initial Section element
119 NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT);
120 // It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements
121 if (section_elements.getLength() < 1) {
122 return count;
123 }
124 Element section_element = (Element) section_elements.item(0);
125 section_elements = null;
126
127 // Retrieve all of the Metadata sections
128 NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT);
129 section_element = null;
130
131 // Zip through the retrieved metadata checking for SourceSegment elements
132 // These are a good sign of bibliographic files, which we must handle specially
133 boolean ignore_values = false;
134 for (int i = 0; i < metadata_elements.getLength(); i++) {
135 Element metadata_element = (Element) metadata_elements.item(i);
136 String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
137 if (name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) {
138 ignore_values = true;
139 break;
140 }
141 }
142
143 // Now for each Metadata entry retrieved...
144 for (int i = 0; i < metadata_elements.getLength(); i++) {
145 Element metadata_element = (Element) metadata_elements.item(i);
146 if (processMetadataElement(metadata_element, ignore_values) == true) {
147 count++;
148 }
149 }
150 }
151
152 return count;
153 }
154
155
156 private boolean processMetadataElement(Element metadata_element, boolean ignore_values)
157 {
158 String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
159 // There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
160 if (name.equals("gsdlsourcefilename")) {
161 file_path = MSMUtils.getValue(metadata_element);
162 return false;
163 }
164
165 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata
166 for (int j = 0; j < ignore_list.length; j++) {
167 if (name.startsWith(ignore_list[j])) {
168 return false;
169 }
170 }
171
172 // namespaced metadata, we don't extract at the moment
173 if (name.indexOf(MSMUtils.NS_SEP)!=-1) {
174 return false;
175 }
176
177 if (file_path == null) {
178 return false;
179 }
180
181 // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
182 ElementWrapper element = Gatherer.c_man.msm.getElement(name);
183 if (element == null) {
184 MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
185 if (extracted_mds != null) {
186 element = extracted_mds.addElement(name, Gatherer.config.getLanguage());
187 }
188 }
189
190 // If ignore_values is set (bibliographic records) we don't care about the values
191 if (ignore_values == true) {
192 return false;
193 }
194
195 // Retrieve the metadata for the current file
196 File target_file = new File(file_path);
197 String value = "";
198 try {
199 value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
200 }
201 catch (IllegalArgumentException error) { // ****
202 value = Utility.decodeGreenstone(MSMUtils.getValue(metadata_element));
203 }
204 catch (UnsupportedEncodingException error) {
205 Gatherer.printStackTrace(error);
206 }
207
208 if (value == null) {
209 return false;
210 }
211 // Create a new metadata object.
212 GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
213 GValueNode value_node = null;
214 if (value_tree != null) {
215 value_node = value_tree.getValue(value);
216 }
217 else {
218 value_node = new GValueNode(element.toString(), value);
219 }
220
221 Metadata metadata = new Metadata(element, value_node);
222 element.inc();
223 ///ystem.err.println("Adding extracted metadata: " + metadata);
224 Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
225
226 return true;
227 }
228
229
230 static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
231
232 static public ArrayList extractMetadataElements(File archive_directory) {
233 ArrayList extracted_metadata_elements = new ArrayList();
234 File document_directories[] = archive_directory.listFiles();
235 for(int i = 0; i < document_directories.length; i++) {
236 // Find the doc.xml file within
237 if(document_directories[i].isDirectory()) {
238 File document_file = new File(document_directories[i], "doc.xml");
239 // Then extract the metadata from it.
240 if(document_file.exists()) {
241 try {
242 Document document = Utility.parse(document_file, false);
243 // Retrieve all of the Metadata sections.
244 Element archive_element = document.getDocumentElement();
245 NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
246 // Now for each Metadata entry retrieved...
247 for(int j = 0; j < metadata_elements.getLength(); j++) {
248 Element metadata_element = (Element) metadata_elements.item(j);
249 String name = metadata_element.getAttribute("name");
250 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
251 boolean ignore = false;
252 for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) {
253 ignore = name.startsWith(metadata_ignore_list[k]);
254 }
255 if(!ignore && !extracted_metadata_elements.contains(name)) {
256 extracted_metadata_elements.add(name);
257 }
258 name = null;
259 metadata_element = null;
260 }
261 metadata_elements = null;
262 archive_element = null;
263 document = null;
264 }
265 catch (Exception error) {
266 Gatherer.printStackTrace(error);
267 }
268 }
269 document_file = null;
270 }
271 }
272 document_directories = null;
273 return extracted_metadata_elements;
274 }
275}
Note: See TracBrowser for help on using the repository browser.