source: trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java@ 7743

Last change on this file since 7743 was 7743, checked in by davidb, 20 years ago

GA format changed so full pathname no longer used, but name local to
collection. This aids portability between Greenstone sites (as needed
when GLI is run as an applet). Parsing of GA files, therefore, now
need to add prefix back in to reconstruct full pathname of this
installation of Greenstone.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.4 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.msm;
38
39import java.io.*;
40import java.net.*;
41import java.util.*;
42import org.greenstone.gatherer.Configuration;
43import org.greenstone.gatherer.Dictionary;
44import org.greenstone.gatherer.Gatherer;
45import org.greenstone.gatherer.collection.Collection;
46import org.greenstone.gatherer.collection.CollectionManager;
47import org.greenstone.gatherer.file.FileNode;
48import org.greenstone.gatherer.msm.ElementWrapper;
49import org.greenstone.gatherer.msm.MetadataSet;
50import org.greenstone.gatherer.msm.MetadataSetManager;
51import org.greenstone.gatherer.msm.MSMUtils;
52import org.greenstone.gatherer.shell.GShell;
53import org.greenstone.gatherer.shell.GShellProgressMonitor;
54import org.greenstone.gatherer.util.StaticStrings;
55import org.greenstone.gatherer.util.Utility;
56import org.greenstone.gatherer.valuetree.GValueModel;
57import org.greenstone.gatherer.valuetree.GValueNode;
58import org.w3c.dom.*;
59
60public class GreenstoneArchiveParser {
61
62 private GShell shell;
63 private String file_path;
64
65 static final String ignore_list[] = { "assocfilepath", "gsdl", "Identifier", "SourceSegment", "URL" };
66
67 public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
68 // We can only extract metadata if an extracted metadata set exists in our collection.
69 if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) {
70 this.shell = shell;
71 // The very firstist thing we do is remove any existing extracted metadata
72 if(Gatherer.c_man != null && Gatherer.c_man.getCollection() != null) {
73 Gatherer.c_man.getCollection().getGDM().removeExtractedMetadata();
74 }
75
76 // Determine the collection archive directory.
77 File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
78 // Start the extraction process
79 processDirectory(archive_directory, progress);
80 }
81 // All done. Outta here like a bald man.
82 }
83
84 private void processDirectory(File directory, GShellProgressMonitor progress) {
85 // look for a doc.xml file here
86 File document_file = new File(directory, "doc.xml");
87 // Then extract the metadata from it.
88 if(document_file.exists()) {
89 int count = extractMetadata(document_file);
90 // Display a pretty progress message.
91 String[] args = new String[2];
92 args[0] = directory.getName();
93 args[1] = String.valueOf(count);
94 this.shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.get("GShell.Extracted", args), GShell.OK, null);
95 args = null;
96 progress.increment();
97 }
98
99 // for each directory here, process it
100 File sub_directories[] = directory.listFiles();
101 for(int i = 0; i < sub_directories.length; i++) {
102 // Find the doc.xml file within
103 if(sub_directories[i].isDirectory()) {
104 processDirectory(sub_directories[i], progress);
105 }
106 }
107 }
108
109 private int extractMetadata(File file) {
110 int count = 0;
111 // Retrieve the DOM of the file.
112 Document document = Utility.parse(file, false);
113
114 // If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
115 if (document != null) {
116 file_path = null;
117 Element archive_element = document.getDocumentElement();
118 // Retrieve the initial Section element
119 NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT);
120 // It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements
121 if (section_elements.getLength() < 1) {
122 return count;
123 }
124 Element section_element = (Element) section_elements.item(0);
125 section_elements = null;
126
127 // Retrieve all of the Metadata sections
128 NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT);
129 section_element = null;
130
131 // Zip through the retrieved metadata checking for SourceSegment elements
132 // These are a good sign of bibliographic files, which we must handle specially
133 boolean ignore_values = false;
134 for (int i = 0; i < metadata_elements.getLength(); i++) {
135 Element metadata_element = (Element) metadata_elements.item(i);
136 String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
137 if (name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) {
138 ignore_values = true;
139 break;
140 }
141 }
142
143 // Now for each Metadata entry retrieved...
144 for (int i = 0; i < metadata_elements.getLength(); i++) {
145 Element metadata_element = (Element) metadata_elements.item(i);
146 if (processMetadataElement(metadata_element, ignore_values) == true) {
147 count++;
148 }
149 }
150 }
151
152 return count;
153 }
154
155
156 private boolean processMetadataElement(Element metadata_element, boolean ignore_values)
157 {
158 String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
159 // There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
160 if (name.equals("gsdlsourcefilename")) {
161 file_path = MSMUtils.getValue(metadata_element);
162 if (!file_path.startsWith(File.separator)) {
163 String collection_dir = Utility.getCollectionDir();
164 file_path = collection_dir + file_path;
165 }
166
167 return false;
168 }
169
170 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata
171 for (int j = 0; j < ignore_list.length; j++) {
172 if (name.startsWith(ignore_list[j])) {
173 return false;
174 }
175 }
176
177 // namespaced metadata, we don't extract at the moment
178 if (name.indexOf(MSMUtils.NS_SEP)!=-1) {
179 return false;
180 }
181
182 if (file_path == null) {
183 return false;
184 }
185
186 // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
187 ElementWrapper element = Gatherer.c_man.msm.getElement(name);
188 if (element == null) {
189 MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
190 if (extracted_mds != null) {
191 element = extracted_mds.addElement(name, Gatherer.config.getLanguage());
192 }
193 }
194
195 // If ignore_values is set (bibliographic records) we don't care about the values
196 if (ignore_values == true) {
197 return false;
198 }
199
200 // Retrieve the metadata for the current file
201 File target_file = new File(file_path);
202 String value = "";
203 try {
204 value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
205 }
206 catch (IllegalArgumentException error) { // ****
207 value = Utility.decodeGreenstone(MSMUtils.getValue(metadata_element));
208 }
209 catch (UnsupportedEncodingException error) {
210 Gatherer.printStackTrace(error);
211 }
212
213 if (value == null) {
214 return false;
215 }
216 // Create a new metadata object.
217 GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
218 GValueNode value_node = null;
219 if (value_tree != null) {
220 value_node = value_tree.getValue(value);
221 }
222 else {
223 value_node = new GValueNode(element.toString(), value);
224 }
225
226 Metadata metadata = new Metadata(element, value_node);
227 element.inc();
228 ///ystem.err.println("Adding extracted metadata: " + metadata);
229 Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
230
231 return true;
232 }
233
234
235 static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
236
237 static public ArrayList extractMetadataElements(File archive_directory) {
238 ArrayList extracted_metadata_elements = new ArrayList();
239 File document_directories[] = archive_directory.listFiles();
240 for(int i = 0; i < document_directories.length; i++) {
241 // Find the doc.xml file within
242 if(document_directories[i].isDirectory()) {
243 File document_file = new File(document_directories[i], "doc.xml");
244 // Then extract the metadata from it.
245 if(document_file.exists()) {
246 try {
247 Document document = Utility.parse(document_file, false);
248 // Retrieve all of the Metadata sections.
249 Element archive_element = document.getDocumentElement();
250 NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
251 // Now for each Metadata entry retrieved...
252 for(int j = 0; j < metadata_elements.getLength(); j++) {
253 Element metadata_element = (Element) metadata_elements.item(j);
254 String name = metadata_element.getAttribute("name");
255 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
256 boolean ignore = false;
257 for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) {
258 ignore = name.startsWith(metadata_ignore_list[k]);
259 }
260 if(!ignore && !extracted_metadata_elements.contains(name)) {
261 extracted_metadata_elements.add(name);
262 }
263 name = null;
264 metadata_element = null;
265 }
266 metadata_elements = null;
267 archive_element = null;
268 document = null;
269 }
270 catch (Exception error) {
271 Gatherer.printStackTrace(error);
272 }
273 }
274 document_file = null;
275 }
276 }
277 document_directories = null;
278 return extracted_metadata_elements;
279 }
280}
Note: See TracBrowser for help on using the repository browser.