source: trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java@ 6877

Last change on this file since 6877 was 6859, checked in by kjdon, 20 years ago

added in Davids fix for something which I can't remember - now catches another exception type.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.8 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.msm;
38
39import java.io.*;
40import java.net.*;
41import java.util.*;
42import org.greenstone.gatherer.Configuration;
43import org.greenstone.gatherer.Dictionary;
44import org.greenstone.gatherer.Gatherer;
45import org.greenstone.gatherer.collection.Collection;
46import org.greenstone.gatherer.collection.CollectionManager;
47import org.greenstone.gatherer.file.FileNode;
48import org.greenstone.gatherer.msm.ElementWrapper;
49import org.greenstone.gatherer.msm.MetadataSet;
50import org.greenstone.gatherer.msm.MetadataSetManager;
51import org.greenstone.gatherer.msm.MSMUtils;
52import org.greenstone.gatherer.shell.GShell;
53import org.greenstone.gatherer.shell.GShellProgressMonitor;
54import org.greenstone.gatherer.util.StaticStrings;
55import org.greenstone.gatherer.util.Utility;
56import org.greenstone.gatherer.valuetree.GValueModel;
57import org.greenstone.gatherer.valuetree.GValueNode;
58import org.w3c.dom.*;
59
60public class GreenstoneArchiveParser {
61
62 private GShell shell;
63
64 static final String ignore_list[] = {"assocfilepath", "gsdl", "Identifier", "URL"}; //"Source",
65
66 public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
67 // We can only extract metadata if an extracted metadata set exists in our collection.
68 if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) {
69 this.shell = shell;
70 // The very firstist thing we do is remove any existing extracted metadata
71 if(Gatherer.c_man != null && Gatherer.c_man.getCollection() != null) {
72 Gatherer.c_man.getCollection().getGDM().removeExtractedMetadata();
73 }
74
75 // Determine the collection archive directory.
76 File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
77 // For each of the hash coded directories within.
78 File document_directories[] = archive_directory.listFiles();
79 for(int i = 0; i < document_directories.length; i++) {
80 // Find the doc.xml file within
81 if(document_directories[i].isDirectory()) {
82 File document_file = new File(document_directories[i], "doc.xml");
83 // Then extract the metadata from it.
84 if(document_file.exists()) {
85 int count = extractMetadata(document_file);
86 // Display a pretty progress message.
87 String[] args = new String[2];
88 args[0] = document_directories[i].getName();
89 args[1] = String.valueOf(count);
90 shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.get("GShell.Extracted", args), GShell.OK, null);
91 args = null;
92 progress.increment();
93 }
94 }
95 }
96 }
97 // All done. Outta here like a bald man.
98 }
99
100 private int extractMetadata(File file) {
101 int count = 0;
102 // Retrieve the DOM of the file.
103 Document document = Utility.parse(file, false);
104
105 Gatherer.println("Parsed greenstone archive document: " + file.getAbsolutePath());
106 // If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
107 if(document != null) {
108 String file_path = null;
109 Element archive_element = document.getDocumentElement();
110 // Retrieve the initial Section element
111 NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT);
112 // It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements.
113 if(section_elements.getLength() < 1) {
114 return count;
115 }
116 Element section_element = (Element) section_elements.item(0);
117 section_elements = null;
118 // Retrieve all of the Metadata sections.
119 NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT);
120 section_element = null;
121 // We first zip through the retrieved metadata, and if we encounter the element 'SourceSegment' - a sure sign this collection came from a bibliographic type file - we break out of extracted metadata parsing as no sense could be made of the data extracted anyway (plus we suffer a death of thirty-thousand pointy bits of metadata!)
122 for(int i = 0; i < metadata_elements.getLength(); i++) {
123 Element metadata_element = (Element) metadata_elements.item(i);
124 String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
125 if(name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) {
126 return 0;
127 }
128 }
129 // Now for each Metadata entry retrieved...
130 for(int i = 0; i < metadata_elements.getLength(); i++) {
131 Element metadata_element = (Element) metadata_elements.item(i);
132 String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
133 // There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
134 if(name.equals("gsdlsourcefilename")) {
135 file_path = MSMUtils.getValue(metadata_element);
136 }
137 else {
138 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
139 boolean ignore = false;
140 for(int j = 0; !ignore && j < ignore_list.length; j++) {
141 ignore = name.startsWith(ignore_list[j]);
142 }
143 // Otherwise ensure the metadata is present in our collection.
144 if(!ignore && file_path != null) {
145 // If we successfully retrieved a record we can continue.
146 if(file_path != null) {
147 // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
148 ElementWrapper element = Gatherer.c_man.msm.getElement(name);
149 if(element == null) {
150 MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
151 if(extracted_mds != null) {
152 element = extracted_mds.addElement(name, Gatherer.config.getLanguage());
153 }
154 }
155 // If we successfully retrieved an element (and we should have) we can continue.
156 // WARNING!! There is one known exception - MARC records. Adding the extracted elements is all good, but adding the extracted metadata causes the whole thing to collapse in a pile of unhappy.
157 if(element != null && !file_path.endsWith(StaticStrings.MARC_EXTENSION) && (element.getNamespace().equals("") || element.getNamespace().equals(Utility.EXTRACTED_METADATA_NAMESPACE))) {
158 // Retrieve the metadata for the current file
159 File target_file = new File(file_path);
160 String value = "";
161 try {
162 value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
163 }
164 catch(IllegalArgumentException error) { // ****
165 value = Utility.decodeGreenstone(MSMUtils.getValue(metadata_element));
166 }
167 catch(UnsupportedEncodingException error) {
168 Gatherer.printStackTrace(error);
169 }
170 // If we successfully retrieved a value we can continue.
171 if(value != null) {
172 // Create a new metadata object.
173 GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
174 GValueNode value_node = null;
175 if(value_tree != null) {
176 value_node = value_tree.getValue(value);
177 }
178 else {
179 value_node = new GValueNode(element.toString(), value);
180 }
181 Metadata metadata = new Metadata(element, value_node);
182 element.inc();
183 ///ystem.err.println("Adding extracted metadata: " + metadata);
184 Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
185 count++;
186 // All done. On to next metadata.
187 }
188 value = null;
189 target_file = null;
190 }
191 else {
192 Gatherer.println("Cannot retrieve metadata element " + name);
193 }
194 }
195 }
196 }
197 }
198 }
199 return count;
200 }
201
202 static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
203
204 static public ArrayList extractMetadataElements(File archive_directory) {
205 ArrayList extracted_metadata_elements = new ArrayList();
206 File document_directories[] = archive_directory.listFiles();
207 for(int i = 0; i < document_directories.length; i++) {
208 // Find the doc.xml file within
209 if(document_directories[i].isDirectory()) {
210 File document_file = new File(document_directories[i], "doc.xml");
211 // Then extract the metadata from it.
212 if(document_file.exists()) {
213 try {
214 Document document = Utility.parse(document_file, false);
215 // Retrieve all of the Metadata sections.
216 Element archive_element = document.getDocumentElement();
217 NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
218 // Now for each Metadata entry retrieved...
219 for(int j = 0; j < metadata_elements.getLength(); j++) {
220 Element metadata_element = (Element) metadata_elements.item(j);
221 String name = metadata_element.getAttribute("name");
222 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
223 boolean ignore = false;
224 for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) {
225 ignore = name.startsWith(metadata_ignore_list[k]);
226 }
227 if(!ignore && !extracted_metadata_elements.contains(name)) {
228 extracted_metadata_elements.add(name);
229 }
230 name = null;
231 metadata_element = null;
232 }
233 metadata_elements = null;
234 archive_element = null;
235 document = null;
236 }
237 catch (Exception error) {
238 Gatherer.printStackTrace(error);
239 }
240 }
241 document_file = null;
242 }
243 }
244 document_directories = null;
245 return extracted_metadata_elements;
246 }
247}
Note: See TracBrowser for help on using the repository browser.