1 | /**
|
---|
2 | *#########################################################################
|
---|
3 | *
|
---|
4 | * A component of the Gatherer application, part of the Greenstone digital
|
---|
5 | * library suite from the New Zealand Digital Library Project at the
|
---|
6 | * University of Waikato, New Zealand.
|
---|
7 | *
|
---|
8 | * <BR><BR>
|
---|
9 | *
|
---|
10 | * Author: John Thompson, Greenstone Digital Library, University of Waikato
|
---|
11 | *
|
---|
12 | * <BR><BR>
|
---|
13 | *
|
---|
14 | * Copyright (C) 1999 New Zealand Digital Library Project
|
---|
15 | *
|
---|
16 | * <BR><BR>
|
---|
17 | *
|
---|
18 | * This program is free software; you can redistribute it and/or modify
|
---|
19 | * it under the terms of the GNU General Public License as published by
|
---|
20 | * the Free Software Foundation; either version 2 of the License, or
|
---|
21 | * (at your option) any later version.
|
---|
22 | *
|
---|
23 | * <BR><BR>
|
---|
24 | *
|
---|
25 | * This program is distributed in the hope that it will be useful,
|
---|
26 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
27 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
28 | * GNU General Public License for more details.
|
---|
29 | *
|
---|
30 | * <BR><BR>
|
---|
31 | *
|
---|
32 | * You should have received a copy of the GNU General Public License
|
---|
33 | * along with this program; if not, write to the Free Software
|
---|
34 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
35 | *########################################################################
|
---|
36 | */
|
---|
37 | package org.greenstone.gatherer.msm;
|
---|
38 |
|
---|
39 | import java.io.*;
|
---|
40 | import java.net.*;
|
---|
41 | import java.util.*;
|
---|
42 | import org.greenstone.gatherer.Configuration;
|
---|
43 | import org.greenstone.gatherer.Dictionary;
|
---|
44 | import org.greenstone.gatherer.Gatherer;
|
---|
45 | import org.greenstone.gatherer.collection.Collection;
|
---|
46 | import org.greenstone.gatherer.collection.CollectionManager;
|
---|
47 | import org.greenstone.gatherer.file.FileNode;
|
---|
48 | import org.greenstone.gatherer.msm.ElementWrapper;
|
---|
49 | import org.greenstone.gatherer.msm.MetadataSet;
|
---|
50 | import org.greenstone.gatherer.msm.MetadataSetManager;
|
---|
51 | import org.greenstone.gatherer.msm.MSMUtils;
|
---|
52 | import org.greenstone.gatherer.shell.GShell;
|
---|
53 | import org.greenstone.gatherer.shell.GShellProgressMonitor;
|
---|
54 | import org.greenstone.gatherer.util.StaticStrings;
|
---|
55 | import org.greenstone.gatherer.util.Utility;
|
---|
56 | import org.greenstone.gatherer.valuetree.GValueModel;
|
---|
57 | import org.greenstone.gatherer.valuetree.GValueNode;
|
---|
58 | import org.w3c.dom.*;
|
---|
59 |
|
---|
60 | public class GreenstoneArchiveParser {
|
---|
61 |
|
---|
62 | private GShell shell;
|
---|
63 | private String file_path;
|
---|
64 |
|
---|
65 | static final String ignore_list[] = { "assocfilepath", "gsdl", "Identifier", "SourceSegment", "URL" };
|
---|
66 |
|
---|
67 | public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
|
---|
68 | // We can only extract metadata if an extracted metadata set exists in our collection.
|
---|
69 | if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) {
|
---|
70 | this.shell = shell;
|
---|
71 | // The very firstist thing we do is remove any existing extracted metadata
|
---|
72 | if(Gatherer.c_man != null && Gatherer.c_man.getCollection() != null) {
|
---|
73 | Gatherer.c_man.getCollection().getGDM().removeExtractedMetadata();
|
---|
74 | }
|
---|
75 |
|
---|
76 | // Determine the collection archive directory.
|
---|
77 | File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
|
---|
78 | // Start the extraction process
|
---|
79 | processDirectory(archive_directory, progress);
|
---|
80 | }
|
---|
81 | // All done. Outta here like a bald man.
|
---|
82 | }
|
---|
83 |
|
---|
84 | private void processDirectory(File directory, GShellProgressMonitor progress) {
|
---|
85 | // look for a doc.xml file here
|
---|
86 | File document_file = new File(directory, "doc.xml");
|
---|
87 | // Then extract the metadata from it.
|
---|
88 | if(document_file.exists()) {
|
---|
89 | int count = extractMetadata(document_file);
|
---|
90 | // Display a pretty progress message.
|
---|
91 | String[] args = new String[2];
|
---|
92 | args[0] = directory.getName();
|
---|
93 | args[1] = String.valueOf(count);
|
---|
94 | this.shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.get("GShell.Extracted", args), GShell.OK, null);
|
---|
95 | args = null;
|
---|
96 | progress.increment();
|
---|
97 | }
|
---|
98 |
|
---|
99 | // for each directory here, process it
|
---|
100 | File sub_directories[] = directory.listFiles();
|
---|
101 | for(int i = 0; i < sub_directories.length; i++) {
|
---|
102 | // Find the doc.xml file within
|
---|
103 | if(sub_directories[i].isDirectory()) {
|
---|
104 | processDirectory(sub_directories[i], progress);
|
---|
105 | }
|
---|
106 | }
|
---|
107 | }
|
---|
108 |
|
---|
109 | private int extractMetadata(File file) {
|
---|
110 | int count = 0;
|
---|
111 | // Retrieve the DOM of the file.
|
---|
112 | Document document = Utility.parse(file, false);
|
---|
113 |
|
---|
114 | // If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
|
---|
115 | if (document != null) {
|
---|
116 | file_path = null;
|
---|
117 | Element archive_element = document.getDocumentElement();
|
---|
118 | // Retrieve the initial Section element
|
---|
119 | NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT);
|
---|
120 | // It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements
|
---|
121 | if (section_elements.getLength() < 1) {
|
---|
122 | return count;
|
---|
123 | }
|
---|
124 | Element section_element = (Element) section_elements.item(0);
|
---|
125 | section_elements = null;
|
---|
126 |
|
---|
127 | // Retrieve all of the Metadata sections
|
---|
128 | NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT);
|
---|
129 | section_element = null;
|
---|
130 |
|
---|
131 | // Zip through the retrieved metadata checking for SourceSegment elements
|
---|
132 | // These are a good sign of bibliographic files, which we must handle specially
|
---|
133 | boolean ignore_values = false;
|
---|
134 | for (int i = 0; i < metadata_elements.getLength(); i++) {
|
---|
135 | Element metadata_element = (Element) metadata_elements.item(i);
|
---|
136 | String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
|
---|
137 | if (name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) {
|
---|
138 | ignore_values = true;
|
---|
139 | break;
|
---|
140 | }
|
---|
141 | }
|
---|
142 |
|
---|
143 | // Now for each Metadata entry retrieved...
|
---|
144 | for (int i = 0; i < metadata_elements.getLength(); i++) {
|
---|
145 | Element metadata_element = (Element) metadata_elements.item(i);
|
---|
146 | if (processMetadataElement(metadata_element, ignore_values) == true) {
|
---|
147 | count++;
|
---|
148 | }
|
---|
149 | }
|
---|
150 | }
|
---|
151 |
|
---|
152 | return count;
|
---|
153 | }
|
---|
154 |
|
---|
155 |
|
---|
156 | private boolean processMetadataElement(Element metadata_element, boolean ignore_values)
|
---|
157 | {
|
---|
158 | String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
|
---|
159 | // There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
|
---|
160 | if (name.equals("gsdlsourcefilename")) {
|
---|
161 | file_path = MSMUtils.getValue(metadata_element);
|
---|
162 | return false;
|
---|
163 | }
|
---|
164 |
|
---|
165 | // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata
|
---|
166 | for (int j = 0; j < ignore_list.length; j++) {
|
---|
167 | if (name.startsWith(ignore_list[j])) {
|
---|
168 | return false;
|
---|
169 | }
|
---|
170 | }
|
---|
171 |
|
---|
172 | // namespaced metadata, we don't extract at the moment
|
---|
173 | if (name.indexOf(MSMUtils.NS_SEP)!=-1) {
|
---|
174 | return false;
|
---|
175 | }
|
---|
176 |
|
---|
177 | if (file_path == null) {
|
---|
178 | return false;
|
---|
179 | }
|
---|
180 |
|
---|
181 | // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
|
---|
182 | ElementWrapper element = Gatherer.c_man.msm.getElement(name);
|
---|
183 | if (element == null) {
|
---|
184 | MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
|
---|
185 | if (extracted_mds != null) {
|
---|
186 | element = extracted_mds.addElement(name, Gatherer.config.getLanguage());
|
---|
187 | }
|
---|
188 | }
|
---|
189 |
|
---|
190 | // If ignore_values is set (bibliographic records) we don't care about the values
|
---|
191 | if (ignore_values == true) {
|
---|
192 | return false;
|
---|
193 | }
|
---|
194 |
|
---|
195 | // Retrieve the metadata for the current file
|
---|
196 | File target_file = new File(file_path);
|
---|
197 | String value = "";
|
---|
198 | try {
|
---|
199 | value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
|
---|
200 | }
|
---|
201 | catch (IllegalArgumentException error) { // ****
|
---|
202 | value = Utility.decodeGreenstone(MSMUtils.getValue(metadata_element));
|
---|
203 | }
|
---|
204 | catch (UnsupportedEncodingException error) {
|
---|
205 | Gatherer.printStackTrace(error);
|
---|
206 | }
|
---|
207 |
|
---|
208 | if (value == null) {
|
---|
209 | return false;
|
---|
210 | }
|
---|
211 | // Create a new metadata object.
|
---|
212 | GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
|
---|
213 | GValueNode value_node = null;
|
---|
214 | if (value_tree != null) {
|
---|
215 | value_node = value_tree.getValue(value);
|
---|
216 | }
|
---|
217 | else {
|
---|
218 | value_node = new GValueNode(element.toString(), value);
|
---|
219 | }
|
---|
220 |
|
---|
221 | Metadata metadata = new Metadata(element, value_node);
|
---|
222 | element.inc();
|
---|
223 | ///ystem.err.println("Adding extracted metadata: " + metadata);
|
---|
224 | Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
|
---|
225 |
|
---|
226 | return true;
|
---|
227 | }
|
---|
228 |
|
---|
229 |
|
---|
230 | static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
|
---|
231 |
|
---|
232 | static public ArrayList extractMetadataElements(File archive_directory) {
|
---|
233 | ArrayList extracted_metadata_elements = new ArrayList();
|
---|
234 | File document_directories[] = archive_directory.listFiles();
|
---|
235 | for(int i = 0; i < document_directories.length; i++) {
|
---|
236 | // Find the doc.xml file within
|
---|
237 | if(document_directories[i].isDirectory()) {
|
---|
238 | File document_file = new File(document_directories[i], "doc.xml");
|
---|
239 | // Then extract the metadata from it.
|
---|
240 | if(document_file.exists()) {
|
---|
241 | try {
|
---|
242 | Document document = Utility.parse(document_file, false);
|
---|
243 | // Retrieve all of the Metadata sections.
|
---|
244 | Element archive_element = document.getDocumentElement();
|
---|
245 | NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
|
---|
246 | // Now for each Metadata entry retrieved...
|
---|
247 | for(int j = 0; j < metadata_elements.getLength(); j++) {
|
---|
248 | Element metadata_element = (Element) metadata_elements.item(j);
|
---|
249 | String name = metadata_element.getAttribute("name");
|
---|
250 | // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
|
---|
251 | boolean ignore = false;
|
---|
252 | for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) {
|
---|
253 | ignore = name.startsWith(metadata_ignore_list[k]);
|
---|
254 | }
|
---|
255 | if(!ignore && !extracted_metadata_elements.contains(name)) {
|
---|
256 | extracted_metadata_elements.add(name);
|
---|
257 | }
|
---|
258 | name = null;
|
---|
259 | metadata_element = null;
|
---|
260 | }
|
---|
261 | metadata_elements = null;
|
---|
262 | archive_element = null;
|
---|
263 | document = null;
|
---|
264 | }
|
---|
265 | catch (Exception error) {
|
---|
266 | Gatherer.printStackTrace(error);
|
---|
267 | }
|
---|
268 | }
|
---|
269 | document_file = null;
|
---|
270 | }
|
---|
271 | }
|
---|
272 | document_directories = null;
|
---|
273 | return extracted_metadata_elements;
|
---|
274 | }
|
---|
275 | }
|
---|