source: trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java@ 5581

Last change on this file since 5581 was 5564, checked in by mdewsnip, 21 years ago

Many more small improvements and tooltips added. Still more to come!

  • Property svn:keywords set to Author Date Id Revision
File size: 9.1 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.msm;
38
39import java.io.*;
40import java.net.*;
41import java.util.*;
42import org.greenstone.gatherer.Configuration;
43import org.greenstone.gatherer.Dictionary;
44import org.greenstone.gatherer.Gatherer;
45import org.greenstone.gatherer.collection.Collection;
46import org.greenstone.gatherer.collection.CollectionManager;
47import org.greenstone.gatherer.file.FileNode;
48import org.greenstone.gatherer.msm.ElementWrapper;
49import org.greenstone.gatherer.msm.MetadataSet;
50import org.greenstone.gatherer.msm.MetadataSetManager;
51import org.greenstone.gatherer.msm.MSMUtils;
52import org.greenstone.gatherer.shell.GShell;
53import org.greenstone.gatherer.shell.GShellProgressMonitor;
54import org.greenstone.gatherer.util.Utility;
55import org.greenstone.gatherer.valuetree.GValueModel;
56import org.greenstone.gatherer.valuetree.GValueNode;
57import org.w3c.dom.*;
58
59public class GreenstoneArchiveParser {
60
61 private GShell shell;
62
63 static final String ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"}; //"Source",
64
65 public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
66 // We can only extract metadata if an extracted metadata set exists in our collection.
67 if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) {
68 this.shell = shell;
69 // Determine the collection archive directory.
70 File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
71 // For each of the hash coded directories within.
72 File document_directories[] = archive_directory.listFiles();
73 for(int i = 0; i < document_directories.length; i++) {
74 // Find the doc.xml file within
75 if(document_directories[i].isDirectory()) {
76 File document_file = new File(document_directories[i], "doc.xml");
77 // Then extract the metadata from it.
78 if(document_file.exists()) {
79 int count = extractMetadata(document_file);
80 // Display a pretty progress message.
81 String[] args = new String[2];
82 args[0] = document_directories[i].getName();
83 args[1] = String.valueOf(count);
84 shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.newget("GShell.Extracted", args), GShell.OK);
85 args = null;
86 progress.increment();
87 }
88 }
89 }
90 }
91 // All done. Outta here like a bald man.
92 }
93
94 private int extractMetadata(File file) {
95 int count = 0;
96 // Retrieve the DOM of the file.
97 Document document = Utility.parse(file, false);
98 // If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
99 if(document != null) {
100 String file_path = null;
101 Element archive_element = document.getDocumentElement();
102 // Retrieve all of the Metadata sections.
103 NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
104 // Now for each Metadata entry retrieved...
105 for(int i = 0; i < metadata_elements.getLength(); i++) {
106 Element metadata_element = (Element) metadata_elements.item(i);
107 String name = metadata_element.getAttribute("name");
108 // There is a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
109 if(name.equals("gsdlsourcefilename")) {
110 file_path = MSMUtils.getValue(metadata_element);
111 }
112 else {
113 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
114 boolean ignore = false;
115 for(int j = 0; !ignore && j < ignore_list.length; j++) {
116 ignore = name.startsWith(ignore_list[j]);
117 }
118 // Otherwise ensure the metadata is present in our collection.
119 if(!ignore && file_path != null) {
120 // If we successfully retrieved a record we can continue.
121 if(file_path != null) {
122 // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
123 ElementWrapper element = Gatherer.c_man.msm.getElement(name);
124 if(element == null) {
125 MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
126 if(extracted_mds != null) {
127 element = extracted_mds.addElement(name, Gatherer.config.interface_language);
128 }
129 }
130 // If we successfully retrieved an element (and we should have) we can continue.
131 if(element != null) {
132 // Retrieve the metadata for the current file
133 File target_file = new File(file_path);
134 ArrayList metadatum = Gatherer.c_man.getCollection().gdm.getMetadata(target_file);
135 // If no metadata exists for the current element, add it
136 boolean found = false;
137 for(int k = 0; !found && k < metadatum.size(); k++) {
138 Metadata sibling = (Metadata) metadatum.get(k);
139 found = element.equals(sibling.getElement());
140 }
141 metadatum = null;
142 if(!found) {
143 String value = "";
144 try {
145 value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
146 }
147 catch(UnsupportedEncodingException error) {
148 Gatherer.printStackTrace(error);
149 }
150 // If we successfully retrieved a value we can continue.
151 if(value != null) {
152 // Create a new metadata object.
153 GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
154 GValueNode value_node = null;
155 if(value_tree != null) {
156 value_node = value_tree.getValue(value);
157 }
158 else {
159 value_node = new GValueNode(element.toString(), value);
160 }
161 Metadata metadata = new Metadata(element, value_node);
162 Gatherer.c_man.getCollection().gdm.metadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
163 count++;
164 // All done. On to next metadata.
165 }
166 }
167 target_file = null;
168 }
169 else {
170 Gatherer.println("Cannot retrieve metadata element " + name);
171 }
172 }
173 }
174 }
175 }
176 }
177 return count;
178 }
179
180 static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
181
182 static public ArrayList extractMetadataElements(File archive_directory) {
183 ArrayList extracted_metadata_elements = new ArrayList();
184 File document_directories[] = archive_directory.listFiles();
185 for(int i = 0; i < document_directories.length; i++) {
186 // Find the doc.xml file within
187 if(document_directories[i].isDirectory()) {
188 File document_file = new File(document_directories[i], "doc.xml");
189 // Then extract the metadata from it.
190 if(document_file.exists()) {
191 try {
192 Document document = Utility.parse(document_file, false);
193 // Retrieve all of the Metadata sections.
194 Element archive_element = document.getDocumentElement();
195 NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
196 // Now for each Metadata entry retrieved...
197 for(int j = 0; j < metadata_elements.getLength(); j++) {
198 Element metadata_element = (Element) metadata_elements.item(j);
199 String name = metadata_element.getAttribute("name");
200 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
201 boolean ignore = false;
202 for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) {
203 ignore = name.startsWith(metadata_ignore_list[k]);
204 }
205 if(!ignore && !extracted_metadata_elements.contains(name)) {
206 extracted_metadata_elements.add(name);
207 }
208 name = null;
209 metadata_element = null;
210 }
211 metadata_elements = null;
212 archive_element = null;
213 document = null;
214 }
215 catch (Exception error) {
216 Gatherer.printStackTrace(error);
217 }
218 }
219 document_file = null;
220 }
221 }
222 document_directories = null;
223 return extracted_metadata_elements;
224 }
225}
Note: See TracBrowser for help on using the repository browser.