source: trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java@ 6639

Last change on this file since 6639 was 6639, checked in by jmt12, 20 years ago

Prevent hand entered section metadata being extracted in the same way that the top level metadata is

  • Property svn:keywords set to Author Date Id Revision
File size: 10.6 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.msm;
38
39import java.io.*;
40import java.net.*;
41import java.util.*;
42import org.greenstone.gatherer.Configuration;
43import org.greenstone.gatherer.Dictionary;
44import org.greenstone.gatherer.Gatherer;
45import org.greenstone.gatherer.collection.Collection;
46import org.greenstone.gatherer.collection.CollectionManager;
47import org.greenstone.gatherer.file.FileNode;
48import org.greenstone.gatherer.msm.ElementWrapper;
49import org.greenstone.gatherer.msm.MetadataSet;
50import org.greenstone.gatherer.msm.MetadataSetManager;
51import org.greenstone.gatherer.msm.MSMUtils;
52import org.greenstone.gatherer.shell.GShell;
53import org.greenstone.gatherer.shell.GShellProgressMonitor;
54import org.greenstone.gatherer.util.StaticStrings;
55import org.greenstone.gatherer.util.Utility;
56import org.greenstone.gatherer.valuetree.GValueModel;
57import org.greenstone.gatherer.valuetree.GValueNode;
58import org.w3c.dom.*;
59
60public class GreenstoneArchiveParser {
61
62 private GShell shell;
63
64 static final String ignore_list[] = {"assocfilepath", "gsdl", "Identifier", "URL"}; //"Source",
65
66 public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
67 // We can only extract metadata if an extracted metadata set exists in our collection.
68 if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) {
69 this.shell = shell;
70 // The very firstist thing we do is remove any existing extracted metadata
71 if(Gatherer.c_man != null && Gatherer.c_man.getCollection() != null) {
72 Gatherer.c_man.getCollection().getGDM().removeExtractedMetadata();
73 }
74
75 // Determine the collection archive directory.
76 File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
77 // For each of the hash coded directories within.
78 File document_directories[] = archive_directory.listFiles();
79 for(int i = 0; i < document_directories.length; i++) {
80 // Find the doc.xml file within
81 if(document_directories[i].isDirectory()) {
82 File document_file = new File(document_directories[i], "doc.xml");
83 // Then extract the metadata from it.
84 if(document_file.exists()) {
85 int count = extractMetadata(document_file);
86 // Display a pretty progress message.
87 String[] args = new String[2];
88 args[0] = document_directories[i].getName();
89 args[1] = String.valueOf(count);
90 shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.get("GShell.Extracted", args), GShell.OK, null);
91 args = null;
92 progress.increment();
93 }
94 }
95 }
96 }
97 // All done. Outta here like a bald man.
98 }
99
100 private int extractMetadata(File file) {
101 int count = 0;
102 // Retrieve the DOM of the file.
103 Document document = Utility.parse(file, false);
104
105 Gatherer.println("Parsed greenstone archive document: " + file.getAbsolutePath());
106 // If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
107 if(document != null) {
108 String file_path = null;
109 Element archive_element = document.getDocumentElement();
110 // Retrieve the initial Section element
111 NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT);
112 // It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements.
113 if(section_elements.getLength() < 1) {
114 return count;
115 }
116 Element section_element = (Element) section_elements.item(0);
117 section_elements = null;
118 // Retrieve all of the Metadata sections.
119 NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT);
120 section_element = null;
121 // We first zip through the retrieved metadata, and if we encounter the element 'SourceSegment' - a sure sign this collection came from a bibliographic type file - we break out of extracted metadata parsing as no sense could be made of the data extracted anyway (plus we suffer a death of thirty-thousand pointy bits of metadata!)
122 for(int i = 0; i < metadata_elements.getLength(); i++) {
123 Element metadata_element = (Element) metadata_elements.item(i);
124 String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
125 if(name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) {
126 return 0;
127 }
128 }
129 // Now for each Metadata entry retrieved...
130 for(int i = 0; i < metadata_elements.getLength(); i++) {
131 Element metadata_element = (Element) metadata_elements.item(i);
132 String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
133 // There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
134 if(name.equals("gsdlsourcefilename")) {
135 file_path = MSMUtils.getValue(metadata_element);
136 }
137 else {
138 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
139 boolean ignore = false;
140 for(int j = 0; !ignore && j < ignore_list.length; j++) {
141 ignore = name.startsWith(ignore_list[j]);
142 }
143 // Otherwise ensure the metadata is present in our collection.
144 if(!ignore && file_path != null) {
145 // If we successfully retrieved a record we can continue.
146 if(file_path != null) {
147 // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
148 ElementWrapper element = Gatherer.c_man.msm.getElement(name);
149 if(element == null) {
150 MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
151 if(extracted_mds != null) {
152 element = extracted_mds.addElement(name, Gatherer.config.getLanguage());
153 }
154 }
155 // If we successfully retrieved an element (and we should have) we can continue.
156 // WARNING!! There is one known exception - MARC records. Adding the extracted elements is all good, but adding the extracted metadata causes the whole thing to collapse in a pile of unhappy.
157 if(element != null && !file_path.endsWith(StaticStrings.MARC_EXTENSION) && (element.getNamespace().equals("") || element.getNamespace().equals(Utility.EXTRACTED_METADATA_NAMESPACE))) {
158 // Retrieve the metadata for the current file
159 File target_file = new File(file_path);
160 String value = "";
161 try {
162 value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
163 }
164 catch(UnsupportedEncodingException error) {
165 Gatherer.printStackTrace(error);
166 }
167 // If we successfully retrieved a value we can continue.
168 if(value != null) {
169 // Create a new metadata object.
170 GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
171 GValueNode value_node = null;
172 if(value_tree != null) {
173 value_node = value_tree.getValue(value);
174 }
175 else {
176 value_node = new GValueNode(element.toString(), value);
177 }
178 Metadata metadata = new Metadata(element, value_node);
179 element.inc();
180 ///ystem.err.println("Adding extracted metadata: " + metadata);
181 Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
182 count++;
183 // All done. On to next metadata.
184 }
185 value = null;
186 target_file = null;
187 }
188 else {
189 Gatherer.println("Cannot retrieve metadata element " + name);
190 }
191 }
192 }
193 }
194 }
195 }
196 return count;
197 }
198
199 static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
200
201 static public ArrayList extractMetadataElements(File archive_directory) {
202 ArrayList extracted_metadata_elements = new ArrayList();
203 File document_directories[] = archive_directory.listFiles();
204 for(int i = 0; i < document_directories.length; i++) {
205 // Find the doc.xml file within
206 if(document_directories[i].isDirectory()) {
207 File document_file = new File(document_directories[i], "doc.xml");
208 // Then extract the metadata from it.
209 if(document_file.exists()) {
210 try {
211 Document document = Utility.parse(document_file, false);
212 // Retrieve all of the Metadata sections.
213 Element archive_element = document.getDocumentElement();
214 NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
215 // Now for each Metadata entry retrieved...
216 for(int j = 0; j < metadata_elements.getLength(); j++) {
217 Element metadata_element = (Element) metadata_elements.item(j);
218 String name = metadata_element.getAttribute("name");
219 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
220 boolean ignore = false;
221 for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) {
222 ignore = name.startsWith(metadata_ignore_list[k]);
223 }
224 if(!ignore && !extracted_metadata_elements.contains(name)) {
225 extracted_metadata_elements.add(name);
226 }
227 name = null;
228 metadata_element = null;
229 }
230 metadata_elements = null;
231 archive_element = null;
232 document = null;
233 }
234 catch (Exception error) {
235 Gatherer.printStackTrace(error);
236 }
237 }
238 document_file = null;
239 }
240 }
241 document_directories = null;
242 return extracted_metadata_elements;
243 }
244}
Note: See TracBrowser for help on using the repository browser.