source: trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java@ 4436

Last change on this file since 4436 was 4369, checked in by jmt12, 21 years ago

Bug fix to prevent the absence of the extracted metadata set causing an NPE during import phase - John

  • Property svn:keywords set to Author Date Id Revision
File size: 7.0 KB
Line 
1package org.greenstone.gatherer.msm;
2/**
3 *#########################################################################
4 *
5 * A component of the Gatherer application, part of the Greenstone digital
6 * library suite from the New Zealand Digital Library Project at the
7 * University of Waikato, New Zealand.
8 *
9 * <BR><BR>
10 *
11 * Author: John Thompson, Greenstone Digital Library, University of Waikato
12 *
13 * <BR><BR>
14 *
15 * Copyright (C) 1999 New Zealand Digital Library Project
16 *
17 * <BR><BR>
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of the GNU General Public License as published by
21 * the Free Software Foundation; either version 2 of the License, or
22 * (at your option) any later version.
23 *
24 * <BR><BR>
25 *
26 * This program is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * <BR><BR>
32 *
33 * You should have received a copy of the GNU General Public License
34 * along with this program; if not, write to the Free Software
35 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
36 *########################################################################
37 */
38import java.io.*;
39import java.net.*;
40import java.util.*;
41import org.greenstone.gatherer.Gatherer;
42import org.greenstone.gatherer.collection.Collection;
43import org.greenstone.gatherer.collection.CollectionManager;
44import org.greenstone.gatherer.file.FileNode;
45import org.greenstone.gatherer.msm.ElementWrapper;
46import org.greenstone.gatherer.msm.MetadataSet;
47import org.greenstone.gatherer.msm.MetadataSetManager;
48import org.greenstone.gatherer.msm.MSMUtils;
49import org.greenstone.gatherer.shell.GShell;
50import org.greenstone.gatherer.shell.GShellProgressMonitor;
51import org.greenstone.gatherer.util.Utility;
52import org.greenstone.gatherer.valuetree.GValueModel;
53import org.greenstone.gatherer.valuetree.GValueNode;
54import org.w3c.dom.*;
55
56public class GreenstoneArchiveParser {
57
58 private GShell shell;
59
60 static final String ignore_list[] = {"assocfilepath","gsdl","Identifier","Source","URL"};
61
62 public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
63 // We can only extract metadata if an extracted metadata set exists in our collection.
64 if(Gatherer.c_man.msm.getSet("") != null) {
65 this.shell = shell;
66 // Determine the collection archive directory.
67 File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
68 // For each of the hash coded directories within.
69 File document_directories[] = archive_directory.listFiles();
70 for(int i = 0; i < document_directories.length; i++) {
71 // Find the doc.xml file within
72 if(document_directories[i].isDirectory()) {
73 File document_file = new File(document_directories[i], "doc.xml");
74 // Then extract the metadata from it.
75 if(document_file.exists()) {
76 extractMetadata(document_file);
77 // Display a pretty progress message.
78 shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Gatherer.dictionary.get("GShell.Extracted", document_directories[i].getName()), GShell.OK);
79 progress.increment();
80 }
81 }
82 }
83 }
84 // All done. Outta here like a bald man.
85 }
86
87 private void extractMetadata(File file) {
88 // Retrieve the DOM of the file.
89 Document document = Utility.parse(file, false);
90 // If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
91 if(document != null) {
92 String file_path = null;
93 Element archive_element = document.getDocumentElement();
94 // Retrieve all of the Metadata sections.
95 NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
96 // Now for each Metadata entry retrieved...
97 for(int i = 0; i < metadata_elements.getLength(); i++) {
98 Element metadata_element = (Element) metadata_elements.item(i);
99 String name = metadata_element.getAttribute("name");
100 // There is a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
101 if(name.equals("gsdlsourcefilename")) {
102 file_path = MSMUtils.getValue(metadata_element);
103 }
104 else {
105 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
106 boolean ignore = (name.indexOf(".") != -1);
107 for(int j = 0; !ignore && j < ignore_list.length; j++) {
108 ignore = name.startsWith(ignore_list[j]);
109 }
110 // Otherwise ensure the metadata is present in our collection.
111 if(!ignore && file_path != null) {
112 // If we successfully retrieved a record we can continue.
113 if(file_path != null) {
114 // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
115 ElementWrapper element = Gatherer.c_man.msm.getElement(name);
116 if(element == null) {
117 MetadataSet extracted_mds = Gatherer.c_man.msm.getSet("ex");
118 if(extracted_mds != null) {
119 element = extracted_mds.addElement(name);
120 }
121 }
122 // If we successfully retrieved an element (and we should have) we can continue.
123 if(element != null) {
124 // Retrieve the metadata for the current file
125 File target_file = new File(file_path);
126 ArrayList metadatum = Gatherer.c_man.getCollection().gdm.getMetadata(target_file);
127 // If no metadata exists for the current element, add it
128 boolean found = false;
129 for(int k = 0; !found && k < metadatum.size(); k++) {
130 Metadata sibling = (Metadata) metadatum.get(k);
131 found = element.equals(sibling.getElement());
132 }
133 metadatum = null;
134 if(!found) {
135 String value = "";
136 try {
137 value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
138 }
139 catch(UnsupportedEncodingException error) {
140 Gatherer.printStackTrace(error);
141 }
142 // If we successfully retrieved a value we can continue.
143 if(value != null) {
144 // Create a new metadata object.
145 GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
146 GValueNode value_node = null;
147 if(value_tree != null) {
148 value_node = value_tree.getValue(value);
149 }
150 else {
151 value_node = new GValueNode(element.toString(), value);
152 }
153 Metadata metadata = new Metadata(element, value_node);
154 Gatherer.c_man.getCollection().gdm.metadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
155 // All done. On to next metadata.
156 }
157 }
158 target_file = null;
159 }
160 else {
161 Gatherer.println("Cannot retrieve metadata element " + name);
162 }
163 }
164 }
165 }
166 }
167 }
168 }
169}
Note: See TracBrowser for help on using the repository browser.