source: trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java@ 4319

Last change on this file since 4319 was 4319, checked in by jmt12, 21 years ago

bug fix

  • Property svn:keywords set to Author Date Id Revision
File size: 7.3 KB
Line 
1package org.greenstone.gatherer.msm;
2/**
3 *#########################################################################
4 *
5 * A component of the Gatherer application, part of the Greenstone digital
6 * library suite from the New Zealand Digital Library Project at the
7 * University of Waikato, New Zealand.
8 *
9 * <BR><BR>
10 *
11 * Author: John Thompson, Greenstone Digital Library, University of Waikato
12 *
13 * <BR><BR>
14 *
15 * Copyright (C) 1999 New Zealand Digital Library Project
16 *
17 * <BR><BR>
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of the GNU General Public License as published by
21 * the Free Software Foundation; either version 2 of the License, or
22 * (at your option) any later version.
23 *
24 * <BR><BR>
25 *
26 * This program is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * <BR><BR>
32 *
33 * You should have received a copy of the GNU General Public License
34 * along with this program; if not, write to the Free Software
35 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
36 *########################################################################
37 */
38import java.io.*;
39import java.net.*;
40import java.util.*;
41import org.greenstone.gatherer.Gatherer;
42import org.greenstone.gatherer.collection.Collection;
43import org.greenstone.gatherer.collection.CollectionManager;
44import org.greenstone.gatherer.file.FileNode;
45import org.greenstone.gatherer.msm.ElementWrapper;
46import org.greenstone.gatherer.msm.MetadataSet;
47import org.greenstone.gatherer.msm.MetadataSetManager;
48import org.greenstone.gatherer.msm.MSMUtils;
49import org.greenstone.gatherer.shell.GShell;
50import org.greenstone.gatherer.shell.GShellProgressMonitor;
51import org.greenstone.gatherer.util.Utility;
52import org.greenstone.gatherer.valuetree.GValueModel;
53import org.greenstone.gatherer.valuetree.GValueNode;
54import org.w3c.dom.*;
55
56public class GreenstoneArchiveParser {
57
58 private GShell shell;
59
60 static final String ignore_list[] = {"assocfilepath","gsdl","Identifier","Source","URL"};
61
62 public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
63 // We can only extract metadata if an extracted metadata set exists in our collection.
64 if(Gatherer.c_man.msm.getSet("") != null) {
65 this.shell = shell;
66 // Determine the collection archive directory.
67 File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
68 // For each of the hash coded directories within.
69 File document_directories[] = archive_directory.listFiles();
70 for(int i = 0; i < document_directories.length; i++) {
71 // Find the doc.xml file within
72 if(document_directories[i].isDirectory()) {
73 File document_file = new File(document_directories[i], "doc.xml");
74 // Then extract the metadata from it.
75 if(document_file.exists()) {
76 extractMetadata(document_file);
77 // Display a pretty progress message.
78 shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Gatherer.dictionary.get("GShell.Extracted", document_directories[i].getName()), GShell.OK);
79 progress.increment();
80 }
81 }
82 }
83 }
84 // All done. Outta here like a bald man.
85 }
86
87 private void extractMetadata(File file) {
88 // Retrieve the DOM of the file.
89 Document document = Utility.parse(file, false);
90 // If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
91 if(document != null) {
92 String file_path = null;
93 Element archive_element = document.getDocumentElement();
94 // Retrieve all of the Metadata sections.
95 NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
96 // Now for each Metadata entry retrieved...
97 for(int i = 0; i < metadata_elements.getLength(); i++) {
98 Element metadata_element = (Element) metadata_elements.item(i);
99 String name = metadata_element.getAttribute("name");
100 // There is a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
101 if(name.equals("gsdlsourcefilename")) {
102 file_path = MSMUtils.getValue(metadata_element);
103 }
104 else {
105 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
106 boolean ignore = (name.indexOf(".") != -1);
107 for(int j = 0; !ignore && j < ignore_list.length; j++) {
108 ignore = name.startsWith(ignore_list[j]);
109 }
110 // Otherwise ensure the metadata is present in our collection.
111 if(!ignore && file_path != null) {
112 // If we successfully retrieved a record we can continue.
113 if(file_path != null) {
114 // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
115 ElementWrapper element = Gatherer.c_man.msm.getElement(name);
116 if(element == null) {
117 MetadataSet extracted_mds = Gatherer.c_man.msm.getSet("ex");
118 element = extracted_mds.addElement(name);
119 }
120 // If we successfully retrieved an element (and we should have) we can continue.
121 if(element != null) {
122 // Retrieve the metadata for the current file
123 File target_file = new File(file_path);
124 ArrayList metadatum = Gatherer.c_man.getCollection().gdm.getMetadata(target_file);
125 // If no metadata exists for the current element, add it
126 boolean found = false;
127 for(int k = 0; !found && k < metadatum.size(); k++) {
128 Metadata sibling = (Metadata) metadatum.get(k);
129 found = element.equals(sibling.getElement());
130 }
131 metadatum = null;
132 if(!found) {
133 String value = "";
134 try {
135 value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
136 }
137 catch(UnsupportedEncodingException error) {
138 Gatherer.printStackTrace(error);
139 }
140 // If we successfully retrieved a value we can continue.
141 if(value != null) {
142 // Create a new metadata object.
143 GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
144 GValueNode value_node = null;
145 if(value_tree != null) {
146 value_node = value_tree.getValue(value);
147 }
148 else {
149 value_node = new GValueNode(element.toString(), value);
150 }
151 Metadata metadata = new Metadata(element, value_node);
152 Gatherer.c_man.getCollection().gdm.metadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
153 // All done. On to next metadata.
154 }
155 }
156 target_file = null;
157 }
158 else {
159 Gatherer.println("Cannot retrieve metadata element " + name);
160 }
161 }
162 }
163 }
164 }
165 }
166 }
167}
Note: See TracBrowser for help on using the repository browser.