[4293] | 1 | package org.greenstone.gatherer.msm;
|
---|
| 2 | /**
|
---|
| 3 | *#########################################################################
|
---|
| 4 | *
|
---|
| 5 | * A component of the Gatherer application, part of the Greenstone digital
|
---|
| 6 | * library suite from the New Zealand Digital Library Project at the
|
---|
| 7 | * University of Waikato, New Zealand.
|
---|
| 8 | *
|
---|
| 9 | * <BR><BR>
|
---|
| 10 | *
|
---|
| 11 | * Author: John Thompson, Greenstone Digital Library, University of Waikato
|
---|
| 12 | *
|
---|
| 13 | * <BR><BR>
|
---|
| 14 | *
|
---|
| 15 | * Copyright (C) 1999 New Zealand Digital Library Project
|
---|
| 16 | *
|
---|
| 17 | * <BR><BR>
|
---|
| 18 | *
|
---|
| 19 | * This program is free software; you can redistribute it and/or modify
|
---|
| 20 | * it under the terms of the GNU General Public License as published by
|
---|
| 21 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 22 | * (at your option) any later version.
|
---|
| 23 | *
|
---|
| 24 | * <BR><BR>
|
---|
| 25 | *
|
---|
| 26 | * This program is distributed in the hope that it will be useful,
|
---|
| 27 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 28 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 29 | * GNU General Public License for more details.
|
---|
| 30 | *
|
---|
| 31 | * <BR><BR>
|
---|
| 32 | *
|
---|
| 33 | * You should have received a copy of the GNU General Public License
|
---|
| 34 | * along with this program; if not, write to the Free Software
|
---|
| 35 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 36 | *########################################################################
|
---|
| 37 | */
|
---|
| 38 | import java.io.*;
|
---|
| 39 | import java.net.*;
|
---|
| 40 | import java.util.*;
|
---|
| 41 | import org.greenstone.gatherer.Gatherer;
|
---|
| 42 | import org.greenstone.gatherer.collection.Collection;
|
---|
| 43 | import org.greenstone.gatherer.collection.CollectionManager;
|
---|
| 44 | import org.greenstone.gatherer.file.FileNode;
|
---|
| 45 | import org.greenstone.gatherer.msm.ElementWrapper;
|
---|
| 46 | import org.greenstone.gatherer.msm.MetadataSet;
|
---|
| 47 | import org.greenstone.gatherer.msm.MetadataSetManager;
|
---|
| 48 | import org.greenstone.gatherer.msm.MSMUtils;
|
---|
| 49 | import org.greenstone.gatherer.shell.GShell;
|
---|
| 50 | import org.greenstone.gatherer.shell.GShellProgressMonitor;
|
---|
| 51 | import org.greenstone.gatherer.util.Utility;
|
---|
| 52 | import org.greenstone.gatherer.valuetree.GValueModel;
|
---|
| 53 | import org.greenstone.gatherer.valuetree.GValueNode;
|
---|
| 54 | import org.w3c.dom.*;
|
---|
| 55 |
|
---|
| 56 | public class GreenstoneArchiveParser {
|
---|
| 57 |
|
---|
[4365] | 58 | private GShell shell;
|
---|
[4293] | 59 |
|
---|
[4365] | 60 | static final String ignore_list[] = {"assocfilepath","gsdl","Identifier","Source","URL"};
|
---|
[4293] | 61 |
|
---|
[4365] | 62 | public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
|
---|
| 63 | // We can only extract metadata if an extracted metadata set exists in our collection.
|
---|
| 64 | if(Gatherer.c_man.msm.getSet("") != null) {
|
---|
| 65 | this.shell = shell;
|
---|
[4319] | 66 | // Determine the collection archive directory.
|
---|
[4365] | 67 | File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
|
---|
[4319] | 68 | // For each of the hash coded directories within.
|
---|
[4365] | 69 | File document_directories[] = archive_directory.listFiles();
|
---|
| 70 | for(int i = 0; i < document_directories.length; i++) {
|
---|
| 71 | // Find the doc.xml file within
|
---|
| 72 | if(document_directories[i].isDirectory()) {
|
---|
| 73 | File document_file = new File(document_directories[i], "doc.xml");
|
---|
| 74 | // Then extract the metadata from it.
|
---|
| 75 | if(document_file.exists()) {
|
---|
| 76 | extractMetadata(document_file);
|
---|
| 77 | // Display a pretty progress message.
|
---|
| 78 | shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Gatherer.dictionary.get("GShell.Extracted", document_directories[i].getName()), GShell.OK);
|
---|
| 79 | progress.increment();
|
---|
| 80 | }
|
---|
| 81 | }
|
---|
| 82 | }
|
---|
| 83 | }
|
---|
| 84 | // All done. Outta here like a bald man.
|
---|
| 85 | }
|
---|
[4293] | 86 |
|
---|
[4365] | 87 | private void extractMetadata(File file) {
|
---|
| 88 | // Retrieve the DOM of the file.
|
---|
| 89 | Document document = Utility.parse(file, false);
|
---|
| 90 | // If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
|
---|
| 91 | if(document != null) {
|
---|
| 92 | String file_path = null;
|
---|
| 93 | Element archive_element = document.getDocumentElement();
|
---|
[4293] | 94 | // Retrieve all of the Metadata sections.
|
---|
[4365] | 95 | NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
|
---|
[4293] | 96 | // Now for each Metadata entry retrieved...
|
---|
[4365] | 97 | for(int i = 0; i < metadata_elements.getLength(); i++) {
|
---|
| 98 | Element metadata_element = (Element) metadata_elements.item(i);
|
---|
| 99 | String name = metadata_element.getAttribute("name");
|
---|
| 100 | // There is a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
|
---|
| 101 | if(name.equals("gsdlsourcefilename")) {
|
---|
| 102 | file_path = MSMUtils.getValue(metadata_element);
|
---|
| 103 | }
|
---|
| 104 | else {
|
---|
| 105 | // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
|
---|
| 106 | boolean ignore = (name.indexOf(".") != -1);
|
---|
| 107 | for(int j = 0; !ignore && j < ignore_list.length; j++) {
|
---|
| 108 | ignore = name.startsWith(ignore_list[j]);
|
---|
| 109 | }
|
---|
| 110 | // Otherwise ensure the metadata is present in our collection.
|
---|
| 111 | if(!ignore && file_path != null) {
|
---|
| 112 | // If we successfully retrieved a record we can continue.
|
---|
| 113 | if(file_path != null) {
|
---|
| 114 | // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
|
---|
| 115 | ElementWrapper element = Gatherer.c_man.msm.getElement(name);
|
---|
| 116 | if(element == null) {
|
---|
| 117 | MetadataSet extracted_mds = Gatherer.c_man.msm.getSet("ex");
|
---|
[4369] | 118 | if(extracted_mds != null) {
|
---|
| 119 | element = extracted_mds.addElement(name);
|
---|
| 120 | }
|
---|
[4365] | 121 | }
|
---|
| 122 | // If we successfully retrieved an element (and we should have) we can continue.
|
---|
| 123 | if(element != null) {
|
---|
| 124 | // Retrieve the metadata for the current file
|
---|
| 125 | File target_file = new File(file_path);
|
---|
| 126 | ArrayList metadatum = Gatherer.c_man.getCollection().gdm.getMetadata(target_file);
|
---|
| 127 | // If no metadata exists for the current element, add it
|
---|
| 128 | boolean found = false;
|
---|
| 129 | for(int k = 0; !found && k < metadatum.size(); k++) {
|
---|
| 130 | Metadata sibling = (Metadata) metadatum.get(k);
|
---|
| 131 | found = element.equals(sibling.getElement());
|
---|
[4293] | 132 | }
|
---|
[4365] | 133 | metadatum = null;
|
---|
| 134 | if(!found) {
|
---|
| 135 | String value = "";
|
---|
| 136 | try {
|
---|
| 137 | value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
|
---|
| 138 | }
|
---|
| 139 | catch(UnsupportedEncodingException error) {
|
---|
| 140 | Gatherer.printStackTrace(error);
|
---|
| 141 | }
|
---|
| 142 | // If we successfully retrieved a value we can continue.
|
---|
| 143 | if(value != null) {
|
---|
| 144 | // Create a new metadata object.
|
---|
| 145 | GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
|
---|
| 146 | GValueNode value_node = null;
|
---|
| 147 | if(value_tree != null) {
|
---|
| 148 | value_node = value_tree.getValue(value);
|
---|
| 149 | }
|
---|
| 150 | else {
|
---|
| 151 | value_node = new GValueNode(element.toString(), value);
|
---|
| 152 | }
|
---|
| 153 | Metadata metadata = new Metadata(element, value_node);
|
---|
| 154 | Gatherer.c_man.getCollection().gdm.metadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
|
---|
| 155 | // All done. On to next metadata.
|
---|
| 156 | }
|
---|
| 157 | }
|
---|
| 158 | target_file = null;
|
---|
| 159 | }
|
---|
| 160 | else {
|
---|
| 161 | Gatherer.println("Cannot retrieve metadata element " + name);
|
---|
| 162 | }
|
---|
| 163 | }
|
---|
| 164 | }
|
---|
| 165 | }
|
---|
| 166 | }
|
---|
| 167 | }
|
---|
| 168 | }
|
---|
[4293] | 169 | }
|
---|