/**
*#########################################################################
*
* A component of the Gatherer application, part of the Greenstone digital
* library suite from the New Zealand Digital Library Project at the
* University of Waikato, New Zealand.
*
*
*
* Author: John Thompson, Greenstone Digital Library, University of Waikato
*
*
*
* Copyright (C) 1999 New Zealand Digital Library Project
*
*
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
*
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*########################################################################
*/
package org.greenstone.gatherer.msm;
import java.io.*;
import java.net.*;
import java.util.*;
import org.greenstone.gatherer.Configuration;
import org.greenstone.gatherer.Dictionary;
import org.greenstone.gatherer.Gatherer;
import org.greenstone.gatherer.collection.Collection;
import org.greenstone.gatherer.collection.CollectionManager;
import org.greenstone.gatherer.file.FileNode;
import org.greenstone.gatherer.msm.ElementWrapper;
import org.greenstone.gatherer.msm.MetadataSet;
import org.greenstone.gatherer.msm.MetadataSetManager;
import org.greenstone.gatherer.msm.MSMUtils;
import org.greenstone.gatherer.shell.GShell;
import org.greenstone.gatherer.shell.GShellProgressMonitor;
import org.greenstone.gatherer.util.StaticStrings;
import org.greenstone.gatherer.util.Utility;
import org.greenstone.gatherer.valuetree.GValueModel;
import org.greenstone.gatherer.valuetree.GValueNode;
import org.w3c.dom.*;
public class GreenstoneArchiveParser {
private GShell shell;
static final String ignore_list[] = {"assocfilepath", "gsdl", "Identifier", "URL"}; //"Source",
public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) {
// We can only extract metadata if an extracted metadata set exists in our collection.
if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) {
this.shell = shell;
// The very firstist thing we do is remove any existing extracted metadata
if(Gatherer.c_man != null && Gatherer.c_man.getCollection() != null) {
Gatherer.c_man.getCollection().getGDM().removeExtractedMetadata();
}
// Determine the collection archive directory.
File archive_directory = new File(Gatherer.c_man.getCollectionArchive());
// For each of the hash coded directories within.
File document_directories[] = archive_directory.listFiles();
for(int i = 0; i < document_directories.length; i++) {
// Find the doc.xml file within
if(document_directories[i].isDirectory()) {
File document_file = new File(document_directories[i], "doc.xml");
// Then extract the metadata from it.
if(document_file.exists()) {
int count = extractMetadata(document_file);
// Display a pretty progress message.
String[] args = new String[2];
args[0] = document_directories[i].getName();
args[1] = String.valueOf(count);
shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.get("GShell.Extracted", args), GShell.OK, null);
args = null;
progress.increment();
}
}
}
}
// All done. Outta here like a bald man.
}
private int extractMetadata(File file) {
int count = 0;
// Retrieve the DOM of the file.
Document document = Utility.parse(file, false);
Gatherer.println("Parsed greenstone archive document: " + file.getAbsolutePath());
// If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags.
if(document != null) {
String file_path = null;
Element archive_element = document.getDocumentElement();
// Retrieve the initial Section element
NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT);
// It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements.
if(section_elements.getLength() < 1) {
return count;
}
Element section_element = (Element) section_elements.item(0);
section_elements = null;
// Retrieve all of the Metadata sections.
NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT);
section_element = null;
// We first zip through the retrieved metadata, and if we encounter the element 'SourceSegment' - a sure sign this collection came from a bibliographic type file - we break out of extracted metadata parsing as no sense could be made of the data extracted anyway (plus we suffer a death of thirty-thousand pointy bits of metadata!)
for(int i = 0; i < metadata_elements.getLength(); i++) {
Element metadata_element = (Element) metadata_elements.item(i);
String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
if(name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) {
return 0;
}
}
// Now for each Metadata entry retrieved...
for(int i = 0; i < metadata_elements.getLength(); i++) {
Element metadata_element = (Element) metadata_elements.item(i);
String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
// There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to.
if(name.equals("gsdlsourcefilename")) {
file_path = MSMUtils.getValue(metadata_element);
}
else {
// Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
boolean ignore = false;
for(int j = 0; !ignore && j < ignore_list.length; j++) {
ignore = name.startsWith(ignore_list[j]);
}
// Otherwise ensure the metadata is present in our collection.
if(!ignore && file_path != null) {
// If we successfully retrieved a record we can continue.
if(file_path != null) {
// We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all).
ElementWrapper element = Gatherer.c_man.msm.getElement(name);
if(element == null) {
MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE);
if(extracted_mds != null) {
element = extracted_mds.addElement(name, Gatherer.config.getLanguage());
}
}
// If we successfully retrieved an element (and we should have) we can continue.
// WARNING!! There is one known exception - MARC records. Adding the extracted elements is all good, but adding the extracted metadata causes the whole thing to collapse in a pile of unhappy.
if(element != null && !file_path.endsWith(StaticStrings.MARC_EXTENSION) && (element.getNamespace().equals("") || element.getNamespace().equals(Utility.EXTRACTED_METADATA_NAMESPACE))) {
// Retrieve the metadata for the current file
File target_file = new File(file_path);
String value = "";
try {
value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8"));
}
catch(UnsupportedEncodingException error) {
Gatherer.printStackTrace(error);
}
// If we successfully retrieved a value we can continue.
if(value != null) {
// Create a new metadata object.
GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element);
GValueNode value_node = null;
if(value_tree != null) {
value_node = value_tree.getValue(value);
}
else {
value_node = new GValueNode(element.toString(), value);
}
Metadata metadata = new Metadata(element, value_node);
element.inc();
///ystem.err.println("Adding extracted metadata: " + metadata);
Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata));
count++;
// All done. On to next metadata.
}
value = null;
target_file = null;
}
else {
Gatherer.println("Cannot retrieve metadata element " + name);
}
}
}
}
}
}
return count;
}
static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
static public ArrayList extractMetadataElements(File archive_directory) {
ArrayList extracted_metadata_elements = new ArrayList();
File document_directories[] = archive_directory.listFiles();
for(int i = 0; i < document_directories.length; i++) {
// Find the doc.xml file within
if(document_directories[i].isDirectory()) {
File document_file = new File(document_directories[i], "doc.xml");
// Then extract the metadata from it.
if(document_file.exists()) {
try {
Document document = Utility.parse(document_file, false);
// Retrieve all of the Metadata sections.
Element archive_element = document.getDocumentElement();
NodeList metadata_elements = archive_element.getElementsByTagName("Metadata");
// Now for each Metadata entry retrieved...
for(int j = 0; j < metadata_elements.getLength(); j++) {
Element metadata_element = (Element) metadata_elements.item(j);
String name = metadata_element.getAttribute("name");
// Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata.
boolean ignore = false;
for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) {
ignore = name.startsWith(metadata_ignore_list[k]);
}
if(!ignore && !extracted_metadata_elements.contains(name)) {
extracted_metadata_elements.add(name);
}
name = null;
metadata_element = null;
}
metadata_elements = null;
archive_element = null;
document = null;
}
catch (Exception error) {
Gatherer.printStackTrace(error);
}
}
document_file = null;
}
}
document_directories = null;
return extracted_metadata_elements;
}
}