/** *######################################################################### * * A component of the Gatherer application, part of the Greenstone digital * library suite from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * *

* * Author: John Thompson, Greenstone Digital Library, University of Waikato * *

* * Copyright (C) 1999 New Zealand Digital Library Project * *

* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * *

* * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * *

* * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *######################################################################## */ package org.greenstone.gatherer.msm; import java.io.*; import java.net.*; import java.util.*; import org.greenstone.gatherer.Configuration; import org.greenstone.gatherer.Dictionary; import org.greenstone.gatherer.Gatherer; import org.greenstone.gatherer.collection.Collection; import org.greenstone.gatherer.collection.CollectionManager; import org.greenstone.gatherer.file.FileNode; import org.greenstone.gatherer.msm.ElementWrapper; import org.greenstone.gatherer.msm.MetadataSet; import org.greenstone.gatherer.msm.MetadataSetManager; import org.greenstone.gatherer.msm.MSMUtils; import org.greenstone.gatherer.shell.GShell; import org.greenstone.gatherer.shell.GShellProgressMonitor; import org.greenstone.gatherer.util.StaticStrings; import org.greenstone.gatherer.util.Utility; import org.greenstone.gatherer.valuetree.GValueModel; import org.greenstone.gatherer.valuetree.GValueNode; import org.w3c.dom.*; public class GreenstoneArchiveParser { private GShell shell; static final String ignore_list[] = {"assocfilepath", "gsdl", "Identifier", "URL"}; //"Source", public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) { // We can only extract metadata if an extracted metadata set exists in our collection. if(Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE) != null) { this.shell = shell; // The very firstist thing we do is remove any existing extracted metadata if(Gatherer.c_man != null && Gatherer.c_man.getCollection() != null) { Gatherer.c_man.getCollection().getGDM().removeExtractedMetadata(); } // Determine the collection archive directory. File archive_directory = new File(Gatherer.c_man.getCollectionArchive()); // For each of the hash coded directories within. File document_directories[] = archive_directory.listFiles(); for(int i = 0; i < document_directories.length; i++) { // Find the doc.xml file within if(document_directories[i].isDirectory()) { File document_file = new File(document_directories[i], "doc.xml"); // Then extract the metadata from it. if(document_file.exists()) { int count = extractMetadata(document_file); // Display a pretty progress message. String[] args = new String[2]; args[0] = document_directories[i].getName(); args[1] = String.valueOf(count); shell.fireMessage(GShell.IMPORT, shell.typeAsString(GShell.IMPORT) + "> " + Dictionary.get("GShell.Extracted", args), GShell.OK, null); args = null; progress.increment(); } } } } // All done. Outta here like a bald man. } private int extractMetadata(File file) { int count = 0; // Retrieve the DOM of the file. Document document = Utility.parse(file, false); Gatherer.println("Parsed greenstone archive document: " + file.getAbsolutePath()); // If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags. if(document != null) { String file_path = null; Element archive_element = document.getDocumentElement(); // Retrieve the initial Section element NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT); // It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements. if(section_elements.getLength() < 1) { return count; } Element section_element = (Element) section_elements.item(0); section_elements = null; // Retrieve all of the Metadata sections. NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT); section_element = null; // We first zip through the retrieved metadata, and if we encounter the element 'SourceSegment' - a sure sign this collection came from a bibliographic type file - we break out of extracted metadata parsing as no sense could be made of the data extracted anyway (plus we suffer a death of thirty-thousand pointy bits of metadata!) for(int i = 0; i < metadata_elements.getLength(); i++) { Element metadata_element = (Element) metadata_elements.item(i); String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE); if(name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) { return 0; } } // Now for each Metadata entry retrieved... for(int i = 0; i < metadata_elements.getLength(); i++) { Element metadata_element = (Element) metadata_elements.item(i); String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE); // There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to. if(name.equals("gsdlsourcefilename")) { file_path = MSMUtils.getValue(metadata_element); } else { // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata. boolean ignore = false; for(int j = 0; !ignore && j < ignore_list.length; j++) { ignore = name.startsWith(ignore_list[j]); } // Otherwise ensure the metadata is present in our collection. if(!ignore && file_path != null) { // If we successfully retrieved a record we can continue. if(file_path != null) { // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all). ElementWrapper element = Gatherer.c_man.msm.getElement(name); if(element == null) { MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE); if(extracted_mds != null) { element = extracted_mds.addElement(name, Gatherer.config.getLanguage()); } } // If we successfully retrieved an element (and we should have) we can continue. // WARNING!! There is one known exception - MARC records. Adding the extracted elements is all good, but adding the extracted metadata causes the whole thing to collapse in a pile of unhappy. if(element != null && !file_path.endsWith(StaticStrings.MARC_EXTENSION) && (element.getNamespace().equals("") || element.getNamespace().equals(Utility.EXTRACTED_METADATA_NAMESPACE))) { // Retrieve the metadata for the current file File target_file = new File(file_path); String value = ""; try { value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8")); } catch(UnsupportedEncodingException error) { Gatherer.printStackTrace(error); } // If we successfully retrieved a value we can continue. if(value != null) { // Create a new metadata object. GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element); GValueNode value_node = null; if(value_tree != null) { value_node = value_tree.getValue(value); } else { value_node = new GValueNode(element.toString(), value); } Metadata metadata = new Metadata(element, value_node); element.inc(); ///ystem.err.println("Adding extracted metadata: " + metadata); Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata)); count++; // All done. On to next metadata. } value = null; target_file = null; } else { Gatherer.println("Cannot retrieve metadata element " + name); } } } } } } return count; } static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"}; static public ArrayList extractMetadataElements(File archive_directory) { ArrayList extracted_metadata_elements = new ArrayList(); File document_directories[] = archive_directory.listFiles(); for(int i = 0; i < document_directories.length; i++) { // Find the doc.xml file within if(document_directories[i].isDirectory()) { File document_file = new File(document_directories[i], "doc.xml"); // Then extract the metadata from it. if(document_file.exists()) { try { Document document = Utility.parse(document_file, false); // Retrieve all of the Metadata sections. Element archive_element = document.getDocumentElement(); NodeList metadata_elements = archive_element.getElementsByTagName("Metadata"); // Now for each Metadata entry retrieved... for(int j = 0; j < metadata_elements.getLength(); j++) { Element metadata_element = (Element) metadata_elements.item(j); String name = metadata_element.getAttribute("name"); // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata. boolean ignore = false; for(int k = 0; !ignore && k < metadata_ignore_list.length; k++) { ignore = name.startsWith(metadata_ignore_list[k]); } if(!ignore && !extracted_metadata_elements.contains(name)) { extracted_metadata_elements.add(name); } name = null; metadata_element = null; } metadata_elements = null; archive_element = null; document = null; } catch (Exception error) { Gatherer.printStackTrace(error); } } document_file = null; } } document_directories = null; return extracted_metadata_elements; } }