/**
*############################################################################
* A component of the Greenstone Librarian Interface, part of the Greenstone
* digital library suite from the New Zealand Digital Library Project at the
* University of Waikato, New Zealand.
*
* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
*
* Copyright (C) 2004 New Zealand Digital Library Project
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*############################################################################
*/
package org.greenstone.gatherer.metadata;
import java.io.*;
import java.util.*;
import java.net.URLDecoder;
import org.greenstone.gatherer.DebugStream;
import org.greenstone.gatherer.util.Utility;
/** This class represents one doc.xml file */
public abstract class DocXMLFile extends File
{
protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
protected final String MetadataWrap;
protected final String MetadataItem;
public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
{
super(doc_xml_file_path);
this.MetadataWrap = metaWrap;
this.MetadataItem = metaItem;
}
public ArrayList getMetadataExtractedFromFile(File file)
{
// Build up a list of metadata extracted from this file
ArrayList metadata_values = new ArrayList();
String file_relative_path = file.getAbsolutePath();
int import_index = file_relative_path.indexOf("import");
if (import_index != -1) {
file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
}
// Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
if (description_elements_list == null) {
// ...it doesn't
return metadata_values;
}
MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
// Parse the file
DebugStream.println("Applicable file: " + this);
try {
BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
int description_element_num = 0;
int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
boolean in_relevant_description_element = false;
String line = null;
for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
// Check if this line contains the start of a relevant "Description" element
// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
if (line_num == next_description_element_start) {
in_relevant_description_element = true;
continue;
}
// If we're not in a relevant Description element we don't care about anything
if (in_relevant_description_element == false) {
continue;
}
// Check if this line contains the end of the relevant Description element
if (line.indexOf(""+MetadataWrap+">") != -1) {
description_element_num++;
if (description_element_num == description_elements_list.size()) {
break;
}
next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
in_relevant_description_element = false;
continue;
}
// If this line doesn't contain a complete Metadata element, we're not interested
if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf(""+MetadataItem+">") == -1) {
continue;
}
// Extract the metadata element name
int name_index = line.indexOf(" name=\"") + " name=\"".length();
String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
// If the metadata has a namespace it isn't extracted metadata, so we're not interested
String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
if (!metadata_set_namespace.equals("")) {
continue;
}
// Extracted metadata!
String metadata_element_name = metadata_element_name_full;
// We completely ignore bibliographic data
if (metadata_element_name.equals("SourceSegment")) {
buffered_reader.close();
return new ArrayList();
}
// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
if (metadata_element_name.startsWith("gsdl")) {
continue;
}
MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
// Value trees are not stored for extracted metadata, so create a new value tree node now
int value_index = line.indexOf(">", name_index) + ">".length();
String metadata_element_value = line.substring(value_index, line.lastIndexOf(""+MetadataItem+">"));
metadata_element.addMetadataValue(metadata_element_value);
MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
// Add the new metadata value to the list
MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
metadata_values.add(metadata_value);
}
buffered_reader.close();
}
catch (FileNotFoundException exception) {
DebugStream.printStackTrace(exception);
}
catch (IOException exception) {
DebugStream.printStackTrace(exception);
}
return metadata_values;
}
/**
* Every file must be skimmed when a collection is opened, for two reasons:
* - To build a mapping from source file to its corresponding doc.xml file
* - To get a complete list of all extracted metadata elements
*/
public void skimFile()
{
MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
// Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
DebugStream.println("Skimming " + this + "...");
try {
BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
int description_element_start = -1;
String line = null;
for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
// This line contains the start of a "MetadataWrap" element
// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
if (line.indexOf("<"+MetadataWrap+">") != -1) {
if (description_element_start != -1) {
System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
}
description_element_start = line_num;
continue;
}
// This line contains the end of a "MetadataWrap" element
if (line.indexOf(""+MetadataWrap+">") != -1) {
if (description_element_start == -1) {
System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
}
description_element_start = -1;
continue;
}
// If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
if (description_element_start == -1) {
continue;
}
// This line doesn't contain a Metadata element, so we're not interested
if (line.indexOf("<"+MetadataItem+" ") == -1) {
DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
continue;
}
// Extract the metadata element name
int name_index = line.indexOf(" name=\"") + " name=\"".length();
String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
// If the metadata has a namespace it isn't extracted metadata, so we're not interested
String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
if (!metadata_set_namespace.equals("")) {
continue;
}
// Extracted metadata!
String metadata_element_name = metadata_element_name_full;
// Note which file this is for
if (metadata_element_name.equals("gsdlsourcefilename")) {
// Extract the gsdlsourcefilename element value
int value_index = line.indexOf(">", name_index) + ">".length();
String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
// We're only interested in the path relative to the import folder
int import_index = gsdlsourcefilename_value.indexOf("import");
if (import_index != -1) {
gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
// URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
// This is stored in the System's file.encoding property.
gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
// Make sure the path matches the OS that is running
if (is_unix_path && Utility.isWindows()) {
// Convert path from Unix to Windows
gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
}
else if (!is_unix_path && !Utility.isWindows()) {
// Convert path from Windows to Unix
gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
}
// Remember this for quick access later
if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
}
((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
}
// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
// (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
// which are the gsdlsourcefilenames for the fedora digital object representing a collection.
// This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
else if (gsdlsourcefilename_value.indexOf("tmp") == -1
&& !gsdlsourcefilename_value.endsWith("collect.cfg")
&& !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
// We don't really know what is going on...
System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
}
}
// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
if (metadata_element_name.startsWith("gsdl")) {
continue;
}
MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
if (metadata_element == null) {
// This element isn't defined in ex.mds, so create it for this session
DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
}
}
buffered_reader.close();
}
catch (FileNotFoundException exception) {
DebugStream.printStackTrace(exception);
}
catch (IOException exception) {
DebugStream.printStackTrace(exception);
}
}
/*
public ArrayList getMetadataExtractedFromFile(File file)
{
// Build up a list of metadata extracted from this file
ArrayList metadata_values = new ArrayList();
String file_relative_path = file.getAbsolutePath();
int import_index = file_relative_path.indexOf("import");
if (import_index != -1) {
file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
}
// Check whether this doc.xml file contains extracted metadata for the specified file
ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
if (description_elements_list == null) {
// ...it doesn't
return metadata_values;
}
MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
// Parse the doc.xml file
DebugStream.println("Applicable doc.xml file: " + this);
try {
BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
int description_element_num = 0;
int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
boolean in_relevant_description_element = false;
String line = null;
for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
// Check if this line contains the start of a relevant Description element
if (line_num == next_description_element_start) {
in_relevant_description_element = true;
continue;
}
// If we're not in a relevant Description element we don't care about anything
if (in_relevant_description_element == false) {
continue;
}
// Check if this line contains the end of the relevant Description element
if (line.indexOf("") != -1) {
description_element_num++;
if (description_element_num == description_elements_list.size()) {
break;
}
next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
in_relevant_description_element = false;
continue;
}
// If this line doesn't contain a complete Metadata element, we're not interested
if (line.indexOf("") == -1) {
continue;
}
// Extract the metadata element name
int name_index = line.indexOf(" name=\"") + " name=\"".length();
String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
// If the metadata has a namespace it isn't extracted metadata, so we're not interested
String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
if (!metadata_set_namespace.equals("")) {
continue;
}
// Extracted metadata!
String metadata_element_name = metadata_element_name_full;
// We completely ignore bibliographic data
if (metadata_element_name.equals("SourceSegment")) {
buffered_reader.close();
return new ArrayList();
}
// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
if (metadata_element_name.startsWith("gsdl")) {
continue;
}
MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
// Value trees are not stored for extracted metadata, so create a new value tree node now
int value_index = line.indexOf(">", name_index) + ">".length();
String metadata_element_value = line.substring(value_index, line.lastIndexOf(""));
metadata_element.addMetadataValue(metadata_element_value);
MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
// Add the new metadata value to the list
MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
metadata_values.add(metadata_value);
}
buffered_reader.close();
}
catch (FileNotFoundException exception) {
DebugStream.printStackTrace(exception);
}
catch (IOException exception) {
DebugStream.printStackTrace(exception);
}
return metadata_values;
}
*/
/**
* Every doc.xml file must be skimmed when a collection is opened, for two reasons:
* - To build a mapping from source file to its corresponding doc.xml file
* - To get a complete list of all extracted metadata elements
*/
/*
public void skimFile()
{
MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
// Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
DebugStream.println("Skimming " + this + "...");
try {
BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
int description_element_start = -1;
String line = null;
for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
// This line contains the start of a Description element
if (line.indexOf("") != -1) {
if (description_element_start != -1) {
System.err.println("Parse error: previous Description element unfinished!");
}
description_element_start = line_num;
continue;
}
// This line contains the end of a Description element
if (line.indexOf("") != -1) {
if (description_element_start == -1) {
System.err.println("Parse error: Description element unstarted!");
}
description_element_start = -1;
continue;
}
// If we're not in a Description element there shouldn't be any Metadata elements
if (description_element_start == -1) {
continue;
}
// This line doesn't contain a Metadata element, so we're not interested
if (line.indexOf("", name_index) + ">".length();
String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
// We're only interested in the path relative to the import folder
int import_index = gsdlsourcefilename_value.indexOf("import");
if (import_index != -1) {
gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
// URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
// This is stored in the System's file.encoding property.
gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
// Make sure the path matches the OS that is running
if (is_unix_path && Utility.isWindows()) {
// Convert path from Unix to Windows
gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
}
else if (!is_unix_path && !Utility.isWindows()) {
// Convert path from Windows to Unix
gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
}
// Remember this for quick access later
if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
}
((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
}
// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
// This is true when the source files come from a zip file processed by ZIPPlug, for example
else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
// We don't really know what is going on...
System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
}
}
// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
if (metadata_element_name.startsWith("gsdl")) {
continue;
}
MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
if (metadata_element == null) {
// This element isn't defined in ex.mds, so create it for this session
DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
}
}
buffered_reader.close();
}
catch (FileNotFoundException exception) {
DebugStream.printStackTrace(exception);
}
catch (IOException exception) {
DebugStream.printStackTrace(exception);
}
}
*/
}