/** *############################################################################ * A component of the Greenstone Librarian Interface, part of the Greenstone * digital library suite from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ * * Copyright (C) 2004 New Zealand Digital Library Project * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *############################################################################ */ package org.greenstone.gatherer.metadata; import java.io.*; import java.util.*; import java.net.URLDecoder; import org.greenstone.gatherer.DebugStream; import org.greenstone.gatherer.util.Utility; /** This class represents one doc.xml file */ public abstract class DocXMLFile extends File { protected HashMap source_file_name_to_description_elements_mapping = new HashMap(); protected final String MetadataWrap; protected final String MetadataItem; public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem) { super(doc_xml_file_path); this.MetadataWrap = metaWrap; this.MetadataItem = metaItem; } public ArrayList getMetadataExtractedFromFile(File file) { // Build up a list of metadata extracted from this file ArrayList metadata_values = new ArrayList(); String file_relative_path = file.getAbsolutePath(); int import_index = file_relative_path.indexOf("import"); if (import_index != -1) { file_relative_path = file_relative_path.substring(import_index + "import".length() + 1); } // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); if (description_elements_list == null) { // ...it doesn't return metadata_values; } MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); // Parse the file DebugStream.println("Applicable file: " + this); try { BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); int description_element_num = 0; int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); boolean in_relevant_description_element = false; String line = null; for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { // Check if this line contains the start of a relevant "Description" element // (mets:xmlData in METS parlance, Description in GreenstoneArchive format) if (line_num == next_description_element_start) { in_relevant_description_element = true; continue; } // If we're not in a relevant Description element we don't care about anything if (in_relevant_description_element == false) { continue; } // Check if this line contains the end of the relevant Description element if (line.indexOf("") != -1) { description_element_num++; if (description_element_num == description_elements_list.size()) { break; } next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); in_relevant_description_element = false; continue; } // If this line doesn't contain a complete Metadata element, we're not interested if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("") == -1) { continue; } // Extract the metadata element name int name_index = line.indexOf(" name=\"") + " name=\"".length(); String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index)); // If the metadata has a namespace it isn't extracted metadata, so we're not interested String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full); if (!metadata_set_namespace.equals("")) { continue; } // Extracted metadata! String metadata_element_name = metadata_element_name_full; // We completely ignore bibliographic data if (metadata_element_name.equals("SourceSegment")) { buffered_reader.close(); return new ArrayList(); } // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) if (metadata_element_name.startsWith("gsdl")) { continue; } MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); // Value trees are not stored for extracted metadata, so create a new value tree node now int value_index = line.indexOf(">", name_index) + ">".length(); String metadata_element_value = line.substring(value_index, line.lastIndexOf("")); metadata_element.addMetadataValue(metadata_element_value); MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value); // Add the new metadata value to the list MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node); metadata_values.add(metadata_value); } buffered_reader.close(); } catch (FileNotFoundException exception) { DebugStream.printStackTrace(exception); } catch (IOException exception) { DebugStream.printStackTrace(exception); } return metadata_values; } /** * Every file must be skimmed when a collection is opened, for two reasons: * - To build a mapping from source file to its corresponding doc.xml file * - To get a complete list of all extracted metadata elements */ public void skimFile() { MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements DebugStream.println("Skimming " + this + "..."); try { BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); int description_element_start = -1; String line = null; for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { // This line contains the start of a "MetadataWrap" element // (mets:xmlData in METS parlance, Description in GreenstoneArchive format) if (line.indexOf("<"+MetadataWrap+">") != -1) { if (description_element_start != -1) { System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!"); } description_element_start = line_num; continue; } // This line contains the end of a "MetadataWrap" element if (line.indexOf("") != -1) { if (description_element_start == -1) { System.err.println("Parse error: "+MetadataWrap+" element unstarted!"); } description_element_start = -1; continue; } // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements if (description_element_start == -1) { continue; } // This line doesn't contain a Metadata element, so we're not interested if (line.indexOf("<"+MetadataItem+" ") == -1) { DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element."); continue; } // Extract the metadata element name int name_index = line.indexOf(" name=\"") + " name=\"".length(); String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index)); // If the metadata has a namespace it isn't extracted metadata, so we're not interested String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full); if (!metadata_set_namespace.equals("")) { continue; } // Extracted metadata! String metadata_element_name = metadata_element_name_full; // Note which file this is for if (metadata_element_name.equals("gsdlsourcefilename")) { // Extract the gsdlsourcefilename element value int value_index = line.indexOf(">", name_index) + ">".length(); String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index)); // We're only interested in the path relative to the import folder int import_index = gsdlsourcefilename_value.indexOf("import"); if (import_index != -1) { gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length()); boolean is_unix_path = gsdlsourcefilename_value.startsWith("/"); gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1); // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding // This is stored in the System's file.encoding property. gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding")); // Make sure the path matches the OS that is running if (is_unix_path && Utility.isWindows()) { // Convert path from Unix to Windows gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\"); } else if (!is_unix_path && !Utility.isWindows()) { // Convert path from Windows to Unix gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/"); } // Remember this for quick access later if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) { source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList()); } ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start)); } // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory // or (as in the case of using FLI) if it is the etc/collect.cfg file // This is true when the source files come from a zip file processed by ZIPPlug, for example else if (gsdlsourcefilename_value.indexOf("tmp") == -1 && !gsdlsourcefilename_value.endsWith("collect.cfg")) { // We don't really know what is going on... System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value); } } // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) if (metadata_element_name.startsWith("gsdl")) { continue; } MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); if (metadata_element == null) { // This element isn't defined in ex.mds, so create it for this session DebugStream.println("Extracted metadata element not defined: " + metadata_element_name); extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name); } } buffered_reader.close(); } catch (FileNotFoundException exception) { DebugStream.printStackTrace(exception); } catch (IOException exception) { DebugStream.printStackTrace(exception); } } /* public ArrayList getMetadataExtractedFromFile(File file) { // Build up a list of metadata extracted from this file ArrayList metadata_values = new ArrayList(); String file_relative_path = file.getAbsolutePath(); int import_index = file_relative_path.indexOf("import"); if (import_index != -1) { file_relative_path = file_relative_path.substring(import_index + "import".length() + 1); } // Check whether this doc.xml file contains extracted metadata for the specified file ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); if (description_elements_list == null) { // ...it doesn't return metadata_values; } MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); // Parse the doc.xml file DebugStream.println("Applicable doc.xml file: " + this); try { BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); int description_element_num = 0; int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); boolean in_relevant_description_element = false; String line = null; for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { // Check if this line contains the start of a relevant Description element if (line_num == next_description_element_start) { in_relevant_description_element = true; continue; } // If we're not in a relevant Description element we don't care about anything if (in_relevant_description_element == false) { continue; } // Check if this line contains the end of the relevant Description element if (line.indexOf("") != -1) { description_element_num++; if (description_element_num == description_elements_list.size()) { break; } next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); in_relevant_description_element = false; continue; } // If this line doesn't contain a complete Metadata element, we're not interested if (line.indexOf("") == -1) { continue; } // Extract the metadata element name int name_index = line.indexOf(" name=\"") + " name=\"".length(); String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index)); // If the metadata has a namespace it isn't extracted metadata, so we're not interested String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full); if (!metadata_set_namespace.equals("")) { continue; } // Extracted metadata! String metadata_element_name = metadata_element_name_full; // We completely ignore bibliographic data if (metadata_element_name.equals("SourceSegment")) { buffered_reader.close(); return new ArrayList(); } // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) if (metadata_element_name.startsWith("gsdl")) { continue; } MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); // Value trees are not stored for extracted metadata, so create a new value tree node now int value_index = line.indexOf(">", name_index) + ">".length(); String metadata_element_value = line.substring(value_index, line.lastIndexOf("")); metadata_element.addMetadataValue(metadata_element_value); MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value); // Add the new metadata value to the list MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node); metadata_values.add(metadata_value); } buffered_reader.close(); } catch (FileNotFoundException exception) { DebugStream.printStackTrace(exception); } catch (IOException exception) { DebugStream.printStackTrace(exception); } return metadata_values; } */ /** * Every doc.xml file must be skimmed when a collection is opened, for two reasons: * - To build a mapping from source file to its corresponding doc.xml file * - To get a complete list of all extracted metadata elements */ /* public void skimFile() { MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements DebugStream.println("Skimming " + this + "..."); try { BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); int description_element_start = -1; String line = null; for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { // This line contains the start of a Description element if (line.indexOf("") != -1) { if (description_element_start != -1) { System.err.println("Parse error: previous Description element unfinished!"); } description_element_start = line_num; continue; } // This line contains the end of a Description element if (line.indexOf("") != -1) { if (description_element_start == -1) { System.err.println("Parse error: Description element unstarted!"); } description_element_start = -1; continue; } // If we're not in a Description element there shouldn't be any Metadata elements if (description_element_start == -1) { continue; } // This line doesn't contain a Metadata element, so we're not interested if (line.indexOf("", name_index) + ">".length(); String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index)); // We're only interested in the path relative to the import folder int import_index = gsdlsourcefilename_value.indexOf("import"); if (import_index != -1) { gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length()); boolean is_unix_path = gsdlsourcefilename_value.startsWith("/"); gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1); // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding // This is stored in the System's file.encoding property. gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding")); // Make sure the path matches the OS that is running if (is_unix_path && Utility.isWindows()) { // Convert path from Unix to Windows gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\"); } else if (!is_unix_path && !Utility.isWindows()) { // Convert path from Windows to Unix gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/"); } // Remember this for quick access later if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) { source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList()); } ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start)); } // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory // This is true when the source files come from a zip file processed by ZIPPlug, for example else if (gsdlsourcefilename_value.indexOf("tmp") == -1) { // We don't really know what is going on... System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value); } } // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) if (metadata_element_name.startsWith("gsdl")) { continue; } MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); if (metadata_element == null) { // This element isn't defined in ex.mds, so create it for this session DebugStream.println("Extracted metadata element not defined: " + metadata_element_name); extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name); } } buffered_reader.close(); } catch (FileNotFoundException exception) { DebugStream.printStackTrace(exception); } catch (IOException exception) { DebugStream.printStackTrace(exception); } } */ }