Context Navigation

source: trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 9701

Last change on this file since 9701 was 9701, checked in by mdewsnip, 19 years ago
Prevented the "Could not understand gsdlsourcefilename" warning for odd gsdlsourcefilename's consisting of the Greenstone "tmp" directory. This is true for source files from zip files processed by ZIPPlug for example. It is not possible to view the extracted metadata for these files (because they are not visible on the Enrich pane).
Property svn:keywords set to `Author Date Id Revision`
File size: 10.6 KB

Line
1	/**
2	*############################################################################
3	* A component of the Greenstone Librarian Interface, part of the Greenstone
4	* digital library suite from the New Zealand Digital Library Project at the
5	* University of Waikato, New Zealand.
6	*
7	* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8	*
9	* Copyright (C) 2004 New Zealand Digital Library Project
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*############################################################################
25	*/
26
27	package org.greenstone.gatherer.metadata;
28
29
30	import java.io.*;
31	import java.util.*;
32	import org.greenstone.gatherer.DebugStream;
33	import org.greenstone.gatherer.util.Utility;
34
35
36	/** This class represents one doc.xml file */
37	public class DocXMLFile
38	extends File
39	{
40	private HashMap source_file_name_to_description_elements_mapping = new HashMap();
41
42
43	public DocXMLFile(String doc_xml_file_path)
44	{
45	super(doc_xml_file_path);
46	}
47
48
49	public ArrayList getMetadataExtractedFromFile(File file)
50	{
51	// Build up a list of metadata extracted from this file
52	ArrayList metadata_values = new ArrayList();
53
54	String file_relative_path = file.getAbsolutePath();
55	int import_index = file_relative_path.indexOf("import");
56	if (import_index != -1) {
57	file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
58	}
59
60	// Check whether this doc.xml file contains extracted metadata for the specified file
61	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
62	if (description_elements_list == null) {
63	// ...it doesn't
64	return metadata_values;
65	}
66
67	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
68
69	// Parse the doc.xml file
70	DebugStream.println("Applicable doc.xml file: " + this);
71	try {
72	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
73
74	int description_element_num = 0;
75	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
76	boolean in_relevant_description_element = false;
77
78	String line = null;
79	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
80	// Check if this line contains the start of a relevant Description element
81	if (line_num == next_description_element_start) {
82	in_relevant_description_element = true;
83	continue;
84	}
85
86	// If we're not in a relevant Description element we don't care about anything
87	if (in_relevant_description_element == false) {
88	continue;
89	}
90
91	// Check if this line contains the end of the relevant Description element
92	if (line.indexOf("</Description>") != -1) {
93	description_element_num++;
94	if (description_element_num == description_elements_list.size()) {
95	break;
96	}
97
98	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
99	in_relevant_description_element = false;
100	continue;
101	}
102
103	// If this line doesn't contain a complete Metadata element, we're not interested
104	if (line.indexOf("<Metadata ") == -1 \|\| line.indexOf("</Metadata>") == -1) {
105	continue;
106	}
107
108	// Extract the metadata element name
109	int name_index = line.indexOf(" name=\"") + " name=\"".length();
110	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
111
112	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
113	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
114	if (!metadata_set_namespace.equals("")) {
115	continue;
116	}
117
118	// Extracted metadata!
119	String metadata_element_name = metadata_element_name_full;
120
121	// We completely ignore bibliographic data
122	if (metadata_element_name.equals("SourceSegment")) {
123	return new ArrayList();
124	}
125
126	// Ignore lower-case metadata elements (gsdlsourcefilename, gsdlassocfile etc.)
127	// and those starting with '/' (/srclink)
128	char first_character = metadata_element_name.charAt(0);
129	if (Character.isLowerCase(first_character) \|\| first_character == '/') {
130	continue;
131	}
132
133	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
134
135	// Value trees are not stored for extracted metadata, so create a new value tree node now
136	int value_index = line.indexOf(">", name_index) + ">".length();
137	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
138
139	metadata_element.addMetadataValue(metadata_element_value);
140	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
141
142	// Add the new metadata value to the list
143	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
144	metadata_values.add(metadata_value);
145	}
146	}
147	catch (Exception ex) {
148	System.err.println("Exception: " + ex);
149	ex.printStackTrace();
150	}
151
152	return metadata_values;
153	}
154
155
156	/**
157	* Every doc.xml file must be skimmed when a collection is opened, for two reasons:
158	* - To build a mapping from source file to its corresponding doc.xml file
159	* - To get a complete list of all extracted metadata elements
160	*/
161	public void skimFile()
162	{
163	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
164
165	// Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
166	DebugStream.println("Skimming doc.xml file " + this + "...");
167	try {
168	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
169	int description_element_start = -1;
170
171	String line = null;
172	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
173	// This line contains the start of a Description element
174	if (line.indexOf("<Description>") != -1) {
175	if (description_element_start != -1) {
176	System.err.println("Parse error: previous Description element unfinished!");
177	}
178	description_element_start = line_num;
179	continue;
180	}
181
182	// This line contains the end of a Description element
183	if (line.indexOf("</Description>") != -1) {
184	if (description_element_start == -1) {
185	System.err.println("Parse error: Description element unstarted!");
186	}
187	description_element_start = -1;
188	continue;
189	}
190
191	// If we're not in a Description element there shouldn't be any Metadata elements
192	if (description_element_start == -1) {
193	continue;
194	}
195
196	// This line doesn't contain a Metadata element, so we're not interested
197	if (line.indexOf("<Metadata ") == -1) {
198	DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
199	continue;
200	}
201
202	// Extract the metadata element name
203	int name_index = line.indexOf(" name=\"") + " name=\"".length();
204	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
205
206	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
207	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
208	if (!metadata_set_namespace.equals("")) {
209	continue;
210	}
211
212	// Extracted metadata!
213	String metadata_element_name = metadata_element_name_full;
214
215	// Note which file this doc.xml is for
216	if (metadata_element_name.equals("gsdlsourcefilename")) {
217	// Extract the gsdlsourcefilename element value
218	int value_index = line.indexOf(">", name_index) + ">".length();
219	String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
220
221	// We're only interested in the path relative to the import folder
222	int import_index = gsdlsourcefilename_value.indexOf("import");
223	if (import_index != -1) {
224	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
225
226	boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
227	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
228
229	// Make sure the path matches the OS that is running
230	if (is_unix_path && Utility.isWindows()) {
231	// Convert path from Unix to Windows
232	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
233	}
234	else if (!is_unix_path && !Utility.isWindows()) {
235	// Convert path from Windows to Unix
236	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
237	}
238
239	// Remember this for quick access later
240	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
241	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
242	}
243
244	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
245	}
246
247	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
248	// This is true when the source files come from a zip file processed by ZIPPlug, for example
249	else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
250	// We don't really know what is going on...
251	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
252	}
253	}
254
255	// Ignore lower-case metadata elements (gsdlsourcefilename, gsdlassocfile etc.)
256	// and those starting with '/' (/srclink)
257	char first_character = metadata_element_name.charAt(0);
258	if (Character.isLowerCase(first_character) \|\| first_character == '/') {
259	continue;
260	}
261
262	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
263	if (metadata_element == null) {
264	// This element isn't defined in ex.mds, so create it for this session
265	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
266	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
267	}
268	}
269	}
270	catch (Exception ex) {
271	System.err.println("Exception: " + ex);
272	ex.printStackTrace();
273	}
274	}
275	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: