Context Navigation

source: gli/trunk/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 16579

Last change on this file since 16579 was 16579, checked in by ak19, 16 years ago
Modified to work with changes in doc.pm: doc.pm Base64 encodes gsdlsourcefilename now, so this class must decode it.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.1 KB

Line
1	/**
2	*############################################################################
3	* A component of the Greenstone Librarian Interface, part of the Greenstone
4	* digital library suite from the New Zealand Digital Library Project at the
5	* University of Waikato, New Zealand.
6	*
7	* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8	*
9	* Copyright (C) 2004 New Zealand Digital Library Project
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*############################################################################
25	*/
26
27	package org.greenstone.gatherer.metadata;
28
29
30	import java.io.*;
31	import java.util.*;
32	import org.greenstone.gatherer.DebugStream;
33	import org.greenstone.gatherer.util.Utility;
34	import org.greenstone.gatherer.feedback.Base64;
35
36	/** This class represents one doc.xml file */
37	public class DocXMLFile
38	extends File
39	{
40	private HashMap source_file_name_to_description_elements_mapping = new HashMap();
41
42
43	public DocXMLFile(String doc_xml_file_path)
44	{
45	super(doc_xml_file_path);
46	}
47
48
49	public ArrayList getMetadataExtractedFromFile(File file)
50	{
51	// Build up a list of metadata extracted from this file
52	ArrayList metadata_values = new ArrayList();
53
54	String file_relative_path = file.getAbsolutePath();
55	int import_index = file_relative_path.indexOf("import");
56	if (import_index != -1) {
57	file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
58	}
59
60	// Check whether this doc.xml file contains extracted metadata for the specified file
61	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
62	if (description_elements_list == null) {
63	// ...it doesn't
64	return metadata_values;
65	}
66
67	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
68
69	// Parse the doc.xml file
70	DebugStream.println("Applicable doc.xml file: " + this);
71	try {
72	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
73
74	int description_element_num = 0;
75	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
76	boolean in_relevant_description_element = false;
77
78	String line = null;
79	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
80	// Check if this line contains the start of a relevant Description element
81	if (line_num == next_description_element_start) {
82	in_relevant_description_element = true;
83	continue;
84	}
85
86	// If we're not in a relevant Description element we don't care about anything
87	if (in_relevant_description_element == false) {
88	continue;
89	}
90
91	// Check if this line contains the end of the relevant Description element
92	if (line.indexOf("</Description>") != -1) {
93	description_element_num++;
94	if (description_element_num == description_elements_list.size()) {
95	break;
96	}
97
98	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
99	in_relevant_description_element = false;
100	continue;
101	}
102
103	// If this line doesn't contain a complete Metadata element, we're not interested
104	if (line.indexOf("<Metadata ") == -1 \|\| line.indexOf("</Metadata>") == -1) {
105	continue;
106	}
107
108	// Extract the metadata element name
109	int name_index = line.indexOf(" name=\"") + " name=\"".length();
110	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
111
112	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
113	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
114	if (!metadata_set_namespace.equals("")) {
115	continue;
116	}
117
118	// Extracted metadata!
119	String metadata_element_name = metadata_element_name_full;
120
121	// We completely ignore bibliographic data
122	if (metadata_element_name.equals("SourceSegment")) {
123	buffered_reader.close();
124	return new ArrayList();
125	}
126
127	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
128	if (metadata_element_name.startsWith("gsdl")) {
129	continue;
130	}
131
132	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
133
134	// Value trees are not stored for extracted metadata, so create a new value tree node now
135	int value_index = line.indexOf(">", name_index) + ">".length();
136	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
137
138	metadata_element.addMetadataValue(metadata_element_value);
139	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
140
141	// Add the new metadata value to the list
142	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
143	metadata_values.add(metadata_value);
144	}
145
146	buffered_reader.close();
147	}
148	catch (FileNotFoundException exception) {
149	DebugStream.printStackTrace(exception);
150	}
151	catch (IOException exception) {
152	DebugStream.printStackTrace(exception);
153	}
154
155	return metadata_values;
156	}
157
158
159	/**
160	* Every doc.xml file must be skimmed when a collection is opened, for two reasons:
161	* - To build a mapping from source file to its corresponding doc.xml file
162	* - To get a complete list of all extracted metadata elements
163	*/
164	public void skimFile()
165	{
166	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
167
168	// Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
169	DebugStream.println("Skimming doc.xml file " + this + "...");
170	try {
171	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
172	int description_element_start = -1;
173
174	String line = null;
175	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
176	// This line contains the start of a Description element
177	if (line.indexOf("<Description>") != -1) {
178	if (description_element_start != -1) {
179	System.err.println("Parse error: previous Description element unfinished!");
180	}
181	description_element_start = line_num;
182	continue;
183	}
184
185	// This line contains the end of a Description element
186	if (line.indexOf("</Description>") != -1) {
187	if (description_element_start == -1) {
188	System.err.println("Parse error: Description element unstarted!");
189	}
190	description_element_start = -1;
191	continue;
192	}
193
194	// If we're not in a Description element there shouldn't be any Metadata elements
195	if (description_element_start == -1) {
196	continue;
197	}
198
199	// This line doesn't contain a Metadata element, so we're not interested
200	if (line.indexOf("<Metadata ") == -1) {
201	DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
202	continue;
203	}
204
205	// Extract the metadata element name
206	int name_index = line.indexOf(" name=\"") + " name=\"".length();
207	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
208
209	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
210	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
211	if (!metadata_set_namespace.equals("")) {
212	continue;
213	}
214
215	// Extracted metadata!
216	String metadata_element_name = metadata_element_name_full;
217
218	// Note which file this doc.xml is for
219	if (metadata_element_name.equals("gsdlsourcefilename")) {
220	// Extract the gsdlsourcefilename element value
221	int value_index = line.indexOf(">", name_index) + ">".length();
222	String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
223
224	// We're only interested in the path relative to the import folder
225	int import_index = gsdlsourcefilename_value.indexOf("import");
226	if (import_index != -1) {
227	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
228
229	boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
230	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
231
232	// Apply Base64 decoding - note that encoding into base64 may have
233	// encoded some characters as slashes. Since slash characters are
234	// removed below the decoding must be done before that to ensure
235	// only genuine slashes remain for processing:
236	gsdlsourcefilename_value = new String(Base64.decode(gsdlsourcefilename_value));
237	//System.err.println("****decodedSrcFilename is: " + gsdlsourcefilename_value);
238
239	// Make sure the path matches the OS that is running
240	if (is_unix_path && Utility.isWindows()) {
241	// Convert path from Unix to Windows
242	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
243	}
244	else if (!is_unix_path && !Utility.isWindows()) {
245	// Convert path from Windows to Unix
246	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
247	}
248
249	// Remember this for quick access later
250	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
251	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
252	}
253
254	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
255	}
256
257	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
258	// This is true when the source files come from a zip file processed by ZIPPlug, for example
259	else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
260	// We don't really know what is going on...
261	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
262	}
263	}
264
265	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
266	if (metadata_element_name.startsWith("gsdl")) {
267	continue;
268	}
269
270	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
271	if (metadata_element == null) {
272	// This element isn't defined in ex.mds, so create it for this session
273	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
274	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
275	}
276	}
277
278	buffered_reader.close();
279	}
280	catch (FileNotFoundException exception) {
281	DebugStream.printStackTrace(exception);
282	}
283	catch (IOException exception) {
284	DebugStream.printStackTrace(exception);
285	}
286	}
287	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: