Context Navigation

source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 34394

Last change on this file since 34394 was 34394, checked in by ak19, 4 years ago
Bugfix 1 for GLI metadata slowdown: selecting multiple Gathererd files in GLI became very slow. Kathy and Dr Bainbridge had tracked this down to code I had added to support non basic ASCII filenames in GLI, which was making an expensive win operating system function call on Windows for each selected file, launching a Java Process for each. The speed of selecting multiple files is now back to being almost as fast as in 3.09. Tested on Windows and linux. Had to treat windows as a special case because I can't get the code modifications to work on Linux: the perl code stores a hex-encoded string for the filename that GLI now uses when OS is Windows and compares against the hex encoded name of a file selected. But on linux the hex encoded value generated by perl is not the same as that which java generates and after trying repeatedly, I'e not been able to succeed to get it to work. So the code behaves as before for Linux.
Property svn:keywords set to `Author Date Id Revision`
File size: 28.3 KB

Line
1	/**
2	*############################################################################
3	* A component of the Greenstone Librarian Interface, part of the Greenstone
4	* digital library suite from the New Zealand Digital Library Project at the
5	* University of Waikato, New Zealand.
6	*
7	* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8	*
9	* Copyright (C) 2004 New Zealand Digital Library Project
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*############################################################################
25	*/
26
27	package org.greenstone.gatherer.metadata;
28
29
30	import java.io.*;
31	import java.util.*;
32	import java.net.URLDecoder;
33	import org.greenstone.gatherer.DebugStream;
34	import org.greenstone.gatherer.util.Utility;
35
36	//import org.greenstone.gatherer.feedback.Base64; // decode() from Base64 didn't work
37	import org.apache.commons.codec.binary.Base64; // decoding from Base64 works
38
39	/** This class represents one doc.xml file */
40
41	public abstract class DocXMLFile extends File
42	{
43	static boolean isWin = Utility.isWindows();
44	// For Linux, we continue using gsdlsourcefilename as key to the metadata mapping
45	// For Windows, we use the hex encoded long file paths as key
46	static String GSDL_SOURCE_FILE_METANAME = isWin ? "gsdlfullsourcepath" : "gsdlsourcefilename";
47
48	protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
49
50	protected final String MetadataWrap;
51	protected final String MetadataItem;
52
53	protected final String FILE_RENAME_METHOD_NONE = "none";
54	protected final String FILE_RENAME_METHOD_URL = "url";
55	protected final String FILE_RENAME_METHOD_BASE64 = "base64";
56
57	public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
58	{
59	super(doc_xml_file_path);
60	this.MetadataWrap = metaWrap;
61	this.MetadataItem = metaItem;
62	}
63
64	/** On Windows, file_relative_path will be hex-encoded for codepts beyond ASCII.
65	* But keys into the source_file_name_to_description_elements_mapping will then also match on Windows */
66	public ArrayList getMetadataExtractedFromFile(File file, String file_relative_path)
67	{
68	// Build up a list of metadata extracted from this file
69	ArrayList metadata_values = new ArrayList();
70
71	///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
72	/// System.err.println("\n@@@ relFilename: " + relFilename);
73	///}
74
75	// Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
76	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
77	if (description_elements_list == null) {
78	// ...it doesn't
79	///System.err.println("Unable to find meta for file path form " + file_relative_path);
80	return metadata_values; // we're done
81	}
82
83	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
84
85	// Parse the file
86	DebugStream.println("Applicable file: " + this);
87	try {
88	BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
89
90	int description_element_num = 0;
91	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
92	boolean in_relevant_description_element = false;
93
94	String line = null;
95	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
96	// Check if this line contains the start of a relevant "Description" element
97	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
98	if (line_num == next_description_element_start) {
99	in_relevant_description_element = true;
100	continue;
101	}
102
103	// If we're not in a relevant Description element we don't care about anything
104	if (in_relevant_description_element == false) {
105	continue;
106	}
107
108	// Check if this line contains the end of the relevant Description element
109	if (line.indexOf("</"+MetadataWrap+">") != -1) {
110	description_element_num++;
111	if (description_element_num == description_elements_list.size()) {
112	break;
113	}
114
115	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
116	in_relevant_description_element = false;
117	continue;
118	}
119
120	// If this line doesn't contain a complete Metadata element, we're not interested
121	if (line.indexOf("<"+MetadataItem+" ") == -1 \|\| line.indexOf("</"+MetadataItem+">") == -1) {
122	continue;
123	}
124
125	// Extract the metadata element name
126	int name_index = line.indexOf(" name=\"") + " name=\"".length();
127	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
128
129	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
130	// Actually, if it is ex. then we are interested
131	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
132
133	if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
134	continue;
135	}
136
137	// Extracted metadata!
138	// do it like this just in case we have ex.
139	String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
140
141	// We completely ignore bibliographic data
142	if (metadata_element_name.equals("SourceSegment")) {
143	buffered_reader.close();
144	return new ArrayList();
145	}
146
147	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
148	if (metadata_element_name.startsWith("gsdl")) {
149	continue;
150	}
151
152	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
153
154	// Value trees are not stored for extracted metadata, so create a new value tree node now
155	int value_index = line.indexOf(">", name_index) + ">".length();
156	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
157
158	metadata_element.addMetadataValue(metadata_element_value);
159	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
160
161	// Add the new metadata value to the list
162	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
163	metadata_values.add(metadata_value);
164	}
165
166	buffered_reader.close();
167	}
168	catch (FileNotFoundException exception) {
169	DebugStream.printStackTrace(exception);
170	}
171	catch (IOException exception) {
172	DebugStream.printStackTrace(exception);
173	}
174
175	return metadata_values;
176	}
177
178
179
180
181	/**
182	* Every file must be skimmed when a collection is opened, for two reasons:
183	* - To build a mapping from source file to its corresponding doc.xml file
184	* - To get a complete list of all extracted metadata elements
185	*/
186	public void skimFile()
187	{
188	String fileRenameMethod = null;
189	String gsdlsourcefilename_value = null;
190	boolean is_unix_path = false;
191
192	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
193
194	// Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
195	DebugStream.println("Skimming " + this + "...");
196	try {
197	BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
198	int description_element_start = -1;
199
200	String line = null;
201	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
202	// This line contains the start of a "MetadataWrap" element
203	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
204	if (line.indexOf("<"+MetadataWrap+">") != -1) {
205	if (description_element_start != -1) {
206	System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
207	}
208	description_element_start = line_num;
209	continue;
210	}
211
212	// This line contains the end of a "MetadataWrap" element
213	if (line.indexOf("</"+MetadataWrap+">") != -1) {
214	if (description_element_start == -1) {
215	System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
216	}
217	description_element_start = -1;
218	continue;
219	}
220
221	// If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
222	if (description_element_start == -1) {
223	continue;
224	}
225
226	// This line doesn't contain a Metadata element, so we're not interested
227	if (line.indexOf("<"+MetadataItem+" ") == -1) {
228	DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
229	continue;
230	}
231
232	// Extract the metadata element name
233	int name_index = line.indexOf(" name=\"") + " name=\"".length();
234	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
235
236	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
237	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
238	if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
239	continue;
240	}
241
242	// Extracted metadata! May have ex. so make sure we remove that
243	String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
244	if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
245	// Extract the element value
246	int value_index = line.indexOf(">", name_index) + ">".length();
247	fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));
248	}
249
250	// Note which file this is for
251	//else if (metadata_element_name.equals("gsdlsourcefilename")) {
252	else if (metadata_element_name.equals(GSDL_SOURCE_FILE_METANAME)) {
253	// On Unix, GSDL_SOURCE_FILE_METANAME is the gsdlsourcefilename metadata field
254	// which may be encoded by the encoding denoted in fileRenameMethod (and will need decoding)
255	// On Windows, GSDL_SOURCE_FILE_METANAME is a different metadata field that
256	// will be hex encoded for non-ASCII chars
257
258	// Extract the gsdlsourcefilename element value
259	int value_index = line.indexOf(">", name_index) + ">".length();
260	gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
261
262	// We're only interested in the path relative to the import folder
263	int import_index = gsdlsourcefilename_value.indexOf("import");
264	if (import_index != -1) {
265	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
266
267	is_unix_path = gsdlsourcefilename_value.startsWith("/");
268	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
269
270	// (Will decode gsdlsourcefilename at end of this method, once we know
271	// for certain the fileRenameMethod that was used to encode it.)
272
273	// Make sure the path matches the OS that is running
274	if (is_unix_path && isWin) {
275	// Convert path from Unix to Windows
276	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
277	}
278	else if (!is_unix_path && !isWin) {
279	// Convert path from Windows to Unix
280	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
281	}
282
283	///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
284	// Remember this for quick access later
285	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
286	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
287	}
288
289	// Would be better to store hex src file name decoded? But how do we know what encoding the filename is in
290	// https://stackoverflow.com/questions/13990941/how-to-convert-hex-string-to-java-string
291
292
293	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
294	}
295
296	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
297	// (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
298	// which are the gsdlsourcefilenames for the fedora digital object representing a collection.
299	// This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
300	else if (gsdlsourcefilename_value.indexOf("tmp") == -1
301	&& !gsdlsourcefilename_value.endsWith("collect.cfg")
302	&& !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
303	// We don't really know what is going on...
304	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
305	}
306	}
307
308	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
309	if (metadata_element_name.startsWith("gsdl")) {
310	continue;
311	}
312
313	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
314	if (metadata_element == null) {
315	// This element isn't defined in ex.mds, so create it for this session
316	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
317	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
318	}
319	}
320
321	buffered_reader.close();
322
323	// ON WINDOWS, we're working with hex encoded full file path instead of with gsdlsourcefilename,
324	// so needn't bother decoding gsdlsourcefilename as it's unused.
325	// On UNIX, continue decoding gsdlsourcefilename as before
326	if(!isWin) {
327	// Now that we're done skimming, we actually need to decode gsdlsourcefilename
328	// based on whatever fileRenameMethod was used to encode it, so that we can
329	// at last properly compare properly against filenames on the file system
330	// in order to load the correct ex.meta for the file.
331	// Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
332	// we can finally perform the decoding of gsdlsourcefilename.
333	if(fileRenameMethod == null) {
334	fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
335	}
336
337	// If gsdlsourcefilename was encoded, we remove it from the map under its encoded
338	// filename, decode it and add it back into map using its decoded filename.
339	if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
340	ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value);
341	gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
342	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list);
343	}
344	}
345
346	}
347	catch (FileNotFoundException exception) {
348	DebugStream.printStackTrace(exception);
349	}
350	catch (IOException exception) {
351	DebugStream.printStackTrace(exception);
352	} catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
353	DebugStream.printStackTrace(exception);
354	}
355	}
356
357	protected String decodeSourceFilename(String relative_sourcefile_path,
358	String encodingMethod, boolean is_unix_path)
359	throws Exception
360	{
361
362	///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
363
364	// First get the file extension. Both in Base64 and URL encoded strings,
365	// the full-stop character (.) doesn't get encoded.
366	// That means getting the file extension is straightforward.
367
368	// Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
369	// 26 lowercase characters, 26 uppercase characters as well as the
370	// Plus sign (+) and the Forward Slash (/).
371	int fullstop = relative_sourcefile_path.indexOf(".");
372	String file_ext = "";
373	if(fullstop != -1) {
374	file_ext = relative_sourcefile_path.substring(fullstop);
375	relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
376	}
377
378	String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
379
380	String decoded_gsdlsourcefilename = "";
381
382	String separator = is_unix_path ? "/" : "\\";
383	for(int i = 0; i < importFilePathParts.length; i++) {
384	String decoded_filePathPart = "";
385	if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
386	// URL decode each part of gsdlsourcefilename.
387	// Need to set the decoder to use the default system encoding
388	// This is stored in the System's file.encoding property.
389	decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
390	}
391	else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
392	// Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
393	//byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
394	// Using org.apache.commons.codec.binary.Base64 instead
395	// https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
396	// General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
397	byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
398	///System.err.println("Got base64 string: " + importFilePathParts[i]);
399	///System.err.println("Decoded from base64 to bytes: " + new String(bytes, System.getProperty("file.encoding")));
400	// Using system file.encoding to interpret the resulting bytestring as a String,
401	// just as we always did with URL decoding method
402	decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
403	}
404
405	if(i == 0) {
406	decoded_gsdlsourcefilename = decoded_filePathPart;
407	} else {
408	decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
409	}
410	///System.err.println("Built up: " + decoded_gsdlsourcefilename);
411	}
412
413	// add the file extension back in
414	decoded_gsdlsourcefilename += file_ext;
415
416	///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
417
418	return decoded_gsdlsourcefilename;
419	}
420
421	/**
422	* Given a filepath, returns the parts between each file separator as an array.
423	* For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
424	*/
425	private static String[] getFilePathParts(String filepath, boolean is_unix_path) {
426	StringTokenizer tok;
427	if(is_unix_path) {
428	tok = new StringTokenizer(filepath, "/");
429	} else {
430	tok = new StringTokenizer(filepath, "\\");
431	}
432	String[] parts;
433	int count = tok.countTokens();
434	if(count <= 0) {
435	parts = new String[]{filepath};
436	} else {
437	int i = 0;
438	parts = new String[count];
439	while(tok.hasMoreTokens()) {
440	parts[i] = tok.nextToken();
441	//System.err.println("Next part: " + parts[i]);
442	i++;
443	}
444	}
445	return parts;
446	}
447
448	/*
449	public ArrayList getMetadataExtractedFromFile(File file)
450	{
451	// Build up a list of metadata extracted from this file
452	ArrayList metadata_values = new ArrayList();
453
454	String file_relative_path = file.getAbsolutePath();
455	int import_index = file_relative_path.indexOf("import");
456	if (import_index != -1) {
457	file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
458	}
459
460	// Check whether this doc.xml file contains extracted metadata for the specified file
461	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
462	if (description_elements_list == null) {
463	// ...it doesn't
464	return metadata_values;
465	}
466
467	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
468
469	// Parse the doc.xml file
470	DebugStream.println("Applicable doc.xml file: " + this);
471	try {
472	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
473
474	int description_element_num = 0;
475	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
476	boolean in_relevant_description_element = false;
477
478	String line = null;
479	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
480	// Check if this line contains the start of a relevant Description element
481	if (line_num == next_description_element_start) {
482	in_relevant_description_element = true;
483	continue;
484	}
485
486	// If we're not in a relevant Description element we don't care about anything
487	if (in_relevant_description_element == false) {
488	continue;
489	}
490
491	// Check if this line contains the end of the relevant Description element
492	if (line.indexOf("</Description>") != -1) {
493	description_element_num++;
494	if (description_element_num == description_elements_list.size()) {
495	break;
496	}
497
498	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
499	in_relevant_description_element = false;
500	continue;
501	}
502
503	// If this line doesn't contain a complete Metadata element, we're not interested
504	if (line.indexOf("<Metadata ") == -1 \|\| line.indexOf("</Metadata>") == -1) {
505	continue;
506	}
507
508	// Extract the metadata element name
509	int name_index = line.indexOf(" name=\"") + " name=\"".length();
510	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
511
512	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
513	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
514	if (!metadata_set_namespace.equals("")) {
515	continue;
516	}
517
518	// Extracted metadata!
519	String metadata_element_name = metadata_element_name_full;
520
521	// We completely ignore bibliographic data
522	if (metadata_element_name.equals("SourceSegment")) {
523	buffered_reader.close();
524	return new ArrayList();
525	}
526
527	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
528	if (metadata_element_name.startsWith("gsdl")) {
529	continue;
530	}
531
532	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
533
534	// Value trees are not stored for extracted metadata, so create a new value tree node now
535	int value_index = line.indexOf(">", name_index) + ">".length();
536	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
537
538	metadata_element.addMetadataValue(metadata_element_value);
539	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
540
541	// Add the new metadata value to the list
542	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
543	metadata_values.add(metadata_value);
544	}
545
546	buffered_reader.close();
547	}
548	catch (FileNotFoundException exception) {
549	DebugStream.printStackTrace(exception);
550	}
551	catch (IOException exception) {
552	DebugStream.printStackTrace(exception);
553	}
554
555	return metadata_values;
556	}
557
558	*/
559
560	/**
561	* Every doc.xml file must be skimmed when a collection is opened, for two reasons:
562	* - To build a mapping from source file to its corresponding doc.xml file
563	* - To get a complete list of all extracted metadata elements
564	*/
565	/*
566	public void skimFile()
567	{
568	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
569
570	// Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
571	DebugStream.println("Skimming " + this + "...");
572	try {
573	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
574	int description_element_start = -1;
575
576	String line = null;
577	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
578	// This line contains the start of a Description element
579	if (line.indexOf("<Description>") != -1) {
580	if (description_element_start != -1) {
581	System.err.println("Parse error: previous Description element unfinished!");
582	}
583	description_element_start = line_num;
584	continue;
585	}
586
587	// This line contains the end of a Description element
588	if (line.indexOf("</Description>") != -1) {
589	if (description_element_start == -1) {
590	System.err.println("Parse error: Description element unstarted!");
591	}
592	description_element_start = -1;
593	continue;
594	}
595
596	// If we're not in a Description element there shouldn't be any Metadata elements
597	if (description_element_start == -1) {
598	continue;
599	}
600
601	// This line doesn't contain a Metadata element, so we're not interested
602	if (line.indexOf("<Metadata ") == -1) {
603	DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
604	continue;
605	}
606
607	// Extract the metadata element name
608	int name_index = line.indexOf(" name=\"") + " name=\"".length();
609	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
610
611	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
612	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
613	if (!metadata_set_namespace.equals("")) {
614	continue;
615	}
616
617	// Extracted metadata!
618	String metadata_element_name = metadata_element_name_full;
619
620	// Note which file this doc.xml is for
621	if (metadata_element_name.equals("gsdlsourcefilename")) {
622	// Extract the gsdlsourcefilename element value
623	int value_index = line.indexOf(">", name_index) + ">".length();
624	String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
625
626	// We're only interested in the path relative to the import folder
627	int import_index = gsdlsourcefilename_value.indexOf("import");
628	if (import_index != -1) {
629	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
630
631	boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
632	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
633
634	// URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
635	// This is stored in the System's file.encoding property.
636	gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
637
638	// Make sure the path matches the OS that is running
639	if (is_unix_path && isWin) {
640	// Convert path from Unix to Windows
641	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
642	}
643	else if (!is_unix_path && !isWin) {
644	// Convert path from Windows to Unix
645	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
646	}
647
648	// Remember this for quick access later
649	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
650	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
651	}
652
653	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
654	}
655
656	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
657	// This is true when the source files come from a zip file processed by ZIPPlug, for example
658	else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
659	// We don't really know what is going on...
660	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
661	}
662	}
663
664	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
665	if (metadata_element_name.startsWith("gsdl")) {
666	continue;
667	}
668
669	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
670	if (metadata_element == null) {
671	// This element isn't defined in ex.mds, so create it for this session
672	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
673	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
674	}
675	}
676
677	buffered_reader.close();
678	}
679	catch (FileNotFoundException exception) {
680	DebugStream.printStackTrace(exception);
681	}
682	catch (IOException exception) {
683	DebugStream.printStackTrace(exception);
684	}
685	}
686	*/
687
688	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: