Context Navigation

source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 34510

Last change on this file since 34510 was 34510, checked in by ak19, 4 years ago
Dr Bainbridge didn't want a heuristic test on tilda for checking if a gsdlsourcefilename stored in doc.xml was a Win 8.3 Short filename or not. The idea was to set a flag on the perl end for if the gsdlsourcefilename stored was a shortfilename or not. Unfortunately, the perl code (DirectoryPlugin::read, variable subfile) was set to short filenames when doing a readdir, not by any algorithm. So it's always potentially a short file name on windows, except that only the presence of a tilda would indicate this. Whether we test for this in perl and set a flag or test for this in Java doesn't matter then. So have taken out the test for presence of tilda, though this means the java code will always try to work out a long filename of a gsdlsourcefilename on Windows.
Property svn:keywords set to `Author Date Id Revision`
File size: 30.0 KB

Line
1	/**
2	*############################################################################
3	* A component of the Greenstone Librarian Interface, part of the Greenstone
4	* digital library suite from the New Zealand Digital Library Project at the
5	* University of Waikato, New Zealand.
6	*
7	* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8	*
9	* Copyright (C) 2004 New Zealand Digital Library Project
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*############################################################################
25	*/
26
27	package org.greenstone.gatherer.metadata;
28
29
30	import java.io.*;
31	import java.util.*;
32	import java.net.URLDecoder;
33	import org.greenstone.gatherer.DebugStream;
34	import org.greenstone.gatherer.Gatherer;
35	import org.greenstone.gatherer.util.Utility;
36
37	//import org.greenstone.gatherer.feedback.Base64; // decode() from Base64 didn't work
38	import org.apache.commons.codec.binary.Base64; // decoding from Base64 works
39
40	/** This class represents one doc.xml file */
41
42	public abstract class DocXMLFile extends File
43	{
44	static boolean isWin = Utility.isWindows();
45
46	protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
47
48	protected final String MetadataWrap;
49	protected final String MetadataItem;
50
51	protected final String FILE_RENAME_METHOD_NONE = "none";
52	protected final String FILE_RENAME_METHOD_URL = "url";
53	protected final String FILE_RENAME_METHOD_BASE64 = "base64";
54
55	public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
56	{
57	super(doc_xml_file_path);
58	this.MetadataWrap = metaWrap;
59	this.MetadataItem = metaItem;
60	}
61
62	/** On Windows, file_relative_path will be hex-encoded for codepts beyond ASCII.
63	* But keys into the source_file_name_to_description_elements_mapping will then also match on Windows */
64	public ArrayList getMetadataExtractedFromFile(File file, String file_relative_path)
65	{
66	// Build up a list of metadata extracted from this file
67	ArrayList metadata_values = new ArrayList();
68
69	///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
70	/// System.err.println("\n@@@ relFilename: " + relFilename);
71	///}
72
73	// Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
74	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
75	if (description_elements_list == null) {
76	// ...it doesn't
77	///System.err.println("Unable to find meta for file path form " + file_relative_path);
78	return metadata_values; // we're done
79	} ///else { System.err.println("@@@ file rel path: " + file_relative_path + " matched" ); }
80
81	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
82
83	// Parse the file
84	DebugStream.println("Applicable file: " + this);
85	try {
86	BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
87
88	int description_element_num = 0;
89	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
90	boolean in_relevant_description_element = false;
91
92	String line = null;
93	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
94	// Check if this line contains the start of a relevant "Description" element
95	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
96	if (line_num == next_description_element_start) {
97	in_relevant_description_element = true;
98	continue;
99	}
100
101	// If we're not in a relevant Description element we don't care about anything
102	if (in_relevant_description_element == false) {
103	continue;
104	}
105
106	// Check if this line contains the end of the relevant Description element
107	if (line.indexOf("</"+MetadataWrap+">") != -1) {
108	description_element_num++;
109	if (description_element_num == description_elements_list.size()) {
110	break;
111	}
112
113	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
114	in_relevant_description_element = false;
115	continue;
116	}
117
118	// If this line doesn't contain a complete Metadata element, we're not interested
119	if (line.indexOf("<"+MetadataItem+" ") == -1 \|\| line.indexOf("</"+MetadataItem+">") == -1) {
120	continue;
121	}
122
123	// Extract the metadata element name
124	int name_index = line.indexOf(" name=\"") + " name=\"".length();
125	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
126
127	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
128	// Actually, if it is ex. then we are interested
129	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
130
131	if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
132	continue;
133	}
134
135	// Extracted metadata!
136	// do it like this just in case we have ex.
137	String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
138
139	// We completely ignore bibliographic data
140	if (metadata_element_name.equals("SourceSegment")) {
141	buffered_reader.close();
142	return new ArrayList();
143	}
144
145	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
146	if (metadata_element_name.startsWith("gsdl")) {
147	continue;
148	}
149
150	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
151
152	// Value trees are not stored for extracted metadata, so create a new value tree node now
153	int value_index = line.indexOf(">", name_index) + ">".length();
154	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
155
156	metadata_element.addMetadataValue(metadata_element_value);
157	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
158
159	// Add the new metadata value to the list
160	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
161	metadata_values.add(metadata_value);
162	}
163
164	buffered_reader.close();
165	}
166	catch (FileNotFoundException exception) {
167	DebugStream.printStackTrace(exception);
168	}
169	catch (IOException exception) {
170	DebugStream.printStackTrace(exception);
171	}
172
173	return metadata_values;
174	}
175
176
177
178
179	/**
180	* Every file must be skimmed when a collection is opened, for two reasons:
181	* - To build a mapping from source file to its corresponding doc.xml file
182	* - To get a complete list of all extracted metadata elements
183	*/
184	public void skimFile()
185	{
186	String fileRenameMethod = null;
187	String gsdlsourcefilename_value = null;
188	boolean is_unix_path = false;
189	int description_element_start_gsdlsourcefilename_value = -1;
190
191	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
192
193	// Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
194	DebugStream.println("Skimming " + this + "...");
195	try {
196	BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
197	int description_element_start = -1;
198
199	String line = null;
200	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
201	// This line contains the start of a "MetadataWrap" element
202	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
203	if (line.indexOf("<"+MetadataWrap+">") != -1) {
204	if (description_element_start != -1) {
205	System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
206	}
207	description_element_start = line_num;
208	continue;
209	}
210
211	// This line contains the end of a "MetadataWrap" element
212	if (line.indexOf("</"+MetadataWrap+">") != -1) {
213	if (description_element_start == -1) {
214	System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
215	}
216	description_element_start = -1;
217	continue;
218	}
219
220	// If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
221	if (description_element_start == -1) {
222	continue;
223	}
224
225	// This line doesn't contain a Metadata element, so we're not interested
226	if (line.indexOf("<"+MetadataItem+" ") == -1) {
227	DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
228	continue;
229	}
230
231	// Extract the metadata element name
232	int name_index = line.indexOf(" name=\"") + " name=\"".length();
233	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
234
235	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
236	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
237	if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
238	continue;
239	}
240
241	// Extracted metadata! May have ex. so make sure we remove that
242	String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
243	if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
244	// Extract the element value
245	int value_index = line.indexOf(">", name_index) + ">".length();
246	fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));
247	}
248
249	// Note which file this is for
250	else if (metadata_element_name.equals("gsdlsourcefilename")) {
251	// the gsdlsourcefilename metadata field may be encoded by the encoding denoted
252	// in fileRenameMethod (and will need decoding)
253
254	// Extract the gsdlsourcefilename element value
255	int value_index = line.indexOf(">", name_index) + ">".length();
256	gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
257
258	// We're only interested in the path relative to the import folder
259	int import_index = gsdlsourcefilename_value.indexOf("import");
260	if (import_index != -1) {
261
262	///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
263	///System.err.println("@@@@ Found description_element_start_gsdlsourcefilename_value: " + description_element_start);
264	description_element_start_gsdlsourcefilename_value = description_element_start;
265
266	}
267
268	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
269	// (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
270	// which are the gsdlsourcefilenames for the fedora digital object representing a collection.
271	// This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
272	else if (gsdlsourcefilename_value.indexOf("tmp") == -1
273	&& !gsdlsourcefilename_value.endsWith("collect.cfg")
274	&& !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
275	// We don't really know what is going on...
276	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
277	}
278	}
279
280	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
281	if (metadata_element_name.startsWith("gsdl")) {
282	continue;
283	}
284
285	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
286	if (metadata_element == null) {
287	// This element isn't defined in ex.mds, so create it for this session
288	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
289	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
290	}
291	}
292
293	buffered_reader.close();
294
295	// Work out if is_unix_path
296	int import_index = gsdlsourcefilename_value.indexOf("import");
297	if (import_index != -1) {
298	String tempStr = gsdlsourcefilename_value.substring(import_index + "import".length());
299	is_unix_path = tempStr.startsWith("/");
300	}
301	// We're only interested in the path relative to the import folder
302	// Lop off "import" folder prefix
303	gsdlsourcefilename_value = adjustForRelativeToImportDir(gsdlsourcefilename_value);
304
305	// Now that we're done skimming, we actually need to decode gsdlsourcefilename
306	// based on whatever fileRenameMethod was used to encode it, so that we can
307	// at last properly compare against filenames on the file system
308	// in order to load the correct ex.meta for the file.
309	// Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
310	// we can finally perform the decoding of gsdlsourcefilename.
311	if(fileRenameMethod == null) {
312	fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
313	}
314	if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
315	gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
316	}
317
318	// Now we can finally put the gsdlsourcefilename path relative to import dir into the hashmap
319	///System.err.println("@@@ into map: " + gsdlsourcefilename_value);
320	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
321	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
322	}
323	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start_gsdlsourcefilename_value));
324
325	// Next, if Windows, check if dealing with Win 8.3 Short Filename
326	// In that case, convert short file name to long filename - works only if the file exists
327	if(isWin /&& gsdlsourcefilename_value.indexOf("~") != -1/) {
328
329	String long_gsdlsourcefilename = gsdlsourcefilename_value;
330
331	// gsdlsourcefilename is stored from import folder onwards: import/opt_subdir/filename.ext
332	// This may contain Win 8.3 shortening. To get Win Long filename, prefix current collection dir
333	// and if resulting file exists, getCanonicalPath() which produces Win Long filename.
334	File currentCollectionFolder = Gatherer.c_man.getCollection().getCollectionDirectory();
335	File f = new File(currentCollectionFolder, "import" + File.separator + gsdlsourcefilename_value); // should work even if linux style slashes in gsdlsourcefilename_value
336	///System.err.println("### file: " + f.getAbsolutePath());
337
338	if(f.exists()) {
339	long_gsdlsourcefilename = f.getCanonicalPath();
340	///System.err.println("### canon: " + long_gsdlsourcefilename);
341	} // else couldn't find a version of the filename stored in doc.xml that exists, giving up, leave gsdlsourcefilename_value as is
342
343	// Again, we're only interested in the path relative to the import folder
344	long_gsdlsourcefilename = adjustForRelativeToImportDir(long_gsdlsourcefilename);
345	if(!gsdlsourcefilename_value.equals(long_gsdlsourcefilename)) { // truly distinct Win long and short file names
346	// Put a copy of the ref to gsdlsourcefilename's metadata list under the long filename as well
347	///System.err.println("@@@ long filename into map: " + long_gsdlsourcefilename);
348	Object arrList = source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value);
349	source_file_name_to_description_elements_mapping.put(long_gsdlsourcefilename, arrList);
350	}
351	}
352
353	}
354	catch (FileNotFoundException exception) {
355	DebugStream.printStackTrace(exception);
356	}
357	catch (IOException exception) {
358	DebugStream.printStackTrace(exception);
359	} catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
360	DebugStream.printStackTrace(exception);
361	}
362	}
363
364	private String adjustForRelativeToImportDir(String gsdlsourcefilename_value) {
365	int import_index = gsdlsourcefilename_value.indexOf("import");
366	if (import_index != -1) {
367	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
368
369	boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
370	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
371
372	// (Will decode gsdlsourcefilename at end of this method, once we know
373	// for certain the fileRenameMethod that was used to encode it.)
374
375	// Make sure the path matches the OS that is running
376	if (is_unix_path && isWin) {
377	// Convert path from Unix to Windows
378	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
379	}
380	else if (!is_unix_path && !isWin) {
381	// Convert path from Windows to Unix
382	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
383	}
384	}
385	return gsdlsourcefilename_value;
386	}
387
388	protected String decodeSourceFilename(String relative_sourcefile_path,
389	String encodingMethod, boolean is_unix_path)
390	throws Exception
391	{
392
393	///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
394
395	// First get the file extension. Both in Base64 and URL encoded strings,
396	// the full-stop character (.) doesn't get encoded.
397	// That means getting the file extension is straightforward.
398
399	// Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
400	// 26 lowercase characters, 26 uppercase characters as well as the
401	// Plus sign (+) and the Forward Slash (/).
402	int fullstop = relative_sourcefile_path.indexOf(".");
403	String file_ext = "";
404	if(fullstop != -1) {
405	file_ext = relative_sourcefile_path.substring(fullstop);
406	relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
407	}
408
409	String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
410
411	String decoded_gsdlsourcefilename = "";
412
413	String separator = is_unix_path ? "/" : "\\";
414	for(int i = 0; i < importFilePathParts.length; i++) {
415	String decoded_filePathPart = "";
416	if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
417	// URL decode each part of gsdlsourcefilename.
418	// Need to set the decoder to use the default system encoding
419	// This is stored in the System's file.encoding property.
420	decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
421	}
422	else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
423	// Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
424	//byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
425	// Using org.apache.commons.codec.binary.Base64 instead
426	// https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
427	// General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
428	byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
429	///System.err.println("Got base64 string: " + importFilePathParts[i]);
430	///System.err.println("Decoded from base64 to bytes: " + new String(bytes, System.getProperty("file.encoding")));
431	// Using system file.encoding to interpret the resulting bytestring as a String,
432	// just as we always did with URL decoding method
433	decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
434	}
435
436	if(i == 0) {
437	decoded_gsdlsourcefilename = decoded_filePathPart;
438	} else {
439	decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
440	}
441	///System.err.println("Built up: " + decoded_gsdlsourcefilename);
442	}
443
444	// add the file extension back in
445	decoded_gsdlsourcefilename += file_ext;
446
447	///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
448
449	return decoded_gsdlsourcefilename;
450	}
451
452	/**
453	* Given a filepath, returns the parts between each file separator as an array.
454	* For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
455	*/
456	private static String[] getFilePathParts(String filepath, boolean is_unix_path) {
457	StringTokenizer tok;
458	if(is_unix_path) {
459	tok = new StringTokenizer(filepath, "/");
460	} else {
461	tok = new StringTokenizer(filepath, "\\");
462	}
463	String[] parts;
464	int count = tok.countTokens();
465	if(count <= 0) {
466	parts = new String[]{filepath};
467	} else {
468	int i = 0;
469	parts = new String[count];
470	while(tok.hasMoreTokens()) {
471	parts[i] = tok.nextToken();
472	//System.err.println("Next part: " + parts[i]);
473	i++;
474	}
475	}
476	return parts;
477	}
478
479	/*
480	public ArrayList getMetadataExtractedFromFile(File file)
481	{
482	// Build up a list of metadata extracted from this file
483	ArrayList metadata_values = new ArrayList();
484
485	String file_relative_path = file.getAbsolutePath();
486	int import_index = file_relative_path.indexOf("import");
487	if (import_index != -1) {
488	file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
489	}
490
491	// Check whether this doc.xml file contains extracted metadata for the specified file
492	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
493	if (description_elements_list == null) {
494	// ...it doesn't
495	return metadata_values;
496	}
497
498	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
499
500	// Parse the doc.xml file
501	DebugStream.println("Applicable doc.xml file: " + this);
502	try {
503	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
504
505	int description_element_num = 0;
506	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
507	boolean in_relevant_description_element = false;
508
509	String line = null;
510	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
511	// Check if this line contains the start of a relevant Description element
512	if (line_num == next_description_element_start) {
513	in_relevant_description_element = true;
514	continue;
515	}
516
517	// If we're not in a relevant Description element we don't care about anything
518	if (in_relevant_description_element == false) {
519	continue;
520	}
521
522	// Check if this line contains the end of the relevant Description element
523	if (line.indexOf("</Description>") != -1) {
524	description_element_num++;
525	if (description_element_num == description_elements_list.size()) {
526	break;
527	}
528
529	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
530	in_relevant_description_element = false;
531	continue;
532	}
533
534	// If this line doesn't contain a complete Metadata element, we're not interested
535	if (line.indexOf("<Metadata ") == -1 \|\| line.indexOf("</Metadata>") == -1) {
536	continue;
537	}
538
539	// Extract the metadata element name
540	int name_index = line.indexOf(" name=\"") + " name=\"".length();
541	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
542
543	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
544	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
545	if (!metadata_set_namespace.equals("")) {
546	continue;
547	}
548
549	// Extracted metadata!
550	String metadata_element_name = metadata_element_name_full;
551
552	// We completely ignore bibliographic data
553	if (metadata_element_name.equals("SourceSegment")) {
554	buffered_reader.close();
555	return new ArrayList();
556	}
557
558	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
559	if (metadata_element_name.startsWith("gsdl")) {
560	continue;
561	}
562
563	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
564
565	// Value trees are not stored for extracted metadata, so create a new value tree node now
566	int value_index = line.indexOf(">", name_index) + ">".length();
567	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
568
569	metadata_element.addMetadataValue(metadata_element_value);
570	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
571
572	// Add the new metadata value to the list
573	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
574	metadata_values.add(metadata_value);
575	}
576
577	buffered_reader.close();
578	}
579	catch (FileNotFoundException exception) {
580	DebugStream.printStackTrace(exception);
581	}
582	catch (IOException exception) {
583	DebugStream.printStackTrace(exception);
584	}
585
586	return metadata_values;
587	}
588
589	*/
590
591	/**
592	* Every doc.xml file must be skimmed when a collection is opened, for two reasons:
593	* - To build a mapping from source file to its corresponding doc.xml file
594	* - To get a complete list of all extracted metadata elements
595	*/
596	/*
597	public void skimFile()
598	{
599	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
600
601	// Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
602	DebugStream.println("Skimming " + this + "...");
603	try {
604	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
605	int description_element_start = -1;
606
607	String line = null;
608	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
609	// This line contains the start of a Description element
610	if (line.indexOf("<Description>") != -1) {
611	if (description_element_start != -1) {
612	System.err.println("Parse error: previous Description element unfinished!");
613	}
614	description_element_start = line_num;
615	continue;
616	}
617
618	// This line contains the end of a Description element
619	if (line.indexOf("</Description>") != -1) {
620	if (description_element_start == -1) {
621	System.err.println("Parse error: Description element unstarted!");
622	}
623	description_element_start = -1;
624	continue;
625	}
626
627	// If we're not in a Description element there shouldn't be any Metadata elements
628	if (description_element_start == -1) {
629	continue;
630	}
631
632	// This line doesn't contain a Metadata element, so we're not interested
633	if (line.indexOf("<Metadata ") == -1) {
634	DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
635	continue;
636	}
637
638	// Extract the metadata element name
639	int name_index = line.indexOf(" name=\"") + " name=\"".length();
640	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
641
642	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
643	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
644	if (!metadata_set_namespace.equals("")) {
645	continue;
646	}
647
648	// Extracted metadata!
649	String metadata_element_name = metadata_element_name_full;
650
651	// Note which file this doc.xml is for
652	if (metadata_element_name.equals("gsdlsourcefilename")) {
653	// Extract the gsdlsourcefilename element value
654	int value_index = line.indexOf(">", name_index) + ">".length();
655	String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
656
657	// We're only interested in the path relative to the import folder
658	int import_index = gsdlsourcefilename_value.indexOf("import");
659	if (import_index != -1) {
660	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
661
662	boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
663	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
664
665	// URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
666	// This is stored in the System's file.encoding property.
667	gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
668
669	// Make sure the path matches the OS that is running
670	if (is_unix_path && isWin) {
671	// Convert path from Unix to Windows
672	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
673	}
674	else if (!is_unix_path && !isWin) {
675	// Convert path from Windows to Unix
676	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
677	}
678
679	// Remember this for quick access later
680	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
681	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
682	}
683
684	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
685	}
686
687	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
688	// This is true when the source files come from a zip file processed by ZIPPlug, for example
689	else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
690	// We don't really know what is going on...
691	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
692	}
693	}
694
695	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
696	if (metadata_element_name.startsWith("gsdl")) {
697	continue;
698	}
699
700	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
701	if (metadata_element == null) {
702	// This element isn't defined in ex.mds, so create it for this session
703	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
704	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
705	}
706	}
707
708	buffered_reader.close();
709	}
710	catch (FileNotFoundException exception) {
711	DebugStream.printStackTrace(exception);
712	}
713	catch (IOException exception) {
714	DebugStream.printStackTrace(exception);
715	}
716	}
717	*/
718
719	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: