Context Navigation

source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 34509

Last change on this file since 34509 was 34509, checked in by ak19, 4 years ago
Related to previous commits 34506-34508. Storing both the Win 8.3 Short filename of gsdlsourcefilename and its long filename version.
Property svn:keywords set to `Author Date Id Revision`
File size: 30.0 KB

Line
1	/**
2	*############################################################################
3	* A component of the Greenstone Librarian Interface, part of the Greenstone
4	* digital library suite from the New Zealand Digital Library Project at the
5	* University of Waikato, New Zealand.
6	*
7	* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8	*
9	* Copyright (C) 2004 New Zealand Digital Library Project
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*############################################################################
25	*/
26
27	package org.greenstone.gatherer.metadata;
28
29
30	import java.io.*;
31	import java.util.*;
32	import java.net.URLDecoder;
33	import org.greenstone.gatherer.DebugStream;
34	import org.greenstone.gatherer.Gatherer;
35	import org.greenstone.gatherer.util.Utility;
36
37	//import org.greenstone.gatherer.feedback.Base64; // decode() from Base64 didn't work
38	import org.apache.commons.codec.binary.Base64; // decoding from Base64 works
39
40	/** This class represents one doc.xml file */
41
42	public abstract class DocXMLFile extends File
43	{
44	static boolean isWin = Utility.isWindows();
45
46	protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
47
48	protected final String MetadataWrap;
49	protected final String MetadataItem;
50
51	protected final String FILE_RENAME_METHOD_NONE = "none";
52	protected final String FILE_RENAME_METHOD_URL = "url";
53	protected final String FILE_RENAME_METHOD_BASE64 = "base64";
54
55	public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
56	{
57	super(doc_xml_file_path);
58	this.MetadataWrap = metaWrap;
59	this.MetadataItem = metaItem;
60	}
61
62	/** On Windows, file_relative_path will be hex-encoded for codepts beyond ASCII.
63	* But keys into the source_file_name_to_description_elements_mapping will then also match on Windows */
64	public ArrayList getMetadataExtractedFromFile(File file, String file_relative_path)
65	{
66	// Build up a list of metadata extracted from this file
67	ArrayList metadata_values = new ArrayList();
68
69	///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
70	/// System.err.println("\n@@@ relFilename: " + relFilename);
71	///}
72
73	// Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
74	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
75	if (description_elements_list == null) {
76	// ...it doesn't
77	///System.err.println("Unable to find meta for file path form " + file_relative_path);
78	return metadata_values; // we're done
79	} ///else { System.err.println("@@@ file rel path: " + file_relative_path + " matched" ); }
80
81	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
82
83	// Parse the file
84	DebugStream.println("Applicable file: " + this);
85	try {
86	BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
87
88	int description_element_num = 0;
89	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
90	boolean in_relevant_description_element = false;
91
92	String line = null;
93	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
94	// Check if this line contains the start of a relevant "Description" element
95	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
96	if (line_num == next_description_element_start) {
97	in_relevant_description_element = true;
98	continue;
99	}
100
101	// If we're not in a relevant Description element we don't care about anything
102	if (in_relevant_description_element == false) {
103	continue;
104	}
105
106	// Check if this line contains the end of the relevant Description element
107	if (line.indexOf("</"+MetadataWrap+">") != -1) {
108	description_element_num++;
109	if (description_element_num == description_elements_list.size()) {
110	break;
111	}
112
113	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
114	in_relevant_description_element = false;
115	continue;
116	}
117
118	// If this line doesn't contain a complete Metadata element, we're not interested
119	if (line.indexOf("<"+MetadataItem+" ") == -1 \|\| line.indexOf("</"+MetadataItem+">") == -1) {
120	continue;
121	}
122
123	// Extract the metadata element name
124	int name_index = line.indexOf(" name=\"") + " name=\"".length();
125	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
126
127	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
128	// Actually, if it is ex. then we are interested
129	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
130
131	if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
132	continue;
133	}
134
135	// Extracted metadata!
136	// do it like this just in case we have ex.
137	String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
138
139	// We completely ignore bibliographic data
140	if (metadata_element_name.equals("SourceSegment")) {
141	buffered_reader.close();
142	return new ArrayList();
143	}
144
145	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
146	if (metadata_element_name.startsWith("gsdl")) {
147	continue;
148	}
149
150	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
151
152	// Value trees are not stored for extracted metadata, so create a new value tree node now
153	int value_index = line.indexOf(">", name_index) + ">".length();
154	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
155
156	metadata_element.addMetadataValue(metadata_element_value);
157	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
158
159	// Add the new metadata value to the list
160	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
161	metadata_values.add(metadata_value);
162	}
163
164	buffered_reader.close();
165	}
166	catch (FileNotFoundException exception) {
167	DebugStream.printStackTrace(exception);
168	}
169	catch (IOException exception) {
170	DebugStream.printStackTrace(exception);
171	}
172
173	return metadata_values;
174	}
175
176
177
178
179	/**
180	* Every file must be skimmed when a collection is opened, for two reasons:
181	* - To build a mapping from source file to its corresponding doc.xml file
182	* - To get a complete list of all extracted metadata elements
183	*/
184	public void skimFile()
185	{
186	String fileRenameMethod = null;
187	String gsdlsourcefilename_value = null;
188	boolean is_unix_path = false;
189	int description_element_start_gsdlsourcefilename_value = -1;
190
191	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
192
193	// Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
194	DebugStream.println("Skimming " + this + "...");
195	try {
196	BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
197	int description_element_start = -1;
198
199	String line = null;
200	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
201	// This line contains the start of a "MetadataWrap" element
202	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
203	if (line.indexOf("<"+MetadataWrap+">") != -1) {
204	if (description_element_start != -1) {
205	System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
206	}
207	description_element_start = line_num;
208	continue;
209	}
210
211	// This line contains the end of a "MetadataWrap" element
212	if (line.indexOf("</"+MetadataWrap+">") != -1) {
213	if (description_element_start == -1) {
214	System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
215	}
216	description_element_start = -1;
217	continue;
218	}
219
220	// If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
221	if (description_element_start == -1) {
222	continue;
223	}
224
225	// This line doesn't contain a Metadata element, so we're not interested
226	if (line.indexOf("<"+MetadataItem+" ") == -1) {
227	DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
228	continue;
229	}
230
231	// Extract the metadata element name
232	int name_index = line.indexOf(" name=\"") + " name=\"".length();
233	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
234
235	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
236	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
237	if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
238	continue;
239	}
240
241	// Extracted metadata! May have ex. so make sure we remove that
242	String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
243	if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
244	// Extract the element value
245	int value_index = line.indexOf(">", name_index) + ">".length();
246	fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));
247	}
248
249	// Note which file this is for
250	else if (metadata_element_name.equals("gsdlsourcefilename")) {
251	// the gsdlsourcefilename metadata field may be encoded by the encoding denoted
252	// in fileRenameMethod (and will need decoding)
253
254	// Extract the gsdlsourcefilename element value
255	int value_index = line.indexOf(">", name_index) + ">".length();
256	gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
257
258	// We're only interested in the path relative to the import folder
259	int import_index = gsdlsourcefilename_value.indexOf("import");
260	if (import_index != -1) {
261
262	///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
263	///System.err.println("@@@@ Found description_element_start_gsdlsourcefilename_value: " + description_element_start);
264	description_element_start_gsdlsourcefilename_value = description_element_start;
265
266	}
267
268	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
269	// (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
270	// which are the gsdlsourcefilenames for the fedora digital object representing a collection.
271	// This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
272	else if (gsdlsourcefilename_value.indexOf("tmp") == -1
273	&& !gsdlsourcefilename_value.endsWith("collect.cfg")
274	&& !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
275	// We don't really know what is going on...
276	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
277	}
278	}
279
280	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
281	if (metadata_element_name.startsWith("gsdl")) {
282	continue;
283	}
284
285	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
286	if (metadata_element == null) {
287	// This element isn't defined in ex.mds, so create it for this session
288	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
289	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
290	}
291	}
292
293	buffered_reader.close();
294
295	// Work out if is_unix_path
296	int import_index = gsdlsourcefilename_value.indexOf("import");
297	if (import_index != -1) {
298	String tempStr = gsdlsourcefilename_value.substring(import_index + "import".length());
299	is_unix_path = tempStr.startsWith("/");
300	}
301	// We're only interested in the path relative to the import folder
302	// Lop off "import" folder prefix
303	gsdlsourcefilename_value = adjustForRelativeToImportDir(gsdlsourcefilename_value);
304
305	// Now that we're done skimming, we actually need to decode gsdlsourcefilename
306	// based on whatever fileRenameMethod was used to encode it, so that we can
307	// at last properly compare against filenames on the file system
308	// in order to load the correct ex.meta for the file.
309	// Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
310	// we can finally perform the decoding of gsdlsourcefilename.
311	if(fileRenameMethod == null) {
312	fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
313	}
314	if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
315	gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
316	}
317
318	// Now we can finally put the gsdlsourcefilename path relative to import dir into the hashmap
319	///System.err.println("@@@ into map: " + gsdlsourcefilename_value);
320	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
321	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
322	}
323	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start_gsdlsourcefilename_value));
324
325	// Next, if Windows, check if dealing with Win 8.3 Short Filename
326	// In that case, convert short file name to long filename - works only if the file exists
327	if(isWin && gsdlsourcefilename_value.indexOf("~") != -1) {
328
329	String long_gsdlsourcefilename = gsdlsourcefilename_value;
330
331	// gsdlsourcefilename is stored from import folder onwards: import/opt_subdir/filename.ext
332	// This may contain Win 8.3 shortening. To get Win Long filename, prefix current collection dir
333	// and if resulting file exists, getCanonicalPath() which produces Win Long filename.
334	File currentCollectionFolder = Gatherer.c_man.getCollection().getCollectionDirectory();
335	File f = new File(currentCollectionFolder, "import" + File.separator + gsdlsourcefilename_value); // should work even if linux style slashes in gsdlsourcefilename_value
336	///System.err.println("### file: " + f.getAbsolutePath());
337
338	if(f.exists()) {
339	long_gsdlsourcefilename = f.getCanonicalPath();
340	///System.err.println("### canon: " + long_gsdlsourcefilename);
341	} // else couldn't find a version of the filename stored in doc.xml that exists, giving up, leave gsdlsourcefilename_value as is
342
343	// Again, we're only interested in the path relative to the import folder
344	long_gsdlsourcefilename = adjustForRelativeToImportDir(long_gsdlsourcefilename);
345	if(!gsdlsourcefilename_value.equals(long_gsdlsourcefilename)) { // truly distinct Win long and short file names
346	// Put a copy of the ref to gsdlsourcefilename's metadata list under the long filename as well
347	///System.err.println("@@@ long filename into map: " + long_gsdlsourcefilename);
348	Object arrList = source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value);
349	source_file_name_to_description_elements_mapping.put(long_gsdlsourcefilename, arrList);
350	}
351	}
352
353	}
354	catch (FileNotFoundException exception) {
355	DebugStream.printStackTrace(exception);
356	}
357	catch (IOException exception) {
358	DebugStream.printStackTrace(exception);
359	} catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
360	DebugStream.printStackTrace(exception);
361	}
362	}
363
364	private String adjustForRelativeToImportDir(String gsdlsourcefilename_value) {
365	int import_index = gsdlsourcefilename_value.indexOf("import");
366	if (import_index != -1) {
367	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
368
369	boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
370	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
371
372	// (Will decode gsdlsourcefilename at end of this method, once we know
373	// for certain the fileRenameMethod that was used to encode it.)
374
375	// Make sure the path matches the OS that is running
376	if (is_unix_path && isWin) {
377	// Convert path from Unix to Windows
378	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
379	}
380	else if (!is_unix_path && !isWin) {
381	// Convert path from Windows to Unix
382	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
383	}
384	}
385	return gsdlsourcefilename_value;
386	}
387
388	protected String decodeSourceFilename(String relative_sourcefile_path,
389	String encodingMethod, boolean is_unix_path)
390	throws Exception
391	{
392
393	///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
394
395	// First get the file extension. Both in Base64 and URL encoded strings,
396	// the full-stop character (.) doesn't get encoded.
397	// That means getting the file extension is straightforward.
398
399	// Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
400	// 26 lowercase characters, 26 uppercase characters as well as the
401	// Plus sign (+) and the Forward Slash (/).
402	int fullstop = relative_sourcefile_path.indexOf(".");
403	String file_ext = "";
404	if(fullstop != -1) {
405	file_ext = relative_sourcefile_path.substring(fullstop);
406	relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
407	}
408
409	String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
410
411	String decoded_gsdlsourcefilename = "";
412
413	String separator = is_unix_path ? "/" : "\\";
414	for(int i = 0; i < importFilePathParts.length; i++) {
415	String decoded_filePathPart = "";
416	if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
417	// URL decode each part of gsdlsourcefilename.
418	// Need to set the decoder to use the default system encoding
419	// This is stored in the System's file.encoding property.
420	decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
421	}
422	else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
423	// Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
424	//byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
425	// Using org.apache.commons.codec.binary.Base64 instead
426	// https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
427	// General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
428	byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
429	///System.err.println("Got base64 string: " + importFilePathParts[i]);
430	///System.err.println("Decoded from base64 to bytes: " + new String(bytes, System.getProperty("file.encoding")));
431	// Using system file.encoding to interpret the resulting bytestring as a String,
432	// just as we always did with URL decoding method
433	decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
434	}
435
436	if(i == 0) {
437	decoded_gsdlsourcefilename = decoded_filePathPart;
438	} else {
439	decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
440	}
441	///System.err.println("Built up: " + decoded_gsdlsourcefilename);
442	}
443
444	// add the file extension back in
445	decoded_gsdlsourcefilename += file_ext;
446
447	///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
448
449	return decoded_gsdlsourcefilename;
450	}
451
452	/**
453	* Given a filepath, returns the parts between each file separator as an array.
454	* For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
455	*/
456	private static String[] getFilePathParts(String filepath, boolean is_unix_path) {
457	StringTokenizer tok;
458	if(is_unix_path) {
459	tok = new StringTokenizer(filepath, "/");
460	} else {
461	tok = new StringTokenizer(filepath, "\\");
462	}
463	String[] parts;
464	int count = tok.countTokens();
465	if(count <= 0) {
466	parts = new String[]{filepath};
467	} else {
468	int i = 0;
469	parts = new String[count];
470	while(tok.hasMoreTokens()) {
471	parts[i] = tok.nextToken();
472	//System.err.println("Next part: " + parts[i]);
473	i++;
474	}
475	}
476	return parts;
477	}
478
479	/*
480	public ArrayList getMetadataExtractedFromFile(File file)
481	{
482	// Build up a list of metadata extracted from this file
483	ArrayList metadata_values = new ArrayList();
484
485	String file_relative_path = file.getAbsolutePath();
486	int import_index = file_relative_path.indexOf("import");
487	if (import_index != -1) {
488	file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
489	}
490
491	// Check whether this doc.xml file contains extracted metadata for the specified file
492	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
493	if (description_elements_list == null) {
494	// ...it doesn't
495	return metadata_values;
496	}
497
498	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
499
500	// Parse the doc.xml file
501	DebugStream.println("Applicable doc.xml file: " + this);
502	try {
503	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
504
505	int description_element_num = 0;
506	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
507	boolean in_relevant_description_element = false;
508
509	String line = null;
510	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
511	// Check if this line contains the start of a relevant Description element
512	if (line_num == next_description_element_start) {
513	in_relevant_description_element = true;
514	continue;
515	}
516
517	// If we're not in a relevant Description element we don't care about anything
518	if (in_relevant_description_element == false) {
519	continue;
520	}
521
522	// Check if this line contains the end of the relevant Description element
523	if (line.indexOf("</Description>") != -1) {
524	description_element_num++;
525	if (description_element_num == description_elements_list.size()) {
526	break;
527	}
528
529	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
530	in_relevant_description_element = false;
531	continue;
532	}
533
534	// If this line doesn't contain a complete Metadata element, we're not interested
535	if (line.indexOf("<Metadata ") == -1 \|\| line.indexOf("</Metadata>") == -1) {
536	continue;
537	}
538
539	// Extract the metadata element name
540	int name_index = line.indexOf(" name=\"") + " name=\"".length();
541	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
542
543	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
544	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
545	if (!metadata_set_namespace.equals("")) {
546	continue;
547	}
548
549	// Extracted metadata!
550	String metadata_element_name = metadata_element_name_full;
551
552	// We completely ignore bibliographic data
553	if (metadata_element_name.equals("SourceSegment")) {
554	buffered_reader.close();
555	return new ArrayList();
556	}
557
558	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
559	if (metadata_element_name.startsWith("gsdl")) {
560	continue;
561	}
562
563	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
564
565	// Value trees are not stored for extracted metadata, so create a new value tree node now
566	int value_index = line.indexOf(">", name_index) + ">".length();
567	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
568
569	metadata_element.addMetadataValue(metadata_element_value);
570	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
571
572	// Add the new metadata value to the list
573	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
574	metadata_values.add(metadata_value);
575	}
576
577	buffered_reader.close();
578	}
579	catch (FileNotFoundException exception) {
580	DebugStream.printStackTrace(exception);
581	}
582	catch (IOException exception) {
583	DebugStream.printStackTrace(exception);
584	}
585
586	return metadata_values;
587	}
588
589	*/
590
591	/**
592	* Every doc.xml file must be skimmed when a collection is opened, for two reasons:
593	* - To build a mapping from source file to its corresponding doc.xml file
594	* - To get a complete list of all extracted metadata elements
595	*/
596	/*
597	public void skimFile()
598	{
599	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
600
601	// Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
602	DebugStream.println("Skimming " + this + "...");
603	try {
604	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
605	int description_element_start = -1;
606
607	String line = null;
608	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
609	// This line contains the start of a Description element
610	if (line.indexOf("<Description>") != -1) {
611	if (description_element_start != -1) {
612	System.err.println("Parse error: previous Description element unfinished!");
613	}
614	description_element_start = line_num;
615	continue;
616	}
617
618	// This line contains the end of a Description element
619	if (line.indexOf("</Description>") != -1) {
620	if (description_element_start == -1) {
621	System.err.println("Parse error: Description element unstarted!");
622	}
623	description_element_start = -1;
624	continue;
625	}
626
627	// If we're not in a Description element there shouldn't be any Metadata elements
628	if (description_element_start == -1) {
629	continue;
630	}
631
632	// This line doesn't contain a Metadata element, so we're not interested
633	if (line.indexOf("<Metadata ") == -1) {
634	DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
635	continue;
636	}
637
638	// Extract the metadata element name
639	int name_index = line.indexOf(" name=\"") + " name=\"".length();
640	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
641
642	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
643	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
644	if (!metadata_set_namespace.equals("")) {
645	continue;
646	}
647
648	// Extracted metadata!
649	String metadata_element_name = metadata_element_name_full;
650
651	// Note which file this doc.xml is for
652	if (metadata_element_name.equals("gsdlsourcefilename")) {
653	// Extract the gsdlsourcefilename element value
654	int value_index = line.indexOf(">", name_index) + ">".length();
655	String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
656
657	// We're only interested in the path relative to the import folder
658	int import_index = gsdlsourcefilename_value.indexOf("import");
659	if (import_index != -1) {
660	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
661
662	boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
663	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
664
665	// URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
666	// This is stored in the System's file.encoding property.
667	gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
668
669	// Make sure the path matches the OS that is running
670	if (is_unix_path && isWin) {
671	// Convert path from Unix to Windows
672	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
673	}
674	else if (!is_unix_path && !isWin) {
675	// Convert path from Windows to Unix
676	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
677	}
678
679	// Remember this for quick access later
680	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
681	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
682	}
683
684	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
685	}
686
687	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
688	// This is true when the source files come from a zip file processed by ZIPPlug, for example
689	else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
690	// We don't really know what is going on...
691	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
692	}
693	}
694
695	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
696	if (metadata_element_name.startsWith("gsdl")) {
697	continue;
698	}
699
700	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
701	if (metadata_element == null) {
702	// This element isn't defined in ex.mds, so create it for this session
703	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
704	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
705	}
706	}
707
708	buffered_reader.close();
709	}
710	catch (FileNotFoundException exception) {
711	DebugStream.printStackTrace(exception);
712	}
713	catch (IOException exception) {
714	DebugStream.printStackTrace(exception);
715	}
716	}
717	*/
718
719	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: