Context Navigation

source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 33758

Last change on this file since 33758 was 33758, checked in by ak19, 4 years ago
Removed debugging and last bit of cleanup.
Property svn:keywords set to `Author Date Id Revision`
File size: 31.6 KB

Line
1	/**
2	*############################################################################
3	* A component of the Greenstone Librarian Interface, part of the Greenstone
4	* digital library suite from the New Zealand Digital Library Project at the
5	* University of Waikato, New Zealand.
6	*
7	* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8	*
9	* Copyright (C) 2004 New Zealand Digital Library Project
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*############################################################################
25	*/
26
27	package org.greenstone.gatherer.metadata;
28
29
30	import java.io.*;
31	import java.util.*;
32	import java.net.URLDecoder;
33	import org.greenstone.gatherer.DebugStream;
34	import org.greenstone.gatherer.util.Utility;
35
36	//import org.greenstone.gatherer.feedback.Base64; // decode() from Base64 didn't work
37	import org.apache.commons.codec.binary.Base64; // decoding from Base64 works
38
39	/** This class represents one doc.xml file */
40
41	public abstract class DocXMLFile extends File
42	{
43	protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
44
45	protected final String MetadataWrap;
46	protected final String MetadataItem;
47
48	protected final String FILE_RENAME_METHOD_NONE = "none";
49	protected final String FILE_RENAME_METHOD_URL = "url";
50	protected final String FILE_RENAME_METHOD_BASE64 = "base64";
51
52	public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
53	{
54	super(doc_xml_file_path);
55	this.MetadataWrap = metaWrap;
56	this.MetadataItem = metaItem;
57	}
58
59	/**
60	* Checks if various versions of the file object's filename, denoted relatively by file_relative_path,
61	* occur in the source_file_name_to_description_elements_mapping map
62	*/
63	private ArrayList findSourceFileMapKeyMatch(File file, String file_relative_path) {
64	ArrayList description_elements_list = null;
65
66	///System.err.println("Looking for key " + file_relative_path);
67	description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
68	if(description_elements_list != null) {
69	///System.err.println(" Found key matching REGULAR filepath: " + file_relative_path);
70	return description_elements_list;
71	}
72	else if(!Utility.isWindows()) { // couldn't find a matching key, we're done
73	///System.err.println("Unable to find ex.meta for regular file path form " + file_relative_path);
74	return null;
75	}
76
77	// Now we can try windows short filename as map key
78
79	String win_short_file_relative_path = "";
80	try{
81	win_short_file_relative_path = Utility.getWindowsShortFileName(file.getAbsolutePath());
82	//System.err.println("@@@ Searching for short file name: " + win_short_file_relative_path);
83	} catch(Exception e) { // we're done trying to find a matching key
84	System.err.println("Failed to convert to windows short file name: " + win_short_file_relative_path);
85	return null;
86	}
87
88	// Got a windows short file name, lop off import folder again
89	int import_index = win_short_file_relative_path.indexOf("import");
90	if (import_index != -1) {
91	win_short_file_relative_path = win_short_file_relative_path.substring(import_index + "import".length() + 1);
92	}
93
94	///System.err.println("### Looking for Windows short file name \|" + win_short_file_relative_path + "\| in map of sourcefilenames to doc.xml's ex meta.");
95	description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(win_short_file_relative_path);
96	if (description_elements_list != null) {
97	///System.err.println(" Found key matching FULL win shortfile path: " + win_short_file_relative_path);
98	return description_elements_list; // found
99	}
100
101	// else, check whether a map key is matched by any REMAINING combination of windows shortfile path and regular path:
102	// - windows shortfilename's rel-dir-path with regular tailname
103	// - and regular rel-dir-path with windows shortfilename's tailname
104
105	String shortFileTailName = win_short_file_relative_path;
106	String shortFileRelDirPath = "";
107	int lastSep = win_short_file_relative_path.lastIndexOf(File.separator);
108	if(lastSep != -1) {
109	shortFileTailName = win_short_file_relative_path.substring(lastSep+1);
110	shortFileRelDirPath = win_short_file_relative_path.substring(0, lastSep+1); // include the slash
111	}
112
113	String fileTailName = file_relative_path;
114	String fileRelDirPath = "";
115	lastSep = file_relative_path.lastIndexOf(File.separator);
116	if(lastSep != -1) {
117	fileTailName = file_relative_path.substring(lastSep+1);
118	fileRelDirPath = file_relative_path.substring(0, lastSep+1); // include the slash
119	}
120
121	String path = shortFileRelDirPath + fileTailName;
122	///System.err.println("### Looking for Windows short file name \|" + path + "\| in map of sourcefilenames to doc.xml's ex meta.");
123	description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path);
124
125	if(description_elements_list != null) {
126	///System.err.println(" Found key matching MIX of win shortfile path and regular path: " + path);
127	return description_elements_list; // found
128	}
129
130	// try the other combination
131	path = fileRelDirPath + shortFileTailName;
132	///System.err.println("### Looking for Windows short file name \|" + path + "\| in map of sourcefilenames to doc.xml's ex meta.");
133	description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path);
134
135	if(description_elements_list != null) {
136	///System.err.println(" Found key matching MIX of regular path and win shortfile path: " + path);
137	return description_elements_list; // found
138	}
139
140	// could not find gsdlsourcefilename in map
141	///System.err.println("Unable to find ex.meta for regular file path form " + file_relative_path);
142	///System.err.println(" Or for windows shortFile path form, or for combinations with regular file path form");
143
144	return description_elements_list; // returns null at this point
145	}
146
147
148	public ArrayList getMetadataExtractedFromFile(File file)
149	{
150	// Build up a list of metadata extracted from this file
151	ArrayList metadata_values = new ArrayList();
152
153	String file_relative_path = file.getAbsolutePath();
154	int import_index = file_relative_path.indexOf("import");
155	if (import_index != -1) {
156	file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
157	}
158
159	///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
160	/// System.err.println("\n@@@ relFilename: " + relFilename);
161	///}
162
163	// Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
164	//ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
165	ArrayList description_elements_list = findSourceFileMapKeyMatch(file, file_relative_path);
166	if (description_elements_list == null) {
167	// ...it doesn't
168	return metadata_values; // we're done
169	}
170
171	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
172
173	// Parse the file
174	DebugStream.println("Applicable file: " + this);
175	try {
176	BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
177
178	int description_element_num = 0;
179	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
180	boolean in_relevant_description_element = false;
181
182	String line = null;
183	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
184	// Check if this line contains the start of a relevant "Description" element
185	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
186	if (line_num == next_description_element_start) {
187	in_relevant_description_element = true;
188	continue;
189	}
190
191	// If we're not in a relevant Description element we don't care about anything
192	if (in_relevant_description_element == false) {
193	continue;
194	}
195
196	// Check if this line contains the end of the relevant Description element
197	if (line.indexOf("</"+MetadataWrap+">") != -1) {
198	description_element_num++;
199	if (description_element_num == description_elements_list.size()) {
200	break;
201	}
202
203	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
204	in_relevant_description_element = false;
205	continue;
206	}
207
208	// If this line doesn't contain a complete Metadata element, we're not interested
209	if (line.indexOf("<"+MetadataItem+" ") == -1 \|\| line.indexOf("</"+MetadataItem+">") == -1) {
210	continue;
211	}
212
213	// Extract the metadata element name
214	int name_index = line.indexOf(" name=\"") + " name=\"".length();
215	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
216
217	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
218	// Actually, if it is ex. then we are interested
219	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
220
221	if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
222	continue;
223	}
224
225	// Extracted metadata!
226	// do it like this just in case we have ex.
227	String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
228
229	// We completely ignore bibliographic data
230	if (metadata_element_name.equals("SourceSegment")) {
231	buffered_reader.close();
232	return new ArrayList();
233	}
234
235	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
236	if (metadata_element_name.startsWith("gsdl")) {
237	continue;
238	}
239
240	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
241
242	// Value trees are not stored for extracted metadata, so create a new value tree node now
243	int value_index = line.indexOf(">", name_index) + ">".length();
244	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
245
246	metadata_element.addMetadataValue(metadata_element_value);
247	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
248
249	// Add the new metadata value to the list
250	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
251	metadata_values.add(metadata_value);
252	}
253
254	buffered_reader.close();
255	}
256	catch (FileNotFoundException exception) {
257	DebugStream.printStackTrace(exception);
258	}
259	catch (IOException exception) {
260	DebugStream.printStackTrace(exception);
261	}
262
263	return metadata_values;
264	}
265
266
267
268
269	/**
270	* Every file must be skimmed when a collection is opened, for two reasons:
271	* - To build a mapping from source file to its corresponding doc.xml file
272	* - To get a complete list of all extracted metadata elements
273	*/
274	public void skimFile()
275	{
276	String fileRenameMethod = null;
277	String gsdlsourcefilename_value = null;
278	boolean is_unix_path = false;
279
280	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
281
282	// Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
283	DebugStream.println("Skimming " + this + "...");
284	try {
285	BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
286	int description_element_start = -1;
287
288	String line = null;
289	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
290	// This line contains the start of a "MetadataWrap" element
291	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
292	if (line.indexOf("<"+MetadataWrap+">") != -1) {
293	if (description_element_start != -1) {
294	System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
295	}
296	description_element_start = line_num;
297	continue;
298	}
299
300	// This line contains the end of a "MetadataWrap" element
301	if (line.indexOf("</"+MetadataWrap+">") != -1) {
302	if (description_element_start == -1) {
303	System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
304	}
305	description_element_start = -1;
306	continue;
307	}
308
309	// If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
310	if (description_element_start == -1) {
311	continue;
312	}
313
314	// This line doesn't contain a Metadata element, so we're not interested
315	if (line.indexOf("<"+MetadataItem+" ") == -1) {
316	DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
317	continue;
318	}
319
320	// Extract the metadata element name
321	int name_index = line.indexOf(" name=\"") + " name=\"".length();
322	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
323
324	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
325	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
326	if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
327	continue;
328	}
329
330	// Extracted metadata! May have ex. so make sure we remove that
331	String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
332	if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
333	// Extract the element value
334	int value_index = line.indexOf(">", name_index) + ">".length();
335	fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));
336	}
337
338	// Note which file this is for
339	else if (metadata_element_name.equals("gsdlsourcefilename")) {
340	// Extract the gsdlsourcefilename element value
341	int value_index = line.indexOf(">", name_index) + ">".length();
342	gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
343
344	// We're only interested in the path relative to the import folder
345	int import_index = gsdlsourcefilename_value.indexOf("import");
346	if (import_index != -1) {
347	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
348
349	is_unix_path = gsdlsourcefilename_value.startsWith("/");
350	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
351
352	// (Will decode gsdlsourcefilename at end of this method, once we know
353	// for certain the fileRenameMethod that was used to encode it.)
354
355	// Make sure the path matches the OS that is running
356	if (is_unix_path && Utility.isWindows()) {
357	// Convert path from Unix to Windows
358	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
359	}
360	else if (!is_unix_path && !Utility.isWindows()) {
361	// Convert path from Windows to Unix
362	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
363	}
364
365	///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
366	// Remember this for quick access later
367	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
368	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
369	}
370
371	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
372	}
373
374	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
375	// (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
376	// which are the gsdlsourcefilenames for the fedora digital object representing a collection.
377	// This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
378	else if (gsdlsourcefilename_value.indexOf("tmp") == -1
379	&& !gsdlsourcefilename_value.endsWith("collect.cfg")
380	&& !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
381	// We don't really know what is going on...
382	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
383	}
384	}
385
386	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
387	if (metadata_element_name.startsWith("gsdl")) {
388	continue;
389	}
390
391	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
392	if (metadata_element == null) {
393	// This element isn't defined in ex.mds, so create it for this session
394	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
395	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
396	}
397	}
398
399	buffered_reader.close();
400
401	// Now that we're done skimming, we actually need to decode gsdlsourcefilename
402	// based on whatever fileRenameMethod was used to encode it, so that we can
403	// at last properly compare properly against filenames on the file system
404	// in order to load the correct ex.meta for the file.
405	// Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
406	// we can finally perform the decoding of gsdlsourcefilename.
407	if(fileRenameMethod == null) {
408	fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
409	}
410	// If gsdlsourcefilename was encoded, we remove it from the map under its encoded
411	// filename, decode it and add it back into map using its decoded filename.
412	if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
413	ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value);
414	gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
415	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list);
416	}
417	}
418	catch (FileNotFoundException exception) {
419	DebugStream.printStackTrace(exception);
420	}
421	catch (IOException exception) {
422	DebugStream.printStackTrace(exception);
423	} catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
424	DebugStream.printStackTrace(exception);
425	}
426	}
427
428	protected String decodeSourceFilename(String relative_sourcefile_path,
429	String encodingMethod, boolean is_unix_path)
430	throws Exception
431	{
432
433	///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
434
435	// First get the file extension. Both in Base64 and URL encoded strings,
436	// the full-stop character (.) doesn't get encoded.
437	// That means getting the file extension is straightforward.
438
439	// Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
440	// 26 lowercase characters, 26 uppercase characters as well as the
441	// Plus sign (+) and the Forward Slash (/).
442	int fullstop = relative_sourcefile_path.indexOf(".");
443	String file_ext = "";
444	if(fullstop != -1) {
445	file_ext = relative_sourcefile_path.substring(fullstop);
446	relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
447	}
448
449	String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
450
451	String decoded_gsdlsourcefilename = "";
452
453	String separator = is_unix_path ? "/" : "\\";
454	for(int i = 0; i < importFilePathParts.length; i++) {
455	String decoded_filePathPart = "";
456	if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
457	// URL decode each part of gsdlsourcefilename.
458	// Need to set the decoder to use the default system encoding
459	// This is stored in the System's file.encoding property.
460	decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
461	}
462	else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
463	// Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
464	//byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
465	// Using org.apache.commons.codec.binary.Base64 instead
466	// https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
467	// General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
468	byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
469	///System.err.println("Got base64 string: " + importFilePathParts[i]);
470	///System.err.println("Decoded from base64 to bytes: " + new String(bytes, System.getProperty("file.encoding")));
471	// Using system file.encoding to interpret the resulting bytestring as a String,
472	// just as we always did with URL decoding method
473	decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
474	}
475
476	if(i == 0) {
477	decoded_gsdlsourcefilename = decoded_filePathPart;
478	} else {
479	decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
480	}
481	///System.err.println("Built up: " + decoded_gsdlsourcefilename);
482	}
483
484	// add the file extension back in
485	decoded_gsdlsourcefilename += file_ext;
486
487	///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
488
489	return decoded_gsdlsourcefilename;
490	}
491
492	/**
493	* Given a filepath, returns the parts between each file separator as an array.
494	* For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
495	*/
496	private static String[] getFilePathParts(String filepath, boolean is_unix_path) {
497	StringTokenizer tok;
498	if(is_unix_path) {
499	tok = new StringTokenizer(filepath, "/");
500	} else {
501	tok = new StringTokenizer(filepath, "\\");
502	}
503	String[] parts;
504	int count = tok.countTokens();
505	if(count <= 0) {
506	parts = new String[]{filepath};
507	} else {
508	int i = 0;
509	parts = new String[count];
510	while(tok.hasMoreTokens()) {
511	parts[i] = tok.nextToken();
512	//System.err.println("Next part: " + parts[i]);
513	i++;
514	}
515	}
516	return parts;
517	}
518
519	/*
520	public ArrayList getMetadataExtractedFromFile(File file)
521	{
522	// Build up a list of metadata extracted from this file
523	ArrayList metadata_values = new ArrayList();
524
525	String file_relative_path = file.getAbsolutePath();
526	int import_index = file_relative_path.indexOf("import");
527	if (import_index != -1) {
528	file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
529	}
530
531	// Check whether this doc.xml file contains extracted metadata for the specified file
532	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
533	if (description_elements_list == null) {
534	// ...it doesn't
535	return metadata_values;
536	}
537
538	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
539
540	// Parse the doc.xml file
541	DebugStream.println("Applicable doc.xml file: " + this);
542	try {
543	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
544
545	int description_element_num = 0;
546	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
547	boolean in_relevant_description_element = false;
548
549	String line = null;
550	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
551	// Check if this line contains the start of a relevant Description element
552	if (line_num == next_description_element_start) {
553	in_relevant_description_element = true;
554	continue;
555	}
556
557	// If we're not in a relevant Description element we don't care about anything
558	if (in_relevant_description_element == false) {
559	continue;
560	}
561
562	// Check if this line contains the end of the relevant Description element
563	if (line.indexOf("</Description>") != -1) {
564	description_element_num++;
565	if (description_element_num == description_elements_list.size()) {
566	break;
567	}
568
569	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
570	in_relevant_description_element = false;
571	continue;
572	}
573
574	// If this line doesn't contain a complete Metadata element, we're not interested
575	if (line.indexOf("<Metadata ") == -1 \|\| line.indexOf("</Metadata>") == -1) {
576	continue;
577	}
578
579	// Extract the metadata element name
580	int name_index = line.indexOf(" name=\"") + " name=\"".length();
581	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
582
583	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
584	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
585	if (!metadata_set_namespace.equals("")) {
586	continue;
587	}
588
589	// Extracted metadata!
590	String metadata_element_name = metadata_element_name_full;
591
592	// We completely ignore bibliographic data
593	if (metadata_element_name.equals("SourceSegment")) {
594	buffered_reader.close();
595	return new ArrayList();
596	}
597
598	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
599	if (metadata_element_name.startsWith("gsdl")) {
600	continue;
601	}
602
603	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
604
605	// Value trees are not stored for extracted metadata, so create a new value tree node now
606	int value_index = line.indexOf(">", name_index) + ">".length();
607	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
608
609	metadata_element.addMetadataValue(metadata_element_value);
610	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
611
612	// Add the new metadata value to the list
613	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
614	metadata_values.add(metadata_value);
615	}
616
617	buffered_reader.close();
618	}
619	catch (FileNotFoundException exception) {
620	DebugStream.printStackTrace(exception);
621	}
622	catch (IOException exception) {
623	DebugStream.printStackTrace(exception);
624	}
625
626	return metadata_values;
627	}
628
629	*/
630
631	/**
632	* Every doc.xml file must be skimmed when a collection is opened, for two reasons:
633	* - To build a mapping from source file to its corresponding doc.xml file
634	* - To get a complete list of all extracted metadata elements
635	*/
636	/*
637	public void skimFile()
638	{
639	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
640
641	// Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
642	DebugStream.println("Skimming " + this + "...");
643	try {
644	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
645	int description_element_start = -1;
646
647	String line = null;
648	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
649	// This line contains the start of a Description element
650	if (line.indexOf("<Description>") != -1) {
651	if (description_element_start != -1) {
652	System.err.println("Parse error: previous Description element unfinished!");
653	}
654	description_element_start = line_num;
655	continue;
656	}
657
658	// This line contains the end of a Description element
659	if (line.indexOf("</Description>") != -1) {
660	if (description_element_start == -1) {
661	System.err.println("Parse error: Description element unstarted!");
662	}
663	description_element_start = -1;
664	continue;
665	}
666
667	// If we're not in a Description element there shouldn't be any Metadata elements
668	if (description_element_start == -1) {
669	continue;
670	}
671
672	// This line doesn't contain a Metadata element, so we're not interested
673	if (line.indexOf("<Metadata ") == -1) {
674	DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
675	continue;
676	}
677
678	// Extract the metadata element name
679	int name_index = line.indexOf(" name=\"") + " name=\"".length();
680	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
681
682	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
683	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
684	if (!metadata_set_namespace.equals("")) {
685	continue;
686	}
687
688	// Extracted metadata!
689	String metadata_element_name = metadata_element_name_full;
690
691	// Note which file this doc.xml is for
692	if (metadata_element_name.equals("gsdlsourcefilename")) {
693	// Extract the gsdlsourcefilename element value
694	int value_index = line.indexOf(">", name_index) + ">".length();
695	String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
696
697	// We're only interested in the path relative to the import folder
698	int import_index = gsdlsourcefilename_value.indexOf("import");
699	if (import_index != -1) {
700	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
701
702	boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
703	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
704
705	// URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
706	// This is stored in the System's file.encoding property.
707	gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
708
709	// Make sure the path matches the OS that is running
710	if (is_unix_path && Utility.isWindows()) {
711	// Convert path from Unix to Windows
712	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
713	}
714	else if (!is_unix_path && !Utility.isWindows()) {
715	// Convert path from Windows to Unix
716	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
717	}
718
719	// Remember this for quick access later
720	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
721	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
722	}
723
724	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
725	}
726
727	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
728	// This is true when the source files come from a zip file processed by ZIPPlug, for example
729	else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
730	// We don't really know what is going on...
731	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
732	}
733	}
734
735	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
736	if (metadata_element_name.startsWith("gsdl")) {
737	continue;
738	}
739
740	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
741	if (metadata_element == null) {
742	// This element isn't defined in ex.mds, so create it for this session
743	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
744	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
745	}
746	}
747
748	buffered_reader.close();
749	}
750	catch (FileNotFoundException exception) {
751	DebugStream.printStackTrace(exception);
752	}
753	catch (IOException exception) {
754	DebugStream.printStackTrace(exception);
755	}
756	}
757	*/
758
759	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: