Context Navigation

source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 33757

Last change on this file since 33757 was 33757, checked in by ak19, 4 years ago
Windows bugfix for getting exMeta to be loaded into GLI where there are subdirs involved in the Gather pane, or there are non-ASCII filenames, or the file rename method is set to base64. 2. Bugfix for Linux and Windows: Using Base64 to rename files was still a problem despite the previous commit (which was supposed to have fixed all GLI exMeta loading issues on Linux) in the special case where a subfolder was pure ASCII. The perl code wouldn't base64 encode such subdirs. However, GLI won't know which part of a relative file path to decode based on the file rename method used and which parts are not to be decoded. So GLI uniformly decoded them, and ASCII named subfolders that were not base64 encoded (but contained files that were to be renamed with base64) got base64 decoded into garbage, so that exMeta still did not get attached. 3. This commit contains debug stmts.
Property svn:keywords set to `Author Date Id Revision`
File size: 31.5 KB

Line
1	/**
2	*############################################################################
3	* A component of the Greenstone Librarian Interface, part of the Greenstone
4	* digital library suite from the New Zealand Digital Library Project at the
5	* University of Waikato, New Zealand.
6	*
7	* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8	*
9	* Copyright (C) 2004 New Zealand Digital Library Project
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*############################################################################
25	*/
26
27	package org.greenstone.gatherer.metadata;
28
29
30	import java.io.*;
31	import java.util.*;
32	import java.net.URLDecoder;
33	import org.greenstone.gatherer.DebugStream;
34	import org.greenstone.gatherer.util.Utility;
35
36	//import org.greenstone.gatherer.feedback.Base64; // decode() from Base64 didn't work
37	import org.apache.commons.codec.binary.Base64; // decoding from Base64 works
38
39	/** This class represents one doc.xml file */
40
41	public abstract class DocXMLFile extends File
42	{
43	protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
44
45	protected final String MetadataWrap;
46	protected final String MetadataItem;
47
48	protected final String FILE_RENAME_METHOD_NONE = "none";
49	protected final String FILE_RENAME_METHOD_URL = "url";
50	protected final String FILE_RENAME_METHOD_BASE64 = "base64";
51
52	public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
53	{
54	super(doc_xml_file_path);
55	this.MetadataWrap = metaWrap;
56	this.MetadataItem = metaItem;
57	}
58
59	/**
60	* Checks if various versions of the file object's filename, denoted relatively by file_relative_path,
61	* occur in the source_file_name_to_description_elements_mapping map
62	*/
63	private ArrayList findSourceFileMapKeyMatch(File file, String file_relative_path) {
64	ArrayList description_elements_list = null;
65
66	System.err.println("Looking for key " + file_relative_path);
67	description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
68	if(description_elements_list != null) {
69	System.err.println(" Found key matching REGULAR filepath: " + file_relative_path);
70	return description_elements_list;
71	}
72	else if(!Utility.isWindows()) { // couldn't find a matching key, we're done
73	System.err.println("Unable to find meta for regular file path form " + file_relative_path);
74	return null;
75	}
76
77	// Now we can try windows short filename as map key
78
79	String win_short_file_relative_path = "";
80	try{
81	win_short_file_relative_path = Utility.getWindowsShortFileName(file.getAbsolutePath());
82	//System.err.println("@@@ Searching for short file name: " + win_short_file_relative_path);
83	} catch(Exception e) { // we're done trying to find a matching key
84	System.err.println("Failed to convert to windows short file name: " + win_short_file_relative_path);
85	return null;
86	}
87
88	// Got a windows short file name, lop off import folder again
89	int import_index = win_short_file_relative_path.indexOf("import");
90	if (import_index != -1) {
91	win_short_file_relative_path = win_short_file_relative_path.substring(import_index + "import".length() + 1);
92	}
93
94	System.err.println("### Looking for Windows short file name \|" + win_short_file_relative_path + "\| in map of sourcefilenames to doc.xml's ex meta.");
95	description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(win_short_file_relative_path);
96	if (description_elements_list != null) {
97	System.err.println(" Found key matching FULL win shortfile path: " + win_short_file_relative_path);
98	return description_elements_list; // found
99	}
100
101	// else, check whether a map key is matched by any REMAINING combination of windows shortfile path and regular path:
102	// - windows shortfilename's rel-dir-path with regular tailname
103	// - and regular rel-dir-path with windows shortfilename's tailname
104
105	String shortFileTailName = win_short_file_relative_path;
106	String shortFileRelDirPath = "";
107	int lastSep = win_short_file_relative_path.lastIndexOf(File.separator);
108	if(lastSep != -1) {
109	shortFileTailName = win_short_file_relative_path.substring(lastSep+1);
110	shortFileRelDirPath = win_short_file_relative_path.substring(0, lastSep+1); // include the slash
111	}
112
113	String fileTailName = file_relative_path;
114	String fileRelDirPath = "";
115	lastSep = file_relative_path.lastIndexOf(File.separator);
116	if(lastSep != -1) {
117	fileTailName = file_relative_path.substring(lastSep+1);
118	fileRelDirPath = file_relative_path.substring(0, lastSep+1); // include the slash
119	}
120
121	String path = shortFileRelDirPath + fileTailName;
122	System.err.println("### Looking for Windows short file name \|" + path + "\| in map of sourcefilenames to doc.xml's ex meta.");
123	description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path);
124
125	if(description_elements_list != null) {
126	System.err.println(" Found key matching MIX of win shortfile path and regular path: " + path);
127	return description_elements_list; // found
128	}
129
130	// try the other combination
131	path = fileRelDirPath + shortFileTailName;
132	System.err.println("### Looking for Windows short file name \|" + path + "\| in map of sourcefilenames to doc.xml's ex meta.");
133	description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path);
134
135	if(description_elements_list != null) {
136	System.err.println(" Found key matching MIX of regular path and win shortfile path: " + path);
137	return description_elements_list; // found
138	}
139
140	return description_elements_list;
141	}
142
143
144	public ArrayList getMetadataExtractedFromFile(File file)
145	{
146	// Build up a list of metadata extracted from this file
147	ArrayList metadata_values = new ArrayList();
148
149	String file_relative_path = file.getAbsolutePath();
150	int import_index = file_relative_path.indexOf("import");
151	if (import_index != -1) {
152	file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
153	}
154
155	for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
156	System.err.println("\n@@@ relFilename: " + relFilename);
157	}
158
159	// Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
160	//ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
161	ArrayList description_elements_list = findSourceFileMapKeyMatch(file, file_relative_path);
162	if (description_elements_list == null) {
163	// ...it doesn't
164	System.err.println("Unable to find meta for (regular file path form) " + file_relative_path);
165	if(Utility.isWindows()) {
166	System.err.println(" Or for windows shortFile path form, or for combinations with regular file path form");
167	}
168	return metadata_values; // we're done
169	}
170
171	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
172
173	// Parse the file
174	DebugStream.println("Applicable file: " + this);
175	try {
176	BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
177
178	int description_element_num = 0;
179	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
180	boolean in_relevant_description_element = false;
181
182	String line = null;
183	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
184	// Check if this line contains the start of a relevant "Description" element
185	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
186	if (line_num == next_description_element_start) {
187	in_relevant_description_element = true;
188	continue;
189	}
190
191	// If we're not in a relevant Description element we don't care about anything
192	if (in_relevant_description_element == false) {
193	continue;
194	}
195
196	// Check if this line contains the end of the relevant Description element
197	if (line.indexOf("</"+MetadataWrap+">") != -1) {
198	description_element_num++;
199	if (description_element_num == description_elements_list.size()) {
200	break;
201	}
202
203	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
204	in_relevant_description_element = false;
205	continue;
206	}
207
208	// If this line doesn't contain a complete Metadata element, we're not interested
209	if (line.indexOf("<"+MetadataItem+" ") == -1 \|\| line.indexOf("</"+MetadataItem+">") == -1) {
210	continue;
211	}
212
213	// Extract the metadata element name
214	int name_index = line.indexOf(" name=\"") + " name=\"".length();
215	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
216
217	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
218	// Actually, if it is ex. then we are interested
219	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
220
221	if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
222	continue;
223	}
224
225	// Extracted metadata!
226	// do it like this just in case we have ex.
227	String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
228
229	// We completely ignore bibliographic data
230	if (metadata_element_name.equals("SourceSegment")) {
231	buffered_reader.close();
232	return new ArrayList();
233	}
234
235	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
236	if (metadata_element_name.startsWith("gsdl")) {
237	continue;
238	}
239
240	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
241
242	// Value trees are not stored for extracted metadata, so create a new value tree node now
243	int value_index = line.indexOf(">", name_index) + ">".length();
244	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
245
246	metadata_element.addMetadataValue(metadata_element_value);
247	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
248
249	// Add the new metadata value to the list
250	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
251	metadata_values.add(metadata_value);
252	}
253
254	buffered_reader.close();
255	}
256	catch (FileNotFoundException exception) {
257	DebugStream.printStackTrace(exception);
258	}
259	catch (IOException exception) {
260	DebugStream.printStackTrace(exception);
261	}
262
263	return metadata_values;
264	}
265
266
267
268
269	/**
270	* Every file must be skimmed when a collection is opened, for two reasons:
271	* - To build a mapping from source file to its corresponding doc.xml file
272	* - To get a complete list of all extracted metadata elements
273	*/
274	public void skimFile()
275	{
276	String fileRenameMethod = null;
277	String gsdlsourcefilename_value = null;
278	boolean is_unix_path = false;
279
280	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
281
282	// Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
283	DebugStream.println("Skimming " + this + "...");
284	try {
285	BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
286	int description_element_start = -1;
287
288	String line = null;
289	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
290	// This line contains the start of a "MetadataWrap" element
291	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
292	if (line.indexOf("<"+MetadataWrap+">") != -1) {
293	if (description_element_start != -1) {
294	System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
295	}
296	description_element_start = line_num;
297	continue;
298	}
299
300	// This line contains the end of a "MetadataWrap" element
301	if (line.indexOf("</"+MetadataWrap+">") != -1) {
302	if (description_element_start == -1) {
303	System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
304	}
305	description_element_start = -1;
306	continue;
307	}
308
309	// If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
310	if (description_element_start == -1) {
311	continue;
312	}
313
314	// This line doesn't contain a Metadata element, so we're not interested
315	if (line.indexOf("<"+MetadataItem+" ") == -1) {
316	DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
317	continue;
318	}
319
320	// Extract the metadata element name
321	int name_index = line.indexOf(" name=\"") + " name=\"".length();
322	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
323
324	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
325	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
326	if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
327	continue;
328	}
329
330	// Extracted metadata! May have ex. so make sure we remove that
331	String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
332	if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
333	// Extract the element value
334	int value_index = line.indexOf(">", name_index) + ">".length();
335	fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));
336	}
337
338	// Note which file this is for
339	else if (metadata_element_name.equals("gsdlsourcefilename")) {
340	// Extract the gsdlsourcefilename element value
341	int value_index = line.indexOf(">", name_index) + ">".length();
342	gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
343
344	// We're only interested in the path relative to the import folder
345	int import_index = gsdlsourcefilename_value.indexOf("import");
346	if (import_index != -1) {
347	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
348
349	is_unix_path = gsdlsourcefilename_value.startsWith("/");
350	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
351
352	// (Will decode gsdlsourcefilename at end of this method, once we know
353	// for certain the fileRenameMethod that was used to encode it.)
354
355	// Make sure the path matches the OS that is running
356	if (is_unix_path && Utility.isWindows()) {
357	// Convert path from Unix to Windows
358	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
359	}
360	else if (!is_unix_path && !Utility.isWindows()) {
361	// Convert path from Windows to Unix
362	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
363	}
364
365	System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
366	// Remember this for quick access later
367	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
368	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
369	}
370
371	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
372	}
373
374	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
375	// (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
376	// which are the gsdlsourcefilenames for the fedora digital object representing a collection.
377	// This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
378	else if (gsdlsourcefilename_value.indexOf("tmp") == -1
379	&& !gsdlsourcefilename_value.endsWith("collect.cfg")
380	&& !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
381	// We don't really know what is going on...
382	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
383	}
384	}
385
386	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
387	if (metadata_element_name.startsWith("gsdl")) {
388	continue;
389	}
390
391	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
392	if (metadata_element == null) {
393	// This element isn't defined in ex.mds, so create it for this session
394	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
395	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
396	}
397	}
398
399	buffered_reader.close();
400
401	// Now that we're done skimming, we actually need to decode gsdlsourcefilename
402	// based on whatever fileRenameMethod was used to encode it, so that we can
403	// at last properly compare properly against filenames on the file system
404	// in order to load the correct ex.meta for the file.
405	// Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
406	// we can finally perform the decoding of gsdlsourcefilename.
407	if(fileRenameMethod == null) {
408	fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
409	}
410	// If gsdlsourcefilename was encoded, we remove it from the map under its encoded
411	// filename, decode it and add it back into map using its decoded filename.
412	if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
413	ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value);
414	gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
415	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list);
416	}
417	}
418	catch (FileNotFoundException exception) {
419	DebugStream.printStackTrace(exception);
420	}
421	catch (IOException exception) {
422	DebugStream.printStackTrace(exception);
423	} catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
424	DebugStream.printStackTrace(exception);
425	}
426	}
427
428	protected String decodeSourceFilename(String relative_sourcefile_path,
429	String encodingMethod, boolean is_unix_path)
430	throws Exception
431	{
432
433	///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
434
435	// First get the file extension. Both in Base64 and URL encoded strings,
436	// the full-stop character (.) doesn't get encoded.
437	// That means getting the file extension is straightforward.
438
439	// Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
440	// 26 lowercase characters, 26 uppercase characters as well as the
441	// Plus sign (+) and the Forward Slash (/).
442	int fullstop = relative_sourcefile_path.indexOf(".");
443	String file_ext = "";
444	if(fullstop != -1) {
445	file_ext = relative_sourcefile_path.substring(fullstop);
446	relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
447	}
448
449	String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
450
451	String decoded_gsdlsourcefilename = "";
452
453	String separator = is_unix_path ? "/" : "\\";
454	for(int i = 0; i < importFilePathParts.length; i++) {
455	String decoded_filePathPart = "";
456	if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
457	// URL decode each part of gsdlsourcefilename.
458	// Need to set the decoder to use the default system encoding
459	// This is stored in the System's file.encoding property.
460	decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
461	}
462	else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
463	// Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
464	//byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
465	// Using org.apache.commons.codec.binary.Base64 instead
466	// https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
467	// General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
468	byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
469	System.err.println("Got base64 string: " + importFilePathParts[i]);
470	System.err.println("Decoded from base64 to bytes: " + new String(bytes, System.getProperty("file.encoding")));
471	// Using system file.encoding to interpret the resulting bytestring as a String,
472	// just as we always did with URL decoding method
473	decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
474	}
475
476	if(i == 0) {
477	decoded_gsdlsourcefilename = decoded_filePathPart;
478	} else {
479	decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
480	}
481	///System.err.println("Built up: " + decoded_gsdlsourcefilename);
482	}
483
484	// add the file extension back in
485	decoded_gsdlsourcefilename += file_ext;
486
487	System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
488
489	return decoded_gsdlsourcefilename;
490	}
491
492	/**
493	* Given a filepath, returns the parts between each file separator as an array.
494	* For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
495	*/
496	private static String[] getFilePathParts(String filepath, boolean is_unix_path) {
497	StringTokenizer tok;
498	if(is_unix_path) {
499	tok = new StringTokenizer(filepath, "/");
500	} else {
501	tok = new StringTokenizer(filepath, "\\");
502	}
503	String[] parts;
504	int count = tok.countTokens();
505	if(count <= 0) {
506	parts = new String[]{filepath};
507	} else {
508	int i = 0;
509	parts = new String[count];
510	while(tok.hasMoreTokens()) {
511	parts[i] = tok.nextToken();
512	//System.err.println("Next part: " + parts[i]);
513	i++;
514	}
515	}
516	return parts;
517	}
518
519	/*
520	public ArrayList getMetadataExtractedFromFile(File file)
521	{
522	// Build up a list of metadata extracted from this file
523	ArrayList metadata_values = new ArrayList();
524
525	String file_relative_path = file.getAbsolutePath();
526	int import_index = file_relative_path.indexOf("import");
527	if (import_index != -1) {
528	file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
529	}
530
531	// Check whether this doc.xml file contains extracted metadata for the specified file
532	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
533	if (description_elements_list == null) {
534	// ...it doesn't
535	return metadata_values;
536	}
537
538	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
539
540	// Parse the doc.xml file
541	DebugStream.println("Applicable doc.xml file: " + this);
542	try {
543	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
544
545	int description_element_num = 0;
546	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
547	boolean in_relevant_description_element = false;
548
549	String line = null;
550	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
551	// Check if this line contains the start of a relevant Description element
552	if (line_num == next_description_element_start) {
553	in_relevant_description_element = true;
554	continue;
555	}
556
557	// If we're not in a relevant Description element we don't care about anything
558	if (in_relevant_description_element == false) {
559	continue;
560	}
561
562	// Check if this line contains the end of the relevant Description element
563	if (line.indexOf("</Description>") != -1) {
564	description_element_num++;
565	if (description_element_num == description_elements_list.size()) {
566	break;
567	}
568
569	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
570	in_relevant_description_element = false;
571	continue;
572	}
573
574	// If this line doesn't contain a complete Metadata element, we're not interested
575	if (line.indexOf("<Metadata ") == -1 \|\| line.indexOf("</Metadata>") == -1) {
576	continue;
577	}
578
579	// Extract the metadata element name
580	int name_index = line.indexOf(" name=\"") + " name=\"".length();
581	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
582
583	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
584	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
585	if (!metadata_set_namespace.equals("")) {
586	continue;
587	}
588
589	// Extracted metadata!
590	String metadata_element_name = metadata_element_name_full;
591
592	// We completely ignore bibliographic data
593	if (metadata_element_name.equals("SourceSegment")) {
594	buffered_reader.close();
595	return new ArrayList();
596	}
597
598	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
599	if (metadata_element_name.startsWith("gsdl")) {
600	continue;
601	}
602
603	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
604
605	// Value trees are not stored for extracted metadata, so create a new value tree node now
606	int value_index = line.indexOf(">", name_index) + ">".length();
607	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
608
609	metadata_element.addMetadataValue(metadata_element_value);
610	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
611
612	// Add the new metadata value to the list
613	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
614	metadata_values.add(metadata_value);
615	}
616
617	buffered_reader.close();
618	}
619	catch (FileNotFoundException exception) {
620	DebugStream.printStackTrace(exception);
621	}
622	catch (IOException exception) {
623	DebugStream.printStackTrace(exception);
624	}
625
626	return metadata_values;
627	}
628
629	*/
630
631	/**
632	* Every doc.xml file must be skimmed when a collection is opened, for two reasons:
633	* - To build a mapping from source file to its corresponding doc.xml file
634	* - To get a complete list of all extracted metadata elements
635	*/
636	/*
637	public void skimFile()
638	{
639	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
640
641	// Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
642	DebugStream.println("Skimming " + this + "...");
643	try {
644	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
645	int description_element_start = -1;
646
647	String line = null;
648	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
649	// This line contains the start of a Description element
650	if (line.indexOf("<Description>") != -1) {
651	if (description_element_start != -1) {
652	System.err.println("Parse error: previous Description element unfinished!");
653	}
654	description_element_start = line_num;
655	continue;
656	}
657
658	// This line contains the end of a Description element
659	if (line.indexOf("</Description>") != -1) {
660	if (description_element_start == -1) {
661	System.err.println("Parse error: Description element unstarted!");
662	}
663	description_element_start = -1;
664	continue;
665	}
666
667	// If we're not in a Description element there shouldn't be any Metadata elements
668	if (description_element_start == -1) {
669	continue;
670	}
671
672	// This line doesn't contain a Metadata element, so we're not interested
673	if (line.indexOf("<Metadata ") == -1) {
674	DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
675	continue;
676	}
677
678	// Extract the metadata element name
679	int name_index = line.indexOf(" name=\"") + " name=\"".length();
680	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
681
682	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
683	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
684	if (!metadata_set_namespace.equals("")) {
685	continue;
686	}
687
688	// Extracted metadata!
689	String metadata_element_name = metadata_element_name_full;
690
691	// Note which file this doc.xml is for
692	if (metadata_element_name.equals("gsdlsourcefilename")) {
693	// Extract the gsdlsourcefilename element value
694	int value_index = line.indexOf(">", name_index) + ">".length();
695	String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
696
697	// We're only interested in the path relative to the import folder
698	int import_index = gsdlsourcefilename_value.indexOf("import");
699	if (import_index != -1) {
700	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
701
702	boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
703	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
704
705	// URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
706	// This is stored in the System's file.encoding property.
707	gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
708
709	// Make sure the path matches the OS that is running
710	if (is_unix_path && Utility.isWindows()) {
711	// Convert path from Unix to Windows
712	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
713	}
714	else if (!is_unix_path && !Utility.isWindows()) {
715	// Convert path from Windows to Unix
716	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
717	}
718
719	// Remember this for quick access later
720	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
721	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
722	}
723
724	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
725	}
726
727	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
728	// This is true when the source files come from a zip file processed by ZIPPlug, for example
729	else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
730	// We don't really know what is going on...
731	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
732	}
733	}
734
735	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
736	if (metadata_element_name.startsWith("gsdl")) {
737	continue;
738	}
739
740	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
741	if (metadata_element == null) {
742	// This element isn't defined in ex.mds, so create it for this session
743	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
744	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
745	}
746	}
747
748	buffered_reader.close();
749	}
750	catch (FileNotFoundException exception) {
751	DebugStream.printStackTrace(exception);
752	}
753	catch (IOException exception) {
754	DebugStream.printStackTrace(exception);
755	}
756	}
757	*/
758
759	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: