Context Navigation

source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 33756

Last change on this file since 33756 was 33756, checked in by ak19, 4 years ago
Attempted bugfix for ex meta not always loading in gli for docs that are in subdirs when filenames are base64 encoded. This commit only testedand works on linux for my basic tests with subdirs and without. 1. Perl now encodes all subdirs and the filename in gsdlsourcefilename (but as before, not file extension). Can't encode entire relative path starting with import in one go, as other parts of the perl code do comparisons and remove file GSDLIMPORTDIR prefixes. 2. Perl now also writes out the file rename method used, which can be none, url or base64, into doc.xml. 3. GLI now decodes each part of the gsdlsourcefilename relative path based on the file rename method. e.g. for import/subdir/filename.ext the import, subdir and filename are decoded to reconstitute the filename as it originally was, with file extension stuck back on. This has allowed GLI to finally detect the ex meta associated with a gsdlsourcefilename in cases of subdirs in import or when dealing with base64 encoded filenames. Still need to test more complex cases on linux, then windows too.
Property svn:keywords set to `Author Date Id Revision`
File size: 27.0 KB

Line
1	/**
2	*############################################################################
3	* A component of the Greenstone Librarian Interface, part of the Greenstone
4	* digital library suite from the New Zealand Digital Library Project at the
5	* University of Waikato, New Zealand.
6	*
7	* Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8	*
9	* Copyright (C) 2004 New Zealand Digital Library Project
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*############################################################################
25	*/
26
27	package org.greenstone.gatherer.metadata;
28
29
30	import java.io.*;
31	import java.util.*;
32	import java.net.URLDecoder;
33	import org.greenstone.gatherer.DebugStream;
34	import org.greenstone.gatherer.util.Utility;
35
36	import org.apache.commons.codec.binary.Base64;
37
38	//import org.greenstone.gatherer.feedback.Base64;
39
40	/** This class represents one doc.xml file */
41
42	public abstract class DocXMLFile extends File
43	{
44	protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
45
46	protected final String MetadataWrap;
47	protected final String MetadataItem;
48
49	protected final String FILE_RENAME_METHOD_NONE = "none";
50	protected final String FILE_RENAME_METHOD_URL = "url";
51	protected final String FILE_RENAME_METHOD_BASE64 = "base64";
52
53	public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
54	{
55	super(doc_xml_file_path);
56	this.MetadataWrap = metaWrap;
57	this.MetadataItem = metaItem;
58	}
59
60
61	public ArrayList getMetadataExtractedFromFile(File file)
62	{
63	// Build up a list of metadata extracted from this file
64	ArrayList metadata_values = new ArrayList();
65
66	String file_relative_path = file.getAbsolutePath();
67	int import_index = file_relative_path.indexOf("import");
68	if (import_index != -1) {
69	file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
70	}
71
72	///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
73	/// System.err.println("@@@ relFilename: " + relFilename);
74	///}
75
76	// Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
77	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
78	if (description_elements_list == null) {
79	// ...it doesn't
80	return metadata_values;
81	}
82
83	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
84
85	// Parse the file
86	DebugStream.println("Applicable file: " + this);
87	try {
88	BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
89
90	int description_element_num = 0;
91	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
92	boolean in_relevant_description_element = false;
93
94	String line = null;
95	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
96	// Check if this line contains the start of a relevant "Description" element
97	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
98	if (line_num == next_description_element_start) {
99	in_relevant_description_element = true;
100	continue;
101	}
102
103	// If we're not in a relevant Description element we don't care about anything
104	if (in_relevant_description_element == false) {
105	continue;
106	}
107
108	// Check if this line contains the end of the relevant Description element
109	if (line.indexOf("</"+MetadataWrap+">") != -1) {
110	description_element_num++;
111	if (description_element_num == description_elements_list.size()) {
112	break;
113	}
114
115	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
116	in_relevant_description_element = false;
117	continue;
118	}
119
120	// If this line doesn't contain a complete Metadata element, we're not interested
121	if (line.indexOf("<"+MetadataItem+" ") == -1 \|\| line.indexOf("</"+MetadataItem+">") == -1) {
122	continue;
123	}
124
125	// Extract the metadata element name
126	int name_index = line.indexOf(" name=\"") + " name=\"".length();
127	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
128
129	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
130	// Actually, if it is ex. then we are interested
131	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
132
133	if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
134	continue;
135	}
136
137	// Extracted metadata!
138	// do it like this just in case we have ex.
139	String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
140
141	// We completely ignore bibliographic data
142	if (metadata_element_name.equals("SourceSegment")) {
143	buffered_reader.close();
144	return new ArrayList();
145	}
146
147	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
148	if (metadata_element_name.startsWith("gsdl")) {
149	continue;
150	}
151
152	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
153
154	// Value trees are not stored for extracted metadata, so create a new value tree node now
155	int value_index = line.indexOf(">", name_index) + ">".length();
156	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
157
158	metadata_element.addMetadataValue(metadata_element_value);
159	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
160
161	// Add the new metadata value to the list
162	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
163	metadata_values.add(metadata_value);
164	}
165
166	buffered_reader.close();
167	}
168	catch (FileNotFoundException exception) {
169	DebugStream.printStackTrace(exception);
170	}
171	catch (IOException exception) {
172	DebugStream.printStackTrace(exception);
173	}
174
175	return metadata_values;
176	}
177
178
179
180
181	/**
182	* Every file must be skimmed when a collection is opened, for two reasons:
183	* - To build a mapping from source file to its corresponding doc.xml file
184	* - To get a complete list of all extracted metadata elements
185	*/
186	public void skimFile()
187	{
188	String fileRenameMethod = null;
189	String gsdlsourcefilename_value = null;
190	boolean is_unix_path = false;
191
192	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
193
194	// Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
195	DebugStream.println("Skimming " + this + "...");
196	try {
197	BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
198	int description_element_start = -1;
199
200	String line = null;
201	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
202	// This line contains the start of a "MetadataWrap" element
203	// (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
204	if (line.indexOf("<"+MetadataWrap+">") != -1) {
205	if (description_element_start != -1) {
206	System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
207	}
208	description_element_start = line_num;
209	continue;
210	}
211
212	// This line contains the end of a "MetadataWrap" element
213	if (line.indexOf("</"+MetadataWrap+">") != -1) {
214	if (description_element_start == -1) {
215	System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
216	}
217	description_element_start = -1;
218	continue;
219	}
220
221	// If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
222	if (description_element_start == -1) {
223	continue;
224	}
225
226	// This line doesn't contain a Metadata element, so we're not interested
227	if (line.indexOf("<"+MetadataItem+" ") == -1) {
228	DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
229	continue;
230	}
231
232	// Extract the metadata element name
233	int name_index = line.indexOf(" name=\"") + " name=\"".length();
234	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
235
236	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
237	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
238	if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
239	continue;
240	}
241
242	// Extracted metadata! May have ex. so make sure we remove that
243	String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
244	if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
245	// Extract the element value
246	int value_index = line.indexOf(">", name_index) + ">".length();
247	fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));
248	}
249
250	// Note which file this is for
251	else if (metadata_element_name.equals("gsdlsourcefilename")) {
252	// Extract the gsdlsourcefilename element value
253	int value_index = line.indexOf(">", name_index) + ">".length();
254	gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
255
256	// We're only interested in the path relative to the import folder
257	int import_index = gsdlsourcefilename_value.indexOf("import");
258	if (import_index != -1) {
259	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
260
261	is_unix_path = gsdlsourcefilename_value.startsWith("/");
262	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
263
264	// (Will decode gsdlsourcefilename at end of this method, once we know
265	// for certain the fileRenameMethod that was used to encode it.)
266
267	// Make sure the path matches the OS that is running
268	if (is_unix_path && Utility.isWindows()) {
269	// Convert path from Unix to Windows
270	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
271	}
272	else if (!is_unix_path && !Utility.isWindows()) {
273	// Convert path from Windows to Unix
274	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
275	}
276
277	///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
278	// Remember this for quick access later
279	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
280	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
281	}
282
283	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
284	}
285
286	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
287	// (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
288	// which are the gsdlsourcefilenames for the fedora digital object representing a collection.
289	// This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
290	else if (gsdlsourcefilename_value.indexOf("tmp") == -1
291	&& !gsdlsourcefilename_value.endsWith("collect.cfg")
292	&& !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
293	// We don't really know what is going on...
294	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
295	}
296	}
297
298	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
299	if (metadata_element_name.startsWith("gsdl")) {
300	continue;
301	}
302
303	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
304	if (metadata_element == null) {
305	// This element isn't defined in ex.mds, so create it for this session
306	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
307	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
308	}
309	}
310
311	buffered_reader.close();
312
313	// Now that we're done skimming, we actually need to decode gsdlsourcefilename
314	// based on whatever fileRenameMethod was used to encode it, so that we can
315	// at last properly compare properly against filenames on the file system
316	// in order to load the correct ex.meta for the file.
317	// Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
318	// we can finally perform the decoding of gsdlsourcefilename.
319	if(fileRenameMethod == null) {
320	fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
321	}
322	// If gsdlsourcefilename was encoded, we remove it from the map under its encoded
323	// filename, decode it and add it back into map using its decoded filename.
324	if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
325	ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value);
326	gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
327	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list);
328	}
329	}
330	catch (FileNotFoundException exception) {
331	DebugStream.printStackTrace(exception);
332	}
333	catch (IOException exception) {
334	DebugStream.printStackTrace(exception);
335	} catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
336	DebugStream.printStackTrace(exception);
337	}
338	}
339
340	protected String decodeSourceFilename(String relative_sourcefile_path,
341	String encodingMethod, boolean is_unix_path)
342	throws Exception
343	{
344
345	///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
346
347	// First get the file extension. Both in Base64 and URL encoded strings,
348	// the full-stop character (.) doesn't get encoded.
349	// That means getting the file extension is straightforward.
350
351	// Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
352	// 26 lowercase characters, 26 uppercase characters as well as the
353	// Plus sign (+) and the Forward Slash (/).
354	int fullstop = relative_sourcefile_path.indexOf(".");
355	String file_ext = "";
356	if(fullstop != -1) {
357	file_ext = relative_sourcefile_path.substring(fullstop);
358	relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
359	}
360
361	String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
362
363	String decoded_gsdlsourcefilename = "";
364
365	String separator = is_unix_path ? "/" : "\\";
366	for(int i = 0; i < importFilePathParts.length; i++) {
367	String decoded_filePathPart = "";
368	if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
369	// URL decode each part of gsdlsourcefilename.
370	// Need to set the decoder to use the default system encoding
371	// This is stored in the System's file.encoding property.
372	decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
373	}
374	else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
375	// Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
376	//byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
377	// Using org.apache.commons.codec.binary.Base64 instead
378	// https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
379	// General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
380	byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
381	///System.err.println("Got base64 string: " + importFilePathParts[i]);
382	///System.err.println("Decoded from base64 to bytes: " + bytes);
383	// Using system file.encoding to interpret the resulting bytestring as a String,
384	// just as we always did with URL decoding method
385	decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
386	}
387
388	if(i == 0) {
389	decoded_gsdlsourcefilename = decoded_filePathPart;
390	} else {
391	decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
392	}
393	///System.err.println("Built up: " + decoded_gsdlsourcefilename);
394	}
395
396	// add the file extension back in
397	decoded_gsdlsourcefilename += file_ext;
398
399	///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
400
401	return decoded_gsdlsourcefilename;
402	}
403
404	/**
405	* Given a filepath, returns the parts between each file separator as an array.
406	* For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
407	*/
408	private static String[] getFilePathParts(String filepath, boolean is_unix_path) {
409	StringTokenizer tok;
410	if(is_unix_path) {
411	tok = new StringTokenizer(filepath, "/");
412	} else {
413	tok = new StringTokenizer(filepath, "\\");
414	}
415	String[] parts;
416	int count = tok.countTokens();
417	if(count <= 0) {
418	parts = new String[]{filepath};
419	} else {
420	int i = 0;
421	parts = new String[count];
422	while(tok.hasMoreTokens()) {
423	parts[i] = tok.nextToken();
424	//System.err.println("Next part: " + parts[i]);
425	i++;
426	}
427	}
428	return parts;
429	}
430
431	/*
432	public ArrayList getMetadataExtractedFromFile(File file)
433	{
434	// Build up a list of metadata extracted from this file
435	ArrayList metadata_values = new ArrayList();
436
437	String file_relative_path = file.getAbsolutePath();
438	int import_index = file_relative_path.indexOf("import");
439	if (import_index != -1) {
440	file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
441	}
442
443	// Check whether this doc.xml file contains extracted metadata for the specified file
444	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
445	if (description_elements_list == null) {
446	// ...it doesn't
447	return metadata_values;
448	}
449
450	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
451
452	// Parse the doc.xml file
453	DebugStream.println("Applicable doc.xml file: " + this);
454	try {
455	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
456
457	int description_element_num = 0;
458	int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
459	boolean in_relevant_description_element = false;
460
461	String line = null;
462	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
463	// Check if this line contains the start of a relevant Description element
464	if (line_num == next_description_element_start) {
465	in_relevant_description_element = true;
466	continue;
467	}
468
469	// If we're not in a relevant Description element we don't care about anything
470	if (in_relevant_description_element == false) {
471	continue;
472	}
473
474	// Check if this line contains the end of the relevant Description element
475	if (line.indexOf("</Description>") != -1) {
476	description_element_num++;
477	if (description_element_num == description_elements_list.size()) {
478	break;
479	}
480
481	next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
482	in_relevant_description_element = false;
483	continue;
484	}
485
486	// If this line doesn't contain a complete Metadata element, we're not interested
487	if (line.indexOf("<Metadata ") == -1 \|\| line.indexOf("</Metadata>") == -1) {
488	continue;
489	}
490
491	// Extract the metadata element name
492	int name_index = line.indexOf(" name=\"") + " name=\"".length();
493	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
494
495	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
496	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
497	if (!metadata_set_namespace.equals("")) {
498	continue;
499	}
500
501	// Extracted metadata!
502	String metadata_element_name = metadata_element_name_full;
503
504	// We completely ignore bibliographic data
505	if (metadata_element_name.equals("SourceSegment")) {
506	buffered_reader.close();
507	return new ArrayList();
508	}
509
510	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
511	if (metadata_element_name.startsWith("gsdl")) {
512	continue;
513	}
514
515	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
516
517	// Value trees are not stored for extracted metadata, so create a new value tree node now
518	int value_index = line.indexOf(">", name_index) + ">".length();
519	String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
520
521	metadata_element.addMetadataValue(metadata_element_value);
522	MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
523
524	// Add the new metadata value to the list
525	MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
526	metadata_values.add(metadata_value);
527	}
528
529	buffered_reader.close();
530	}
531	catch (FileNotFoundException exception) {
532	DebugStream.printStackTrace(exception);
533	}
534	catch (IOException exception) {
535	DebugStream.printStackTrace(exception);
536	}
537
538	return metadata_values;
539	}
540
541	*/
542
543	/**
544	* Every doc.xml file must be skimmed when a collection is opened, for two reasons:
545	* - To build a mapping from source file to its corresponding doc.xml file
546	* - To get a complete list of all extracted metadata elements
547	*/
548	/*
549	public void skimFile()
550	{
551	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
552
553	// Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
554	DebugStream.println("Skimming " + this + "...");
555	try {
556	BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
557	int description_element_start = -1;
558
559	String line = null;
560	for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
561	// This line contains the start of a Description element
562	if (line.indexOf("<Description>") != -1) {
563	if (description_element_start != -1) {
564	System.err.println("Parse error: previous Description element unfinished!");
565	}
566	description_element_start = line_num;
567	continue;
568	}
569
570	// This line contains the end of a Description element
571	if (line.indexOf("</Description>") != -1) {
572	if (description_element_start == -1) {
573	System.err.println("Parse error: Description element unstarted!");
574	}
575	description_element_start = -1;
576	continue;
577	}
578
579	// If we're not in a Description element there shouldn't be any Metadata elements
580	if (description_element_start == -1) {
581	continue;
582	}
583
584	// This line doesn't contain a Metadata element, so we're not interested
585	if (line.indexOf("<Metadata ") == -1) {
586	DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
587	continue;
588	}
589
590	// Extract the metadata element name
591	int name_index = line.indexOf(" name=\"") + " name=\"".length();
592	String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
593
594	// If the metadata has a namespace it isn't extracted metadata, so we're not interested
595	String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
596	if (!metadata_set_namespace.equals("")) {
597	continue;
598	}
599
600	// Extracted metadata!
601	String metadata_element_name = metadata_element_name_full;
602
603	// Note which file this doc.xml is for
604	if (metadata_element_name.equals("gsdlsourcefilename")) {
605	// Extract the gsdlsourcefilename element value
606	int value_index = line.indexOf(">", name_index) + ">".length();
607	String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
608
609	// We're only interested in the path relative to the import folder
610	int import_index = gsdlsourcefilename_value.indexOf("import");
611	if (import_index != -1) {
612	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
613
614	boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
615	gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
616
617	// URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
618	// This is stored in the System's file.encoding property.
619	gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
620
621	// Make sure the path matches the OS that is running
622	if (is_unix_path && Utility.isWindows()) {
623	// Convert path from Unix to Windows
624	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
625	}
626	else if (!is_unix_path && !Utility.isWindows()) {
627	// Convert path from Windows to Unix
628	gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
629	}
630
631	// Remember this for quick access later
632	if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
633	source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
634	}
635
636	((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
637	}
638
639	// Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
640	// This is true when the source files come from a zip file processed by ZIPPlug, for example
641	else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
642	// We don't really know what is going on...
643	System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
644	}
645	}
646
647	// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
648	if (metadata_element_name.startsWith("gsdl")) {
649	continue;
650	}
651
652	MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
653	if (metadata_element == null) {
654	// This element isn't defined in ex.mds, so create it for this session
655	DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
656	extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
657	}
658	}
659
660	buffered_reader.close();
661	}
662	catch (FileNotFoundException exception) {
663	DebugStream.printStackTrace(exception);
664	}
665	catch (IOException exception) {
666	DebugStream.printStackTrace(exception);
667	}
668	}
669	*/
670
671	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: