source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 33756

Last change on this file since 33756 was 33756, checked in by ak19, 4 years ago

Attempted bugfix for ex meta not always loading in gli for docs that are in subdirs when filenames are base64 encoded. This commit only testedand works on linux for my basic tests with subdirs and without. 1. Perl now encodes all subdirs and the filename in gsdlsourcefilename (but as before, not file extension). Can't encode entire relative path starting with import in one go, as other parts of the perl code do comparisons and remove file GSDLIMPORTDIR prefixes. 2. Perl now also writes out the file rename method used, which can be none, url or base64, into doc.xml. 3. GLI now decodes each part of the gsdlsourcefilename relative path based on the file rename method. e.g. for import/subdir/filename.ext the import, subdir and filename are decoded to reconstitute the filename as it originally was, with file extension stuck back on. This has allowed GLI to finally detect the ex meta associated with a gsdlsourcefilename in cases of subdirs in import or when dealing with base64 encoded filenames. Still need to test more complex cases on linux, then windows too.

  • Property svn:keywords set to Author Date Id Revision
File size: 27.0 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import java.net.URLDecoder;
33import org.greenstone.gatherer.DebugStream;
34import org.greenstone.gatherer.util.Utility;
35
36import org.apache.commons.codec.binary.Base64;
37
38//import org.greenstone.gatherer.feedback.Base64;
39
40/** This class represents one doc.xml file */
41
42public abstract class DocXMLFile extends File
43{
44 protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
45
46 protected final String MetadataWrap;
47 protected final String MetadataItem;
48
49 protected final String FILE_RENAME_METHOD_NONE = "none";
50 protected final String FILE_RENAME_METHOD_URL = "url";
51 protected final String FILE_RENAME_METHOD_BASE64 = "base64";
52
53 public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
54 {
55 super(doc_xml_file_path);
56 this.MetadataWrap = metaWrap;
57 this.MetadataItem = metaItem;
58 }
59
60
61 public ArrayList getMetadataExtractedFromFile(File file)
62 {
63 // Build up a list of metadata extracted from this file
64 ArrayList metadata_values = new ArrayList();
65
66 String file_relative_path = file.getAbsolutePath();
67 int import_index = file_relative_path.indexOf("import");
68 if (import_index != -1) {
69 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
70 }
71
72 ///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
73 /// System.err.println("@@@ relFilename: " + relFilename);
74 ///}
75
76 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
77 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
78 if (description_elements_list == null) {
79 // ...it doesn't
80 return metadata_values;
81 }
82
83 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
84
85 // Parse the file
86 DebugStream.println("Applicable file: " + this);
87 try {
88 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
89
90 int description_element_num = 0;
91 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
92 boolean in_relevant_description_element = false;
93
94 String line = null;
95 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
96 // Check if this line contains the start of a relevant "Description" element
97 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
98 if (line_num == next_description_element_start) {
99 in_relevant_description_element = true;
100 continue;
101 }
102
103 // If we're not in a relevant Description element we don't care about anything
104 if (in_relevant_description_element == false) {
105 continue;
106 }
107
108 // Check if this line contains the end of the relevant Description element
109 if (line.indexOf("</"+MetadataWrap+">") != -1) {
110 description_element_num++;
111 if (description_element_num == description_elements_list.size()) {
112 break;
113 }
114
115 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
116 in_relevant_description_element = false;
117 continue;
118 }
119
120 // If this line doesn't contain a complete Metadata element, we're not interested
121 if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) {
122 continue;
123 }
124
125 // Extract the metadata element name
126 int name_index = line.indexOf(" name=\"") + " name=\"".length();
127 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
128
129 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
130 // Actually, if it is ex. then we are interested
131 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
132
133 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
134 continue;
135 }
136
137 // Extracted metadata!
138 // do it like this just in case we have ex.
139 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
140
141 // We completely ignore bibliographic data
142 if (metadata_element_name.equals("SourceSegment")) {
143 buffered_reader.close();
144 return new ArrayList();
145 }
146
147 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
148 if (metadata_element_name.startsWith("gsdl")) {
149 continue;
150 }
151
152 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
153
154 // Value trees are not stored for extracted metadata, so create a new value tree node now
155 int value_index = line.indexOf(">", name_index) + ">".length();
156 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
157
158 metadata_element.addMetadataValue(metadata_element_value);
159 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
160
161 // Add the new metadata value to the list
162 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
163 metadata_values.add(metadata_value);
164 }
165
166 buffered_reader.close();
167 }
168 catch (FileNotFoundException exception) {
169 DebugStream.printStackTrace(exception);
170 }
171 catch (IOException exception) {
172 DebugStream.printStackTrace(exception);
173 }
174
175 return metadata_values;
176 }
177
178
179
180
181 /**
182 * Every file must be skimmed when a collection is opened, for two reasons:
183 * - To build a mapping from source file to its corresponding doc.xml file
184 * - To get a complete list of all extracted metadata elements
185 */
186 public void skimFile()
187 {
188 String fileRenameMethod = null;
189 String gsdlsourcefilename_value = null;
190 boolean is_unix_path = false;
191
192 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
193
194 // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
195 DebugStream.println("Skimming " + this + "...");
196 try {
197 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
198 int description_element_start = -1;
199
200 String line = null;
201 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
202 // This line contains the start of a "MetadataWrap" element
203 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
204 if (line.indexOf("<"+MetadataWrap+">") != -1) {
205 if (description_element_start != -1) {
206 System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
207 }
208 description_element_start = line_num;
209 continue;
210 }
211
212 // This line contains the end of a "MetadataWrap" element
213 if (line.indexOf("</"+MetadataWrap+">") != -1) {
214 if (description_element_start == -1) {
215 System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
216 }
217 description_element_start = -1;
218 continue;
219 }
220
221 // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
222 if (description_element_start == -1) {
223 continue;
224 }
225
226 // This line doesn't contain a Metadata element, so we're not interested
227 if (line.indexOf("<"+MetadataItem+" ") == -1) {
228 DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
229 continue;
230 }
231
232 // Extract the metadata element name
233 int name_index = line.indexOf(" name=\"") + " name=\"".length();
234 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
235
236 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
237 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
238 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
239 continue;
240 }
241
242 // Extracted metadata! May have ex. so make sure we remove that
243 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
244 if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
245 // Extract the element value
246 int value_index = line.indexOf(">", name_index) + ">".length();
247 fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));
248 }
249
250 // Note which file this is for
251 else if (metadata_element_name.equals("gsdlsourcefilename")) {
252 // Extract the gsdlsourcefilename element value
253 int value_index = line.indexOf(">", name_index) + ">".length();
254 gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
255
256 // We're only interested in the path relative to the import folder
257 int import_index = gsdlsourcefilename_value.indexOf("import");
258 if (import_index != -1) {
259 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
260
261 is_unix_path = gsdlsourcefilename_value.startsWith("/");
262 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
263
264 // (Will decode gsdlsourcefilename at end of this method, once we know
265 // for certain the fileRenameMethod that was used to encode it.)
266
267 // Make sure the path matches the OS that is running
268 if (is_unix_path && Utility.isWindows()) {
269 // Convert path from Unix to Windows
270 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
271 }
272 else if (!is_unix_path && !Utility.isWindows()) {
273 // Convert path from Windows to Unix
274 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
275 }
276
277 ///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
278 // Remember this for quick access later
279 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
280 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
281 }
282
283 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
284 }
285
286 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
287 // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
288 // which are the gsdlsourcefilenames for the fedora digital object representing a collection.
289 // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
290 else if (gsdlsourcefilename_value.indexOf("tmp") == -1
291 && !gsdlsourcefilename_value.endsWith("collect.cfg")
292 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
293 // We don't really know what is going on...
294 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
295 }
296 }
297
298 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
299 if (metadata_element_name.startsWith("gsdl")) {
300 continue;
301 }
302
303 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
304 if (metadata_element == null) {
305 // This element isn't defined in ex.mds, so create it for this session
306 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
307 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
308 }
309 }
310
311 buffered_reader.close();
312
313 // Now that we're done skimming, we actually need to decode gsdlsourcefilename
314 // based on whatever fileRenameMethod was used to encode it, so that we can
315 // at last properly compare properly against filenames on the file system
316 // in order to load the correct ex.meta for the file.
317 // Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
318 // we can finally perform the decoding of gsdlsourcefilename.
319 if(fileRenameMethod == null) {
320 fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
321 }
322 // If gsdlsourcefilename was encoded, we remove it from the map under its encoded
323 // filename, decode it and add it back into map using its decoded filename.
324 if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
325 ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value);
326 gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
327 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list);
328 }
329 }
330 catch (FileNotFoundException exception) {
331 DebugStream.printStackTrace(exception);
332 }
333 catch (IOException exception) {
334 DebugStream.printStackTrace(exception);
335 } catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
336 DebugStream.printStackTrace(exception);
337 }
338 }
339
340 protected String decodeSourceFilename(String relative_sourcefile_path,
341 String encodingMethod, boolean is_unix_path)
342 throws Exception
343 {
344
345 ///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
346
347 // First get the file extension. Both in Base64 and URL encoded strings,
348 // the full-stop character (.) doesn't get encoded.
349 // That means getting the file extension is straightforward.
350
351 // Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
352 // 26 lowercase characters, 26 uppercase characters as well as the
353 // Plus sign (+) and the Forward Slash (/).
354 int fullstop = relative_sourcefile_path.indexOf(".");
355 String file_ext = "";
356 if(fullstop != -1) {
357 file_ext = relative_sourcefile_path.substring(fullstop);
358 relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
359 }
360
361 String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
362
363 String decoded_gsdlsourcefilename = "";
364
365 String separator = is_unix_path ? "/" : "\\";
366 for(int i = 0; i < importFilePathParts.length; i++) {
367 String decoded_filePathPart = "";
368 if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
369 // URL decode each part of gsdlsourcefilename.
370 // Need to set the decoder to use the default system encoding
371 // This is stored in the System's file.encoding property.
372 decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
373 }
374 else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
375 // Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
376 //byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
377 // Using org.apache.commons.codec.binary.Base64 instead
378 // https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
379 // General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
380 byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
381 ///System.err.println("Got base64 string: " + importFilePathParts[i]);
382 ///System.err.println("Decoded from base64 to bytes: " + bytes);
383 // Using system file.encoding to interpret the resulting bytestring as a String,
384 // just as we always did with URL decoding method
385 decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
386 }
387
388 if(i == 0) {
389 decoded_gsdlsourcefilename = decoded_filePathPart;
390 } else {
391 decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
392 }
393 ///System.err.println("Built up: " + decoded_gsdlsourcefilename);
394 }
395
396 // add the file extension back in
397 decoded_gsdlsourcefilename += file_ext;
398
399 ///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
400
401 return decoded_gsdlsourcefilename;
402 }
403
404 /**
405 * Given a filepath, returns the parts between each file separator as an array.
406 * For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
407 */
408 private static String[] getFilePathParts(String filepath, boolean is_unix_path) {
409 StringTokenizer tok;
410 if(is_unix_path) {
411 tok = new StringTokenizer(filepath, "/");
412 } else {
413 tok = new StringTokenizer(filepath, "\\");
414 }
415 String[] parts;
416 int count = tok.countTokens();
417 if(count <= 0) {
418 parts = new String[]{filepath};
419 } else {
420 int i = 0;
421 parts = new String[count];
422 while(tok.hasMoreTokens()) {
423 parts[i] = tok.nextToken();
424 //System.err.println("Next part: " + parts[i]);
425 i++;
426 }
427 }
428 return parts;
429 }
430
431 /*
432 public ArrayList getMetadataExtractedFromFile(File file)
433 {
434 // Build up a list of metadata extracted from this file
435 ArrayList metadata_values = new ArrayList();
436
437 String file_relative_path = file.getAbsolutePath();
438 int import_index = file_relative_path.indexOf("import");
439 if (import_index != -1) {
440 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
441 }
442
443 // Check whether this doc.xml file contains extracted metadata for the specified file
444 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
445 if (description_elements_list == null) {
446 // ...it doesn't
447 return metadata_values;
448 }
449
450 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
451
452 // Parse the doc.xml file
453 DebugStream.println("Applicable doc.xml file: " + this);
454 try {
455 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
456
457 int description_element_num = 0;
458 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
459 boolean in_relevant_description_element = false;
460
461 String line = null;
462 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
463 // Check if this line contains the start of a relevant Description element
464 if (line_num == next_description_element_start) {
465 in_relevant_description_element = true;
466 continue;
467 }
468
469 // If we're not in a relevant Description element we don't care about anything
470 if (in_relevant_description_element == false) {
471 continue;
472 }
473
474 // Check if this line contains the end of the relevant Description element
475 if (line.indexOf("</Description>") != -1) {
476 description_element_num++;
477 if (description_element_num == description_elements_list.size()) {
478 break;
479 }
480
481 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
482 in_relevant_description_element = false;
483 continue;
484 }
485
486 // If this line doesn't contain a complete Metadata element, we're not interested
487 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
488 continue;
489 }
490
491 // Extract the metadata element name
492 int name_index = line.indexOf(" name=\"") + " name=\"".length();
493 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
494
495 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
496 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
497 if (!metadata_set_namespace.equals("")) {
498 continue;
499 }
500
501 // Extracted metadata!
502 String metadata_element_name = metadata_element_name_full;
503
504 // We completely ignore bibliographic data
505 if (metadata_element_name.equals("SourceSegment")) {
506 buffered_reader.close();
507 return new ArrayList();
508 }
509
510 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
511 if (metadata_element_name.startsWith("gsdl")) {
512 continue;
513 }
514
515 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
516
517 // Value trees are not stored for extracted metadata, so create a new value tree node now
518 int value_index = line.indexOf(">", name_index) + ">".length();
519 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
520
521 metadata_element.addMetadataValue(metadata_element_value);
522 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
523
524 // Add the new metadata value to the list
525 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
526 metadata_values.add(metadata_value);
527 }
528
529 buffered_reader.close();
530 }
531 catch (FileNotFoundException exception) {
532 DebugStream.printStackTrace(exception);
533 }
534 catch (IOException exception) {
535 DebugStream.printStackTrace(exception);
536 }
537
538 return metadata_values;
539 }
540
541 */
542
543 /**
544 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
545 * - To build a mapping from source file to its corresponding doc.xml file
546 * - To get a complete list of all extracted metadata elements
547 */
548 /*
549 public void skimFile()
550 {
551 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
552
553 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
554 DebugStream.println("Skimming " + this + "...");
555 try {
556 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
557 int description_element_start = -1;
558
559 String line = null;
560 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
561 // This line contains the start of a Description element
562 if (line.indexOf("<Description>") != -1) {
563 if (description_element_start != -1) {
564 System.err.println("Parse error: previous Description element unfinished!");
565 }
566 description_element_start = line_num;
567 continue;
568 }
569
570 // This line contains the end of a Description element
571 if (line.indexOf("</Description>") != -1) {
572 if (description_element_start == -1) {
573 System.err.println("Parse error: Description element unstarted!");
574 }
575 description_element_start = -1;
576 continue;
577 }
578
579 // If we're not in a Description element there shouldn't be any Metadata elements
580 if (description_element_start == -1) {
581 continue;
582 }
583
584 // This line doesn't contain a Metadata element, so we're not interested
585 if (line.indexOf("<Metadata ") == -1) {
586 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
587 continue;
588 }
589
590 // Extract the metadata element name
591 int name_index = line.indexOf(" name=\"") + " name=\"".length();
592 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
593
594 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
595 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
596 if (!metadata_set_namespace.equals("")) {
597 continue;
598 }
599
600 // Extracted metadata!
601 String metadata_element_name = metadata_element_name_full;
602
603 // Note which file this doc.xml is for
604 if (metadata_element_name.equals("gsdlsourcefilename")) {
605 // Extract the gsdlsourcefilename element value
606 int value_index = line.indexOf(">", name_index) + ">".length();
607 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
608
609 // We're only interested in the path relative to the import folder
610 int import_index = gsdlsourcefilename_value.indexOf("import");
611 if (import_index != -1) {
612 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
613
614 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
615 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
616
617 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
618 // This is stored in the System's file.encoding property.
619 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
620
621 // Make sure the path matches the OS that is running
622 if (is_unix_path && Utility.isWindows()) {
623 // Convert path from Unix to Windows
624 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
625 }
626 else if (!is_unix_path && !Utility.isWindows()) {
627 // Convert path from Windows to Unix
628 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
629 }
630
631 // Remember this for quick access later
632 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
633 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
634 }
635
636 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
637 }
638
639 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
640 // This is true when the source files come from a zip file processed by ZIPPlug, for example
641 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
642 // We don't really know what is going on...
643 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
644 }
645 }
646
647 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
648 if (metadata_element_name.startsWith("gsdl")) {
649 continue;
650 }
651
652 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
653 if (metadata_element == null) {
654 // This element isn't defined in ex.mds, so create it for this session
655 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
656 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
657 }
658 }
659
660 buffered_reader.close();
661 }
662 catch (FileNotFoundException exception) {
663 DebugStream.printStackTrace(exception);
664 }
665 catch (IOException exception) {
666 DebugStream.printStackTrace(exception);
667 }
668 }
669 */
670
671}
Note: See TracBrowser for help on using the repository browser.