source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 33757

Last change on this file since 33757 was 33757, checked in by ak19, 4 years ago
  1. Windows bugfix for getting exMeta to be loaded into GLI where there are subdirs involved in the Gather pane, or there are non-ASCII filenames, or the file rename method is set to base64. 2. Bugfix for Linux and Windows: Using Base64 to rename files was still a problem despite the previous commit (which was supposed to have fixed all GLI exMeta loading issues on Linux) in the special case where a subfolder was pure ASCII. The perl code wouldn't base64 encode such subdirs. However, GLI won't know which part of a relative file path to decode based on the file rename method used and which parts are not to be decoded. So GLI uniformly decoded them, and ASCII named subfolders that were not base64 encoded (but contained files that were to be renamed with base64) got base64 decoded into garbage, so that exMeta still did not get attached. 3. This commit contains debug stmts.
  • Property svn:keywords set to Author Date Id Revision
File size: 31.5 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import java.net.URLDecoder;
33import org.greenstone.gatherer.DebugStream;
34import org.greenstone.gatherer.util.Utility;
35
36//import org.greenstone.gatherer.feedback.Base64; // decode() from Base64 didn't work
37import org.apache.commons.codec.binary.Base64; // decoding from Base64 works
38
39/** This class represents one doc.xml file */
40
41public abstract class DocXMLFile extends File
42{
43 protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
44
45 protected final String MetadataWrap;
46 protected final String MetadataItem;
47
48 protected final String FILE_RENAME_METHOD_NONE = "none";
49 protected final String FILE_RENAME_METHOD_URL = "url";
50 protected final String FILE_RENAME_METHOD_BASE64 = "base64";
51
52 public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
53 {
54 super(doc_xml_file_path);
55 this.MetadataWrap = metaWrap;
56 this.MetadataItem = metaItem;
57 }
58
59 /**
60 * Checks if various versions of the file object's filename, denoted relatively by file_relative_path,
61 * occur in the source_file_name_to_description_elements_mapping map
62 */
63 private ArrayList findSourceFileMapKeyMatch(File file, String file_relative_path) {
64 ArrayList description_elements_list = null;
65
66 System.err.println("Looking for key " + file_relative_path);
67 description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
68 if(description_elements_list != null) {
69 System.err.println(" Found key matching REGULAR filepath: " + file_relative_path);
70 return description_elements_list;
71 }
72 else if(!Utility.isWindows()) { // couldn't find a matching key, we're done
73 System.err.println("Unable to find meta for regular file path form " + file_relative_path);
74 return null;
75 }
76
77 // Now we can try windows short filename as map key
78
79 String win_short_file_relative_path = "";
80 try{
81 win_short_file_relative_path = Utility.getWindowsShortFileName(file.getAbsolutePath());
82 //System.err.println("@@@ Searching for short file name: " + win_short_file_relative_path);
83 } catch(Exception e) { // we're done trying to find a matching key
84 System.err.println("Failed to convert to windows short file name: " + win_short_file_relative_path);
85 return null;
86 }
87
88 // Got a windows short file name, lop off import folder again
89 int import_index = win_short_file_relative_path.indexOf("import");
90 if (import_index != -1) {
91 win_short_file_relative_path = win_short_file_relative_path.substring(import_index + "import".length() + 1);
92 }
93
94 System.err.println("### Looking for Windows short file name |" + win_short_file_relative_path + "| in map of sourcefilenames to doc.xml's ex meta.");
95 description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(win_short_file_relative_path);
96 if (description_elements_list != null) {
97 System.err.println(" Found key matching FULL win shortfile path: " + win_short_file_relative_path);
98 return description_elements_list; // found
99 }
100
101 // else, check whether a map key is matched by any REMAINING combination of windows shortfile path and regular path:
102 // - windows shortfilename's rel-dir-path with regular tailname
103 // - and regular rel-dir-path with windows shortfilename's tailname
104
105 String shortFileTailName = win_short_file_relative_path;
106 String shortFileRelDirPath = "";
107 int lastSep = win_short_file_relative_path.lastIndexOf(File.separator);
108 if(lastSep != -1) {
109 shortFileTailName = win_short_file_relative_path.substring(lastSep+1);
110 shortFileRelDirPath = win_short_file_relative_path.substring(0, lastSep+1); // include the slash
111 }
112
113 String fileTailName = file_relative_path;
114 String fileRelDirPath = "";
115 lastSep = file_relative_path.lastIndexOf(File.separator);
116 if(lastSep != -1) {
117 fileTailName = file_relative_path.substring(lastSep+1);
118 fileRelDirPath = file_relative_path.substring(0, lastSep+1); // include the slash
119 }
120
121 String path = shortFileRelDirPath + fileTailName;
122 System.err.println("### Looking for Windows short file name |" + path + "| in map of sourcefilenames to doc.xml's ex meta.");
123 description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path);
124
125 if(description_elements_list != null) {
126 System.err.println(" Found key matching MIX of win shortfile path and regular path: " + path);
127 return description_elements_list; // found
128 }
129
130 // try the other combination
131 path = fileRelDirPath + shortFileTailName;
132 System.err.println("### Looking for Windows short file name |" + path + "| in map of sourcefilenames to doc.xml's ex meta.");
133 description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path);
134
135 if(description_elements_list != null) {
136 System.err.println(" Found key matching MIX of regular path and win shortfile path: " + path);
137 return description_elements_list; // found
138 }
139
140 return description_elements_list;
141 }
142
143
144 public ArrayList getMetadataExtractedFromFile(File file)
145 {
146 // Build up a list of metadata extracted from this file
147 ArrayList metadata_values = new ArrayList();
148
149 String file_relative_path = file.getAbsolutePath();
150 int import_index = file_relative_path.indexOf("import");
151 if (import_index != -1) {
152 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
153 }
154
155 for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
156 System.err.println("\n@@@ relFilename: " + relFilename);
157 }
158
159 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
160 //ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
161 ArrayList description_elements_list = findSourceFileMapKeyMatch(file, file_relative_path);
162 if (description_elements_list == null) {
163 // ...it doesn't
164 System.err.println("Unable to find meta for (regular file path form) " + file_relative_path);
165 if(Utility.isWindows()) {
166 System.err.println(" Or for windows shortFile path form, or for combinations with regular file path form");
167 }
168 return metadata_values; // we're done
169 }
170
171 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
172
173 // Parse the file
174 DebugStream.println("Applicable file: " + this);
175 try {
176 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
177
178 int description_element_num = 0;
179 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
180 boolean in_relevant_description_element = false;
181
182 String line = null;
183 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
184 // Check if this line contains the start of a relevant "Description" element
185 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
186 if (line_num == next_description_element_start) {
187 in_relevant_description_element = true;
188 continue;
189 }
190
191 // If we're not in a relevant Description element we don't care about anything
192 if (in_relevant_description_element == false) {
193 continue;
194 }
195
196 // Check if this line contains the end of the relevant Description element
197 if (line.indexOf("</"+MetadataWrap+">") != -1) {
198 description_element_num++;
199 if (description_element_num == description_elements_list.size()) {
200 break;
201 }
202
203 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
204 in_relevant_description_element = false;
205 continue;
206 }
207
208 // If this line doesn't contain a complete Metadata element, we're not interested
209 if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) {
210 continue;
211 }
212
213 // Extract the metadata element name
214 int name_index = line.indexOf(" name=\"") + " name=\"".length();
215 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
216
217 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
218 // Actually, if it is ex. then we are interested
219 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
220
221 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
222 continue;
223 }
224
225 // Extracted metadata!
226 // do it like this just in case we have ex.
227 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
228
229 // We completely ignore bibliographic data
230 if (metadata_element_name.equals("SourceSegment")) {
231 buffered_reader.close();
232 return new ArrayList();
233 }
234
235 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
236 if (metadata_element_name.startsWith("gsdl")) {
237 continue;
238 }
239
240 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
241
242 // Value trees are not stored for extracted metadata, so create a new value tree node now
243 int value_index = line.indexOf(">", name_index) + ">".length();
244 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
245
246 metadata_element.addMetadataValue(metadata_element_value);
247 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
248
249 // Add the new metadata value to the list
250 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
251 metadata_values.add(metadata_value);
252 }
253
254 buffered_reader.close();
255 }
256 catch (FileNotFoundException exception) {
257 DebugStream.printStackTrace(exception);
258 }
259 catch (IOException exception) {
260 DebugStream.printStackTrace(exception);
261 }
262
263 return metadata_values;
264 }
265
266
267
268
269 /**
270 * Every file must be skimmed when a collection is opened, for two reasons:
271 * - To build a mapping from source file to its corresponding doc.xml file
272 * - To get a complete list of all extracted metadata elements
273 */
274 public void skimFile()
275 {
276 String fileRenameMethod = null;
277 String gsdlsourcefilename_value = null;
278 boolean is_unix_path = false;
279
280 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
281
282 // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
283 DebugStream.println("Skimming " + this + "...");
284 try {
285 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
286 int description_element_start = -1;
287
288 String line = null;
289 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
290 // This line contains the start of a "MetadataWrap" element
291 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
292 if (line.indexOf("<"+MetadataWrap+">") != -1) {
293 if (description_element_start != -1) {
294 System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
295 }
296 description_element_start = line_num;
297 continue;
298 }
299
300 // This line contains the end of a "MetadataWrap" element
301 if (line.indexOf("</"+MetadataWrap+">") != -1) {
302 if (description_element_start == -1) {
303 System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
304 }
305 description_element_start = -1;
306 continue;
307 }
308
309 // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
310 if (description_element_start == -1) {
311 continue;
312 }
313
314 // This line doesn't contain a Metadata element, so we're not interested
315 if (line.indexOf("<"+MetadataItem+" ") == -1) {
316 DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
317 continue;
318 }
319
320 // Extract the metadata element name
321 int name_index = line.indexOf(" name=\"") + " name=\"".length();
322 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
323
324 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
325 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
326 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
327 continue;
328 }
329
330 // Extracted metadata! May have ex. so make sure we remove that
331 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
332 if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
333 // Extract the element value
334 int value_index = line.indexOf(">", name_index) + ">".length();
335 fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));
336 }
337
338 // Note which file this is for
339 else if (metadata_element_name.equals("gsdlsourcefilename")) {
340 // Extract the gsdlsourcefilename element value
341 int value_index = line.indexOf(">", name_index) + ">".length();
342 gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
343
344 // We're only interested in the path relative to the import folder
345 int import_index = gsdlsourcefilename_value.indexOf("import");
346 if (import_index != -1) {
347 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
348
349 is_unix_path = gsdlsourcefilename_value.startsWith("/");
350 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
351
352 // (Will decode gsdlsourcefilename at end of this method, once we know
353 // for certain the fileRenameMethod that was used to encode it.)
354
355 // Make sure the path matches the OS that is running
356 if (is_unix_path && Utility.isWindows()) {
357 // Convert path from Unix to Windows
358 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
359 }
360 else if (!is_unix_path && !Utility.isWindows()) {
361 // Convert path from Windows to Unix
362 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
363 }
364
365 System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
366 // Remember this for quick access later
367 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
368 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
369 }
370
371 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
372 }
373
374 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
375 // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
376 // which are the gsdlsourcefilenames for the fedora digital object representing a collection.
377 // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
378 else if (gsdlsourcefilename_value.indexOf("tmp") == -1
379 && !gsdlsourcefilename_value.endsWith("collect.cfg")
380 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
381 // We don't really know what is going on...
382 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
383 }
384 }
385
386 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
387 if (metadata_element_name.startsWith("gsdl")) {
388 continue;
389 }
390
391 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
392 if (metadata_element == null) {
393 // This element isn't defined in ex.mds, so create it for this session
394 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
395 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
396 }
397 }
398
399 buffered_reader.close();
400
401 // Now that we're done skimming, we actually need to decode gsdlsourcefilename
402 // based on whatever fileRenameMethod was used to encode it, so that we can
403 // at last properly compare properly against filenames on the file system
404 // in order to load the correct ex.meta for the file.
405 // Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
406 // we can finally perform the decoding of gsdlsourcefilename.
407 if(fileRenameMethod == null) {
408 fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
409 }
410 // If gsdlsourcefilename was encoded, we remove it from the map under its encoded
411 // filename, decode it and add it back into map using its decoded filename.
412 if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
413 ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value);
414 gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
415 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list);
416 }
417 }
418 catch (FileNotFoundException exception) {
419 DebugStream.printStackTrace(exception);
420 }
421 catch (IOException exception) {
422 DebugStream.printStackTrace(exception);
423 } catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
424 DebugStream.printStackTrace(exception);
425 }
426 }
427
428 protected String decodeSourceFilename(String relative_sourcefile_path,
429 String encodingMethod, boolean is_unix_path)
430 throws Exception
431 {
432
433 ///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
434
435 // First get the file extension. Both in Base64 and URL encoded strings,
436 // the full-stop character (.) doesn't get encoded.
437 // That means getting the file extension is straightforward.
438
439 // Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
440 // 26 lowercase characters, 26 uppercase characters as well as the
441 // Plus sign (+) and the Forward Slash (/).
442 int fullstop = relative_sourcefile_path.indexOf(".");
443 String file_ext = "";
444 if(fullstop != -1) {
445 file_ext = relative_sourcefile_path.substring(fullstop);
446 relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
447 }
448
449 String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
450
451 String decoded_gsdlsourcefilename = "";
452
453 String separator = is_unix_path ? "/" : "\\";
454 for(int i = 0; i < importFilePathParts.length; i++) {
455 String decoded_filePathPart = "";
456 if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
457 // URL decode each part of gsdlsourcefilename.
458 // Need to set the decoder to use the default system encoding
459 // This is stored in the System's file.encoding property.
460 decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
461 }
462 else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
463 // Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
464 //byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
465 // Using org.apache.commons.codec.binary.Base64 instead
466 // https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
467 // General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
468 byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
469 System.err.println("Got base64 string: " + importFilePathParts[i]);
470 System.err.println("Decoded from base64 to bytes: " + new String(bytes, System.getProperty("file.encoding")));
471 // Using system file.encoding to interpret the resulting bytestring as a String,
472 // just as we always did with URL decoding method
473 decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
474 }
475
476 if(i == 0) {
477 decoded_gsdlsourcefilename = decoded_filePathPart;
478 } else {
479 decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
480 }
481 ///System.err.println("Built up: " + decoded_gsdlsourcefilename);
482 }
483
484 // add the file extension back in
485 decoded_gsdlsourcefilename += file_ext;
486
487 System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
488
489 return decoded_gsdlsourcefilename;
490 }
491
492 /**
493 * Given a filepath, returns the parts between each file separator as an array.
494 * For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
495 */
496 private static String[] getFilePathParts(String filepath, boolean is_unix_path) {
497 StringTokenizer tok;
498 if(is_unix_path) {
499 tok = new StringTokenizer(filepath, "/");
500 } else {
501 tok = new StringTokenizer(filepath, "\\");
502 }
503 String[] parts;
504 int count = tok.countTokens();
505 if(count <= 0) {
506 parts = new String[]{filepath};
507 } else {
508 int i = 0;
509 parts = new String[count];
510 while(tok.hasMoreTokens()) {
511 parts[i] = tok.nextToken();
512 //System.err.println("Next part: " + parts[i]);
513 i++;
514 }
515 }
516 return parts;
517 }
518
519 /*
520 public ArrayList getMetadataExtractedFromFile(File file)
521 {
522 // Build up a list of metadata extracted from this file
523 ArrayList metadata_values = new ArrayList();
524
525 String file_relative_path = file.getAbsolutePath();
526 int import_index = file_relative_path.indexOf("import");
527 if (import_index != -1) {
528 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
529 }
530
531 // Check whether this doc.xml file contains extracted metadata for the specified file
532 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
533 if (description_elements_list == null) {
534 // ...it doesn't
535 return metadata_values;
536 }
537
538 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
539
540 // Parse the doc.xml file
541 DebugStream.println("Applicable doc.xml file: " + this);
542 try {
543 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
544
545 int description_element_num = 0;
546 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
547 boolean in_relevant_description_element = false;
548
549 String line = null;
550 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
551 // Check if this line contains the start of a relevant Description element
552 if (line_num == next_description_element_start) {
553 in_relevant_description_element = true;
554 continue;
555 }
556
557 // If we're not in a relevant Description element we don't care about anything
558 if (in_relevant_description_element == false) {
559 continue;
560 }
561
562 // Check if this line contains the end of the relevant Description element
563 if (line.indexOf("</Description>") != -1) {
564 description_element_num++;
565 if (description_element_num == description_elements_list.size()) {
566 break;
567 }
568
569 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
570 in_relevant_description_element = false;
571 continue;
572 }
573
574 // If this line doesn't contain a complete Metadata element, we're not interested
575 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
576 continue;
577 }
578
579 // Extract the metadata element name
580 int name_index = line.indexOf(" name=\"") + " name=\"".length();
581 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
582
583 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
584 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
585 if (!metadata_set_namespace.equals("")) {
586 continue;
587 }
588
589 // Extracted metadata!
590 String metadata_element_name = metadata_element_name_full;
591
592 // We completely ignore bibliographic data
593 if (metadata_element_name.equals("SourceSegment")) {
594 buffered_reader.close();
595 return new ArrayList();
596 }
597
598 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
599 if (metadata_element_name.startsWith("gsdl")) {
600 continue;
601 }
602
603 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
604
605 // Value trees are not stored for extracted metadata, so create a new value tree node now
606 int value_index = line.indexOf(">", name_index) + ">".length();
607 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
608
609 metadata_element.addMetadataValue(metadata_element_value);
610 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
611
612 // Add the new metadata value to the list
613 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
614 metadata_values.add(metadata_value);
615 }
616
617 buffered_reader.close();
618 }
619 catch (FileNotFoundException exception) {
620 DebugStream.printStackTrace(exception);
621 }
622 catch (IOException exception) {
623 DebugStream.printStackTrace(exception);
624 }
625
626 return metadata_values;
627 }
628
629 */
630
631 /**
632 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
633 * - To build a mapping from source file to its corresponding doc.xml file
634 * - To get a complete list of all extracted metadata elements
635 */
636 /*
637 public void skimFile()
638 {
639 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
640
641 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
642 DebugStream.println("Skimming " + this + "...");
643 try {
644 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
645 int description_element_start = -1;
646
647 String line = null;
648 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
649 // This line contains the start of a Description element
650 if (line.indexOf("<Description>") != -1) {
651 if (description_element_start != -1) {
652 System.err.println("Parse error: previous Description element unfinished!");
653 }
654 description_element_start = line_num;
655 continue;
656 }
657
658 // This line contains the end of a Description element
659 if (line.indexOf("</Description>") != -1) {
660 if (description_element_start == -1) {
661 System.err.println("Parse error: Description element unstarted!");
662 }
663 description_element_start = -1;
664 continue;
665 }
666
667 // If we're not in a Description element there shouldn't be any Metadata elements
668 if (description_element_start == -1) {
669 continue;
670 }
671
672 // This line doesn't contain a Metadata element, so we're not interested
673 if (line.indexOf("<Metadata ") == -1) {
674 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
675 continue;
676 }
677
678 // Extract the metadata element name
679 int name_index = line.indexOf(" name=\"") + " name=\"".length();
680 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
681
682 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
683 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
684 if (!metadata_set_namespace.equals("")) {
685 continue;
686 }
687
688 // Extracted metadata!
689 String metadata_element_name = metadata_element_name_full;
690
691 // Note which file this doc.xml is for
692 if (metadata_element_name.equals("gsdlsourcefilename")) {
693 // Extract the gsdlsourcefilename element value
694 int value_index = line.indexOf(">", name_index) + ">".length();
695 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
696
697 // We're only interested in the path relative to the import folder
698 int import_index = gsdlsourcefilename_value.indexOf("import");
699 if (import_index != -1) {
700 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
701
702 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
703 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
704
705 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
706 // This is stored in the System's file.encoding property.
707 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
708
709 // Make sure the path matches the OS that is running
710 if (is_unix_path && Utility.isWindows()) {
711 // Convert path from Unix to Windows
712 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
713 }
714 else if (!is_unix_path && !Utility.isWindows()) {
715 // Convert path from Windows to Unix
716 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
717 }
718
719 // Remember this for quick access later
720 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
721 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
722 }
723
724 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
725 }
726
727 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
728 // This is true when the source files come from a zip file processed by ZIPPlug, for example
729 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
730 // We don't really know what is going on...
731 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
732 }
733 }
734
735 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
736 if (metadata_element_name.startsWith("gsdl")) {
737 continue;
738 }
739
740 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
741 if (metadata_element == null) {
742 // This element isn't defined in ex.mds, so create it for this session
743 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
744 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
745 }
746 }
747
748 buffered_reader.close();
749 }
750 catch (FileNotFoundException exception) {
751 DebugStream.printStackTrace(exception);
752 }
753 catch (IOException exception) {
754 DebugStream.printStackTrace(exception);
755 }
756 }
757 */
758
759}
Note: See TracBrowser for help on using the repository browser.