source: main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 34508

Last change on this file since 34508 was 34508, checked in by ak19, 4 years ago

Fixing oversight in previous commit: ex.Metadata was not showing up in GLI.

  • Property svn:keywords set to Author Date Id Revision
File size: 28.8 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import java.net.URLDecoder;
33import org.greenstone.gatherer.DebugStream;
34import org.greenstone.gatherer.Gatherer;
35import org.greenstone.gatherer.util.Utility;
36
37//import org.greenstone.gatherer.feedback.Base64; // decode() from Base64 didn't work
38import org.apache.commons.codec.binary.Base64; // decoding from Base64 works
39
40/** This class represents one doc.xml file */
41
42public abstract class DocXMLFile extends File
43{
44 static boolean isWin = Utility.isWindows();
45
46 protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
47
48 protected final String MetadataWrap;
49 protected final String MetadataItem;
50
51 protected final String FILE_RENAME_METHOD_NONE = "none";
52 protected final String FILE_RENAME_METHOD_URL = "url";
53 protected final String FILE_RENAME_METHOD_BASE64 = "base64";
54
55 public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
56 {
57 super(doc_xml_file_path);
58 this.MetadataWrap = metaWrap;
59 this.MetadataItem = metaItem;
60 }
61
62 /** On Windows, file_relative_path will be hex-encoded for codepts beyond ASCII.
63 * But keys into the source_file_name_to_description_elements_mapping will then also match on Windows */
64 public ArrayList getMetadataExtractedFromFile(File file, String file_relative_path)
65 {
66 // Build up a list of metadata extracted from this file
67 ArrayList metadata_values = new ArrayList();
68
69 ///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
70 /// System.err.println("\n@@@ relFilename: " + relFilename);
71 ///}
72
73 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
74 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
75 if (description_elements_list == null) {
76 // ...it doesn't
77 ///System.err.println("Unable to find meta for file path form " + file_relative_path);
78 return metadata_values; // we're done
79 } /// else { System.err.println("@@@ file rel path: " + file_relative_path + " matched" ); }
80
81 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
82
83 // Parse the file
84 DebugStream.println("Applicable file: " + this);
85 try {
86 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
87
88 int description_element_num = 0;
89 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
90 boolean in_relevant_description_element = false;
91
92 String line = null;
93 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
94 // Check if this line contains the start of a relevant "Description" element
95 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
96 if (line_num == next_description_element_start) {
97 in_relevant_description_element = true;
98 continue;
99 }
100
101 // If we're not in a relevant Description element we don't care about anything
102 if (in_relevant_description_element == false) {
103 continue;
104 }
105
106 // Check if this line contains the end of the relevant Description element
107 if (line.indexOf("</"+MetadataWrap+">") != -1) {
108 description_element_num++;
109 if (description_element_num == description_elements_list.size()) {
110 break;
111 }
112
113 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
114 in_relevant_description_element = false;
115 continue;
116 }
117
118 // If this line doesn't contain a complete Metadata element, we're not interested
119 if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) {
120 continue;
121 }
122
123 // Extract the metadata element name
124 int name_index = line.indexOf(" name=\"") + " name=\"".length();
125 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
126
127 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
128 // Actually, if it is ex. then we are interested
129 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
130
131 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
132 continue;
133 }
134
135 // Extracted metadata!
136 // do it like this just in case we have ex.
137 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
138
139 // We completely ignore bibliographic data
140 if (metadata_element_name.equals("SourceSegment")) {
141 buffered_reader.close();
142 return new ArrayList();
143 }
144
145 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
146 if (metadata_element_name.startsWith("gsdl")) {
147 continue;
148 }
149
150 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
151
152 // Value trees are not stored for extracted metadata, so create a new value tree node now
153 int value_index = line.indexOf(">", name_index) + ">".length();
154 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));
155
156 metadata_element.addMetadataValue(metadata_element_value);
157 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
158
159 // Add the new metadata value to the list
160 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
161 metadata_values.add(metadata_value);
162 }
163
164 buffered_reader.close();
165 }
166 catch (FileNotFoundException exception) {
167 DebugStream.printStackTrace(exception);
168 }
169 catch (IOException exception) {
170 DebugStream.printStackTrace(exception);
171 }
172
173 return metadata_values;
174 }
175
176
177
178
179 /**
180 * Every file must be skimmed when a collection is opened, for two reasons:
181 * - To build a mapping from source file to its corresponding doc.xml file
182 * - To get a complete list of all extracted metadata elements
183 */
184 public void skimFile()
185 {
186 String fileRenameMethod = null;
187 String gsdlsourcefilename_value = null;
188 boolean is_unix_path = false;
189 int description_element_start_gsdlsourcefilename_value = -1;
190
191 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
192
193 // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
194 DebugStream.println("Skimming " + this + "...");
195 try {
196 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
197 int description_element_start = -1;
198
199 String line = null;
200 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
201 // This line contains the start of a "MetadataWrap" element
202 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format)
203 if (line.indexOf("<"+MetadataWrap+">") != -1) {
204 if (description_element_start != -1) {
205 System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
206 }
207 description_element_start = line_num;
208 continue;
209 }
210
211 // This line contains the end of a "MetadataWrap" element
212 if (line.indexOf("</"+MetadataWrap+">") != -1) {
213 if (description_element_start == -1) {
214 System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
215 }
216 description_element_start = -1;
217 continue;
218 }
219
220 // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
221 if (description_element_start == -1) {
222 continue;
223 }
224
225 // This line doesn't contain a Metadata element, so we're not interested
226 if (line.indexOf("<"+MetadataItem+" ") == -1) {
227 DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
228 continue;
229 }
230
231 // Extract the metadata element name
232 int name_index = line.indexOf(" name=\"") + " name=\"".length();
233 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
234
235 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
236 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
237 if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
238 continue;
239 }
240
241 // Extracted metadata! May have ex. so make sure we remove that
242 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
243 if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
244 // Extract the element value
245 int value_index = line.indexOf(">", name_index) + ">".length();
246 fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));
247 }
248
249 // Note which file this is for
250 else if (metadata_element_name.equals("gsdlsourcefilename")) {
251 // the gsdlsourcefilename metadata field may be encoded by the encoding denoted
252 // in fileRenameMethod (and will need decoding)
253
254 // Extract the gsdlsourcefilename element value
255 int value_index = line.indexOf(">", name_index) + ">".length();
256 gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
257
258 // We're only interested in the path relative to the import folder
259 int import_index = gsdlsourcefilename_value.indexOf("import");
260 if (import_index != -1) {
261
262 ///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
263 ///System.err.println("@@@@ Found description_element_start_gsdlsourcefilename_value: " + description_element_start);
264 description_element_start_gsdlsourcefilename_value = description_element_start;
265
266 }
267
268 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
269 // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
270 // which are the gsdlsourcefilenames for the fedora digital object representing a collection.
271 // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
272 else if (gsdlsourcefilename_value.indexOf("tmp") == -1
273 && !gsdlsourcefilename_value.endsWith("collect.cfg")
274 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
275 // We don't really know what is going on...
276 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
277 }
278 }
279
280 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
281 if (metadata_element_name.startsWith("gsdl")) {
282 continue;
283 }
284
285 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
286 if (metadata_element == null) {
287 // This element isn't defined in ex.mds, so create it for this session
288 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
289 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
290 }
291 }
292
293 buffered_reader.close();
294
295
296 // Now that we're done skimming, we actually need to decode gsdlsourcefilename
297 // based on whatever fileRenameMethod was used to encode it, so that we can
298 // at last properly compare against filenames on the file system
299 // in order to load the correct ex.meta for the file.
300 // Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
301 // we can finally perform the decoding of gsdlsourcefilename.
302 if(fileRenameMethod == null) {
303 fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
304 }
305
306 // If gsdlsourcefilename was encoded, we remove it from the map under its encoded
307 // filename, decode it and add it back into map using its decoded filename.
308 if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
309 gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
310 }
311
312 // Next, if Windows, check if dealing with Win 8.3 Short Filename
313 // In that case, convert short file name to full name - works only if the file exists
314 if(isWin && gsdlsourcefilename_value.indexOf("~") != -1) {
315 // gsdlsourcefilename is stored from import folder onwards: import/opt_subdir/filename.ext
316 // This may contain Win 8.3 shortening. To get Win Long filename, prefix current collection dir
317 // and if resulting file exists, getCanonicalPath() which produces Win Long filename.
318 File currentCollectionFolder = Gatherer.c_man.getCollection().getCollectionDirectory();
319 File f = new File(currentCollectionFolder, /*"import" + File.separator +*/ gsdlsourcefilename_value);
320 ///System.err.println("### file: " + f.getAbsolutePath());
321
322 if(f.exists()) {
323 gsdlsourcefilename_value = f.getCanonicalPath();
324 ///System.err.println("### canon: " + gsdlsourcefilename_value);
325
326 } // else couldn't find a version of the filename stored in doc.xml that exists, giving up, leave gsdlsourcefilename_value as is
327 }
328
329 // We're only interested in the path relative to the import folder
330 int import_index = gsdlsourcefilename_value.indexOf("import");
331 if (import_index != -1) {
332 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
333
334 is_unix_path = gsdlsourcefilename_value.startsWith("/");
335 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
336
337 // (Will decode gsdlsourcefilename at end of this method, once we know
338 // for certain the fileRenameMethod that was used to encode it.)
339
340 // Make sure the path matches the OS that is running
341 if (is_unix_path && isWin) {
342 // Convert path from Unix to Windows
343 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
344 }
345 else if (!is_unix_path && !isWin) {
346 // Convert path from Windows to Unix
347 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
348 }
349
350 ///System.err.println("@@@ into map: " + gsdlsourcefilename_value);
351
352 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
353 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
354 }
355
356 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start_gsdlsourcefilename_value));
357 }
358 }
359 catch (FileNotFoundException exception) {
360 DebugStream.printStackTrace(exception);
361 }
362 catch (IOException exception) {
363 DebugStream.printStackTrace(exception);
364 } catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
365 DebugStream.printStackTrace(exception);
366 }
367 }
368
369
370 protected String decodeSourceFilename(String relative_sourcefile_path,
371 String encodingMethod, boolean is_unix_path)
372 throws Exception
373 {
374
375 ///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
376
377 // First get the file extension. Both in Base64 and URL encoded strings,
378 // the full-stop character (.) doesn't get encoded.
379 // That means getting the file extension is straightforward.
380
381 // Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
382 // 26 lowercase characters, 26 uppercase characters as well as the
383 // Plus sign (+) and the Forward Slash (/).
384 int fullstop = relative_sourcefile_path.indexOf(".");
385 String file_ext = "";
386 if(fullstop != -1) {
387 file_ext = relative_sourcefile_path.substring(fullstop);
388 relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
389 }
390
391 String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
392
393 String decoded_gsdlsourcefilename = "";
394
395 String separator = is_unix_path ? "/" : "\\";
396 for(int i = 0; i < importFilePathParts.length; i++) {
397 String decoded_filePathPart = "";
398 if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
399 // URL decode each part of gsdlsourcefilename.
400 // Need to set the decoder to use the default system encoding
401 // This is stored in the System's file.encoding property.
402 decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
403 }
404 else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
405 // Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
406 //byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
407 // Using org.apache.commons.codec.binary.Base64 instead
408 // https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
409 // General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
410 byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
411 ///System.err.println("Got base64 string: " + importFilePathParts[i]);
412 ///System.err.println("Decoded from base64 to bytes: " + new String(bytes, System.getProperty("file.encoding")));
413 // Using system file.encoding to interpret the resulting bytestring as a String,
414 // just as we always did with URL decoding method
415 decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
416 }
417
418 if(i == 0) {
419 decoded_gsdlsourcefilename = decoded_filePathPart;
420 } else {
421 decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
422 }
423 ///System.err.println("Built up: " + decoded_gsdlsourcefilename);
424 }
425
426 // add the file extension back in
427 decoded_gsdlsourcefilename += file_ext;
428
429 ///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
430
431 return decoded_gsdlsourcefilename;
432 }
433
434 /**
435 * Given a filepath, returns the parts between each file separator as an array.
436 * For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
437 */
438 private static String[] getFilePathParts(String filepath, boolean is_unix_path) {
439 StringTokenizer tok;
440 if(is_unix_path) {
441 tok = new StringTokenizer(filepath, "/");
442 } else {
443 tok = new StringTokenizer(filepath, "\\");
444 }
445 String[] parts;
446 int count = tok.countTokens();
447 if(count <= 0) {
448 parts = new String[]{filepath};
449 } else {
450 int i = 0;
451 parts = new String[count];
452 while(tok.hasMoreTokens()) {
453 parts[i] = tok.nextToken();
454 //System.err.println("Next part: " + parts[i]);
455 i++;
456 }
457 }
458 return parts;
459 }
460
461 /*
462 public ArrayList getMetadataExtractedFromFile(File file)
463 {
464 // Build up a list of metadata extracted from this file
465 ArrayList metadata_values = new ArrayList();
466
467 String file_relative_path = file.getAbsolutePath();
468 int import_index = file_relative_path.indexOf("import");
469 if (import_index != -1) {
470 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
471 }
472
473 // Check whether this doc.xml file contains extracted metadata for the specified file
474 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
475 if (description_elements_list == null) {
476 // ...it doesn't
477 return metadata_values;
478 }
479
480 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
481
482 // Parse the doc.xml file
483 DebugStream.println("Applicable doc.xml file: " + this);
484 try {
485 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
486
487 int description_element_num = 0;
488 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
489 boolean in_relevant_description_element = false;
490
491 String line = null;
492 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
493 // Check if this line contains the start of a relevant Description element
494 if (line_num == next_description_element_start) {
495 in_relevant_description_element = true;
496 continue;
497 }
498
499 // If we're not in a relevant Description element we don't care about anything
500 if (in_relevant_description_element == false) {
501 continue;
502 }
503
504 // Check if this line contains the end of the relevant Description element
505 if (line.indexOf("</Description>") != -1) {
506 description_element_num++;
507 if (description_element_num == description_elements_list.size()) {
508 break;
509 }
510
511 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
512 in_relevant_description_element = false;
513 continue;
514 }
515
516 // If this line doesn't contain a complete Metadata element, we're not interested
517 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {
518 continue;
519 }
520
521 // Extract the metadata element name
522 int name_index = line.indexOf(" name=\"") + " name=\"".length();
523 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
524
525 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
526 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
527 if (!metadata_set_namespace.equals("")) {
528 continue;
529 }
530
531 // Extracted metadata!
532 String metadata_element_name = metadata_element_name_full;
533
534 // We completely ignore bibliographic data
535 if (metadata_element_name.equals("SourceSegment")) {
536 buffered_reader.close();
537 return new ArrayList();
538 }
539
540 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
541 if (metadata_element_name.startsWith("gsdl")) {
542 continue;
543 }
544
545 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
546
547 // Value trees are not stored for extracted metadata, so create a new value tree node now
548 int value_index = line.indexOf(">", name_index) + ">".length();
549 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>"));
550
551 metadata_element.addMetadataValue(metadata_element_value);
552 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);
553
554 // Add the new metadata value to the list
555 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
556 metadata_values.add(metadata_value);
557 }
558
559 buffered_reader.close();
560 }
561 catch (FileNotFoundException exception) {
562 DebugStream.printStackTrace(exception);
563 }
564 catch (IOException exception) {
565 DebugStream.printStackTrace(exception);
566 }
567
568 return metadata_values;
569 }
570
571 */
572
573 /**
574 * Every doc.xml file must be skimmed when a collection is opened, for two reasons:
575 * - To build a mapping from source file to its corresponding doc.xml file
576 * - To get a complete list of all extracted metadata elements
577 */
578 /*
579 public void skimFile()
580 {
581 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
582
583 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements
584 DebugStream.println("Skimming " + this + "...");
585 try {
586 BufferedReader buffered_reader = new BufferedReader(new FileReader(this));
587 int description_element_start = -1;
588
589 String line = null;
590 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
591 // This line contains the start of a Description element
592 if (line.indexOf("<Description>") != -1) {
593 if (description_element_start != -1) {
594 System.err.println("Parse error: previous Description element unfinished!");
595 }
596 description_element_start = line_num;
597 continue;
598 }
599
600 // This line contains the end of a Description element
601 if (line.indexOf("</Description>") != -1) {
602 if (description_element_start == -1) {
603 System.err.println("Parse error: Description element unstarted!");
604 }
605 description_element_start = -1;
606 continue;
607 }
608
609 // If we're not in a Description element there shouldn't be any Metadata elements
610 if (description_element_start == -1) {
611 continue;
612 }
613
614 // This line doesn't contain a Metadata element, so we're not interested
615 if (line.indexOf("<Metadata ") == -1) {
616 DebugStream.println("Warning: Description element line doesn't contain Metadata element.");
617 continue;
618 }
619
620 // Extract the metadata element name
621 int name_index = line.indexOf(" name=\"") + " name=\"".length();
622 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));
623
624 // If the metadata has a namespace it isn't extracted metadata, so we're not interested
625 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
626 if (!metadata_set_namespace.equals("")) {
627 continue;
628 }
629
630 // Extracted metadata!
631 String metadata_element_name = metadata_element_name_full;
632
633 // Note which file this doc.xml is for
634 if (metadata_element_name.equals("gsdlsourcefilename")) {
635 // Extract the gsdlsourcefilename element value
636 int value_index = line.indexOf(">", name_index) + ">".length();
637 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
638
639 // We're only interested in the path relative to the import folder
640 int import_index = gsdlsourcefilename_value.indexOf("import");
641 if (import_index != -1) {
642 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
643
644 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
645 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
646
647 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
648 // This is stored in the System's file.encoding property.
649 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
650
651 // Make sure the path matches the OS that is running
652 if (is_unix_path && isWin) {
653 // Convert path from Unix to Windows
654 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
655 }
656 else if (!is_unix_path && !isWin) {
657 // Convert path from Windows to Unix
658 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
659 }
660
661 // Remember this for quick access later
662 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
663 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
664 }
665
666 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
667 }
668
669 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory
670 // This is true when the source files come from a zip file processed by ZIPPlug, for example
671 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) {
672 // We don't really know what is going on...
673 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
674 }
675 }
676
677 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
678 if (metadata_element_name.startsWith("gsdl")) {
679 continue;
680 }
681
682 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
683 if (metadata_element == null) {
684 // This element isn't defined in ex.mds, so create it for this session
685 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
686 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
687 }
688 }
689
690 buffered_reader.close();
691 }
692 catch (FileNotFoundException exception) {
693 DebugStream.printStackTrace(exception);
694 }
695 catch (IOException exception) {
696 DebugStream.printStackTrace(exception);
697 }
698 }
699 */
700
701}
Note: See TracBrowser for help on using the repository browser.