source: trunk/gli/src/org/greenstone/gatherer/msm/parsers/GreenstoneMetadataParser.java@ 8022

Last change on this file since 8022 was 8022, checked in by mdewsnip, 20 years ago

(Very) minor changes.

  • Property svn:keywords set to Author Date Id Revision
File size: 44.6 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * Author: John Thompson, Greenstone Digital Library, University of Waikato
9 *
10 * Copyright (C) 1999 New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *########################################################################
26 */
27package org.greenstone.gatherer.msm.parsers;
28
29/**************************************************************************************
30 * Written: ??/??/02
31 * Revised: ??/??/02 - Commented
32 * 25/07/03 - Fix to allow any valid greenstone metadata.xml to be imported from, not just those that occur within a collection. This functionality is accidental and caused by GLI attempting to find a collect.cfg to extract hierarchy file information from, and failing.
33 **************************************************************************************/
34import java.io.*;
35import java.lang.ref.*;
36import java.net.*;
37import java.util.*;
38import java.util.regex.*;
39import javax.swing.*;
40import javax.swing.tree.*;
41import org.greenstone.gatherer.Gatherer;
42import org.greenstone.gatherer.cdm.CommandTokenizer;
43import org.greenstone.gatherer.file.FileNode;
44import org.greenstone.gatherer.gui.WarningDialog;
45import org.greenstone.gatherer.msm.ElementWrapper;
46import org.greenstone.gatherer.msm.Metadata;
47import org.greenstone.gatherer.msm.MetadataParser;
48import org.greenstone.gatherer.msm.MSMUtils;
49import org.greenstone.gatherer.util.StaticStrings;
50import org.greenstone.gatherer.util.Utility;
51import org.greenstone.gatherer.valuetree.GValueModel;
52import org.greenstone.gatherer.valuetree.GValueNode;
53import org.w3c.dom.*;
54/** Provides a metadata parser implementation that knows how to locate, prepare for, then import metadata from a previous Greenstone collection. Is aware of such factors as the presence of Metadata Set files and hierarchy files. Updates the profiler where possible to allow for faster subsequent imports from a certain collection. Caches all the information about encountered collections in CollectCFG objects which are softly cached (ie are cached, but are reclaimed before an OutOfMemory exception would be thrown).
55 * @author John Thompson, Greenstone Digital Library, University of Waikato
56 * @version 2.3
57 */
58public class GreenstoneMetadataParser
59 extends LinkedHashMap
60 implements MetadataParser {
61
62 static final private int MAX_CFG_CACHE_SIZE = 10;
63 static final private int MAX_GDM_CACHE_SIZE = 10;
64 /** The default name and location for a collection configuration file (presuming that a collection file prefix will be added). */
65 static final private String CONFIG_FILENAME = "etc" + File.separator + "collect.cfg";
66 /** The pattern to match when searching for directory level assignments. */
67 static final private String DIRECTORY_FILENAME = ".*";
68 static final private String DIRECTORY_FILENAME_SUFFIX = "/.*";
69 static final private String DESCRIPTION_ELEMENT = "Description";
70 static final private String FILENAME_ELEMENT = "FileName";
71 static final private String FILESET_ELEMENT = "FileSet";
72 /** The name of a gdm file. */
73 static final private String GIMPORT = "gimport";
74 static final private String IMPORT = "import";
75 static final private String METADATA_ELEMENT = "Metadata";
76 static final private String METADATA_XML_FILENAME = "metadata.xml";
77 static final private String MODE_ATTRIBUTE = "mode";
78 static final private String NAME_ATTRIBUTE = "name";
79 static final private String SEPARATOR = "/";
80
81 /** A list of the collect.cfg paths that we should ignore. */
82 private ArrayList ignore_list = new ArrayList();
83 /** Has this process been cancelled. */
84 private boolean dialog_cancelled = false;
85 /** A cache of previously parsed collection configuration files. */
86 private CollectCFGCache cfg_cache = new CollectCFGCache();
87 /** A mapping from BasicMetadata to their fully enabled Metadata incarnation. */
88 private HashMap transform = new HashMap();
89
90 /** Default constructor needed for dynamic class loading. */
91 public GreenstoneMetadataParser() {
92 }
93 /** Locate and import any metadata parsed by this metadata parser given the file involved and its previous incarnation. */
94 public boolean process(FileNode destination, FileNode origin, boolean folder_level, boolean dummy_run) {
95 Gatherer.println("GreenstoneMetadataParser: Process " + origin + ": ");
96 int counter = 0;
97 dialog_cancelled = false;
98
99 // 1. Determine what collection the file is in, and load/parse the appropriate collect.cfg. Cache collect.cfg object.
100 ///ystem.err.print("1 ");
101 // Start at the origin node file. If its a file get its parent directory.
102 File collection_dir = origin.getFile();
103 if(collection_dir.isFile()) {
104 collection_dir = collection_dir.getParentFile();
105 }
106 // We're currently in the importing directory so we'll go one more step up.
107 collection_dir = collection_dir.getParentFile();
108 // We are looking for a directory which contains a etc/collect.cfg file and either an import or a gimport directory.
109 boolean found = false;
110 while(!found && collection_dir != null) {
111 File possible_cfg_file = new File(collection_dir, CONFIG_FILENAME);
112 File possible_gimport_directory = new File(collection_dir, GIMPORT);
113 File possible_import_directory = new File(collection_dir, IMPORT);
114 if(possible_cfg_file.exists() && (possible_gimport_directory.exists() || possible_import_directory.exists())) {
115 found = true;
116 Gatherer.println("Found greenstone collection at " + collection_dir.getAbsolutePath());
117 }
118 else {
119 collection_dir = collection_dir.getParentFile();
120 }
121 }
122
123 // Now retrieve the configuration file if there is one.
124 CollectCFG collect_cfg = null;
125 if(collection_dir != null) {
126 File collect_cfg_file = new File(collection_dir, CONFIG_FILENAME);
127 if(collect_cfg_file.exists()) {
128 collect_cfg = cfg_cache.get(collect_cfg_file);
129 }
130 }
131
132 // 2. If a collection configuration file was found, attempt to merge in any mdses and make note of those that are successfully imported (by removing reference from collect.cfg).
133 Gatherer.println("Merging in any metadata sets found.");
134 if(collect_cfg != null) {
135 ArrayList mdses = collect_cfg.getMetadataSets();
136 for(int i = 0; i < mdses.size(); i++) {
137 File mds_file = (File) mdses.get(i);
138 Gatherer.c_man.getCollection().msm.importMDS(mds_file, false);
139 }
140 mdses.clear();
141 mdses = null;
142 }
143
144 // 3. Locate all of the metadata.xml files that may have an affect on the origin file. Make sure the metadata.xml closest to the origin files directory is last (to ensure property inheritance regarding accumulate/overwrite).
145 Gatherer.println("Searching for metadata.xml files");
146 ArrayList search_files = new ArrayList();
147 File file = origin.getFile();
148 String filename = null;
149 boolean file_level;
150 if(file.isFile()) {
151 file_level = false;
152 filename = file.getName();
153 file = file.getParentFile();
154 }
155 else {
156 file_level = true;
157 }
158 while(file != null && (collection_dir == null || !file.equals(collection_dir))) {
159 File test_file = new File(file, Utility.METADATA_XML);
160 if(test_file.exists()) {
161 search_files.add(0, new MetadataXMLFileSearch(test_file, filename));
162 }
163 if(filename != null) {
164 filename = file.getName() + SEPARATOR + filename;
165 }
166 else {
167 filename = file.getName();
168 }
169 file = file.getParentFile();
170 }
171 filename = null;
172 file = null;
173 // Start with an initially empty ArrayList of metadata
174 ArrayList metadata = new ArrayList();
175 // Now search each of these metadata xml for metadata, remembering to accumulate or overwrite as we go along.
176 for(int i = 0; i < search_files.size(); i++) {
177 MetadataXMLFileSearch a_search = (MetadataXMLFileSearch) search_files.get(i);
178 Gatherer.println("Search " + a_search.file.getAbsolutePath() + " for " + (a_search.filename != null ? a_search.filename : ".*"));
179 Gatherer.println("Search at the " + (folder_level ? "Folder" : "Filename") + " level");
180 // Retrieve the document
181 BasicGDMDocument document = getDocument(a_search.file);
182 if(document != null) {
183 // If this is a dummy run, our original source file is actually the metadata.xml file and we retrieve all metadata for this collection, as if accumulated!
184 if(dummy_run) {
185 metadata = document.getAllMetadata();
186 }
187 else {
188 metadata = document.getMetadata(a_search.filename, metadata, folder_level);
189 }
190 document = null;
191 }
192 a_search = null;
193 }
194 search_files = null;
195 // Finally assign the metadata
196 Gatherer.println("Found " + metadata.size() + " pieces of metadata for " + destination);
197 if(metadata.size() > 0) {
198 addMetadata(origin, destination, metadata, collection_dir, collect_cfg, dummy_run);
199 }
200 return dialog_cancelled;
201 }
202
203 protected boolean removeEldestEntry(java.util.Map.Entry entry) {
204 return (size() > MAX_GDM_CACHE_SIZE);
205 }
206
207 private void addMetadata(FileNode origin, FileNode destination, ArrayList metadata, File collection_dir, CollectCFG collect_cfg, boolean dummy_run) {
208 // before we try to addMetadata, we need to check that there are some metadata sets for the collection - otherwise we cant add or import
209 Vector meta_sets = Gatherer.c_man.getCollection().msm.getSets(false);
210 if (meta_sets.size()==0) {
211 ///ystem.out.println("GreenstoneMetadataParser:Error: we have been asked to add metadata but there are no existing sets");
212 // print the warning dialog
213 if(Gatherer.f_man.complain_if_no_sets) {
214 WarningDialog dialog = new WarningDialog("warning.MissingMDS", true);
215 if (dialog.display() == JOptionPane.CANCEL_OPTION) {
216 // the user has cancelled
217 dialog_cancelled = true;
218 Gatherer.f_man.complain_if_no_sets = true;
219 }
220 else {
221 Gatherer.f_man.complain_if_no_sets = false;
222 }
223 }
224 return;
225 }
226 ///ystem.err.print("6 ");
227 // Used in a complicated test later on.
228 for(int i = 0; !dialog_cancelled && i < metadata.size(); i++) {
229 BasicMetadata basic_metadata = ((BasicMetadata) metadata.get(i)).copy();
230 BasicMetadata metadatum = (BasicMetadata) metadata.get(i);
231 metadatum.collection = collection_dir; // May be null. Doesn't matter.
232 Metadata final_metadata = null;
233 // If this BasicMetadata already exists in the transform cache then we can save ourselves a lot of work.
234 SoftReference reference = (SoftReference) transform.get(basic_metadata);
235 if(reference != null) {
236 final_metadata = (Metadata) reference.get();
237 }
238 if(final_metadata == null) {
239 ///ystem.err.println("No existing Metadata object for BasicMetadata: " + basic_metadata);
240 // 6a. Check if an hfile is associated with this metadata, and if so load it, cache it in the collection.cfg object, then resolve metadata value index. Of course we can only do this if a collection configuration file was found in the first place.
241 if(collect_cfg != null) {
242 HFile h_file = collect_cfg.getHFile(metadatum.element);
243 if(h_file != null && !dummy_run) {
244 ///ystem.err.print(metadata.value + " maps to ");
245 metadatum.value = h_file.getValue(metadatum.value);
246 ///ystem.err.println(metadatum.value);
247 }
248 h_file = null;
249 }
250 // 6b. Check if there is a profile regarding the current metadata. The profile may be stored for the collection directory, or if no such directory is available, then try the ancestor folders of the origin file.
251 ///ystem.err.println("Retrieve existing action: " + collection_dir.getAbsolutePath() + ", " + metadatum.element);
252 if(collection_dir != null) {
253 // Note that the first test is whether a profile action exist, while the 'getAction' can return null as the profile action.
254 if(Gatherer.c_man.getCollection().msm.profiler.containsAction(collection_dir.getAbsolutePath(), metadatum.element)) {
255 String new_element_name = Gatherer.c_man.getCollection().msm.profiler.getAction(collection_dir.getAbsolutePath(), metadatum.element);
256 ///ystem.err.println("Profile result = " + new_element_name);
257 if(new_element_name == null) {
258 metadatum = null;
259 }
260 else {
261 metadatum.element = new_element_name;
262 }
263 new_element_name = null;
264 }
265 }
266 else {
267 boolean found = false;
268 File current_folder = origin.getFile().getParentFile();
269 while(!found && metadatum != null && current_folder != null) {
270 if(Gatherer.c_man.getCollection().msm.profiler.containsAction(current_folder.getAbsolutePath(), metadatum.element)) {
271 found = true;
272 String new_element_name = Gatherer.c_man.getCollection().msm.profiler.getAction(current_folder.getAbsolutePath(), metadatum.element);
273 ///ystem.err.println("Profile result = " + new_element_name);
274 if(new_element_name == null) {
275 metadatum = null;
276 }
277 else {
278 metadatum.element = new_element_name;
279 }
280 new_element_name = null;
281 }
282 else {
283 current_folder = current_folder.getParentFile();
284 }
285 }
286 current_folder = null;
287 }
288 ///atherer.println("Assigning metadatum.");
289 if(metadatum != null) {
290 ElementWrapper element = null;
291 // 6c. Try to add metadata.
292 // If we just happen to be importing files from our base collection, then we can assume that any non-namespaced elements are actually extracted metadata
293 if(Utility.isParentFolderOf(Gatherer.c_man.getCollection().getBaseCollection(), origin.getFile().getParentFile())) {
294 element = Gatherer.c_man.getCollection().msm.getElement(metadatum.element);// Perfect Match not required
295 }
296 // Typical case
297 else {
298 element = Gatherer.c_man.getCollection().msm.getElement(metadatum.element, true); // Perfect match
299 }
300
301 ///ystem.err.println("Closest match: " + element);
302 // 6ci. If no match exists, prompt the user to add/merge with specific metadata element. The user can also choose to ignore this metadata.
303 if(element == null) {
304 element = selectElement(metadatum.element);
305 if(!dialog_cancelled) {
306 // 6ciii. If either of the above work, remember to add to profile.
307 if(element == null) {
308 ///ystem.err.println("Adding profile action: " + collection_dir.getAbsolutePath() + ", " + metadatum.element + ", null");
309 if(collection_dir != null) {
310 Gatherer.c_man.getCollection().msm.profiler.addAction(collection_dir.getAbsolutePath(), metadatum.element, null);
311 }
312 else {
313 Gatherer.c_man.getCollection().msm.profiler.addAction(origin.getFile().getParentFile().getAbsolutePath(), metadatum.element, null);
314 }
315 }
316 else {
317 ///ystem.err.println("Adding profile action: " + collection_dir.getAbsolutePath() + ", " + metadatum.element + ", " + element.getName());
318 if(collection_dir != null) {
319 Gatherer.c_man.getCollection().msm.profiler.addAction(collection_dir.getAbsolutePath(), metadatum.element, element.getName());
320}
321 else {
322 Gatherer.c_man.getCollection().msm.profiler.addAction(origin.getFile().getParentFile().getAbsolutePath(), metadatum.element, element.getName());
323 }
324 }
325 }
326 }
327 // - Add metadata
328 if(!dummy_run && element != null && !dialog_cancelled) {
329 ///ystem.err.println("Retrieve the value tree for " + element.toString());
330 GValueModel model = Gatherer.c_man.getCollection().msm.getValueTree(element);
331 if(model != null) {
332 // One little 'fix' for importing from the demo or dls files. The Title metadata found in the metadata.xml isn't used in preference for the automatically extracted Titles. However we want to use them, so we should remove '.*(<filename>)$' for a certain file <filename>.
333 String raw_value = metadatum.value.trim();
334 String filename_munged = destination.getFile().getName();
335 int index = -1;
336 if((index = filename_munged.indexOf(".")) != -1) {
337 filename_munged = filename_munged.substring(0, index);
338 }
339 filename_munged = "(" + filename_munged + ")";
340 if(raw_value.endsWith(filename_munged)) {
341 raw_value = (raw_value.substring(0, raw_value.length() - filename_munged.length())).trim();
342 }
343 GValueNode node = model.addValue(raw_value);
344 final_metadata = new Metadata(element, node);
345 ///ystem.err.println("Adding final metadata: " + metadatum.toString());
346 node = null;
347 }
348 model = null;
349 }
350 element = null;
351 }
352 // If we have successfully created a Metadata from the BasicMetadata, store it
353 if(final_metadata != null && !dialog_cancelled) {
354 transform.put(basic_metadata, new SoftReference(final_metadata));
355 ///ystem.err.println("Add a Metadata object for BasicMetadata: " + basic_metadata);
356 }
357 }
358 else {
359 ///ystem.err.println("Found a Metadata object for BasicMetadata: " + basic_metadata);
360 }
361 if(!dummy_run && final_metadata != null && !dialog_cancelled) {
362 final_metadata.setAccumulate(metadatum.accumulates);
363 // Now we can finally add the metadata.
364 ///ystem.err.println("Adding Metadata: " + final_metadata);
365 Gatherer.c_man.getCollection().msm.fireMetadataChanged(0, destination, null, final_metadata);
366 }
367 // Otherwise there is no way to add this metadata. No value model no metadata value.
368 final_metadata = null;
369 metadatum = null;
370 }
371 }
372
373 /** Determine the different suffix between two string.
374 * @param base_str The base <strong>String</strong>, expected to be the short of the two strings provided.
375 * @param target_str The target <strong>String</strong>, whose differing suffix is returned.
376 * @return A <strong>String</strong> containing the suffix from target which is different from base.
377 */
378 private String diff(String base_str, String target_str) {
379 StringTokenizer base_tokenizer = new StringTokenizer(base_str, File.separator);
380 StringTokenizer target_tokenizer = new StringTokenizer(target_str, File.separator);
381 String base = null;
382 String target = null;
383 while(base_tokenizer.hasMoreTokens() && (base = base_tokenizer.nextToken()).equals((target = target_tokenizer.nextToken()))) {
384 }
385 StringBuffer result = new StringBuffer(target);
386 while(target_tokenizer.hasMoreTokens()) {
387 result.append(File.separator);
388 result.append(target_tokenizer.nextToken());
389 }
390 return result.toString();
391 }
392
393 /** Retrieve the BasicGDMDocument found at the given file, or null if there is no such file or if it isn't a valid BasicGDMDocument. */
394 private BasicGDMDocument getDocument(File file) {
395 ///ystem.err.println("Get Document at: " + file.getAbsolutePath());
396 BasicGDMDocument document = null;
397 if(!ignore_list.contains(file) && file.exists()) {
398 // Check cache
399 SoftReference reference = (SoftReference) get(file);
400 if(reference != null) {
401 ///ystem.err.println("Hit!!");
402 document = (BasicGDMDocument) reference.get();
403 reference = null;
404 }
405 // If that didn't work try to parse in the document
406 if(document == null) {
407 ///ystem.err.println("Miss or stale reference.");
408 document = new BasicGDMDocument(file);
409 if(document.isValid()) {
410 put(file, new SoftReference(document));
411 }
412 else {
413 ///ystem.err.println(file.getAbsolutePath() + " is not a valid GDM XML file.");
414 ignore_list.add(file);
415 document = null;
416 }
417 }
418 }
419 else {
420 ///ystem.err.println("Ignoring file or file doesn't exists.");
421 }
422 return document;
423 }
424
425
426 /** Display a prompt allowing a user to select a metadata element to attempt to force add/merge or ignore a metadata element to. For instance an old version of a metadata.xml from the DLS collection might have an assigned metadata value "Publisher=EC Courier", however Publisher won't automatically match to any metadata set. This prompt will be displayed, and some effort will be made to systematically locate the appropriate set. In this case this should be the DLS metadata set as dls.Publisher should be the closest match. Regardless the element selected is returned.
427 * @param element_name The name of the element we are trying to add, as a <strong>String</strong>.
428 * @return The <strong>ElementWrapper</strong> choosen by the user, or <i>null</i> to skip this metadata element.
429 */
430 private ElementWrapper selectElement(String element_name) {
431 ElementWrapper result = Gatherer.c_man.getCollection().msm.prompt.selectElement(element_name);
432 dialog_cancelled = Gatherer.c_man.getCollection().msm.prompt.wasDialogCancelled();
433 return result;
434 }
435
436 /** A 'basic' version of the more complete GDMDocument used elsewhere, this object provides the same functionality except that it doesn't use Metadata objects. These objects require live references to elements within the MetadataSetManager and GValueModels, but these may not yet exist (and indeed may never exist) for metadata parsed from metadata.xml's outside of our current collection. Thus this class returns a String (or an ArrayList of Strings) when asked for the metadata associated with a certain file. Also notice that this class provides no constructor method for creating a blank document, nor does it ever need a reference to the Gatherer.*/
437 private class BasicGDMDocument
438 extends HashMap {
439 /** The document this class sources its data from. */
440 private Document base_document;
441 /** This constructor takes the original document and parsed out and stores metadata with its association to filenames. */
442 public BasicGDMDocument(File file) {
443 ///ystem.err.println("New BasicGDMDocument: " + file.getAbsolutePath());
444 base_document = Utility.parse(file.getAbsolutePath(), false);
445 }
446 /** Retrieve all of the metadata in this file. */
447 public ArrayList getAllMetadata() {
448 ArrayList metadatum = new ArrayList();
449 // Don't search the cache as this would never have been added.
450 try {
451 // Retrieve the document element.
452 Element directorymetadata_element = base_document.getDocumentElement();
453 // Iterate through the filesets, checking the FileName child element against the target file's name using regular expression matching.
454 NodeList fileset_elements = directorymetadata_element.getElementsByTagName(FILESET_ELEMENT);
455 for(int i = 0; i < fileset_elements.getLength(); i++) {
456 Element fileset_element = (Element) fileset_elements.item(i);
457 NodeList filename_elements = fileset_element.getElementsByTagName(FILENAME_ELEMENT);
458 for(int j = 0; j < filename_elements.getLength(); j++) {
459 Element filename_element = (Element) filename_elements.item(j);
460 // If they match add all of the metadata found in the Description child element, overwriting any metadata with the same element
461 NodeList description_elements = fileset_element.getElementsByTagName(DESCRIPTION_ELEMENT);
462 for(int k = 0; k < description_elements.getLength(); k++) {
463 Element description_element = (Element) description_elements.item(k);
464 NodeList metadata_elements = description_element.getElementsByTagName(METADATA_ELEMENT);
465 for(int l = 0; l < metadata_elements.getLength(); l++) {
466 Element metadata_element = (Element) metadata_elements.item(l);
467 String element = metadata_element.getAttribute(NAME_ATTRIBUTE);
468 BasicMetadata metadata = new BasicMetadata(element, Utility.METADATA_XML, true);
469 // Remove any previous values for this metadata element.
470 for(int m = metadatum.size() - 1; m >= 0; m--) {
471 BasicMetadata old_metadata = (BasicMetadata) metadatum.get(m);
472 if(old_metadata.element.equals(element)) {
473 metadatum.remove(m);
474 }
475 old_metadata = null;
476 }
477 // Add the completed metadata and clean up
478 metadatum.add(metadata);
479 metadata = null;
480 element = null;
481 metadata_element = null;
482 }
483 metadata_elements = null;
484 description_element = null;
485 }
486 description_elements = null;
487 filename_element = null;
488 }
489 filename_elements = null;
490 fileset_element = null;
491 }
492 fileset_elements = null;
493 directorymetadata_element = null;
494 }
495 catch (Exception error) {
496 Gatherer.self.printStackTrace(error);
497 }
498 return metadatum;
499 }
500
501 /** Retrieve any metadata associated with a certain file. If filename is null we are attempting to find directory level metadata. */
502 public ArrayList getMetadata(String filename, ArrayList metadatum_so_far, boolean folder_level) {
503 ///ystem.err.println("Retrieving metadata for: " + filename + " [" + folder_level + "]");
504 ArrayList metadatum = null;
505 // We start by attempting to retrieve this metadata from the cache.
506 if(filename != null) {
507 metadatum = (ArrayList) get(filename);
508 }
509 else {
510 metadatum = (ArrayList) get(DIRECTORY_FILENAME);
511 }
512 // If that failed we consult the document for metadata.
513 if(metadatum == null) {
514 metadatum = new ArrayList();
515 if(metadatum_so_far == null) {
516 metadatum = new ArrayList();
517 }
518 else {
519 metadatum = metadatum_so_far;
520 }
521 try {
522 // Retrieve the document element.
523 Element directorymetadata_element = base_document.getDocumentElement();
524 // Iterate through the filesets, checking the FileName child element against the target file's name using regular expression matching.
525 NodeList fileset_elements = directorymetadata_element.getElementsByTagName(FILESET_ELEMENT);
526 for(int i = 0; i < fileset_elements.getLength(); i++) {
527 Element fileset_element = (Element) fileset_elements.item(i);
528 NodeList filename_elements = fileset_element.getElementsByTagName(FILENAME_ELEMENT);
529 for(int j = 0; j < filename_elements.getLength(); j++) {
530 Element filename_element = (Element) filename_elements.item(j);
531 String filename_text = MSMUtils.getValue(filename_element);
532 if(isMatchingFileSet(filename, filename_text, folder_level)) {
533 ///ystem.err.println("Match: " + (filename != null ? filename : ".*") + " => " + filename_text);
534 // If they match add all of the metadata found in the Description child element, remembering to abide by desired mode (accumulate vs. overwrite).
535 NodeList description_elements = fileset_element.getElementsByTagName(DESCRIPTION_ELEMENT);
536 for(int k = 0; k < description_elements.getLength(); k++) {
537 Element description_element = (Element) description_elements.item(k);
538 NodeList metadata_elements = description_element.getElementsByTagName(METADATA_ELEMENT);
539 for(int l = 0; l < metadata_elements.getLength(); l++) {
540 Element metadata_element = (Element) metadata_elements.item(l);
541 String element = metadata_element.getAttribute(NAME_ATTRIBUTE);
542 ///ystem.err.println("Found element: " + element);
543 //String language = metadata_element.getAttribute("language");
544 String mode = metadata_element.getAttribute(MODE_ATTRIBUTE);
545 // Add the new metadata to our list of metadata for this target file.
546 String value = Utility.stripNL(MSMUtils.getValue(metadata_element));
547 ///ystem.err.println("Found value: " + element);
548 BasicMetadata metadata = new BasicMetadata(element, value, mode.equals("accumulate"));
549 // If mode is overwrite, then remove any previous values for this metadata element.
550 if(!metadata.accumulates) {
551 for(int m = metadatum.size() - 1; m >= 0; m--) {
552 BasicMetadata old_metadata = (BasicMetadata) metadatum.get(m);
553 if(old_metadata.element.equals(element)) {
554 metadatum.remove(m);
555 }
556 old_metadata = null;
557 }
558 }
559 mode = null;
560
561 // Add the completed metadata and clean up
562 metadatum.add(metadata);
563 metadata = null;
564 value = null;
565 element = null;
566 metadata_element = null;
567 }
568 metadata_elements = null;
569 description_element = null;
570 }
571 description_elements = null;
572 }
573 else {
574 ///ystem.err.println("No Match!");
575 }
576 filename_text = null;
577 filename_element = null;
578 }
579 filename_elements = null;
580 fileset_element = null;
581 }
582 fileset_elements = null;
583 directorymetadata_element = null;
584 }
585 catch (Exception error) {
586 Gatherer.self.printStackTrace(error);
587 }
588 // Cache the result, given that these external metadata.xmls are taken to be static at the time of reading (if you happen to be sourcing information from a opened collection that someone is working on, too bad.
589 if(filename != null) {
590 put(filename, metadatum);
591 }
592 else {
593 put(DIRECTORY_FILENAME, metadatum);
594 }
595 }
596 return metadatum;
597 }
598
599 private boolean isMatchingFileSet(String filename, String filename_text, boolean folder_level) {
600 // Crappy. There are apparently two ways of assigning, say, directory level metadata to anything in the ac01ne directory from a parent directories metadata.xml.
601 // The developers guide way: ac01ne/.*
602 // The dls way: ac01ne
603 // So the three tests are:
604 // 1. Check for an exact match i.e "ac01ne/ac01ne.htm" matches "ac01ne/".*
605 // 2. Check for a parent folder match, in the absence of further pattern i.e "ac01ne/ac01ne.htm" matches "ac01ne"
606 // 3. Check for a folder level match if thats what we are looking for i.e "null" matches ".*"
607
608 ///ystem.err.println("Check for: " + (filename != null ? filename : ".*"));
609 ///ystem.err.println("Folder level = " + folder_level);
610 ///ystem.err.println("filename != null && '" + filename + "'.matches('" + filename_text + "') = " + (filename != null ? filename.matches(filename_text) && !filename_text.equals(DIRECTORY_FILENAME) : false));
611 ///ystem.err.println("filename != null && '" + filename + "'.matches('" + filename_text + DIRECTORY_FILENAME_SUFFIX + "') [folder level = " + folder_level + "] = " + (filename != null ? filename.matches(filename_text + DIRECTORY_FILENAME_SUFFIX) && folder_level: false));
612 ///ystem.err.println("filename == null && '" + filename_text + "'.equals('.*') = " + (filename == null ? filename_text.equals(DIRECTORY_FILENAME) : false));
613 if (filename != null) {
614 if(folder_level) {
615 return filename.matches(filename_text) || filename.matches(filename_text + DIRECTORY_FILENAME_SUFFIX);
616 }
617 else {
618 return filename.matches(filename_text) && !filename_text.equals(DIRECTORY_FILENAME);
619 }
620 }
621 else {
622 return filename_text.equals(DIRECTORY_FILENAME);
623 }
624 }
625
626 /** Determine is this is a valid Greenstone Directory Metadata file. It may of course just be some xml file with the name metadata.xml. */
627 public boolean isValid() {
628 // Just determine if the doctype is GreenstoneDirectoryMetadata and root node is called DirectoryMetadata.
629 String doctype_name = base_document.getDoctype().getName();
630 String root_name = base_document.getDocumentElement().getTagName();
631 return ((doctype_name.equals("DirectoryMetadata") || doctype_name.equals("GreenstoneDirectoryMetadata")) && (root_name.equals("DirectoryMetadata") || root_name.equals("GreenstoneDirectoryMetadata")));
632 }
633
634 /** Decode a string that was previously made Perl safe.
635 * @param safe The encoded <strong>String</strong> where dangerous characters have been escaped.
636 * @return A <strong>String</strong> with all the escaping removed.
637 */
638 private String decode(String safe) {
639 String dangerous = safe.replaceAll("\\\\.",".");
640 return dangerous;
641 }
642 }
643 /** A simplistic version of metadata, with no live references. */
644 private class BasicMetadata
645 implements Comparable {
646 public boolean accumulates;
647 /** The collection this metadata was extracted from. Important when attempting to map BasicMetadata to its Metadata incarnation. */
648 public File collection;
649 /** The metadata element. */
650 public String element = null;
651 /** The value. */
652 public String value = null;
653 /** Constructor takes initial values for element and value.
654 * @param element The metadata element as a <strong>String</strong>.
655 * @param value The value as a <strong>String</strong>.
656 */
657 public BasicMetadata(String element, String value, boolean accumulates) {
658 this.accumulates = accumulates;
659 this.element = element;
660 this.value = value;
661 }
662
663 public BasicMetadata copy() {
664 return new BasicMetadata(element, value, accumulates);
665 }
666
667 public int compareTo(Object other) {
668 return toString().compareTo(other.toString());
669 }
670 /** Compare two BasicMetadata objects for equality.
671 * @param object The other <strong>Object</strong>.
672 * @return <i>true</i> if this BasicMetadata matches the given object, <i>false</i> otherwise.
673 */
674 public boolean equals(Object object) {
675 BasicMetadata other = (BasicMetadata) object;
676 if(collection != null && other.collection != null) {
677 return (collection.equals(other.collection) && element.equals(other.element) && value.equals(other.value));
678 }
679 return (element.equals(other.element) && value.equals(other.value));
680 }
681 public String toString() {
682 return element + " = " + value;
683 }
684 }
685
686 /** This class provides a cache for the instances of parsed collect.cfg files and their associated data. Assures that the most recently cached CollectCFG will remain available. Older objects are maintained as soft references and are freed at the JVM implementations descretion, but are gareunteed to be garbage collected before an OutOfMemory exception is thrown. */
687 private class CollectCFGCache
688 extends LinkedHashMap {
689 /** Retrieve the CollectCFG object that matches the given collection file path.
690 * @param collect_cfg_file The <strong>File</strong> that references the collection's directory.
691 * @return The <strong>CollectCFG</strong> that belongs to this collection, or <i>null</i> if no such file exists (so we probably aren't in a collection!).
692 */
693 public CollectCFG get(File collect_cfg_file) {
694 ///ystem.err.println("Retrieve the collection configuration file at: " + collect_cfg_file);
695 CollectCFG collect_cfg = null;
696 // Attempt to load from cache.
697 SoftReference reference = (SoftReference) super.get(collect_cfg_file);
698 // If is doesn't exist, either because its never been loaded, or thats its cache reference has gone stale, attempt to load it again.
699 if(reference == null || (collect_cfg = (CollectCFG)reference.get()) == null) {
700 try {
701 collect_cfg = new CollectCFG(collect_cfg_file);
702 put(collect_cfg_file, new SoftReference(collect_cfg));
703 }
704 catch(Exception error) {
705 Gatherer.printStackTrace(error);
706 collect_cfg = null;
707 }
708 }
709 return collect_cfg;
710 }
711
712 protected boolean removeEldestEntry(java.util.Map.Entry entry) {
713 return (size() > MAX_CFG_CACHE_SIZE);
714 }
715 }
716
717 /** The CollectCFG object encapsulates important metadata information extracted from a collect.cfg file, such as required metadata sets, and hfile associations. As the former are merged, their references are removed from this object, whereas the for the later references are replaced a representation of the hfile itself. */
718 private class CollectCFG {
719 /** A list of the metadata sets associated with the collect.cfg file. */
720 private ArrayList metadatasets = null;
721 /** A hash mapping from metadata element name to hierarchy file, or possibly hierarchy object. */
722 private HashMap hfiles = null;
723 /** The token at the start of a classify command line within the collect.cfg. */
724 static final private String CLASSIFY_COMMAND = "classify";
725 /** The token at the start of a metadataset command line within the collect.cfg. */
726 static final private String METADATASET_COMMAND = "metadataset";
727 /** Constructor which takes a file assumed to be the location of a collect.cfg file belonging to a Greenstone Collection.
728 * @param file A <strong>File</strong> referencing a collect.cfg file.
729 */
730 public CollectCFG(File file)
731 throws Exception {
732 ///atherer.println("Loading a new collection configuration file: " + file.getAbsolutePath());
733 File etc_directory = file.getParentFile();
734 hfiles = new HashMap();
735 metadatasets = new ArrayList();
736 FileReader reader = new FileReader(file);
737 BufferedReader in = new BufferedReader(reader);
738 String command = null;
739 while((command = in.readLine()) != null) {
740 CommandTokenizer tokenizer = new CommandTokenizer(command);
741 if(tokenizer.hasMoreTokens()) {
742 String token = tokenizer.nextToken().toLowerCase();
743 if(token.equals(METADATASET_COMMAND)) {
744 String family_name = tokenizer.nextToken();
745 String file_str = tokenizer.nextToken();
746 if(file_str.startsWith("\"") && file_str.endsWith("\"") && !file_str.equals("\"\"")) {
747 file_str = file_str.substring(1, file_str.length() - 1);
748 }
749 // If the file str is -only- the filename then we add <col_dir>/metadata/
750 File mds_file = null;
751 if(file_str.indexOf(File.separator) == -1) {
752 mds_file = new File(file.getParentFile().getParentFile(), File.separator + "metadata" + File.separator + file_str);
753 }
754 else {
755 mds_file = new File(file_str);
756 }
757 ///ystem.err.println("Attempting to file mds file at " + file.getAbsolutePath());
758 if(mds_file.exists()) {
759 metadatasets.add(mds_file);
760 }
761 mds_file = null;
762 file_str = null;
763 family_name = null;
764 }
765 // Also look for any classify commands that include an hfile and element
766 else if(token.equals(CLASSIFY_COMMAND)) {
767 String hfile_name = null;
768 String element_name = null;
769 // Drop the classifier name
770 tokenizer.nextToken();
771 while(tokenizer.hasMoreTokens()) {
772 token = tokenizer.nextToken().toLowerCase();
773 if(token.equals("-hfile")) {
774 hfile_name = tokenizer.nextToken();
775 }
776 else if(token.equals("-metadata")) {
777 element_name = tokenizer.nextToken();
778 }
779 }
780 if(hfile_name != null && element_name != null) {
781 // If hfile_name has no path, append the etc directories one. Either way create a file reference
782 File hfile = null;
783 hfile_name = hfile_name.replace('\\', File.separatorChar);
784 hfile_name = hfile_name.replace('/', File.separatorChar);
785 if(hfile_name.indexOf(File.separator) == -1) {
786 hfile = new File(etc_directory, hfile_name);
787 }
788 else {
789 hfile = new File(hfile_name);
790 }
791 // Add to hfiles
792 ///atherer.println("Adding hfile reference: " + element_name + " -> " + hfile);
793 hfiles.put(element_name, hfile);
794 hfile = null;
795 }
796 element_name = null;
797 hfile_name = null;
798 }
799 tokenizer = null;
800 }
801 }
802 command = null;
803 in.close();
804 reader.close();
805 in = null;
806 reader = null;
807 // Now we search the etc directory for *.txt files which we attempt to parse as hfiles
808 File children[] = etc_directory.listFiles(); // We are sure there is at least one, collect.cfg
809 for(int i = 0; i < children.length; i++) {
810 // If this is a text file, extract the element name and process
811 String name = children[i].getName();
812 if(children[i].isFile() && name.endsWith(".txt")) {
813 String element_name = name.substring(0, name.lastIndexOf("."));
814 if(!hfiles.containsKey(element_name)) {
815 ///atherer.println("Adding hfile reference: " + element_name + " -> " + children[i]);
816 hfiles.put(element_name, children[i]);
817 }
818 element_name = null;
819 }
820 name = null;
821 }
822 children = null;
823 etc_directory = null;
824 file = null;
825 }
826 /** Attempts to retrieve the HFile object associated with a certain metadata element. This may have already been cached, or may need to be loaded. Then again it may not even be necessary.
827 * @param element The fully qualified name of a metadata element, as a <strong>String</strong>.
828 * @return The <strong>HFile</strong> associated with the given element, or <i>null</i> if its unnecessary.
829 * @see org.greenstone.gatherer.cdm.CommandTokenizer
830 */
831 public HFile getHFile(String element) {
832 HFile result = null;
833 Object target = hfiles.get(element);
834 // If target is non-null
835 if(target != null) {
836 // If we haven't already load and parse the file.
837 if(target instanceof File) {
838 ///ystem.err.println("\nHFILE-MISS!! Loading " + target.toString());
839 result = new HFile();
840 try {
841 FileReader in_filereader = new FileReader((File)target);
842 BufferedReader in = new BufferedReader(in_filereader);
843 String line = null;
844 while((line = in.readLine()) != null) {
845 CommandTokenizer tokenizer = new CommandTokenizer(line);
846 String alias = Utility.decodeGreenstone(tokenizer.nextToken());
847 String index = tokenizer.nextToken();
848 String value = Utility.decodeGreenstone(tokenizer.nextToken());
849 ///ystem.err.println("Read " + index + ", " + alias + ", " + value);
850 if(alias.startsWith("\"") && alias.endsWith("\"") && !alias.equals("\"\"")) {
851 alias = alias.substring(1, alias.length() - 1);
852 }
853 if(value.startsWith("\"") && value.endsWith("\"") && !value.equals("\"\"")) {
854 value = value.substring(1, value.length() - 1);
855 }
856 result.add(index, alias, value);
857 value = null;
858 alias = null;
859 index = null;
860 tokenizer = null;
861 }
862 line = null;
863 in.close();
864 in = null;
865 in_filereader = null;
866 hfiles.put(element, result);
867 }
868 catch (Exception error) {
869 error.printStackTrace();
870 hfiles.remove(element);
871 }
872 }
873 else {
874 ///ystem.err.print("HFILE-HIT!!");
875 result = (HFile) target;
876 }
877 }
878 // Else no hfile is needed for this element
879 target = null;
880 return result;
881 }
882 /** Retrieve the list of metadata sets associated with this collection.
883 * @return An <strong>ArrayList</strong> of metadata set Files.
884 */
885 public ArrayList getMetadataSets() {
886 return metadatasets;
887 }
888 }
889
890 /** The HFile object provides a container for the mappings from indexes, of the form 1.1.1, to alias-value pairs. It also provides method to retrieving the alias and value for a certain element, remembering that values must be expressed in terms of their absolute subject heirarchy path. */
891 private class HFile
892 extends HashMap {
893 /** Construct a new HFile object with no initial values. */
894 public HFile() {
895 super();
896 }
897 /** Add a new (index,(alias, value)) mapping.
898 * @param index The index of this mapping as a <strong>String</strong>.
899 * @param alias The alias of this mapping as a <strong>String</strong>.
900 * @param value And finally the value of this mapping as a, you guessed it, <strong>String</strong>.
901 */
902 public void add(String index, String alias, String value) {
903 Entry entry = new Entry(index, alias, value);
904 ///ystem.err.println("Adding entry: " + index + " \"" + alias + "\" \"" + value + "\"");
905 put(index, entry);
906 put(alias, entry);
907 }
908 public String getAlias(String index) {
909 String alias = "";
910 Entry entry = (Entry) get(index);
911 if(entry != null) {
912 alias = entry.alias;
913 }
914 entry = null;
915 return alias;
916 }
917 /** Retrieve the value associated with a certain index. This is harder than it first sounds as you must take into account the parent indexes of this one.
918 * @param index The index whose value you wish to calculate, as a <strong>String</strong>.
919 * @return The fully quantified path to the value that matches index, also as a <strong>String</strong>. Delimitiation between subject layers is denoted by the string "|"
920 */
921 public String getValue(String index) {
922 ///ystem.err.println("Retrieve value for the alias/index: '" + index + "'");
923 StringBuffer value = new StringBuffer("");
924 // If index isn't the index, it must be the alias. Replace it with the index dammit.
925 Entry entry = null;
926 if(!Utility.isIndex(index)) {
927 ///ystem.err.println("\tThis is an alias.");
928 // Store this for later, as its exactly the same entry we'd get had we found the last component of a proper index.
929 entry = (Entry) get(index);
930 index = entry.index;
931 ///ystem.err.println("\tIndex is actually: " + index);
932 }
933 // Now build the hierarchy if necessary.
934 int dot_index = -1;
935 if((dot_index = index.indexOf(".")) != -1) {
936 ///ystem.err.println("\tHierarchy information required -->");
937 value.append(getValue(index.substring(0, dot_index)));
938 value.append(StaticStrings.PIPE_STR);
939 ///ystem.err.println("\t<-- Hierarchy information complete");
940 }
941 if(entry == null) {
942 entry = (Entry) get(index);
943 }
944 if(entry != null) {
945 value.append(entry.value);
946 }
947 entry = null;
948 ///ystem.err.println("\tFinal value is: '" + value.toString() + "'\n");
949 return value.toString();
950 }
951
952 private class Entry {
953 public String alias = null;
954 public String index = null;
955 public String value = null;
956 public Entry(String index, String alias, String value) {
957 this.alias = alias;
958 this.index = index;
959 this.value = value;
960 }
961 }
962 }
963
964 private class MetadataXMLFileSearch {
965 public File file;
966 public String filename;
967 public MetadataXMLFileSearch(File file, String filename) {
968 this.file = file;
969 this.filename = filename;
970 }
971 }
972}
Note: See TracBrowser for help on using the repository browser.