source: trunk/gli/src/org/greenstone/gatherer/msm/parsers/GreenstoneMetadataParser.java@ 5153

Last change on this file since 5153 was 5153, checked in by jmt12, 21 years ago

Fix 203B143

  • Property svn:keywords set to Author Date Id Revision
File size: 44.3 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * Author: John Thompson, Greenstone Digital Library, University of Waikato
9 *
10 * Copyright (C) 1999 New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *########################################################################
26 */
27package org.greenstone.gatherer.msm.parsers;
28/**************************************************************************************
29 * Written: ??/??/02
30 * Revised: ??/??/02 - Commented
31 * 25/07/03 - Fix to allow any valid greenstone metadata.xml to be imported from, not just those that occur within a collection. This functionality is accidental and caused by GLI attempting to find a collect.cfg to extract hierarchy file information from, and failing.
32 **************************************************************************************/
33import java.io.*;
34import java.lang.ref.*;
35import java.net.*;
36import java.util.*;
37import java.util.regex.*;
38import javax.swing.*;
39import javax.swing.tree.*;
40import org.greenstone.gatherer.Gatherer;
41import org.greenstone.gatherer.cdm.CommandTokenizer;
42import org.greenstone.gatherer.file.FileNode;
43import org.greenstone.gatherer.gui.WarningDialog;
44import org.greenstone.gatherer.msm.ElementWrapper;
45import org.greenstone.gatherer.msm.Metadata;
46import org.greenstone.gatherer.msm.MetadataParser;
47import org.greenstone.gatherer.msm.MSMUtils;
48import org.greenstone.gatherer.util.DecodeHTMLReader;
49import org.greenstone.gatherer.util.StaticStrings;
50import org.greenstone.gatherer.util.Utility;
51import org.greenstone.gatherer.valuetree.GValueModel;
52import org.greenstone.gatherer.valuetree.GValueNode;
53import org.w3c.dom.*;
54/** Provides a metadata parser implementation that knows how to locate, prepare for, then import metadata from a previous Greenstone collection. Is aware of such factors as the presence of Metadata Set files and hierarchy files. Updates the profiler where possible to allow for faster subsequent imports from a certain collection. Caches all the information about encountered collections in CollectCFG objects which are softly cached (ie are cached, but are reclaimed before an OutOfMemory exception would be thrown).
55 * @author John Thompson, Greenstone Digital Library, University of Waikato
56 * @version 2.3
57 */
58public class GreenstoneMetadataParser
59 extends LinkedHashMap
60 implements MetadataParser {
61
62 static final private int MAX_CFG_CACHE_SIZE = 10;
63 static final private int MAX_GDM_CACHE_SIZE = 10;
64 /** The default name and location for a collection configuration file (presuming that a collection file prefix will be added). */
65 static final private String CONFIG_FILENAME = "etc" + File.separator + "collect.cfg";
66 /** The pattern to match when searching for directory level assignments. */
67 static final private String DIRECTORY_FILENAME = ".*";
68 static final private String DIRECTORY_FILENAME_SUFFIX = "/.*";
69 static final private String DESCRIPTION_ELEMENT = "Description";
70 static final private String FILENAME_ELEMENT = "FileName";
71 static final private String FILESET_ELEMENT = "FileSet";
72 /** The name of a gdm file. */
73 static final private String GIMPORT = "gimport";
74 static final private String IMPORT = "import";
75 static final private String METADATA_ELEMENT = "Metadata";
76 static final private String METADATA_XML_FILENAME = "metadata.xml";
77 static final private String MODE_ATTRIBUTE = "mode";
78 static final private String NAME_ATTRIBUTE = "name";
79 static final private String SEPARATOR = "/";
80
81 /** A list of the collect.cfg paths that we should ignore. */
82 private ArrayList ignore_list = new ArrayList();
83 /** Has this process been cancelled. */
84 private boolean dialog_cancelled = false;
85 /** A cache of previously parsed collection configuration files. */
86 private CollectCFGCache cfg_cache = new CollectCFGCache();
87 /** A mapping from BasicMetadata to their fully enabled Metadata incarnation. */
88 private HashMap transform = new HashMap();
89
90 /** Default constructor needed for dynamic class loading. */
91 public GreenstoneMetadataParser() {
92 }
93 /** Locate and import any metadata parsed by this metadata parser given the file involved and its previous incarnation. */
94 public boolean process(FileNode destination, FileNode origin, boolean folder_level, boolean dummy_run) {
95 ///atherer.println("GreenstoneMetadataParser: Process " + origin + ": ");
96 int counter = 0;
97 dialog_cancelled = false;
98
99 // 1. Determine what collection the file is in, and load/parse the appropriate collect.cfg. Cache collect.cfg object.
100 ///ystem.err.print("1 ");
101 // Start at the origin node file. If its a file get its parent directory.
102 File collection_dir = origin.getFile();
103 if(collection_dir.isFile()) {
104 collection_dir = collection_dir.getParentFile();
105 }
106 // We're currently in the importing directory so we'll go one more step up.
107 collection_dir = collection_dir.getParentFile();
108 // We are looking for a directory which contains a etc/collect.cfg file and either an import or a gimport directory.
109 boolean found = false;
110 while(!found && collection_dir != null) {
111 File possible_cfg_file = new File(collection_dir, CONFIG_FILENAME);
112 File possible_gimport_directory = new File(collection_dir, GIMPORT);
113 File possible_import_directory = new File(collection_dir, IMPORT);
114 if(possible_cfg_file.exists() && (possible_gimport_directory.exists() || possible_import_directory.exists())) {
115 found = true;
116 ///ystem.err.println("Found greenstone collection at " + collection_dir.getAbsolutePath());
117 }
118 else {
119 collection_dir = collection_dir.getParentFile();
120 }
121 }
122
123 // Now retrieve the configuration file if there is one.
124 CollectCFG collect_cfg = null;
125 if(collection_dir != null) {
126 File collect_cfg_file = new File(collection_dir, CONFIG_FILENAME);
127 if(collect_cfg_file.exists()) {
128 collect_cfg = cfg_cache.get(collect_cfg_file);
129 }
130 }
131
132 // 2. If a collection configuration file was found, attempt to merge in any mdses and make note of those that are successfully imported (by removing reference from collect.cfg).
133 ///ystem.err.print("2 ");
134 if(collect_cfg != null) {
135 ArrayList mdses = collect_cfg.getMetadataSets();
136 for(int i = 0; i < mdses.size(); i++) {
137 File mds_file = (File) mdses.get(i);
138 Gatherer.c_man.getCollection().msm.importMDS(mds_file, false);
139 }
140 mdses.clear();
141 mdses = null;
142 }
143
144 // 3. Locate all of the metadata.xml files that may have an affect on the origin file. Make sure the metadata.xml closest to the origin files directory is last (to ensure property inheritance regarding accumulate/overwrite).
145 ///ystem.err.print("3 ");
146 ArrayList search_files = new ArrayList();
147 File file = origin.getFile();
148 String filename = null;
149 boolean file_level;
150 if(file.isFile()) {
151 file_level = false;
152 filename = file.getName();
153 file = file.getParentFile();
154 }
155 else {
156 file_level = true;
157 }
158 while(file != null && (collection_dir == null || !file.equals(collection_dir))) {
159 File test_file = new File(file, Utility.METADATA_XML);
160 if(test_file.exists()) {
161 search_files.add(0, new MetadataXMLFileSearch(test_file, filename));
162 }
163 if(filename != null) {
164 filename = file.getName() + SEPARATOR + filename;
165 }
166 else {
167 filename = file.getName();
168 }
169 file = file.getParentFile();
170 }
171 filename = null;
172 file = null;
173 // Start with an initially empty ArrayList of metadata
174 ArrayList metadata = new ArrayList();
175 // Now search each of these metadata xml for metadata, remembering to accumulate or overwrite as we go along.
176 for(int i = 0; i < search_files.size(); i++) {
177 MetadataXMLFileSearch a_search = (MetadataXMLFileSearch) search_files.get(i);
178 ///ystem.err.println("Search " + a_search.file.getAbsolutePath() + " for " + (a_search.filename != null ? a_search.filename : ".*"));
179 // Retrieve the document
180 BasicGDMDocument document = getDocument(a_search.file);
181 if(document != null) {
182 // If this is a dummy run, our original source file is actually the metadata.xml file and we retrieve all metadata for this collection, as if accumulated!
183 if(dummy_run) {
184 metadata = document.getAllMetadata();
185 }
186 else {
187 metadata = document.getMetadata(a_search.filename, metadata, folder_level);
188 }
189 document = null;
190 }
191 a_search = null;
192 }
193 search_files = null;
194 // Finally assign the metadata
195 ///ystem.err.println("Found " + metadata.size() + " pieces of metadata for " + destination);
196 if(metadata.size() > 0) {
197 addMetadata(origin, destination, metadata, collection_dir, collect_cfg, dummy_run);
198 }
199 return dialog_cancelled;
200 }
201
202 protected boolean removeEldestEntry(java.util.Map.Entry entry) {
203 return (size() > MAX_GDM_CACHE_SIZE);
204 }
205
206 private void addMetadata(FileNode origin, FileNode destination, ArrayList metadata, File collection_dir, CollectCFG collect_cfg, boolean dummy_run) {
207 // before we try to addMetadata, we need to check that there are some metadata sets for the collection - otherwise we cant add or import
208 Vector meta_sets = Gatherer.c_man.getCollection().msm.getSets(false);
209 if (meta_sets.size()==0) {
210 ///ystem.out.println("GreenstoneMetadataParser:Error: we have been asked to add metadata but there are no existing sets");
211 // print the warning dialog
212 WarningDialog dialog = new WarningDialog("warning.MissingMDS", true);
213 if (dialog.display() == JOptionPane.CANCEL_OPTION) {
214 // the user has cancelled
215 dialog_cancelled = true;
216 }
217 return;
218 }
219 ///ystem.err.print("6 ");
220 // Used in a complicated test later on.
221 for(int i = 0; !dialog_cancelled && i < metadata.size(); i++) {
222 BasicMetadata basic_metadata = ((BasicMetadata) metadata.get(i)).copy();
223 BasicMetadata metadatum = (BasicMetadata) metadata.get(i);
224 metadatum.collection = collection_dir; // May be null. Doesn't matter.
225 Metadata final_metadata = null;
226 // If this BasicMetadata already exists in the transform cache then we can save ourselves a lot of work.
227 SoftReference reference = (SoftReference) transform.get(basic_metadata);
228 if(reference != null) {
229 final_metadata = (Metadata) reference.get();
230 }
231 if(final_metadata == null) {
232 ///ystem.err.println("No existing Metadata object for BasicMetadata: " + basic_metadata);
233 // 6a. Check if an hfile is associated with this metadata, and if so load it, cache it in the collection.cfg object, then resolve metadata value index. Of course we can only do this if a collection configuration file was found in the first place.
234 if(collect_cfg != null) {
235 HFile h_file = collect_cfg.getHFile(metadatum.element);
236 if(h_file != null && !dummy_run) {
237 ///ystem.err.print(metadata.value + " maps to ");
238 metadatum.value = h_file.getValue(metadatum.value);
239 ///ystem.err.println(metadatum.value);
240 }
241 h_file = null;
242 }
243 // 6b. Check if there is a profile regarding the current metadata. The profile may be stored for the collection directory, or if no such directory is available, then try the ancestor folders of the origin file.
244 ///ystem.err.println("Retrieve existing action: " + collection_dir.getAbsolutePath() + ", " + metadatum.element);
245 if(collection_dir != null) {
246 // Note that the first test is whether a profile action exist, while the 'getAction' can return null as the profile action.
247 if(Gatherer.c_man.getCollection().msm.profiler.containsAction(collection_dir.getAbsolutePath(), metadatum.element)) {
248 String new_element_name = Gatherer.c_man.getCollection().msm.profiler.getAction(collection_dir.getAbsolutePath(), metadatum.element);
249 ///ystem.err.println("Profile result = " + new_element_name);
250 if(new_element_name == null) {
251 metadatum = null;
252 }
253 else {
254 metadatum.element = new_element_name;
255 }
256 new_element_name = null;
257 }
258 }
259 else {
260 boolean found = false;
261 File current_folder = origin.getFile().getParentFile();
262 while(!found && metadatum != null && current_folder != null) {
263 if(Gatherer.c_man.getCollection().msm.profiler.containsAction(current_folder.getAbsolutePath(), metadatum.element)) {
264 found = true;
265 String new_element_name = Gatherer.c_man.getCollection().msm.profiler.getAction(current_folder.getAbsolutePath(), metadatum.element);
266 ///ystem.err.println("Profile result = " + new_element_name);
267 if(new_element_name == null) {
268 metadatum = null;
269 }
270 else {
271 metadatum.element = new_element_name;
272 }
273 new_element_name = null;
274 }
275 else {
276 current_folder = current_folder.getParentFile();
277 }
278 }
279 current_folder = null;
280 }
281 ///atherer.println("Assigning metadatum.");
282 if(metadatum != null) {
283 // 6c. Try to add metadata. If there is no matching metadata element:
284 ElementWrapper element = Gatherer.c_man.getCollection().msm.getElement(metadatum.element, true);
285 // Arg. The element returned may come from the Greenstone dls, which of course should never be involved during importing. To solve check the namespace isn't "" and if it is nullify the element. Nullify. NULLIFY, Bwuhahahaha...
286 if(element != null && element.getNamespace().equals("")) {
287 element = null;
288 }
289 // 6ci. If no match exists, prompt the user to add/merge with specific metadata element. The user can also choose to ignore this metadata.
290 if(element == null) {
291 element = selectElement(metadatum.element);
292 if(!dialog_cancelled) {
293 // 6ciii. If either of the above work, remember to add to profile.
294 if(element == null) {
295 ///ystem.err.println("Adding profile action: " + collection_dir.getAbsolutePath() + ", " + metadatum.element + ", null");
296 if(collection_dir != null) {
297 Gatherer.c_man.getCollection().msm.profiler.addAction(collection_dir.getAbsolutePath(), metadatum.element, null);
298 }
299 else {
300 Gatherer.c_man.getCollection().msm.profiler.addAction(origin.getFile().getParentFile().getAbsolutePath(), metadatum.element, null);
301 }
302 }
303 else {
304 ///ystem.err.println("Adding profile action: " + collection_dir.getAbsolutePath() + ", " + metadatum.element + ", " + element.getName());
305 if(collection_dir != null) {
306 Gatherer.c_man.getCollection().msm.profiler.addAction(collection_dir.getAbsolutePath(), metadatum.element, element.getName());
307}
308 else {
309 Gatherer.c_man.getCollection().msm.profiler.addAction(origin.getFile().getParentFile().getAbsolutePath(), metadatum.element, element.getName());
310 }
311 }
312 }
313 }
314 // - Add metadata
315 if(!dummy_run && element != null && !dialog_cancelled) {
316 ///ystem.err.println("Retrieve the value tree for " + element.toString());
317 GValueModel model = Gatherer.c_man.getCollection().msm.getValueTree(element);
318 if(model != null) {
319 // One little 'fix' for importing from the demo or dls files. The Title metadata found in the metadata.xml isn't used in preference for the automatically extracted Titles. However we want to use them, so we should remove '.*(<filename>)$' for a certain file <filename>.
320 String raw_value = metadatum.value.trim();
321 String filename_munged = destination.getFile().getName();
322 int index = -1;
323 if((index = filename_munged.indexOf(".")) != -1) {
324 filename_munged = filename_munged.substring(0, index);
325 }
326 filename_munged = "(" + filename_munged + ")";
327 if(raw_value.endsWith(filename_munged)) {
328 raw_value = (raw_value.substring(0, raw_value.length() - filename_munged.length())).trim();
329 }
330 GValueNode node = model.addValue(raw_value);
331 final_metadata = new Metadata(element, node);
332 ///ystem.err.println("Adding final metadata: " + metadatum.toString());
333 node = null;
334 }
335 model = null;
336 }
337 element = null;
338 }
339 // If we have successfully created a Metadata from the BasicMetadata, store it
340 if(final_metadata != null && !dialog_cancelled) {
341 transform.put(basic_metadata, new SoftReference(final_metadata));
342 ///ystem.err.println("Add a Metadata object for BasicMetadata: " + basic_metadata);
343 }
344 }
345 else {
346 ///ystem.err.println("Found a Metadata object for BasicMetadata: " + basic_metadata);
347 }
348 if(!dummy_run && final_metadata != null && !dialog_cancelled) {
349 final_metadata.setAccumulate(metadatum.accumulates);
350 // Now we can finally add the metadata.
351 ///ystem.err.println("Adding Metadata: " + final_metadata);
352 Gatherer.c_man.getCollection().msm.fireMetadataChanged(0, destination, null, final_metadata);
353 }
354 // Otherwise there is no way to add this metadata. No value model no metadata value.
355 final_metadata = null;
356 metadatum = null;
357 }
358 }
359
360 /** Determine the different suffix between two string.
361 * @param base_str The base <strong>String</strong>, expected to be the short of the two strings provided.
362 * @param target_str The target <strong>String</strong>, whose differing suffix is returned.
363 * @return A <strong>String</strong> containing the suffix from target which is different from base.
364 */
365 private String diff(String base_str, String target_str) {
366 StringTokenizer base_tokenizer = new StringTokenizer(base_str, File.separator);
367 StringTokenizer target_tokenizer = new StringTokenizer(target_str, File.separator);
368 String base = null;
369 String target = null;
370 while(base_tokenizer.hasMoreTokens() && (base = base_tokenizer.nextToken()).equals((target = target_tokenizer.nextToken()))) {
371 }
372 StringBuffer result = new StringBuffer(target);
373 while(target_tokenizer.hasMoreTokens()) {
374 result.append(File.separator);
375 result.append(target_tokenizer.nextToken());
376 }
377 return result.toString();
378 }
379
380 /** Retrieve the BasicGDMDocument found at the given file, or null if there is no such file or if it isn't a valid BasicGDMDocument. */
381 private BasicGDMDocument getDocument(File file) {
382 ///ystem.err.println("Get Document at: " + file.getAbsolutePath());
383 BasicGDMDocument document = null;
384 if(!ignore_list.contains(file) && file.exists()) {
385 // Check cache
386 SoftReference reference = (SoftReference) get(file);
387 if(reference != null) {
388 ///ystem.err.println("Hit!!");
389 document = (BasicGDMDocument) reference.get();
390 reference = null;
391 }
392 // If that didn't work try to parse in the document
393 if(document == null) {
394 ///ystem.err.println("Miss or stale reference.");
395 document = new BasicGDMDocument(file);
396 if(document.isValid()) {
397 put(file, new SoftReference(document));
398 }
399 else {
400 ///ystem.err.println(file.getAbsolutePath() + " is not a valid GDM XML file.");
401 ignore_list.add(file);
402 document = null;
403 }
404 }
405 }
406 else {
407 ///ystem.err.println("Ignoring file or file doesn't exists.");
408 }
409 return document;
410 }
411
412
413 /** Display a prompt allowing a user to select a metadata element to attempt to force add/merge or ignore a metadata element to. For instance an old version of a metadata.xml from the DLS collection might have an assigned metadata value "Publisher=EC Courier", however Publisher won't automatically match to any metadata set. This prompt will be displayed, and some effort will be made to systematically locate the appropriate set. In this case this should be the DLS metadata set as dls.Publisher should be the closest match. Regardless the element selected is returned.
414 * @param element_name The name of the element we are trying to add, as a <strong>String</strong>.
415 * @return The <strong>ElementWrapper</strong> choosen by the user, or <i>null</i> to skip this metadata element.
416 */
417 private ElementWrapper selectElement(String element_name) {
418 ElementWrapper result = Gatherer.c_man.getCollection().msm.prompt.selectElement(element_name);
419 dialog_cancelled = Gatherer.c_man.getCollection().msm.prompt.wasDialogCancelled();
420 return result;
421 }
422
423 /** A 'basic' version of the more complete GDMDocument used elsewhere, this object provides the same functionality except that it doesn't use Metadata objects. These objects require live references to elements within the MetadataSetManager and GValueModels, but these may not yet exist (and indeed may never exist) for metadata parsed from metadata.xml's outside of our current collection. Thus this class returns a String (or an ArrayList of Strings) when asked for the metadata associated with a certain file. Also notice that this class provides no constructor method for creating a blank document, nor does it ever need a reference to the Gatherer.*/
424 private class BasicGDMDocument
425 extends HashMap {
426 /** The document this class sources its data from. */
427 private Document base_document;
428 /** This constructor takes the original document and parsed out and stores metadata with its association to filenames. */
429 public BasicGDMDocument(File file) {
430 ///ystem.err.println("New BasicGDMDocument: " + file.getAbsolutePath());
431 base_document = Utility.parse(file.getAbsolutePath(), false);
432 }
433 /** Retrieve all of the metadata in this file. */
434 public ArrayList getAllMetadata() {
435 ArrayList metadatum = new ArrayList();
436 // Don't search the cache as this would never have been added.
437 try {
438 // Retrieve the document element.
439 Element directorymetadata_element = base_document.getDocumentElement();
440 // Iterate through the filesets, checking the FileName child element against the target file's name using regular expression matching.
441 NodeList fileset_elements = directorymetadata_element.getElementsByTagName(FILESET_ELEMENT);
442 for(int i = 0; i < fileset_elements.getLength(); i++) {
443 Element fileset_element = (Element) fileset_elements.item(i);
444 NodeList filename_elements = fileset_element.getElementsByTagName(FILENAME_ELEMENT);
445 for(int j = 0; j < filename_elements.getLength(); j++) {
446 Element filename_element = (Element) filename_elements.item(j);
447 // If they match add all of the metadata found in the Description child element, overwriting any metadata with the same element
448 NodeList description_elements = fileset_element.getElementsByTagName(DESCRIPTION_ELEMENT);
449 for(int k = 0; k < description_elements.getLength(); k++) {
450 Element description_element = (Element) description_elements.item(k);
451 NodeList metadata_elements = description_element.getElementsByTagName(METADATA_ELEMENT);
452 for(int l = 0; l < metadata_elements.getLength(); l++) {
453 Element metadata_element = (Element) metadata_elements.item(l);
454 String element = metadata_element.getAttribute(NAME_ATTRIBUTE);
455 BasicMetadata metadata = new BasicMetadata(element, Utility.METADATA_XML, true);
456 // Remove any previous values for this metadata element.
457 for(int m = metadatum.size() - 1; m >= 0; m--) {
458 BasicMetadata old_metadata = (BasicMetadata) metadatum.get(m);
459 if(old_metadata.element.equals(element)) {
460 metadatum.remove(m);
461 }
462 old_metadata = null;
463 }
464 // Add the completed metadata and clean up
465 metadatum.add(metadata);
466 metadata = null;
467 element = null;
468 metadata_element = null;
469 }
470 metadata_elements = null;
471 description_element = null;
472 }
473 description_elements = null;
474 filename_element = null;
475 }
476 filename_elements = null;
477 fileset_element = null;
478 }
479 fileset_elements = null;
480 directorymetadata_element = null;
481 }
482 catch (Exception error) {
483 Gatherer.self.printStackTrace(error);
484 }
485 return metadatum;
486 }
487
488 /** Retrieve any metadata associated with a certain file. If filename is null we are attempting to find directory level metadata. */
489 public ArrayList getMetadata(String filename, ArrayList metadatum_so_far, boolean folder_level) {
490 ///ystem.err.println("Retrieving metadata for: " + filename + " [" + folder_level + "]");
491 ArrayList metadatum = null;
492 // We start by attempting to retrieve this metadata from the cache.
493 if(filename != null) {
494 metadatum = (ArrayList) get(filename);
495 }
496 else {
497 metadatum = (ArrayList) get(DIRECTORY_FILENAME);
498 }
499 // If that failed we consult the document for metadata.
500 if(metadatum == null) {
501 metadatum = new ArrayList();
502 if(metadatum_so_far == null) {
503 metadatum = new ArrayList();
504 }
505 else {
506 metadatum = metadatum_so_far;
507 }
508 try {
509 // Retrieve the document element.
510 Element directorymetadata_element = base_document.getDocumentElement();
511 // Iterate through the filesets, checking the FileName child element against the target file's name using regular expression matching.
512 NodeList fileset_elements = directorymetadata_element.getElementsByTagName(FILESET_ELEMENT);
513 for(int i = 0; i < fileset_elements.getLength(); i++) {
514 Element fileset_element = (Element) fileset_elements.item(i);
515 NodeList filename_elements = fileset_element.getElementsByTagName(FILENAME_ELEMENT);
516 for(int j = 0; j < filename_elements.getLength(); j++) {
517 Element filename_element = (Element) filename_elements.item(j);
518 String filename_text = MSMUtils.getValue(filename_element);
519 if(isMatchingFileSet(filename, filename_text, folder_level)) {
520 ///ystem.err.println("Match: " + (filename != null ? filename : ".*") + " => " + filename_text);
521 // If they match add all of the metadata found in the Description child element, remembering to abide by desired mode (accumulate vs. overwrite).
522 NodeList description_elements = fileset_element.getElementsByTagName(DESCRIPTION_ELEMENT);
523 for(int k = 0; k < description_elements.getLength(); k++) {
524 Element description_element = (Element) description_elements.item(k);
525 NodeList metadata_elements = description_element.getElementsByTagName(METADATA_ELEMENT);
526 for(int l = 0; l < metadata_elements.getLength(); l++) {
527 Element metadata_element = (Element) metadata_elements.item(l);
528 String element = metadata_element.getAttribute(NAME_ATTRIBUTE);
529 ///ystem.err.println("Found element: " + element);
530 //String language = metadata_element.getAttribute("language");
531 String mode = metadata_element.getAttribute(MODE_ATTRIBUTE);
532 // Add the new metadata to our list of metadata for this target file.
533 String value = Utility.stripNL(MSMUtils.getValue(metadata_element));
534 ///ystem.err.println("Found value: " + element);
535 BasicMetadata metadata = new BasicMetadata(element, value, mode.equals("accumulate"));
536 // If mode is overwrite, then remove any previous values for this metadata element.
537 if(!metadata.accumulates) {
538 for(int m = metadatum.size() - 1; m >= 0; m--) {
539 BasicMetadata old_metadata = (BasicMetadata) metadatum.get(m);
540 if(old_metadata.element.equals(element)) {
541 metadatum.remove(m);
542 }
543 old_metadata = null;
544 }
545 }
546 mode = null;
547
548 // Add the completed metadata and clean up
549 metadatum.add(metadata);
550 metadata = null;
551 value = null;
552 element = null;
553 metadata_element = null;
554 }
555 metadata_elements = null;
556 description_element = null;
557 }
558 description_elements = null;
559 }
560 else {
561 ///ystem.err.println("No Match!");
562 }
563 filename_text = null;
564 filename_element = null;
565 }
566 filename_elements = null;
567 fileset_element = null;
568 }
569 fileset_elements = null;
570 directorymetadata_element = null;
571 }
572 catch (Exception error) {
573 Gatherer.self.printStackTrace(error);
574 }
575 // Cache the result, given that these external metadata.xmls are taken to be static at the time of reading (if you happen to be sourcing information from a opened collection that someone is working on, too bad.
576 if(filename != null) {
577 put(filename, metadatum);
578 }
579 else {
580 put(DIRECTORY_FILENAME, metadatum);
581 }
582 }
583 return metadatum;
584 }
585
586 private boolean isMatchingFileSet(String filename, String filename_text, boolean folder_level) {
587 // Crappy. There are apparently two ways of assigning, say, directory level metadata to anything in the ac01ne directory from a parent directories metadata.xml.
588 // The developers guide way: ac01ne/.*
589 // The dls way: ac01ne
590 // So the three tests are:
591 // 1. Check for an exact match i.e "ac01ne/ac01ne.htm" matches "ac01ne/".*
592 // 2. Check for a parent folder match, in the absence of further pattern i.e "ac01ne/ac01ne.htm" matches "ac01ne"
593 // 3. Check for a folder level match if thats what we are looking for i.e "null" matches ".*"
594
595 ///ystem.err.println("Check for: " + (filename != null ? filename : ".*"));
596 ///ystem.err.println("Folder level = " + folder_level);
597 ///ystem.err.println("filename != null && '" + filename + "'.matches('" + filename_text + "') = " + (filename != null ? filename.matches(filename_text) && !filename_text.equals(DIRECTORY_FILENAME) : false));
598 ///ystem.err.println("filename != null && '" + filename + "'.matches('" + filename_text + DIRECTORY_FILENAME_SUFFIX + "') [folder level = " + folder_level + "] = " + (filename != null ? filename.matches(filename_text + DIRECTORY_FILENAME_SUFFIX) && folder_level: false));
599 ///ystem.err.println("filename == null && '" + filename_text + "'.equals('.*') = " + (filename == null ? filename_text.equals(DIRECTORY_FILENAME) : false));
600 if (filename != null) {
601 if(folder_level) {
602 return filename.matches(filename_text) || filename.matches(filename_text + DIRECTORY_FILENAME_SUFFIX);
603 }
604 else {
605 return filename.matches(filename_text) && !filename_text.equals(DIRECTORY_FILENAME);
606 }
607 }
608 else {
609 return filename_text.equals(DIRECTORY_FILENAME);
610 }
611 }
612
613 /** Determine is this is a valid Greenstone Directory Metadata file. It may of course just be some xml file with the name metadata.xml. */
614 public boolean isValid() {
615 // Just determine if the doctype is GreenstoneDirectoryMetadata and root node is called DirectoryMetadata.
616 String doctype_name = base_document.getDoctype().getName();
617 String root_name = base_document.getDocumentElement().getTagName();
618 return ((doctype_name.equals("DirectoryMetadata") || doctype_name.equals("GreenstoneDirectoryMetadata")) && (root_name.equals("DirectoryMetadata") || root_name.equals("GreenstoneDirectoryMetadata")));
619 }
620
621 /** Decode a string that was previously made Perl safe.
622 * @param safe The encoded <strong>String</strong> where dangerous characters have been escaped.
623 * @return A <strong>String</strong> with all the escaping removed.
624 */
625 private String decode(String safe) {
626 String dangerous = safe.replaceAll("\\\\.",".");
627 return dangerous;
628 }
629 }
630 /** A simplistic version of metadata, with no live references. */
631 private class BasicMetadata
632 implements Comparable {
633 public boolean accumulates;
634 /** The collection this metadata was extracted from. Important when attempting to map BasicMetadata to its Metadata incarnation. */
635 public File collection;
636 /** The metadata element. */
637 public String element = null;
638 /** The value. */
639 public String value = null;
640 /** Constructor takes initial values for element and value.
641 * @param element The metadata element as a <strong>String</strong>.
642 * @param value The value as a <strong>String</strong>.
643 */
644 public BasicMetadata(String element, String value, boolean accumulates) {
645 this.accumulates = accumulates;
646 this.element = element;
647 this.value = value;
648 }
649
650 public BasicMetadata copy() {
651 return new BasicMetadata(element, value, accumulates);
652 }
653
654 public int compareTo(Object other) {
655 return toString().compareTo(other.toString());
656 }
657 /** Compare two BasicMetadata objects for equality.
658 * @param object The other <strong>Object</strong>.
659 * @return <i>true</i> if this BasicMetadata matches the given object, <i>false</i> otherwise.
660 */
661 public boolean equals(Object object) {
662 BasicMetadata other = (BasicMetadata) object;
663 if(collection != null && other.collection != null) {
664 return (collection.equals(other.collection) && element.equals(other.element) && value.equals(other.value));
665 }
666 return (element.equals(other.element) && value.equals(other.value));
667 }
668 public String toString() {
669 return element + " = " + value;
670 }
671 }
672
673 /** This class provides a cache for the instances of parsed collect.cfg files and their associated data. Assures that the most recently cached CollectCFG will remain available. Older objects are maintained as soft references and are freed at the JVM implementations descretion, but are gareunteed to be garbage collected before an OutOfMemory exception is thrown. */
674 private class CollectCFGCache
675 extends LinkedHashMap {
676 /** Retrieve the CollectCFG object that matches the given collection file path.
677 * @param collection_file The <strong>File</strong> that references the collection's directory.
678 * @return The <strong>CollectCFG</strong> that belongs to this collection, or <i>null</i> if no such file exists (so we probably aren't in a collection!).
679 */
680 public CollectCFG get(File collect_cfg_file) {
681 ///ystem.err.println("Retrieve the collection configuration file at: " + collect_cfg_file);
682 CollectCFG collect_cfg = null;
683 // Attempt to load from cache.
684 SoftReference reference = (SoftReference) super.get(collect_cfg_file);
685 // If is doesn't exist, either because its never been loaded, or thats its cache reference has gone stale, attempt to load it again.
686 if(reference == null || (collect_cfg = (CollectCFG)reference.get()) == null) {
687 try {
688 collect_cfg = new CollectCFG(collect_cfg_file);
689 put(collect_cfg_file, new SoftReference(collect_cfg));
690 }
691 catch(Exception error) {
692 Gatherer.printStackTrace(error);
693 collect_cfg = null;
694 }
695 }
696 return collect_cfg;
697 }
698
699 protected boolean removeEldestEntry(java.util.Map.Entry entry) {
700 return (size() > MAX_CFG_CACHE_SIZE);
701 }
702 }
703
704 /** The CollectCFG object encapsulates important metadata information extracted from a collect.cfg file, such as required metadata sets, and hfile associations. As the former are merged, their references are removed from this object, whereas the for the later references are replaced a representation of the hfile itself. */
705 private class CollectCFG {
706 /** A list of the metadata sets associated with the collect.cfg file. */
707 private ArrayList metadatasets = null;
708 /** A hash mapping from metadata element name to hierarchy file, or possibly hierarchy object. */
709 private HashMap hfiles = null;
710 /** The token at the start of a classify command line within the collect.cfg. */
711 static final private String CLASSIFY_COMMAND = "classify";
712 /** The token at the start of a metadataset command line within the collect.cfg. */
713 static final private String METADATASET_COMMAND = "metadataset";
714 /** Constructor which takes a file assumed to be the location of a collect.cfg file belonging to a Greenstone Collection.
715 * @param file A <strong>File</strong> referencing a collect.cfg file.
716 */
717 public CollectCFG(File file)
718 throws Exception {
719 ///atherer.println("Loading a new collection configuration file: " + file.getAbsolutePath());
720 File etc_directory = file.getParentFile();
721 hfiles = new HashMap();
722 metadatasets = new ArrayList();
723 FileReader reader = new FileReader(file);
724 BufferedReader in = new BufferedReader(reader);
725 String command = null;
726 while((command = in.readLine()) != null) {
727 CommandTokenizer tokenizer = new CommandTokenizer(command);
728 if(tokenizer.hasMoreTokens()) {
729 String token = tokenizer.nextToken().toLowerCase();
730 if(token.equals(METADATASET_COMMAND)) {
731 String family_name = tokenizer.nextToken();
732 String file_str = tokenizer.nextToken();
733 if(file_str.startsWith("\"") && file_str.endsWith("\"") && !file_str.equals("\"\"")) {
734 file_str = file_str.substring(1, file_str.length() - 1);
735 }
736 // If the file str is -only- the filename then we add <col_dir>/metadata/
737 File mds_file = null;
738 if(file_str.indexOf(File.separator) == -1) {
739 mds_file = new File(file.getParentFile().getParentFile(), File.separator + "metadata" + File.separator + file_str);
740 }
741 else {
742 mds_file = new File(file_str);
743 }
744 ///ystem.err.println("Attempting to file mds file at " + file.getAbsolutePath());
745 if(mds_file.exists()) {
746 metadatasets.add(mds_file);
747 }
748 mds_file = null;
749 file_str = null;
750 family_name = null;
751 }
752 // Also look for any classify commands that include an hfile and element
753 else if(token.equals(CLASSIFY_COMMAND)) {
754 String hfile_name = null;
755 String element_name = null;
756 // Drop the classifier name
757 tokenizer.nextToken();
758 while(tokenizer.hasMoreTokens()) {
759 token = tokenizer.nextToken().toLowerCase();
760 if(token.equals("-hfile")) {
761 hfile_name = tokenizer.nextToken();
762 }
763 else if(token.equals("-metadata")) {
764 element_name = tokenizer.nextToken();
765 }
766 }
767 if(hfile_name != null && element_name != null) {
768 // If hfile_name has no path, append the etc directories one. Either way create a file reference
769 File hfile = null;
770 hfile_name = hfile_name.replace('\\', File.separatorChar);
771 hfile_name = hfile_name.replace('/', File.separatorChar);
772 if(hfile_name.indexOf(File.separator) == -1) {
773 hfile = new File(etc_directory, hfile_name);
774 }
775 else {
776 hfile = new File(hfile_name);
777 }
778 // Add to hfiles
779 ///atherer.println("Adding hfile reference: " + element_name + " -> " + hfile);
780 hfiles.put(element_name, hfile);
781 hfile = null;
782 }
783 element_name = null;
784 hfile_name = null;
785 }
786 tokenizer = null;
787 }
788 }
789 command = null;
790 in.close();
791 reader.close();
792 in = null;
793 reader = null;
794 // Now we search the etc directory for *.txt files which we attempt to parse as hfiles
795 File children[] = etc_directory.listFiles(); // We are sure there is at least one, collect.cfg
796 for(int i = 0; i < children.length; i++) {
797 // If this is a text file, extract the element name and process
798 String name = children[i].getName();
799 if(children[i].isFile() && name.endsWith(".txt")) {
800 String element_name = name.substring(0, name.lastIndexOf("."));
801 if(!hfiles.containsKey(element_name)) {
802 ///atherer.println("Adding hfile reference: " + element_name + " -> " + children[i]);
803 hfiles.put(element_name, children[i]);
804 }
805 element_name = null;
806 }
807 name = null;
808 }
809 children = null;
810 etc_directory = null;
811 file = null;
812 }
813 /** Attempts to retrieve the HFile object associated with a certain metadata element. This may have already been cached, or may need to be loaded. Then again it may not even be necessary.
814 * @param element The fully qualified name of a metadata element, as a <strong>String</strong>.
815 * @return The <strong>HFile</strong> associated with the given element, or <i>null</i> if its unnecessary.
816 * @see org.greenstone.gatherer.cdm.CommandTokenizer
817 */
818 public HFile getHFile(String element) {
819 HFile result = null;
820 Object target = hfiles.get(element);
821 // If target is non-null
822 if(target != null) {
823 // If we haven't already load and parse the file.
824 if(target instanceof File) {
825 ///ystem.err.println("\nHFILE-MISS!! Loading " + target.toString());
826 result = new HFile();
827 try {
828 FileReader in_filereader = new FileReader((File)target);
829 //DecodeHTMLReader in_decodehtmlreader = new DecodeHTMLReader(in_filereader);
830 BufferedReader in = new BufferedReader(in_filereader);
831 String line = null;
832 while((line = in.readLine()) != null) {
833 CommandTokenizer tokenizer = new CommandTokenizer(line);
834 String alias = Utility.decodeGreenstone(tokenizer.nextToken());
835 String index = tokenizer.nextToken();
836 String value = Utility.decodeGreenstone(tokenizer.nextToken());
837 ///ystem.err.println("Read " + index + ", " + alias + ", " + value);
838 if(alias.startsWith("\"") && alias.endsWith("\"") && !alias.equals("\"\"")) {
839 alias = alias.substring(1, alias.length() - 1);
840 }
841 if(value.startsWith("\"") && value.endsWith("\"") && !value.equals("\"\"")) {
842 value = value.substring(1, value.length() - 1);
843 }
844 result.add(index, alias, value);
845 value = null;
846 alias = null;
847 index = null;
848 tokenizer = null;
849 }
850 line = null;
851 in.close();
852 in = null;
853 //in_decodehtmlreader = null;
854 in_filereader = null;
855 hfiles.put(element, result);
856 }
857 catch (Exception error) {
858 error.printStackTrace();
859 hfiles.remove(element);
860 }
861 }
862 else {
863 ///ystem.err.print("HFILE-HIT!!! ");
864 result = (HFile) target;
865 }
866 }
867 // Else no hfile is needed for this element
868 target = null;
869 return result;
870 }
871 /** Retrieve the list of metadata sets associated with this collection.
872 * @return An <strong>ArrayList</strong> of metadata set Files.
873 */
874 public ArrayList getMetadataSets() {
875 return metadatasets;
876 }
877 }
878
879 /** The HFile object provides a container for the mappings from indexes, of the form 1.1.1, to alias-value pairs. It also provides method to retrieving the alias and value for a certain element, remembering that values must be expressed in terms of their absolute subject heirarchy path. */
880 private class HFile
881 extends HashMap {
882 /** Construct a new HFile object with no initial values. */
883 public HFile() {
884 super();
885 }
886 /** Add a new (index,(alias, value)) mapping.
887 * @param index The index of this mapping as a <strong>String</strong>.
888 * @param alias The alias of this mapping as a <strong>String</strong>.
889 * @param value And finally the value of this mapping as a, you guessed it, <strong>String</strong>.
890 */
891 public void add(String index, String alias, String value) {
892 Entry entry = new Entry(index, alias, value);
893 ///ystem.err.println("Adding entry: " + index + " \"" + alias + "\" \"" + value + "\"");
894 put(index, entry);
895 put(alias, entry);
896 }
897 public String getAlias(String index) {
898 String alias = "";
899 Entry entry = (Entry) get(index);
900 if(entry != null) {
901 alias = entry.alias;
902 }
903 entry = null;
904 return alias;
905 }
906 /** Retrieve the value associated with a certain index. This is harder than it first sounds as you must take into account the parent indexes of this one.
907 * @param index The index whose value you wish to calculate, as a <strong>String</strong>.
908 * @return The fully quantified path to the value that matches index, also as a <strong>String</strong>. Delimitiation between subject layers is denoted by the string "\\"
909 */
910 public String getValue(String index) {
911 ///ystem.err.println("Retrieve value for the alias/index: '" + index + "'");
912 StringBuffer value = new StringBuffer("");
913 // If index isn't the index, it must be the alias. Replace it with the index dammit.
914 Entry entry = null;
915 if(!Utility.isIndex(index)) {
916 ///ystem.err.println("\tThis is an alias.");
917 // Store this for later, as its exactly the same entry we'd get had we found the last component of a proper index.
918 entry = (Entry) get(index);
919 index = entry.index;
920 ///ystem.err.println("\tIndex is actually: " + index);
921 }
922 // Now build the hierarchy if necessary.
923 int dot_index = -1;
924 if((dot_index = index.indexOf(".")) != -1) {
925 ///ystem.err.println("\tHierarchy information required -->");
926 value.append(getValue(index.substring(0, dot_index)));
927 value.append(StaticStrings.ESCAPE_STR + StaticStrings.ESCAPE_STR);
928 ///ystem.err.println("\t<-- Hierarchy information complete");
929 }
930 if(entry == null) {
931 entry = (Entry) get(index);
932 }
933 if(entry != null) {
934 value.append(entry.value);
935 }
936 entry = null;
937 ///ystem.err.println("\tFinal value is: '" + value.toString() + "'\n");
938 return value.toString();
939 }
940
941 private class Entry {
942 public String alias = null;
943 public String index = null;
944 public String value = null;
945 public Entry(String index, String alias, String value) {
946 this.alias = alias;
947 this.index = index;
948 this.value = value;
949 }
950 }
951 }
952
953 private class MetadataXMLFileSearch {
954 public File file;
955 public String filename;
956 public MetadataXMLFileSearch(File file, String filename) {
957 this.file = file;
958 this.filename = filename;
959 }
960 }
961}
Note: See TracBrowser for help on using the repository browser.