source: trunk/gli/src/org/greenstone/gatherer/msm/LegacyCollectionImporter.java@ 7234

Last change on this file since 7234 was 7171, checked in by mdewsnip, 20 years ago

Hacked up something to rewrite metadata.xml files of legacy collections as lots of little metadata.xml files so the GLI can actually work with them.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.7 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * Author: John Thompson, Greenstone Digital Library, University of Waikato
9 *
10 * Copyright (C) 1999 New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *########################################################################
26 */
27package org.greenstone.gatherer.msm;
28
29
30import java.io.*;
31import java.util.*;
32import org.w3c.dom.*;
33import org.greenstone.gatherer.Gatherer;
34import org.greenstone.gatherer.cdm.Argument;
35import org.greenstone.gatherer.cdm.Classifier;
36import org.greenstone.gatherer.cdm.CollectionDesignManager;
37import org.greenstone.gatherer.msm.ElementWrapper;
38import org.greenstone.gatherer.msm.MetadataSetManager;
39import org.greenstone.gatherer.util.StaticStrings;
40import org.greenstone.gatherer.util.Utility;
41import org.greenstone.gatherer.valuetree.GValueModel;
42
43
44/**
45 * Rewrote almost all this class to fix a variety of bugs.
46 * Importing legacy collections involves three main steps:
47 *
48 * 1. Read the existing metadata.xml files and namespace them, usually by prompting the user
49 * to specify the mapping between old metadata elements and new elements.
50 *
51 * 2. Build complete value trees for the new metadata elements. This ensures that all the
52 * metadata will show up and that the hfiles are written out complete. Building the
53 * value trees involves parsing the old hfiles and processing hierarchical metadata
54 * into GLI format (where '|' is the level separator).
55 *
56 * 3. Fix up the classify commands in the collect.cfg file to specify the new element names.
57 *
58 * @author Michael Dewsnip
59 */
60public class LegacyCollectionImporter
61{
62 private File collection_folder;
63 private String collection_folder_path;
64 private CollectionDesignManager cdm;
65 private MetadataSetManager msm;
66 private HashMap source_metadata_to_hfile_mapping;
67 private boolean cancelled;
68
69
70 public LegacyCollectionImporter(File collection_folder, CollectionDesignManager cdm)
71 {
72 this.collection_folder = collection_folder;
73 this.collection_folder_path = collection_folder.getAbsolutePath();
74 this.cdm = cdm;
75 this.msm = Gatherer.c_man.getCollection().msm;
76
77 // Create a mapping from metadata element to hierarchy classifier
78 source_metadata_to_hfile_mapping = new HashMap();
79 ArrayList hierarchy_classifiers_list = cdm.classifier_manager.getHierarchyClassifiers();
80 for (int i = 0; i < hierarchy_classifiers_list.size(); i++) {
81 Classifier classifier = (Classifier) hierarchy_classifiers_list.get(i);
82 // System.err.println("Hierarchy classifier: " + classifier);
83
84 // Get the element name
85 Argument metadata_name_argument = classifier.getArgument(StaticStrings.METADATA_ARGUMENT);
86 String metadata_name = metadata_name_argument.getValue();
87 // System.err.println("Metadata name: " + metadata_name);
88
89 // Remove the extracted namespace if it has been added
90 if (metadata_name.startsWith(StaticStrings.EXTRACTED_NAMESPACE)) {
91 metadata_name = metadata_name.substring(StaticStrings.EXTRACTED_NAMESPACE.length());
92 }
93
94 // Parse the hfile for this Hierarchy classifier
95 Argument hfile_name_argument = classifier.getArgument(StaticStrings.HFILE_ARGUMENT);
96 String hfile_name = hfile_name_argument.getValue();
97
98 File hfile_file = new File(collection_folder, StaticStrings.ETC_FOLDER + File.separator + hfile_name);
99
100 // System.err.println("Checking hfile " + hfile_file + " for " + metadata_name);
101 HFile hfile = new HFile(hfile_file);
102 source_metadata_to_hfile_mapping.put(metadata_name, hfile);
103 }
104 }
105
106
107 // This copies all the existing metadata.xml files into a backup directory
108 public void backupMetadataXMLFiles(File collection_dir)
109 {
110 File import_dir = new File(collection_dir, Utility.IMPORT_DIR);
111 File import_bak_dir = new File(collection_dir, Utility.IMPORT_BAK_DIR);
112 import_bak_dir.mkdir();
113 copyMetadataXMLFiles(import_dir, import_bak_dir);
114 }
115
116
117 private void copyMetadataXMLFiles(File source_dir, File dest_dir)
118 {
119 // Find the metadata file in this dir
120 File meta_file = new File(source_dir, "metadata.xml");
121 if (meta_file.exists()) {
122 File new_meta_file = new File(dest_dir, "metadata.xml");
123 try {
124 dest_dir.mkdirs();
125 Gatherer.f_man.getQueue().copyFile(meta_file, new_meta_file, null);
126 if (!new_meta_file.exists()) {
127 throw new Exception("");
128 }
129 }
130 catch (Exception e) {
131 Gatherer.println("Exception: couldn't move the file " + meta_file.getPath() + e.getMessage());
132 }
133 }
134
135 // Now go through child directories
136 File [] children = source_dir.listFiles();
137 for (int i = 0; i < children.length; i++) {
138 File child = children[i];
139 if (child.isDirectory()) {
140 copyMetadataXMLFiles(child, new File(dest_dir, child.getName()));
141 }
142 }
143 }
144
145
146 public void importMetadata()
147 {
148 // Nothing to do if we don't have any metadata sets (apart from extracted) loaded
149 if (msm.getSets().size() <= 1) {
150 System.err.println("No metadata sets!");
151 return;
152 }
153
154 cancelled = false;
155 importMetadata(new File(collection_folder, StaticStrings.IMPORT_FOLDER));
156 }
157
158
159 private void importMetadata(File file)
160 {
161 if (file.isDirectory()) {
162 // Apply recursively to the contents of the directory
163 File[] files = file.listFiles();
164 if (files != null) {
165 for (int i = 0; i < files.length && !cancelled; i++) {
166 importMetadata(files[i]);
167 }
168 }
169
170 return;
171 }
172
173 // We only care about metadata.xml files
174 if (!file.getName().equals(StaticStrings.METADATA_XML)) {
175 return;
176 }
177
178 // Parse the metadata.xml file
179 // System.err.println("Importing metadata from " + file);
180 Document document = Utility.parse(file.getAbsolutePath(), false);
181
182 // Get a list of all the <Metadata> elements in the file, and put them in an array
183 NodeList metadata_elements_list = document.getDocumentElement().getElementsByTagName(StaticStrings.METADATA_ELEMENT);
184 Node[] metadata_elements = new Node[metadata_elements_list.getLength()];
185 for (int i = 0; i < metadata_elements_list.getLength(); i++) {
186 metadata_elements[i] = metadata_elements_list.item(i);
187 // System.err.println("Metadata element: " + MSMUtils.getValue(metadata_elements[i]));
188 // System.err.println("Metadata element parent: " + metadata_elements[i].getParentNode().getNodeName());
189 }
190
191 // Now, for each metadata element...
192 for (int i = 0; i < metadata_elements.length; i++) {
193 Element source_element = (Element) metadata_elements[i];
194 String source_element_name = source_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
195 // System.err.println("Source element name: " + source_element_name);
196
197 // Check if there is a profile already set up for this element
198 if (msm.profiler.containsAction(collection_folder_path, source_element_name)) {
199 String target_element_name = msm.profiler.getAction(collection_folder_path, source_element_name);
200
201 // Update the metadata element and move onto the next one
202 if (target_element_name != null) {
203 updateMetadataElement(source_element, target_element_name);
204 }
205 else {
206 // Element has been ignored, so remove it
207 source_element.getParentNode().removeChild(source_element);
208 }
209
210 continue;
211 }
212
213 // No profile, so check if the element is already in the metadata set
214 ElementWrapper target_element = msm.getElement(source_element_name, true);
215 if (target_element != null) {
216 String target_element_name = target_element.getName();
217
218 // Update the metadata element and move onto the next one
219 updateMetadataElement(source_element, target_element_name);
220 continue;
221 }
222
223 // We must ask the user how to process this metadata element
224 target_element = msm.prompt.selectElement(source_element_name);
225 if (msm.prompt.wasDialogCancelled()) {
226 cancelled = true;
227 return;
228 }
229
230 if (target_element == null) {
231 // The user has chosen to ignore this element, so remove it
232 source_element.getParentNode().removeChild(source_element);
233
234 // Add the user's choice to the profile for this collection
235 msm.profiler.addAction(collection_folder_path, source_element_name, null);
236 }
237 else {
238 // Replace the old metadata element name with the new one
239 String target_element_name = target_element.getName();
240
241 // Update the metadata element
242 updateMetadataElement(source_element, target_element_name);
243
244 // Add the user's choice to the profile for this collection
245 msm.profiler.addAction(collection_folder_path, source_element_name, target_element_name);
246 }
247 }
248
249 // ----------------------------------------------------------------------------------
250 // HACK CODE ADDED IN AT VERY LAST MINUTE FOR REWRITING METADATA.XML FILES
251
252 // Get a list of all the <FileSet> elements in the file, and put them in an array
253 NodeList fileset_elements_list = document.getDocumentElement().getElementsByTagName(MetadataXMLFile.FILESET_ELEMENT);
254 Node[] fileset_elements = new Node[fileset_elements_list.getLength()];
255 for (int i = 0; i < fileset_elements_list.getLength(); i++) {
256 fileset_elements[i] = fileset_elements_list.item(i);
257 }
258
259 // For each fileset element...
260 for (int i = 0; i < fileset_elements.length; i++) {
261 Node fileset_node = fileset_elements[i];
262 NodeList fileset_children = fileset_node.getChildNodes();
263 for (int j = 0; j < fileset_children.getLength(); j++) {
264 Node fileset_child = fileset_children.item(j);
265 if (fileset_child.getNodeName().equals(MetadataXMLFile.FILENAME_ELEMENT)) {
266 String child_filename = MSMUtils.getValue(fileset_child);
267 File child_file = new File(file.getParentFile(), child_filename);
268 if (child_file.isDirectory()) {
269 MetadataXMLFile child_metadata_xml_file = new MetadataXMLFile();
270 Document child_metadata_xml_file_document = child_metadata_xml_file.getDocument();
271
272 fileset_node = fileset_node.getParentNode().removeChild(fileset_node);
273
274 // Change the filename value to .*
275 MSMUtils.setValue((Element) fileset_child, ".*");
276
277 Node child_fileset_node = child_metadata_xml_file_document.importNode(fileset_node, true);
278 child_metadata_xml_file_document.getDocumentElement().appendChild(child_fileset_node);
279 Utility.export(child_metadata_xml_file_document, new File(child_file, StaticStrings.METADATA_XML));
280 }
281 break;
282 }
283 }
284 }
285
286 // END HACK CODE
287 // ----------------------------------------------------------------------------------
288
289 // Write the modified metadata.xml file back out
290 Utility.export(document, file);
291 }
292
293
294 private void updateMetadataElement(Element metadata_element, String new_element_name)
295 {
296 String source_element_name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE);
297 HFile hfile = (HFile) source_metadata_to_hfile_mapping.get(source_element_name);
298
299 // Get the value of this metadata element
300 String element_value = MSMUtils.getValue(metadata_element);
301 if (hfile != null) {
302 // Map to the full value
303 String full_element_value = hfile.getFullValue(element_value);
304 if (full_element_value != null) {
305 element_value = full_element_value;
306 }
307 }
308
309 // Update the name and value of the metadata element
310 metadata_element.setAttribute(StaticStrings.NAME_ATTRIBUTE, new_element_name);
311 MSMUtils.setValue(metadata_element, element_value);
312
313 // Add the value of this metadata element to the value tree
314 GValueModel value_model = msm.getValueTree(msm.getElement(new_element_name, true));
315 value_model.addValue(element_value);
316 }
317
318
319 public void updateClassifiers()
320 {
321 // Update the metadata elements in each of the classifiers
322 for (int i = 0; i < cdm.classifier_manager.getSize(); i++) {
323 Classifier classifier = cdm.classifier_manager.getClassifier(i);
324 // System.err.println("Classifier: " + classifier);
325
326 // Update the "-metadata" value
327 mapClassifierArgumentToNewValue(classifier, StaticStrings.METADATA_ARGUMENT);
328
329 // Update the "-sort" value
330 mapClassifierArgumentToNewValue(classifier, "-sort");
331
332 // With Hierarchy classifiers, update the hfile arguments
333 if (classifier.getName().equalsIgnoreCase(StaticStrings.HIERARCHY_CLASSIFIER)) {
334 // Update the "-hfile" value
335 Argument hfile_argument = classifier.getArgument(StaticStrings.HFILE_ARGUMENT);
336 String hfile_value = hfile_argument.getValue();
337
338 // Find the source metadata element
339 Iterator keys = source_metadata_to_hfile_mapping.keySet().iterator();
340 while (keys.hasNext()) {
341 String source_metadata = (String) keys.next();
342 String hfile_name = ((HFile) source_metadata_to_hfile_mapping.get(source_metadata)).hfile_name;
343 if (hfile_name.equals(hfile_value)) {
344 // Update the metadata value to the new (namespaced) one
345 if (msm.profiler.containsAction(collection_folder_path, source_metadata)) {
346 String target_value = msm.profiler.getAction(collection_folder_path, source_metadata);
347 hfile_argument.setValue(target_value + ".txt");
348 }
349
350 break;
351 }
352 }
353 }
354
355 // System.err.println("Classifier (after): " + classifier);
356 }
357 }
358
359
360 private void mapClassifierArgumentToNewValue(Classifier classifier, String argument_name)
361 {
362 Argument argument = classifier.getArgument(argument_name);
363 if (argument == null) {
364 // there is no such argument
365 return;
366 }
367 String value = argument.getValue();
368 // System.err.println("Value: " + value);
369
370 // Remove the extracted namespace if it has been added
371 if (value.startsWith(StaticStrings.EXTRACTED_NAMESPACE)) {
372 value = value.substring(StaticStrings.EXTRACTED_NAMESPACE.length());
373 }
374
375 // Update the metadata value to the new (namespaced) one
376 if (msm.profiler.containsAction(collection_folder_path, value)) {
377 String target_value = msm.profiler.getAction(collection_folder_path, value);
378 argument.setValue(target_value);
379 }
380 }
381
382
383 /** Another basic HFile wrapper. This one expects you to provide an element when you create it, then as it is built it generates the value tree as well. Later it allows you to provide an alias and retrieve the full path string (delimited by pipes) */
384 private class HFile
385 {
386 public String hfile_name;
387 private HashMap index_to_entry_mapping;
388 private HashMap alias_to_value_mapping;
389
390 public HFile(File file)
391 {
392 hfile_name = file.getName();
393 index_to_entry_mapping = new HashMap();
394 alias_to_value_mapping = new HashMap();
395
396 try {
397 // Read in the hfile, line by line, creating entry mappings
398 //FileReader file_reader = new FileReader(file);
399 //BufferedReader buffered_reader = new BufferedReader(file_reader);
400 BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
401
402 String line = null;
403 while ((line = buffered_reader.readLine()) != null) {
404 // Read alias
405 String alias = "";
406 int alias_end;
407 if (line.indexOf("\"") == 0) {
408 alias_end = line.indexOf("\"", 1);
409 alias = line.substring(1, alias_end);
410 }
411 else {
412 alias_end = line.indexOf(" ", 1);
413 alias = line.substring(0, alias_end);
414 }
415 // System.err.println("Alias: " + alias);
416
417 int value_start = line.indexOf("\"", alias_end + 1);
418 int value_end = line.indexOf("\"", value_start + 1);
419 String value = line.substring(value_start + 1, value_end);
420 // System.err.println("Value: " + value);
421
422 // if (!alias.equals(value)) {
423 // System.err.println("Alias (" + alias + ") and value (" + value + ") differ!");
424 // }
425
426 String index = line.substring(alias_end + 1, value_start).trim();
427 // System.err.println("Index: " + index);
428
429 index_to_entry_mapping.put(index, new Entry(alias, value));
430 }
431
432 buffered_reader.close();
433 }
434 catch (Exception ex) {
435 System.err.println("Exception reading hfile " + file);
436 ex.printStackTrace();
437 }
438
439 Iterator index_keys = index_to_entry_mapping.keySet().iterator();
440 while (index_keys.hasNext()) {
441 String index = (String) index_keys.next();
442 String alias = ((Entry) index_to_entry_mapping.get(index)).alias;
443 String value = ((Entry) index_to_entry_mapping.get(index)).value;
444
445 // Chop the last reference off index, as we already have it
446 if (index.indexOf(StaticStrings.STOP_CHARACTER) > -1) {
447 index = index.substring(0, index.lastIndexOf(StaticStrings.STOP_CHARACTER));
448 // Then while there are still futher indexes left, retrieve them
449 while (index.length() > 0) {
450 // Retrieve that value (if any).
451 Entry entry = (Entry) index_to_entry_mapping.get(index);
452 if (entry != null) {
453 // Precatenate with the current value separating with a pipe
454 value = entry.value + StaticStrings.PIPE_CHAR + value;
455 }
456 // Then trim the index down
457 if (index.indexOf(StaticStrings.STOP_CHARACTER) > -1) {
458 index = index.substring(0, index.lastIndexOf(StaticStrings.STOP_CHARACTER));
459 }
460 else {
461 index = "";
462 }
463 }
464 }
465
466 alias_to_value_mapping.put(alias, value);
467 }
468
469 index_to_entry_mapping.clear();
470 }
471
472
473 public String getFullValue(String alias)
474 {
475 return (String) alias_to_value_mapping.get(alias);
476 }
477
478
479 private class Entry
480 {
481 public String alias;
482 public String value;
483
484 public Entry(String alias, String value) {
485 this.alias = alias;
486 this.value = value;
487 }
488 }
489 }
490}
Note: See TracBrowser for help on using the repository browser.