source: main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java@ 34415

Last change on this file since 34415 was 34415, checked in by ak19, 4 years ago

Bugfix for slowdown when assigning meta to multiple gathered docs in GLI's Enrich pane. Tested on Windows. This is the simplest way I could think of to solve the problem: XMLParsing always resolves html entities (unless possibly when using the StAX parser, but that may not return the Document object as code expects). Entities start with ampersand and are resolved upon parsing, so too standalone ampersand signs. The earlier code, a bugfix for metadata not sticking to filenames/import folder structures containing non-ASCII or ampersands or plus signs, had caused the slow-down, as after each XML parse of the current metadata.xml file, the code would loop through each FileName element of the metadata.xml file and reintroduce the resolved html entities. The best and simplest solution that worked is simply to escape ampersands with %26 when writing out values for the FileName element and compare against filenames that have a similar substitution done. Still to test on Linux, but this reincorporates recent ideas for the bugfix that had worked on Linux (but then broke on Windows) so I feel somewhat confident that this commit is likely to largely work on Linux when I test it tomorrow.

  • Property svn:keywords set to Author Date Id Revision
File size: 41.7 KB
Line 
1/**
2 *############################################################################
3 * A component of the Greenstone Librarian Interface, part of the Greenstone
4 * digital library suite from the New Zealand Digital Library Project at the
5 * University of Waikato, New Zealand.
6 *
7 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
8 *
9 * Copyright (C) 2004 New Zealand Digital Library Project
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *############################################################################
25 */
26
27package org.greenstone.gatherer.metadata;
28
29
30import java.io.*;
31import java.util.*;
32import org.greenstone.gatherer.DebugStream;
33import org.greenstone.gatherer.collection.CollectionTreeNode;
34import org.greenstone.gatherer.util.XMLTools;
35import org.w3c.dom.*;
36
37import org.greenstone.gatherer.util.Utility;
38
39/** This class represents one metadata.xml file */
40public class MetadataXMLFile
41 extends File
42{
43 static final private String DESCRIPTION_ELEMENT = "Description";
44 static final private String DIRECTORY_FILENAME = ".*";
45 static final private String FILENAME_ELEMENT = "FileName";
46 static final private String FILESET_ELEMENT = "FileSet";
47 static final private String METADATA_ELEMENT = "Metadata";
48 static final private String[] nonEscapingElements = new String[]{FILENAME_ELEMENT};
49
50 /** Special metadata field: the filename encoding is a unique sort of metadata in
51 * that it is not just information stored with a collection file, but also needs to
52 * be applied in real-time to the collection file (to its filename) for display. */
53 static final public String FILENAME_ENCODING_METADATA = "gs.filenameEncoding";
54
55 // To speed things up a bit we keep the last accessed metadata.xml file in memory
56 static private File loaded_file = null;
57 static private Document loaded_file_document = null;
58 static private boolean loaded_file_changed = false;
59
60
61 public MetadataXMLFile(String metadata_xml_file_path)
62 {
63 super(metadata_xml_file_path);
64 }
65
66 public void clearAllMetadataInFile() {
67 // If this metadata.xml file isn't the one currently loaded, load it now
68 if (loaded_file != this) {
69 // First we must save out the currently loaded file
70 saveLoadedFile();
71
72 // Parse the metadata.xml file
73 Document document = XMLTools.parseXMLFile(this);
74 if (document == null) {
75 System.err.println("Error: Could not parse metadata.xml file " + getAbsolutePath());
76 return;
77 }
78
79 loaded_file = this;
80 loaded_file_document = document;
81 }
82
83 Element root = loaded_file_document.getDocumentElement();
84 while(root.hasChildNodes()) {
85 root.removeChild(root.getFirstChild());
86 }
87 loaded_file_changed = true;
88 //saveLoadedFile(); // this final metaxml file being cleared of meta will get saved by MetaXMLFileManager.clearAllMetadataInCollection()
89 }
90
91
92 public void addMetadata(CollectionTreeNode file_node, ArrayList metadata_values)
93 {
94 // If this metadata.xml file isn't the one currently loaded, load it now
95 if (loaded_file != this) {
96 // First we must save out the currently loaded file
97 saveLoadedFile();
98
99 // Parse the metadata.xml file
100 Document document = XMLTools.parseXMLFile(this);
101 if (document == null) {
102 System.err.println("Error: Could not parse metadata.xml file " + getAbsolutePath());
103 return;
104 }
105
106 loaded_file = this;
107 loaded_file_document = document;
108 }
109
110 // Determine the file's path relative to the location of the metadata.xml file
111 String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile());
112 String file_relative_path = file_node.getURLEncodedFilePath();
113 file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length());
114
115 if (file_relative_path.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
116 file_relative_path = file_relative_path.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
117 }
118
119 // Form a regular expression that specifies the scope of the metadata
120 String file_path_regexp;
121 if (file_relative_path.equals("")) {
122 // Special case for matching all files in the directory
123 file_path_regexp = DIRECTORY_FILENAME;
124 }
125 else {
126 // When XML files are parsed, predefined XML entities get resolved, which includes & in & and &#x...;
127 // see https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references#Predefined_entities_in_XML
128 // (and https://stackoverflow.com/questions/1777878/is-there-a-java-xml-api-that-can-parse-a-document-without-resolving-character-en)
129 // We don't want &/entities in FileName elements stored in metadata.xml, as we'd have to put the entities
130 // back (undo the xml entity resolution) after each XML parse operation, which is costly and slows GLI down
131 // when assigning meta to multiple docs.
132 // Instead, when writing out or comparing against FileName elements in metadata.xml, we ensure all
133 // ampersands are replaced by their hex URL encoded value of %26.
134 file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND);
135
136 // Convert the file path into a regular expression that will match it
137 file_path_regexp = MetadataTools.getRegularExpressionThatMatchesFilePath(file_relative_path);
138 }
139
140 // LEAVE THIS DEBUGGING STATEMENT IN - USEFUL TO DEBUG FILENAME ENCODING ISSUES WHEN META ASSIGNED
141 //System.err.println("MetadataXMLFile.addMetadata() Adding meta for file regexp: "
142 // + file_path_regexp + " - " + org.greenstone.gatherer.util.Utility.debugUnicodeString(file_path_regexp));
143
144 // Find the appropriate FileSet element for this file
145 Element appropriate_fileset_element = null;
146
147 // Read all the FileSet elements in the file
148 NodeList fileset_elements_nodelist = loaded_file_document.getElementsByTagName(FILESET_ELEMENT);
149 for (int i = 0; i < fileset_elements_nodelist.getLength(); i++) {
150 Element current_fileset_element = (Element) fileset_elements_nodelist.item(i);
151
152 // Check the FileName elements of the FileSet to see if we have a match
153 NodeList filename_elements_nodelist = current_fileset_element.getElementsByTagName(FILENAME_ELEMENT);
154 for (int j = 0; j < filename_elements_nodelist.getLength(); j++) {
155 Element current_filename_element = (Element) filename_elements_nodelist.item(j);
156 String current_filename_element_value = XMLTools.getElementTextValue(current_filename_element);
157
158 // Only exact matches can be extended with new metadata
159 if (current_filename_element_value.equals(file_path_regexp)) {
160 appropriate_fileset_element = current_fileset_element;
161 break;
162 }
163 }
164 }
165
166 // If no appropriate FileSet element exists create a new one for this file
167 if (appropriate_fileset_element == null) {
168 DebugStream.println("Creating new FileSet element for file since none exists..."+file_path_regexp);
169 appropriate_fileset_element = loaded_file_document.createElement(FILESET_ELEMENT);
170
171 Element new_filename_element = loaded_file_document.createElement(FILENAME_ELEMENT);
172 new_filename_element.appendChild(loaded_file_document.createTextNode(file_path_regexp));
173 appropriate_fileset_element.appendChild(new_filename_element);
174
175 Element new_description_element = loaded_file_document.createElement(DESCRIPTION_ELEMENT);
176 appropriate_fileset_element.appendChild(new_description_element);
177
178 // add the fileset element for .* at the top: especially important for
179 // non-accumulating (and override mode) meta. Other type fileset elements can be appended
180 if(file_path_regexp.equals(DIRECTORY_FILENAME)) {
181 loaded_file_document.getDocumentElement().insertBefore(appropriate_fileset_element,
182 loaded_file_document.getDocumentElement().getFirstChild());
183 } else {
184 loaded_file_document.getDocumentElement().appendChild(appropriate_fileset_element);
185 }
186 }
187
188 // Add each of the metadata values to the FileSet's Description element
189 Element description_element = (Element) appropriate_fileset_element.getElementsByTagName(DESCRIPTION_ELEMENT).item(0);
190 for (int i = 0; i < metadata_values.size(); i++) {
191 MetadataValue metadata_value = (MetadataValue) metadata_values.get(i);
192 String metadata_element_name_full = metadata_value.getMetadataElement().getFullName();
193
194 // Remove any characters that are invalid in XML
195 String metadata_value_string = XMLTools.removeInvalidCharacters(metadata_value.getFullValue());
196
197 // Square brackets need to be escaped because they are a special character in Greenstone
198 metadata_value_string = metadata_value_string.replaceAll("\\[", "&#091;");
199 metadata_value_string = metadata_value_string.replaceAll("\\]", "&#093;");
200
201 // the gs.filenameEncoding metadata is unique in that, when added, removed or
202 // changed, it must be applied on the file(name) whose metadata has been adjusted
203 if(metadata_element_name_full.equals(FILENAME_ENCODING_METADATA)) {
204 metadata_value_string = processFilenameEncoding(file_path_regexp, // file_path_regexp has & replaced by HEX_AMPERSAND but processFilenameEncoding doesn't use param
205 file_node, metadata_value_string, false);
206 // true only if removing meta
207 }
208
209 // Check if this piece of metadata has already been assigned to this FileSet element
210 boolean metadata_already_assigned = false;
211 NodeList metadata_elements_nodelist = description_element.getElementsByTagName(METADATA_ELEMENT);
212 for (int k = 0; k < metadata_elements_nodelist.getLength(); k++) {
213 Element current_metadata_element = (Element) metadata_elements_nodelist.item(k);
214
215 // Check if the metadata element name matches
216 String current_metadata_element_name_full = current_metadata_element.getAttribute("name");
217 if (current_metadata_element_name_full.equals(metadata_element_name_full)) {
218 // if the metadata must not accumulate, then edit the current value
219 if (!metadata_value.isAccumulatingMetadata()) {
220 XMLTools.setNodeText(current_metadata_element, metadata_value_string);
221 metadata_already_assigned = true;
222 break;
223 }
224 // Check if the metadata element value matches
225 String current_metadata_value_string = XMLTools.getElementTextValue(current_metadata_element);
226 if (current_metadata_value_string.equals(metadata_value_string)) {
227 // Metadata already assigned
228 metadata_already_assigned = true;
229 break;
230 }
231 }
232 }
233
234 // If the piece of metadata hasn't already been assigned, add it now
235 if (!metadata_already_assigned) {
236 // Create a new Metadata element to record this metadata
237 Element new_metadata_element = loaded_file_document.createElement(METADATA_ELEMENT);
238 new_metadata_element.setAttribute("name", metadata_value.getMetadataElement().getFullName());
239 new_metadata_element.setAttribute("mode", (metadata_value.isAccumulatingMetadata() ? "accumulate" : "override"));
240 new_metadata_element.appendChild(loaded_file_document.createTextNode(metadata_value_string));
241
242 // Accumulating metadata: add at the end
243 if (metadata_value.isAccumulatingMetadata()) {
244 description_element.appendChild(new_metadata_element);
245 }
246 // Override metadata: add at the start (so it overrides inherited metadata without affecting other assigned metadata)
247 else {
248 description_element.insertBefore(new_metadata_element, description_element.getFirstChild());
249 }
250 }
251 }
252
253 // Remember that we've changed the file so it gets saved when a new one is loaded
254 loaded_file_changed = true;
255 }
256
257 // DO NOT REMOVE THE System.err DEBUGGING STATEMENTS FROM THIS METHOD: HELPS WITH TESTING/DEBUGGING
258 // WHEN FILE-LEVEL META IS ASSIGNED TO NON-ASCII ENCODED FILENAMES OR WITH FILENAMES CONTAINING +/ampersand
259
260
261 // By default, XML parsing automatically resolves certain predefined XML entities including the ampersand.
262 // https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references#Predefined_entities_in_XML
263 // "The XML specification defines five "predefined entities" representing special characters, and requires that all XML processors honor them. The entities can be explicitly declared in a DTD, as well, but if this is done, the replacement text must be the same as the built-in definitions. XML also allows other named entities of any size to be defined on a per-document basis."
264 // Also https://stackoverflow.com/questions/1777878/is-there-a-java-xml-api-that-can-parse-a-document-without-resolving-character-en
265 // which suggests using StAX instead of SAX or DOM parsers allows us to bypass automatic entity resolution.
266 // However, https://docs.oracle.com/javase/tutorial/jaxp/stax/why.html and
267 // https://docs.oracle.com/javase/tutorial/jaxp/stax/api.html show that StAX works like SAX rather than DOM parser
268 // while the XMLTools.parseXML() that we use throughout this file relies on DOMParser behaviour to get access to the
269 // XML DOM Document, so that it's not straightforward to replace DOMParser's use in Document XMLTools.parseXML() with
270 // an equivalent using a streambased StAX parser.
271 // Instead, method reEncodeFilenamesInMetadataXML(Doc doc) has been removed, as the solution is to no longer store
272 // ampersands: no longer encoding ampersands to entities but to %26, and all hex entities in filenames are further
273 // protected from XML's entity resolution because their ampersand prefixes are encoded as %26 (i.e. &#xDDDD; is
274 // stored as %26#xDDDD;) and therefore we no longer need to go over the XML Doc reinstating entities after parseXML
275 // either, entities being now preserved though with %26 prefixed in place of the & prefix.
276
277 public ArrayList getMetadataAssignedToFile(File file, boolean fileEncodingOnly)
278 {
279 // If this metadata.xml file isn't the one currently loaded, load it now
280 if (loaded_file != this) {
281 // First we must save out the currently loaded file
282 saveLoadedFile();
283
284 // Parse the metadata.xml file
285 Document document = XMLTools.parseXMLFile(this);
286 if (document == null) {
287 System.err.println("Error: Could not parse metadata.xml file " + getAbsolutePath());
288 return new ArrayList();
289 }
290
291 loaded_file = this;
292 loaded_file_document = document;
293 }
294
295 // Determine the file's path relative to the location of the metadata.xml file
296 String file_relative_path = FilenameEncoding.fileToURLEncoding(file);
297 File metadata_xml_file_directory = getParentFile();
298 String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(metadata_xml_file_directory);
299 file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length());
300
301 if (file_relative_path.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
302 file_relative_path = file_relative_path.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
303 }
304
305 // Ampersands and entities in FileName elements are protected from being resolved on parsing back the XML.
306 // To compare apples with apples convert any & to its hex url encoded value of %26
307 file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND);
308
309 // Build up a list of metadata assigned to this file
310 ArrayList metadata_values = new ArrayList();
311
312 // Read all the FileSet elements in the file
313 NodeList fileset_elements_nodelist = loaded_file_document.getElementsByTagName(FILESET_ELEMENT);
314 for (int i = 0; i < fileset_elements_nodelist.getLength(); i++) {
315 Element current_fileset_element = (Element) fileset_elements_nodelist.item(i);
316 boolean current_fileset_matches = false;
317 boolean is_one_file_only_metadata = true;
318 File folder_metadata_inherited_from = null;
319
320 // Check the FileName elements of the FileSet to see if we have a match
321 NodeList filename_elements_nodelist = current_fileset_element.getElementsByTagName(FILENAME_ELEMENT);
322 for (int j = 0; j < filename_elements_nodelist.getLength(); j++) {
323 Element current_filename_element = (Element) filename_elements_nodelist.item(j);
324 String current_filename_element_value = XMLTools.getElementTextValue(current_filename_element);
325
326 //System.err.println("\n Original TAIL filename was: " + Utility.debugUnicodeString(file.getName()));
327 //System.err.println("Looking in meta.xml for file_relative_path: " + file_relative_path);
328 //+ " - debug version: " + Utility.debugUnicodeString(file_relative_path));
329
330 // Does this fileset specify metadata for one file only?
331 is_one_file_only_metadata = true;
332 if (current_filename_element_value.indexOf("*") != -1 && !current_filename_element_value.equals(DIRECTORY_FILENAME)) {
333 // No, it specifies metadata for multiple files (but not all the files in the directory)
334 is_one_file_only_metadata = false;
335 }
336
337 //System.err.println(" Checking to see if it matches " + current_filename_element_value);// + " - debug: " + Utility.debugUnicodeString(current_filename_element_value));
338
339 // This fileset specifies metadata for the file
340 // MetadataXMLFile.addMetadata(CollectionTreeNode, ArrayList) stored filename in uppercase hex
341 // so need to make sure everything hex has been decoded (no more hex) to compare apples with apples
342 if (file_relative_path.matches(current_filename_element_value)) {
343 //System.err.println(" @@@ Found a match in meta.xml for file_relative_path: " + file_relative_path + "\n");
344 current_fileset_matches = true;
345 if (!file_relative_path.equals("") && current_filename_element_value.equals(DIRECTORY_FILENAME)) {
346 folder_metadata_inherited_from = metadata_xml_file_directory;
347 }
348 break;
349 } //else {
350 //System.err.println(" ###" + file_relative_path + " does not match " + current_filename_element_value);
351 //System.err.println( Utility.debugUnicodeString(file_relative_path) + " does not match " + Utility.debugUnicodeString(current_filename_element_value));
352 //}
353
354 // This fileset specifies metadata for the folder the file is in
355 if (file_relative_path.startsWith(current_filename_element_value + FilenameEncoding.URL_FILE_SEPARATOR)) {
356 current_fileset_matches = true;
357 folder_metadata_inherited_from = new File(metadata_xml_file_directory, current_filename_element_value);
358 break;
359 }
360 }
361
362 // The FileSet doesn't apply, so move onto the next one
363 if (current_fileset_matches == false) {
364 continue;
365 }
366
367 // Read all the Metadata elements in the fileset
368 NodeList metadata_elements_nodelist = current_fileset_element.getElementsByTagName(METADATA_ELEMENT);
369 for (int k = 0; k < metadata_elements_nodelist.getLength(); k++) {
370 Element current_metadata_element = (Element) metadata_elements_nodelist.item(k);
371 String metadata_element_name_full = current_metadata_element.getAttribute("name");
372 // if we're only looking for fileEncoding metadata and this isn't it, skip to the next
373 if(fileEncodingOnly && !metadata_element_name_full.equals(FILENAME_ENCODING_METADATA)) {
374 continue;
375 }
376 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
377
378 // Ignore legacy crap
379 if (metadata_set_namespace.equals("hidden")) {
380 continue;
381 }
382
383 MetadataSet metadata_set = MetadataSetManager.getMetadataSet(metadata_set_namespace);
384 if (metadata_set == null) {
385 // The metadata set isn't loaded, so give the option of mapping the element into a loaded set
386 String target_metadata_element_name_full = MetadataSetManager.mapUnloadedMetadataElement(metadata_element_name_full);
387 if (target_metadata_element_name_full == null || target_metadata_element_name_full.equals("")) {
388 // Skip this element if we still don't have a loaded element for it
389 continue;
390 }
391
392 metadata_element_name_full = target_metadata_element_name_full;
393 metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
394 metadata_set = MetadataSetManager.getMetadataSet(metadata_set_namespace);
395 }
396
397 MetadataElement metadata_element = MetadataTools.getMetadataElementWithName(metadata_element_name_full);
398
399 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
400 // If the element doesn't exist in the metadata set, we're not interested
401 //Shaoqun modified. It needs to be added to metadata_set because the user might disable skim file
402 if (metadata_element == null) {
403 metadata_element = metadata_set.addMetadataElementForThisSession(metadata_element_name);
404 // continue;
405 }
406
407 // Square brackets need to be escaped because they are a special character in Greenstone
408 String metadata_value_string = XMLTools.getElementTextValue(current_metadata_element);
409 metadata_value_string = metadata_value_string.replaceAll("&#091;", "[");
410 metadata_value_string = metadata_value_string.replaceAll("&#093;", "]");
411
412 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_value_string);
413
414 // If there is no metadata value tree node for this value, create it
415 if (metadata_value_tree_node == null) {
416 DebugStream.println("Note: No value tree node for metadata value \"" + metadata_value_string + "\"");
417 metadata_element.addMetadataValue(metadata_value_string);
418 metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_value_string);
419 }
420
421 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
422 metadata_value.inheritsMetadataFromFolder(folder_metadata_inherited_from);
423 metadata_value.setIsOneFileOnlyMetadata(is_one_file_only_metadata);
424
425 // Is this accumulating metadata?
426 if (current_metadata_element.getAttribute("mode").equals("accumulate")) {
427 metadata_value.setIsAccumulatingMetadata(true);
428 }
429
430 // Add the new metadata value to the list
431 metadata_values.add(metadata_value);
432 }
433 }
434
435 return metadata_values;
436 }
437
438
439 public void removeMetadata(CollectionTreeNode file_node, ArrayList metadata_values)
440 {
441 // If this metadata.xml file isn't the one currently loaded, load it now
442 if (loaded_file != this) {
443 // First we must save out the currently loaded file
444 saveLoadedFile();
445
446 // Parse the metadata.xml file
447 Document document = XMLTools.parseXMLFile(this);
448 if (document == null) {
449 System.err.println("Error: Could not parse metadata.xml file " + getAbsolutePath());
450 return;
451 }
452
453 loaded_file = this;
454 loaded_file_document = document;
455 }
456
457 // Determine the file's path relative to the location of the metadata.xml file
458 String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile());
459 String file_relative_path = file_node.getURLEncodedFilePath();
460 file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length());
461 if (file_relative_path.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
462 file_relative_path = file_relative_path.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
463 }
464
465 // Form a regular expression that specifies the scope of the metadata
466 String file_path_regexp;
467 if (file_relative_path.equals("")) {
468 // Special case for matching all files in the directory
469 file_path_regexp = DIRECTORY_FILENAME;
470 }
471 else {
472 // Ampersands and entities in FileName elements are protected from being resolved on parsing back the XML.
473 // To compare apples with apples convert any & to its hex url encoded value of %26
474 file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND);
475
476 // Convert the file path into a regular expression that will match it
477 file_path_regexp = MetadataTools.getRegularExpressionThatMatchesFilePath(file_relative_path);
478 }
479
480 // Find the appropriate FileSet element for this file
481 Element appropriate_fileset_element = null;
482
483 // Read all the FileSet elements in the file
484 NodeList fileset_elements_nodelist = loaded_file_document.getElementsByTagName(FILESET_ELEMENT);
485 for (int i = 0; i < fileset_elements_nodelist.getLength(); i++) {
486 Element current_fileset_element = (Element) fileset_elements_nodelist.item(i);
487
488 // Check the FileName elements of the FileSet to see if we have a match
489 NodeList filename_elements_nodelist = current_fileset_element.getElementsByTagName(FILENAME_ELEMENT);
490 for (int j = 0; j < filename_elements_nodelist.getLength(); j++) {
491 Element current_filename_element = (Element) filename_elements_nodelist.item(j);
492 String current_filename_element_value = XMLTools.getElementTextValue(current_filename_element);
493
494 // Only exact matches can be extended with new metadata
495 if (current_filename_element_value.equals(file_path_regexp)) {
496 appropriate_fileset_element = current_fileset_element;
497 break;
498 }
499 }
500 }
501
502 // If no appropriate FileSet element exists the metadata isn't assigned in this metadata.xml file
503 if (appropriate_fileset_element == null) {
504 DebugStream.println("Note: No appropriate FileSet element found when removing metadata from " + this);
505 return;
506 }
507
508 // Remove each of the metadata values from the FileSet's Description element
509 for (int i = 0; i < metadata_values.size(); i++) {
510 MetadataValue metadata_value = (MetadataValue) metadata_values.get(i);
511
512 // Remove any characters that are invalid in XML
513 String metadata_value_string = XMLTools.removeInvalidCharacters(metadata_value.getFullValue());
514
515 // Square brackets need to be escaped because they are a special character in Greenstone
516 metadata_value_string = metadata_value_string.replaceAll("\\[", "&#091;");
517 metadata_value_string = metadata_value_string.replaceAll("\\]", "&#093;");
518
519 // Find the Metadata element to delete from the fileset
520 String metadata_element_name_full = metadata_value.getMetadataElement().getFullName();
521 NodeList metadata_elements_nodelist = appropriate_fileset_element.getElementsByTagName(METADATA_ELEMENT);
522 for (int k = 0; k < metadata_elements_nodelist.getLength(); k++) {
523 Element current_metadata_element = (Element) metadata_elements_nodelist.item(k);
524
525 // Check the metadata element name matches
526 String current_metadata_element_name_full = current_metadata_element.getAttribute("name");
527 if (current_metadata_element_name_full.equals(metadata_element_name_full)) {
528 // Check the metadata element value matches
529 String current_metadata_value_string = XMLTools.getElementTextValue(current_metadata_element);
530 if (current_metadata_value_string.equals(metadata_value_string)) {
531
532 // Remove this Metadata element
533 current_metadata_element.getParentNode().removeChild(current_metadata_element);
534
535 // the gs.filenameEncoding metadata is unique in that, when added, removed or
536 // changed, it must be applied on the file(name) whose metadata has been adjusted
537 if(current_metadata_element_name_full.equals(FILENAME_ENCODING_METADATA)) {
538
539 // metadata_value_string will hereafter be the inherited gs.FilenameEncoding
540 // metadata (if any), now that the value at this level has been removed
541 metadata_value_string = processFilenameEncoding(file_path_regexp,
542 file_node, "", true); // true only if *removing* this meta
543 }
544
545 // If there are no Metadata elements left now, remove the (empty) FileSet element
546 if (metadata_elements_nodelist.getLength() == 0) {
547 appropriate_fileset_element.getParentNode().removeChild(appropriate_fileset_element);
548 }
549
550 break;
551 }
552 }
553 }
554 }
555
556 // Remember that we've changed the file so it gets saved when a new one is loaded
557 loaded_file_changed = true;
558 }
559
560
561 public void replaceMetadata(CollectionTreeNode file_node, MetadataValue old_metadata_value, MetadataValue new_metadata_value)
562 {
563 // If this metadata.xml file isn't the one currently loaded, load it now
564 if (loaded_file != this) {
565 // First we must save out the currently loaded file
566 saveLoadedFile();
567
568 // Parse the metadata.xml file
569 Document document = XMLTools.parseXMLFile(this);
570 if (document == null) {
571 System.err.println("Error: Could not parse metadata.xml file " + getAbsolutePath());
572 return;
573 }
574
575 loaded_file = this;
576 loaded_file_document = document;
577 }
578
579 // Determine the file's path relative to the location of the metadata.xml file
580 String metadata_xml_file_directory_path = FilenameEncoding.fileToURLEncoding(getParentFile());
581 String file_relative_path = file_node.getURLEncodedFilePath();
582 file_relative_path = file_relative_path.substring(metadata_xml_file_directory_path.length());
583 if (file_relative_path.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
584 file_relative_path = file_relative_path.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
585 }
586
587 // Form a regular expression that specifies the scope of the metadata
588 String file_path_regexp;
589 if (file_relative_path.equals("")) {
590 // Special case for matching all files in the directory
591 file_path_regexp = DIRECTORY_FILENAME;
592 }
593 else {
594 // Ampersands and entities in FileName elements are protected from being resolved on parsing back the XML.
595 // To compare apples with apples convert any & to its hex url encoded value of %26
596 file_relative_path = file_relative_path.replace("&", FilenameEncoding.HEX_AMPERSAND);
597
598 // Convert the file path into a regular expression that will match it
599 file_path_regexp = MetadataTools.getRegularExpressionThatMatchesFilePath(file_relative_path);
600 }
601
602 // Remove any characters that are invalid in XML
603 String old_metadata_value_string = XMLTools.removeInvalidCharacters(old_metadata_value.getFullValue());
604 String new_metadata_value_string = XMLTools.removeInvalidCharacters(new_metadata_value.getFullValue());
605
606 // Square brackets need to be escaped because they are a special character in Greenstone
607 old_metadata_value_string = old_metadata_value_string.replaceAll("\\[", "&#091;");
608 old_metadata_value_string = old_metadata_value_string.replaceAll("\\]", "&#093;");
609 new_metadata_value_string = new_metadata_value_string.replaceAll("\\[", "&#091;");
610 new_metadata_value_string = new_metadata_value_string.replaceAll("\\]", "&#093;");
611
612 // Read all the FileSet elements in the file
613 NodeList fileset_elements_nodelist = loaded_file_document.getElementsByTagName(FILESET_ELEMENT);
614 for (int i = 0; i < fileset_elements_nodelist.getLength(); i++) {
615 Element current_fileset_element = (Element) fileset_elements_nodelist.item(i);
616 boolean current_fileset_matches = false;
617
618 // Check the FileName elements of the FileSet to see if we have a match
619 NodeList filename_elements_nodelist = current_fileset_element.getElementsByTagName(FILENAME_ELEMENT);
620 for (int j = 0; j < filename_elements_nodelist.getLength(); j++) {
621 Element current_filename_element = (Element) filename_elements_nodelist.item(j);
622 String current_filename_element_value = XMLTools.getElementTextValue(current_filename_element);
623
624 // Only exact matches can be edited
625 if (current_filename_element_value.equals(file_path_regexp)) {
626 current_fileset_matches = true;
627 break;
628 }
629 }
630
631 // The FileSet doesn't apply, so move onto the next one
632 if (current_fileset_matches == false) {
633 continue;
634 }
635
636 // Each metadata value is only allowed to be assigned once
637 boolean new_metadata_value_already_exists = false;
638 Element metadata_element_to_edit = null;
639
640 // Find the Metadata element to replace in the fileset
641 String metadata_element_name_full = old_metadata_value.getMetadataElement().getFullName();
642 NodeList metadata_elements_nodelist = current_fileset_element.getElementsByTagName(METADATA_ELEMENT);
643 for (int k = 0; k < metadata_elements_nodelist.getLength(); k++) {
644 Element current_metadata_element = (Element) metadata_elements_nodelist.item(k);
645
646 // Check the metadata element name matches
647 String current_metadata_element_name_full = current_metadata_element.getAttribute("name");
648 if (!current_metadata_element_name_full.equals(metadata_element_name_full)) {
649 continue;
650 }
651
652 // Check the new metadata value doesn't already exist
653 String current_metadata_value_string = XMLTools.getElementTextValue(current_metadata_element);
654 if (current_metadata_value_string.equals(new_metadata_value_string)) {
655 new_metadata_value_already_exists = true;
656 }
657
658 // Check the metadata element value matches
659 if (current_metadata_value_string.equals(old_metadata_value_string)) {
660 metadata_element_to_edit = current_metadata_element;
661 }
662 }
663
664 // If the new metadata value already existed, remove the original value
665 if (new_metadata_value_already_exists) {
666 if(metadata_element_to_edit != null) { //?????????
667 metadata_element_to_edit.getParentNode().removeChild(metadata_element_to_edit);
668 } else {
669 System.err.println("ERROR MetadataXMLFile: metadata_element_to_edit is null");
670 }
671 }
672 // Otherwise replace the old value with the new value
673 // Ensure metadata_element_to_edit isn't null (may occur when multiple files are selected)
674 else if (metadata_element_to_edit != null) {
675
676 // the gs.filenameEncoding metadata is unique in that, when added, removed or
677 // changed, it must be applied on the file(name) whose metadata has been adjusted
678 if(metadata_element_name_full.equals(FILENAME_ENCODING_METADATA)) {
679 new_metadata_value_string = processFilenameEncoding(file_path_regexp, file_node, new_metadata_value_string, false);
680 // true only if removing meta
681 }
682 XMLTools.setElementTextValue(metadata_element_to_edit, new_metadata_value_string);
683 }
684 }
685
686 // Remember that we've changed the file so it gets saved when a new one is loaded
687 loaded_file_changed = true;
688 }
689
690
691 static public void saveLoadedFile()
692 {
693 // If we have a file loaded into memory and it has been modified, save it now
694 if (loaded_file != null && loaded_file_changed == true) {
695 //System.err.println("START saveLoadedFile(), loaded_file_document:\n" + XMLTools.elementToString(loaded_file_document.getDocumentElement(), true));
696
697 XMLTools.writeXMLFile(loaded_file, loaded_file_document, nonEscapingElements);
698
699 loaded_file_changed = false;
700 }
701 }
702
703 /**
704 * Every metadata.xml file must be skimmed when a collection is opened, for three very important reasons:
705 * - To handle any non-namespaced metadata in the metadata.xml files (this is mapped and the files rewritten)
706 * - To get a complete list of the metadata elements in the collection (used in Design and Format panes)
707 * - To build complete and accurate metadata value trees (used in the Enrich pane)
708 */
709 public void skimFile()
710 {
711 boolean file_changed = false;
712
713 // Parse the metadata.xml file
714 DebugStream.println("Skimming metadata.xml file " + this + "...");
715
716 Document document = XMLTools.parseXMLFile(this);
717 if (document == null) {
718 System.err.println("Error: Could not parse metadata.xml file " + getAbsolutePath());
719 return;
720 }
721
722 // Read all the Metadata elements in the file
723 HashMap target_metadata_element_name_attrs_cache = new HashMap();
724 NodeList metadata_elements_nodelist = document.getElementsByTagName(METADATA_ELEMENT);
725 for (int i = 0; i < metadata_elements_nodelist.getLength(); i++) {
726 Element current_metadata_element = (Element) metadata_elements_nodelist.item(i);
727 String metadata_element_name_full = current_metadata_element.getAttribute("name");
728 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
729
730 // Ignore legacy crap
731 if (metadata_set_namespace.equals("hidden")) {
732 continue;
733 }
734
735 MetadataSet metadata_set = MetadataSetManager.getMetadataSet(metadata_set_namespace);
736 if (metadata_set == null) {
737 // The metadata set isn't loaded, so give the option of mapping the element into a loaded set
738 String target_metadata_element_name_full = MetadataSetManager.mapUnloadedMetadataElement(metadata_element_name_full);
739 if (target_metadata_element_name_full == null || target_metadata_element_name_full.equals("")) {
740 // Skip this element if we still don't have a loaded element for it
741 continue;
742 }
743
744 // Update the metadata.xml file to have the new (namespaced) element name
745 // Instead of using current_metadata_element.setAttribute("name", target_metadata_element_name_full)
746 // we create an Attr object for each target metadata element name, and cache them
747 // This makes a *huge* difference (namespacing a metadata.xml file with 45000 metadata entries now
748 // takes 45 seconds instead of 30 minutes!) -- why is setting the value of a Node so slow?
749 Attr target_metadata_element_name_attr = (Attr) target_metadata_element_name_attrs_cache.get(target_metadata_element_name_full);
750 if (target_metadata_element_name_attr == null) {
751 target_metadata_element_name_attr = document.createAttribute("name");
752 target_metadata_element_name_attr.setValue(target_metadata_element_name_full);
753 target_metadata_element_name_attrs_cache.put(target_metadata_element_name_full, target_metadata_element_name_attr);
754 }
755
756 // Remove the old name attribute and add the new (namespaced) one
757 current_metadata_element.removeAttribute("name");
758 current_metadata_element.setAttributeNode((Attr) target_metadata_element_name_attr.cloneNode(false));
759 file_changed = true;
760
761 metadata_element_name_full = target_metadata_element_name_full;
762 metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
763 metadata_set = MetadataSetManager.getMetadataSet(metadata_set_namespace);
764 }
765
766 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
767 MetadataElement metadata_element = metadata_set.getMetadataElementWithName(metadata_element_name);
768
769 // If the element doesn't exist in the metadata set, add it
770 if (metadata_element == null) {
771 metadata_element = metadata_set.addMetadataElementForThisSession(metadata_element_name);
772 }
773
774 // Square brackets need to be escaped because they are a special character in Greenstone
775 String metadata_value_string = XMLTools.getElementTextValue(current_metadata_element);
776 metadata_value_string = metadata_value_string.replaceAll("&#091;", "[");
777 metadata_value_string = metadata_value_string.replaceAll("&#093;", "]");
778
779 metadata_element.addMetadataValue(metadata_value_string);
780 }
781
782 // Rewrite the metadata.xml file if it has changed
783 if (file_changed) {
784 XMLTools.writeXMLFile(this, document);
785 }
786 }
787
788 /**
789 * The gs.filenameEncoding metadata is unique in that, when added, removed or
790 * replaced, it must be applied on the file(name) whose metadata has been
791 * adjusted.
792 * This method handles all that, given the regular expression or filepath name
793 * to match on (.* matches subdirectories), the affected fileNode, the new
794 * encoding value and whether a new encoding value has been added/an existing
795 * one has been replaced or whether the encoding metadata has been removed.
796 * The new adjusted value for the encoding metadata is returned.
797 *
798 * MetadataXMLFileManager maintains a hashmap of (URL-encoded filepaths, encoding)
799 * to allow fast access to previously assigned gs.filenameEncoding metadata (if
800 * any) for each file. This hashmap also needs to be updated, but this update
801 * is complicated by the fact that it concerns regular expressions that could
802 * affect multiple filenames.
803 */
804 public String processFilenameEncoding(String file_path_regexp, CollectionTreeNode file_node,
805 String encoding_metadata_value, boolean removingMetadata)
806 {
807 if(!FilenameEncoding.MULTIPLE_FILENAME_ENCODINGS_SUPPORTED) {
808 return encoding_metadata_value;
809 }
810
811 // Work out this filenode's new encoding and apply it:
812
813 if(removingMetadata) { // encoding_metadata_value = ""
814 // gs.filenameEncoding metadata being removed, work out
815 // any inherited metadata to replace it with in the meta-table
816 encoding_metadata_value = FilenameEncoding.getInheritedFilenameEncoding(
817 file_node.getURLEncodedFilePath(), file_node.getFile());
818 // should be canonical encoding already
819 }
820 else if(!encoding_metadata_value.equals("")) {
821 // if adding or replacing filename encoding,
822 // get the canonical encoding name for this alias
823 encoding_metadata_value = FilenameEncoding.canonicalEncodingName(encoding_metadata_value);
824 }
825 // Reencode the display of this filenode only as any affected
826 // childnodes will be reencoded on FileNode.refreshDescendantEncodings()
827 file_node.reencodeDisplayName(encoding_metadata_value);
828
829
830 // Whether removing or adding/replacing the file's gs.filename encoding meta,
831 // store this in the file-to-encoding map for fast access, since the map stores
832 // empty string values when no meta has been assigned at this file level.
833 // In the case of removingMetadata, the value stored will be the fallback value
834
835 String urlpath = file_node.getURLEncodedFilePath();
836 if(removingMetadata) {
837 // remove it from the map instead of inserting "", so that when folders in the collectiontree
838 // are being deleted or shifted, the removemetada (and addmetadata) calls that get fired
839 // for each affected filenodes does not cause the undesirable effect of multiple "" to be
840 // entered into the filename-to-encoding map for filepaths that no longer exist .
841 FilenameEncoding.map.remove(urlpath);
842 } else { // for adding and replacing, put the encoding into the map (also replaces any existing encoding for it)
843 FilenameEncoding.map.put(urlpath, encoding_metadata_value);
844 }
845
846 // If new folder-level metadata (or metadata for a set of files fitting a pattern) has been
847 // assigned, the file_to_encodings map will be cleared for all descendant folders and files,
848 // so that these can be re-calculated upon refreshing the visible parts of the CollectionTree.
849 // Mark the state as requiring a refresh of the CollectionTree.
850 // This next step also serves to prevent the MetadataValueTableModel from trying to update
851 // itself while a refresh (involving re-encoding of filenames of visible nodes) is in progress.
852 FilenameEncoding.setRefreshRequired(true);
853
854 return encoding_metadata_value;
855 }
856}
Note: See TracBrowser for help on using the repository browser.