source: main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataToCSV.java@ 34264

Last change on this file since 34264 was 34264, checked in by ak19, 4 years ago
  1. Added moveMetaXMLToCSV 2. Both this and exportMetaAsCSV now also made to work for the remote case. 3. Bugfix to oversight in GUIManager that in the previous commit used to ignore user selected csvfile and always created a metdata.csv in import folder. 4. Tidied up MetaToCSV.java some more.
File size: 21.7 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 2020 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.metadata;
38
39import java.io.*;
40import java.util.*;
41import javax.swing.filechooser.FileNameExtensionFilter;
42import javax.swing.JFileChooser;
43import javax.swing.JFrame;
44import javax.swing.JOptionPane;
45
46import org.apache.commons.csv.*;
47
48import org.greenstone.gatherer.util.SafeProcess;
49import org.greenstone.gatherer.DebugStream;
50import org.greenstone.gatherer.Dictionary;
51import org.greenstone.gatherer.metadata.MetadataElement;
52import org.greenstone.gatherer.metadata.MetadataValue;
53import org.greenstone.gatherer.metadata.MetadataXMLFileManager;
54
55
56
57/**
58 * Class to export GLI metadata of a collection to a metadata.csv file.
59 * This class can also merge GLI meta for the collection onto an existing metadata.csv file.
60 * Merging is a cumulative process.
61 * Duplicate entries and values are not preserved.
62 * Uses TreeMap and TreeSet to keep everything alphabetically ordered.
63 * TODO: What about ordering by unicode. Is that the natural ordering for Java Strings?
64 * If so, this would support keeping metadata values ordered regardless of script used.
65*/
66public class MetadataToCSV implements FileFilter {
67 private char meta_field_sep = ','; // comma is default field separator for CSV, comma separated values
68 private String meta_value_sep_re = "\\|"; // must escape | to get regex
69 private char meta_value_sep_char = '|'; // when written out to file
70 private String collection_directory_path = "";
71 private String coll_importdir_path = "";
72 private final int import_path_length;
73
74 /** The CSV metadata file to be read and rewritten. */
75 //private String metadataCSVFilename = "metadata.csv";
76 private File metadataCSVFile;
77
78 /** TODO: Is this useful?
79 * Not yet implemented: if this flag is true, then if a file mentioned in metadata.csv does not exist,
80 * its entry is dropped and won't appear again when the metadata.csv is written out again.
81 */
82 //private boolean removeMetaForFilesThatDoNotExist = false;
83
84 private final String IMPORT_DIRNAME = "import";
85
86 /** A Map of all files/docs in this collection and their metadata,
87 * itself tuples of metadata field names and their (possibly multiple) metadata values. */
88 TreeMap<File, TreeMap<String,TreeSet<String>>> collMetaMap = new TreeMap<File, TreeMap<String,TreeSet<String>>>();
89
90 public MetadataToCSV(String collDirPath) {
91 this.collection_directory_path = collDirPath;
92 this.coll_importdir_path = collDirPath + IMPORT_DIRNAME + File.separator; //new File(collDirPath, IMPORT_DIRNAME).getAbsolutePath();
93 import_path_length = this.coll_importdir_path.length();
94 this.metadataCSVFile = new File(coll_importdir_path, "metadata.csv");
95 }
96
97 public MetadataToCSV(String collDirPath, File metadataCSV) {
98 this(collDirPath);
99 this.metadataCSVFile = metadataCSVFile;
100 }
101
102 public MetadataToCSV(String collDirPath, File metadataCSVFile, char metafieldSepChar, String readMetaValSepExpression, char writeMetaValSepChar) {
103 this(collDirPath, metadataCSVFile);
104 this.meta_field_sep = metafieldSepChar;
105 this.meta_value_sep_re = readMetaValSepExpression;
106 this.meta_value_sep_char = writeMetaValSepChar;
107 }
108
109 /** Remove import path prefix from given file. Returned is the path of file relative to import. */
110 public String fileToRelativeString(File f) {
111 String fullPath = f.getAbsolutePath();
112 //System.err.println("@@@ fullpath: " + fullPath);
113 //System.err.println("@@@ coll_importdir_path: " + this.coll_importdir_path);
114 int indexMatch = fullPath.indexOf(coll_importdir_path);
115 if(indexMatch == -1) {
116 return fullPath;
117 } else {
118 return fullPath.substring(indexMatch+import_path_length);
119 }
120 }
121
122
123 /** helper methods to export metadata for collection files to csv
124 * Returns a Navigable Sorted Map of file names in the collection (relative to import folder), ordered alphabetically,
125 * mapped to each file's metadata, sorted alphabetically by metadata field name, and list of metadata values sorted alphabetically
126 */
127 public TreeMap<File, TreeMap<String,TreeSet<String>>> getAllAssignedMetadataForAllFiles() {
128 TreeMap<File, TreeMap<String,TreeSet<String>>> files_with_meta = new TreeMap<File, TreeMap<String,TreeSet<String>>>();
129
130 ArrayList<File> files = listFilesInCollection(this.collection_directory_path);
131 Iterator<File> i = files.iterator();
132
133 while(i.hasNext()) {
134 File f = i.next();
135 ArrayList file_meta = MetadataXMLFileManager.getMetadataAssignedToFile(f);
136
137 //files_with_meta.put(f, file_meta);
138 TreeMap<String,TreeSet<String>> fileToMetaMap = new TreeMap<String,TreeSet<String>>();
139
140 // debugging display
141 ///System.err.println("Meta for file: " + f.getAbsolutePath());
142 Iterator it = file_meta.iterator();
143 while(it.hasNext()) {
144 MetadataValue meta = (MetadataValue)it.next();
145 String metaValue = meta.getValue();
146 MetadataElement metaEl = meta.getMetadataElement();
147 String metaFieldName = metaEl.getFullName();
148 ///System.err.println(" field: " + metaFieldName);
149 ///System.err.println(" value: " + metaValue);
150
151 TreeSet<String> vals = fileToMetaMap.get(metaFieldName);
152 if(vals == null) {
153 vals = new TreeSet<String>();
154 vals.add(metaValue);
155 fileToMetaMap.put(metaFieldName, vals);
156 } else {
157 vals.add(metaValue);
158 }
159 }
160
161 files_with_meta.put(f, fileToMetaMap);
162 }
163
164 return files_with_meta;
165 }
166
167 // Get all meta in any metadata.csv file
168 // and add to it all meta assigned for docs in this collection
169 public void amalgamateAllMeta() {
170 TreeMap<File, TreeMap<String,TreeSet<String>>> assignedMeta = getAllAssignedMetadataForAllFiles();
171 TreeMap<File, TreeMap<String,TreeSet<String>>> csvFileMeta = loadMetaFromCSVFile(this.metadataCSVFile);
172
173 if(collMetaMap.size() == 0) {
174
175 if(assignedMeta.keySet().size() > csvFileMeta.keySet().size()) {
176 collMetaMap = assignedMeta;
177 merge(collMetaMap, csvFileMeta);
178 } else {
179 collMetaMap = csvFileMeta;
180 merge(collMetaMap, assignedMeta);
181 }
182 } else {
183
184 merge(collMetaMap, assignedMeta);
185 merge(collMetaMap, csvFileMeta);
186 }
187
188 }
189
190 public TreeSet<String> getAllCollHeadings(TreeMap<File, TreeMap<String,TreeSet<String>>> metaMap) {
191 TreeSet<String> collHeadings = new TreeSet<String>();
192
193 if(metaMap == null || metaMap.size() == 0) {
194 return collHeadings;
195 }
196 // get all meta field names and add into collHeadings. As it's a TreeSet,
197 // duplicates will be automatically ignored and collheadings will be sorted
198 Iterator<File> iFiles = metaMap.keySet().iterator();
199 while(iFiles.hasNext()) {
200 File f = iFiles.next();
201 TreeMap<String, TreeSet<String>> metaFields = metaMap.get(f);
202 Iterator<String> iMetaFields = metaFields.keySet().iterator();
203 while(iMetaFields.hasNext()) {
204 String fieldName = iMetaFields.next();
205 collHeadings.add(fieldName);
206 }
207 }
208
209 return collHeadings;
210 }
211
212 /** merge metaMap param into baseMetaMap: only portions not already present in baseMetaMap are added in
213 * whether these are new file entries, new metadata field entries for extant files, or metadata values for extant fields of files.
214 * A simple map.putALL() will not do the trick as collMetaMap is a complicated data structure.
215 */
216 public void merge(TreeMap<File, TreeMap<String,TreeSet<String>>> baseMetaMap, TreeMap<File, TreeMap<String,TreeSet<String>>> metaMap) {
217
218 if(metaMap == null || metaMap.size() == 0) {
219 // nothing to do
220 return;
221 }
222
223 Iterator<File> iFiles = metaMap.keySet().iterator();
224 while(iFiles.hasNext()) {
225 File f = iFiles.next();
226
227 // check if this file already has an entry in baseMetaMap
228 TreeMap<String, TreeSet<String>> origMetaFields = baseMetaMap.get(f);
229
230 TreeMap<String, TreeSet<String>> metaFields = metaMap.get(f);
231 Iterator<String> iMetaFields = metaFields.keySet().iterator();
232
233 // if file in metaMap didn't exist in baseMetaMap, easy: just copy its entry across in entirety
234 if(origMetaFields == null) {
235 metaMap.put(f, metaFields);
236 continue;
237 }
238
239 // else, file already exists in baseMetaMap, need to check if we have to merge any meta on the file
240 while(iMetaFields.hasNext()) {
241 String fieldName = iMetaFields.next();
242 TreeSet<String> metaValues = metaFields.get(fieldName);
243
244 // check if this metadata field exists for the same file in baseMetaMap
245 TreeSet<String> origMetaValues = origMetaFields.get(fieldName);
246 if(origMetaValues == null) { // this metadata field name did not exist for file in baseMetaMap,
247 // so copy all vals for this fieldName into baseMetaMap's entry for this file
248 origMetaFields.put(fieldName, metaValues);
249 continue; // continue on inner loop
250 }
251
252 // else the meta fieldName existed for that file in baseMetaMap
253 // Check if any of the metadata values didn't already exist, else add them in
254 Iterator<String> iMetaValues = metaValues.iterator();
255 while(iMetaValues.hasNext()) {
256 String metaValue = iMetaValues.next();
257
258 if(!origMetaValues.contains(metaValue)) {
259 origMetaValues.add(metaValue);
260 }
261 }
262
263 }
264 }
265 }
266
267
268 /** If successfully wrote out collection's meta from to a CSV file,
269 * then will need to remove all meta from GLI (metadata.xml files).
270 * Just del or rename those files to .bak?
271 * This dangerous method goes through all the metadata.xml files that were in use so far
272 * and removes all the child elements from meta xml files' DirectoryMetadata root elements
273 */
274 public boolean moveMetaXMLToCSV(File csvFile, JFrame parent) {
275
276 // Warn the user about the operation being destructive
277 int result = JOptionPane.showConfirmDialog(parent,
278 Dictionary.get("ExportMeta.MoveMetaXMLToCSV_Warning_Message"),
279 Dictionary.get("General.Warning"),
280 JOptionPane.OK_CANCEL_OPTION,
281 JOptionPane.WARNING_MESSAGE);
282 if(result == JOptionPane.CANCEL_OPTION || result == JOptionPane.CLOSED_OPTION) {
283 // NO_OPTION shouldn't happen
284 return false;
285 }
286
287 boolean success = exportMetaXMLToCSV(csvFile);
288
289 if(success) { // now it's backed up to a metadatacsv file, can clear all metadata from metaXML files
290
291 System.err.println("About to clear all metadata in collection...");
292 MetadataXMLFileManager.clearAllMetadataInCollection();
293 } else {
294 JOptionPane.showMessageDialog(parent,
295 Dictionary.get("ExportMeta.MoveMetaXMLToCSV_Failed_Message"),
296 Dictionary.get("General.Error"),
297 JOptionPane.ERROR_MESSAGE);
298 //System.err.println("@@@ Failed to properly export metadata.xml files' contents for this collection to CSV. Will not remove metadata.xml files");
299 }
300
301 return success;
302 }
303
304 /** If given a new file to create, creates the specified meta csv file from GLI's meta for the current collection.
305 * If the file exists, this will append the GLI metadata without checking if the file already contains the same entries. */
306 public boolean exportMetaXMLToCSV(File csvFile) {
307 boolean appendSetting = false;
308 boolean success = false;
309
310 if(csvFile.exists()) {
311 //appendSetting = true; // better to call the other version of this method in this case?
312 amalgamateAllMeta();
313 success = writeMetaToCSV(collMetaMap, csvFile, appendSetting);
314 } else { // no preexisting metadata.csv file, just write out GLI meta
315 TreeMap<File, TreeMap<String,TreeSet<String>>> assignedMeta = getAllAssignedMetadataForAllFiles();
316 success = writeMetaToCSV(assignedMeta, csvFile, appendSetting);
317 }
318
319 return success;
320 }
321
322 private boolean writeMetaToCSV(TreeMap<File, TreeMap<String,TreeSet<String>>> metaMap, File csvFile, boolean appendSetting) {
323 boolean success = true;
324
325 // First would need to write the row of all headings
326 TreeSet<String> metaFieldColumnHeadings = getAllCollHeadings(metaMap);
327 // Careful, collHeadings are alphabetically ordered, but not all docs may have meta for each column heading/metadata field name
328 // Need metadataFieldNames in an indexed array
329 Vector<String> columnHeadings = new Vector<String>(metaFieldColumnHeadings.size());
330 // put the Filename column as first item
331 columnHeadings.add("Filename");
332 columnHeadings.addAll(metaFieldColumnHeadings); // now have an indexed, yet still ordered, list of all column headings(the meta fieldnames)
333
334 CSVFormat customCSVFormat = CSVFormat.DEFAULT
335 .withDelimiter(meta_field_sep)
336 .withIgnoreSurroundingSpaces(false)
337 .withQuoteMode(QuoteMode.MINIMAL)
338 .withTrim();
339
340 try (CSVPrinter printer = new CSVPrinter(new FileWriter(csvFile, appendSetting), customCSVFormat)) {
341 printer.printRecord(columnHeadings);
342 // https://javadoc.io/doc/org.apache.commons/commons-csv/latest/index.html
343 Iterator<File> iFiles = metaMap.keySet().iterator();
344 while(iFiles.hasNext()) {
345 File f = iFiles.next();
346 String relFilename = fileToRelativeString(f);
347 // write out the filename field of this record
348 printer.print(relFilename);
349
350 TreeMap<String, TreeSet<String>> fileMetadata = metaMap.get(f);
351 // now get each metadata field's value in the order of the column headings, and write them out
352 //for(String metaFieldName : columnHeadings) {
353 for(int i = 1; i < columnHeadings.size(); i++) { // skip past Filename coll heading, already written out
354 String metaFieldName = columnHeadings.get(i);
355 TreeSet<String> metavalues = fileMetadata.get(metaFieldName);
356 StringBuffer allMetaValuesForField = new StringBuffer();
357 if(metavalues == null || metavalues.size() == 0) {
358 // this file does not have (metavalues) such a metaFieldName, the cell for this column is empty
359 //System.err.println("No meta values for fieldname: " + metaFieldName);
360 printer.print(allMetaValuesForField);
361 } else {
362 for(String metavalue : metavalues) {
363 //metavalue = metavalue.trim();
364 allMetaValuesForField.append(meta_value_sep_char);
365 allMetaValuesForField.append(metavalue);
366 }
367 // write out the current metadata field of this record
368 // remove the extra meta_value_separator_char added the first time
369 printer.print(allMetaValuesForField.substring(1));
370 }
371 }
372
373 printer.println(); // done writing a record
374 }
375 } catch (IOException ex) {
376 success = false;
377 DebugStream.printStackTrace(ex);
378 System.err.println("Caught exception when writing meta to CSVFile " + csvFile.getAbsolutePath());
379 System.err.println("\t" + ex.getMessage());
380 }
381
382 return success;
383 }
384
385
386 public TreeMap<File, TreeMap<String,TreeSet<String>>> loadMetaFromCSVFile(File csvFile) {
387 TreeMap<File, TreeMap<String,TreeSet<String>>> csvFileMeta = new TreeMap<File, TreeMap<String,TreeSet<String>>>();
388
389 if(!csvFile.exists()) {
390 return csvFileMeta;
391 }
392
393 Reader in = null;
394 //try(Reader in = new FileReader(csvFile);) { // try-with-resources may break on older Java that we use to build GS3 binaries
395 try {
396 in = new FileReader(csvFile);
397 boolean headingRow = true;
398
399 // https://javadoc.io/doc/org.apache.commons/commons-csv/latest/index.html
400 CSVFormat lenientCSVFormat = CSVFormat.DEFAULT
401 .withDelimiter(meta_field_sep)
402 .withFirstRecordAsHeader()
403 .withCommentMarker('#')
404 .withIgnoreSurroundingSpaces()
405 .withTrim();
406
407 // https://stackoverflow.com/questions/36269387/get-csv-file-header-using-apache-commons
408 // The first col heading which is the Filename
409 // the remaining CSV column headings are the metadata field names
410
411 CSVParser parser = lenientCSVFormat.parse(in);
412
413 //String[] metaFieldNames = lenientCSVFormat.getHeader(); // didn't work
414 // getHeaders() returns List<String>, convert to String[] array
415 String[] metaFieldNames = parser.getHeaderNames().toArray(new String[0]);
416
417 for (CSVRecord record : parser) {
418
419 // a new row, represents a new file's meta
420 TreeMap<String,TreeSet<String>> meta = new TreeMap<String,TreeSet<String>>();
421
422 for(int i = 0; i < record.size(); i++) { //for (String field : record) {
423 String field = record.get(i);
424
425 if(i == 0) { // col 0 = Filename
426 String filename = field;
427 // TODO: filenames are stored relative to import folder, convert to full path for internal use?
428 File fullPathFile = new File(coll_importdir_path + filename);
429 ///System.err.println("Found Filename meta: " + filename);
430 csvFileMeta.put(fullPathFile, meta);
431 } else {
432 // not Filename, but metadata field name, add into meta map for this file
433 TreeSet<String> metaValues = new TreeSet<String>();
434 String metadataFieldName = metaFieldNames[i]; // get column heading=meta field name for current cell
435 meta.put(metadataFieldName, metaValues);
436 ///System.err.println("Found value for meta field: " + metadataFieldName);
437 // Split the field to get all metavalues for this metadata field name
438 // and add to metaValues set
439 String unparsedMetaVal = field.trim();
440 String[] metadataValues = unparsedMetaVal.split(meta_value_sep_re);
441 for(String metaVal : metadataValues) {
442 metaVal = metaVal.trim(); // get rid of whitespaces around separator char
443 if(!metaVal.equals("")) {
444 ///System.err.println("Found value for meta field: " + metaVal);
445 metaValues.add(metaVal);
446 }
447 }
448 }
449 }
450 }
451 } catch(Exception e) {
452 DebugStream.printStackTrace(e);
453 DebugStream.println("@@@ Error reading from CSV file: " + csvFile.getAbsolutePath());
454 } finally {
455 SafeProcess.closeResource(in);
456 }
457
458 //this.print(csvFileMeta);
459 return csvFileMeta;
460 }
461
462 /** For debugging */
463 public void print(TreeMap<File, TreeMap<String,TreeSet<String>>> metaMap ) {
464 Iterator<File> iFiles = metaMap.keySet().iterator();
465 while(iFiles.hasNext()) {
466 File f = iFiles.next();
467 TreeMap<String, TreeSet<String>> metaFields = metaMap.get(f);
468 if(metaFields != null) {
469 System.err.println("Meta for file: " + fileToRelativeString(f)); //f.getAbsolutePath());
470 }
471 Iterator<String> iMetaFields = metaFields.keySet().iterator();
472 if(!iMetaFields.hasNext()) {
473 System.err.println("No meta for file!");
474 }
475 while(iMetaFields.hasNext()) {
476 String fieldName = iMetaFields.next();
477 System.err.println("\tMetafield: " + fieldName);
478
479 TreeSet<String> metaValues = metaFields.get(fieldName);
480 Iterator<String> iMetaValues = metaValues.iterator();
481 while(iMetaValues.hasNext()) {
482 String metaValue = iMetaValues.next();
483 System.err.println("\t\tValue: " + metaValue);
484 }
485 }
486 }
487 }
488
489 /** For debugging */
490 public void printOrderedCollectionMeta() {
491 //TreeMap<File, TreeMap<String,TreeSet<String>>> collMetaMap = getAllAssignedMetadataForAllFiles();
492
493 amalgamateAllMeta();
494 this.print(collMetaMap);
495 }
496
497 public ArrayList<File> listFilesInCollection(String collection_directory_path) {
498
499 ///System.err.println("coll dir path: " + collection_directory_path);
500
501 // only files in import folder have meta. Don't list files outside import folder
502 File collDir = new File(collection_directory_path, IMPORT_DIRNAME);
503
504 ArrayList<File> files = new ArrayList<File>();
505
506 //FileFilter collDocsFilter = new CollectionDocFileFilter();
507 getAllFiles(files, collDir, this);
508
509 return files;
510 }
511
512 public void getAllFiles(ArrayList<File> files, File path, FileFilter filter) {
513 File[] fileList = path.listFiles(filter);
514 for(int i = 0; i < fileList.length; i++) {
515 File f = fileList[i];
516 if(f.isFile()) {
517 files.add(f);
518 } else {
519 getAllFiles(files, f, filter);
520 }
521 }
522 }
523
524 /** Filter to only accept Gathered GS documents
525 * to produce the list of files for which we need to export GLI metadata info to CSV.
526 */
527 //private class CollectionDocFileFilter implements FileFilter {
528 @Override
529 public boolean accept(File pathname) {
530 String tailname = pathname.getName();
531 if(pathname.isDirectory()) {
532 if(tailname.equals(".svn")) {
533 return false;
534 }
535 } else {
536 if(pathname.equals(metadataCSVFile)) { // skip any meta csv file user exported/put into import
537 return false;
538 } else if(tailname.equals("metadata.xml")) {
539 return false;
540 } else if(tailname.endsWith("~")) {
541 return false;
542 } else if(tailname.endsWith(".bak")) {
543 return false;
544 }
545 }
546 // accept all other file types
547 return true;
548 }
549 //}
550
551 public static File chooseMetaCSVFile(String defaultSearchPath, JFrame parent) {
552 JFileChooser chooser = new JFileChooser(defaultSearchPath);
553 chooser.setFileSelectionMode(JFileChooser.FILES_ONLY);
554 chooser.setDialogTitle(Dictionary.get("ExportMeta.ChooseMetaCSVFile"));
555 FileNameExtensionFilter filter = new FileNameExtensionFilter("CSV spreadsheet file", "csv");
556 chooser.setFileFilter(filter);//.addChoosableFileFilter(filter);
557 int returnVal = chooser.showOpenDialog(parent);
558 if(returnVal == JFileChooser.APPROVE_OPTION) {
559 File selectedFile = chooser.getSelectedFile();
560 ///System.err.println("File selected: " + selectedFile.getAbsolutePath());
561 return selectedFile;
562 } else {
563 return null;
564 }
565 }
566}
567
568
569
570
Note: See TracBrowser for help on using the repository browser.