source: main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataToCSV.java@ 34263

Last change on this file since 34263 was 34263, checked in by ak19, 4 years ago

Option in GLI file menu to Export collection meta to CSV (new CSV file or add to existing). Uses apache commons CSV, so had to include the csv subfolder of commons into apache.jar, where all other apache jars used by GLI are bundled into. Updated the gli/lib/README about this.

File size: 21.3 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 2020 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.metadata;
38
39import java.io.*;
40import java.util.*;
41import javax.swing.JFileChooser;
42import javax.swing.filechooser.FileNameExtensionFilter;
43import javax.swing.JFrame;
44
45import org.apache.commons.csv.*;
46
47import org.greenstone.gatherer.util.SafeProcess;
48//import org.greenstone.gatherer.Configuration;
49import org.greenstone.gatherer.DebugStream;
50import org.greenstone.gatherer.Dictionary;
51import org.greenstone.gatherer.Gatherer;
52//import org.greenstone.gatherer.gui.WarningDialog;
53import org.greenstone.gatherer.collection.Collection;
54import org.greenstone.gatherer.metadata.MetadataChangedListener;
55import org.greenstone.gatherer.metadata.MetadataElement; //
56import org.greenstone.gatherer.metadata.MetadataSet;
57import org.greenstone.gatherer.metadata.MetadataSetManager;
58import org.greenstone.gatherer.metadata.MetadataXMLFileManager;
59import org.greenstone.gatherer.metadata.MetadataValue;//
60
61
62/**
63 * Class to export GLI metadata of a collection to a metadata.csv file.
64 * This class can also merge GLI meta for the collection onto an existing metadata.csv file.
65 * Merging is a cumulative process.
66 * Duplicate entries and values are not preserved.
67 * Uses TreeMap and TreeSet to keep everything alphabetically ordered.
68 * TODO: What about ordering by unicode. Is that the natural ordering for Java Strings?
69 * If so, this would support keeping metadata values ordered regardless of script used.
70*/
71public class MetadataToCSV implements FileFilter {
72 private char meta_field_sep = ','; // comma is default field separator for CSV, comma separated values
73 private String meta_value_sep_re = "\\|"; // must escape | to get regex
74 private char meta_value_sep_char = '|'; // when written out to file
75 private String collection_directory_path = "";
76 private String coll_importdir_path = "";
77 private final int import_path_length;
78
79 /** The CSV metadata file to be read and rewritten. */
80 //private String metadataCSVFilename = "metadata.csv";
81 private File metadataCSVFile;
82
83 /** Is this useful?
84 * Not yet implemented: if this flag is true, then if a file mentioned in metadata.csv does not exist,
85 * its entry is dropped and won't appear again when the metadata.csv is written out again.
86 */
87 //private boolean removeMetaForFilesThatDoNotExist = false;
88
89 private final String IMPORT_DIRNAME = "import";
90
91 /** A Map of all files/docs in this collection and their metadata,
92 * itself tuples of metadata field names and their (possibly multiple) metadata values. */
93 TreeMap<File, TreeMap<String,TreeSet<String>>> collMetaMap = new TreeMap<File, TreeMap<String,TreeSet<String>>>();
94
95 public MetadataToCSV(String collDirPath) {
96 this.collection_directory_path = collDirPath;
97 this.coll_importdir_path = collDirPath + IMPORT_DIRNAME + File.separator; //new File(collDirPath, IMPORT_DIRNAME).getAbsolutePath();
98 import_path_length = this.coll_importdir_path.length();
99 this.metadataCSVFile = new File(coll_importdir_path, "metadata.csv");
100 }
101
102 public MetadataToCSV(String collDirPath, File metadataCSV) {
103 this(collDirPath);
104 this.metadataCSVFile = metadataCSVFile;
105 }
106
107 public MetadataToCSV(String collDirPath, File metadataCSVFile, char metafieldSepChar, String readMetaValSepExpression, char writeMetaValSepChar) {
108 this(collDirPath, metadataCSVFile);
109 this.meta_field_sep = metafieldSepChar;
110 this.meta_value_sep_re = readMetaValSepExpression;
111 this.meta_value_sep_char = writeMetaValSepChar;
112 }
113
114 /** Remove import path prefix from given file. Returned is the path of file relative to import. */
115 public String fileToRelativeString(File f) {
116 String fullPath = f.getAbsolutePath();
117 //System.err.println("@@@ fullpath: " + fullPath);
118 //System.err.println("@@@ coll_importdir_path: " + this.coll_importdir_path);
119 int indexMatch = fullPath.indexOf(coll_importdir_path);
120 if(indexMatch == -1) {
121 return fullPath;
122 } else {
123 return fullPath.substring(indexMatch+import_path_length);
124 }
125 }
126
127
128 /** helper methods to export metadata for collection files to csv
129 * Returns a Navigable Sorted Map of file names in the collection (relative to import folder), ordered alphabetically,
130 * mapped to each file's metadata, sorted alphabetically by metadata field name, and list of metadata values sorted alphabetically
131 */
132 public TreeMap<File, TreeMap<String,TreeSet<String>>> getAllAssignedMetadataForAllFiles() {
133 TreeMap<File, TreeMap<String,TreeSet<String>>> files_with_meta = new TreeMap<File, TreeMap<String,TreeSet<String>>>();
134
135 ArrayList<File> files = listFilesInCollection(this.collection_directory_path);
136 Iterator<File> i = files.iterator();
137
138 while(i.hasNext()) {
139 File f = i.next();
140 ArrayList file_meta = MetadataXMLFileManager.getMetadataAssignedToFile(f);
141
142 //files_with_meta.put(f, file_meta);
143 TreeMap<String,TreeSet<String>> fileToMetaMap = new TreeMap<String,TreeSet<String>>();
144
145 // debugging display
146 ///System.err.println("Meta for file: " + f.getAbsolutePath());
147 Iterator it = file_meta.iterator();
148 while(it.hasNext()) {
149 MetadataValue meta = (MetadataValue)it.next();
150 String metaValue = meta.getValue();
151 MetadataElement metaEl = meta.getMetadataElement();
152 String metaFieldName = metaEl.getFullName();
153 ///System.err.println(" field: " + metaFieldName);
154 ///System.err.println(" value: " + metaValue);
155
156 TreeSet<String> vals = fileToMetaMap.get(metaFieldName);
157 if(vals == null) {
158 vals = new TreeSet<String>();
159 vals.add(metaValue);
160 fileToMetaMap.put(metaFieldName, vals);
161 } else {
162 vals.add(metaValue);
163 }
164 }
165
166 files_with_meta.put(f, fileToMetaMap);
167 }
168
169 return files_with_meta;
170 }
171
172 // Get all meta in any metadata.csv file
173 // and add to it all meta assigned for docs in this collection
174 public void amalgamateAllMeta() {
175 TreeMap<File, TreeMap<String,TreeSet<String>>> assignedMeta = getAllAssignedMetadataForAllFiles();
176 TreeMap<File, TreeMap<String,TreeSet<String>>> csvFileMeta = loadMetaFromCSVFile(this.metadataCSVFile);
177
178 if(collMetaMap.size() == 0) {
179
180 if(assignedMeta.keySet().size() > csvFileMeta.keySet().size()) {
181 collMetaMap = assignedMeta;
182 merge(collMetaMap, csvFileMeta);
183 } else {
184 collMetaMap = csvFileMeta;
185 merge(collMetaMap, assignedMeta);
186 }
187 } else {
188
189 merge(collMetaMap, assignedMeta);
190 merge(collMetaMap, csvFileMeta);
191 }
192
193 }
194
195 public TreeSet<String> getAllCollHeadings(TreeMap<File, TreeMap<String,TreeSet<String>>> metaMap) {
196 TreeSet<String> collHeadings = new TreeSet<String>();
197
198 if(metaMap == null || metaMap.size() == 0) {
199 return collHeadings;
200 }
201 // get all meta field names and add into collHeadings. As it's a TreeSet,
202 // duplicates will be automatically ignored and collheadings will be sorted
203 Iterator<File> iFiles = metaMap.keySet().iterator();
204 while(iFiles.hasNext()) {
205 File f = iFiles.next();
206 TreeMap<String, TreeSet<String>> metaFields = metaMap.get(f);
207 Iterator<String> iMetaFields = metaFields.keySet().iterator();
208 while(iMetaFields.hasNext()) {
209 String fieldName = iMetaFields.next();
210 collHeadings.add(fieldName);
211 }
212 }
213
214 return collHeadings;
215 }
216
217 /** merge metaMap param into baseMetaMap: only portions not already present in baseMetaMap are added in
218 * whether these are new file entries, new metadata field entries for extant files, or metadata values for extant fields of files.
219 * A simple map.putALL() will not do the trick as collMetaMap is a complicated data structure.
220 */
221 public void merge(TreeMap<File, TreeMap<String,TreeSet<String>>> baseMetaMap, TreeMap<File, TreeMap<String,TreeSet<String>>> metaMap) {
222
223 if(metaMap == null || metaMap.size() == 0) {
224 // nothing to do
225 return;
226 }
227
228 Iterator<File> iFiles = metaMap.keySet().iterator();
229 while(iFiles.hasNext()) {
230 File f = iFiles.next();
231
232 // check if this file already has an entry in baseMetaMap
233 TreeMap<String, TreeSet<String>> origMetaFields = baseMetaMap.get(f);
234
235 TreeMap<String, TreeSet<String>> metaFields = metaMap.get(f);
236 Iterator<String> iMetaFields = metaFields.keySet().iterator();
237
238 // if file in metaMap didn't exist in baseMetaMap, easy: just copy its entry across in entirety
239 if(origMetaFields == null) {
240 metaMap.put(f, metaFields);
241 continue;
242 }
243
244 // else, file already exists in baseMetaMap, need to check if we have to merge any meta on the file
245 while(iMetaFields.hasNext()) {
246 String fieldName = iMetaFields.next();
247 TreeSet<String> metaValues = metaFields.get(fieldName);
248
249 // check if this metadata field exists for the same file in baseMetaMap
250 TreeSet<String> origMetaValues = origMetaFields.get(fieldName);
251 if(origMetaValues == null) { // this metadata field name did not exist for file in baseMetaMap,
252 // so copy all vals for this fieldName into baseMetaMap's entry for this file
253 origMetaFields.put(fieldName, metaValues);
254 continue; // continue on inner loop
255 }
256
257 // else the meta fieldName existed for that file in baseMetaMap
258 // Check if any of the metadata values didn't already exist, else add them in
259 Iterator<String> iMetaValues = metaValues.iterator();
260 while(iMetaValues.hasNext()) {
261 String metaValue = iMetaValues.next();
262
263 if(!origMetaValues.contains(metaValue)) {
264 origMetaValues.add(metaValue);
265 }
266 }
267
268 }
269 }
270 }
271
272
273 /** If successfully wrote out collection's meta from to a CSV file,
274 * then will need to remove all meta from GLI (metadata.xml files).
275 * Just del or rename those files to .bak?
276 */
277 public void moveGLIMetaToCSV(File csvFile) {
278 boolean success = exportGLIMetaToCSV(csvFile);
279 // TODO
280 if(success) {
281 } else {
282 System.err.println("Failed to export GLI metadata for this collection to CSV properly. Will not remove metadata.xml files");
283 }
284 }
285
286 /** If given a new file to create, creates the specified meta csv file from GLI's meta for the current collection.
287 * If the file exists, this will append the GLI metadata without checking if the file already contains the same entries. */
288 public boolean exportGLIMetaToCSV(File csvFile) {
289 boolean appendSetting = false;
290 boolean success = false;
291
292 // if(csvFile.exists()) {
293 // appendSetting = true; // TODO: better to call the other version of this method in this case?
294 // }
295 // TreeMap<File, TreeMap<String,TreeSet<String>>> assignedMeta = getAllAssignedMetadataForAllFiles();
296 // writeMetaToCSV(assignedMeta, csvFile, appendSetting);
297
298 if(csvFile.exists()) {
299 //appendSetting = true; // better to call the other version of this method in this case?
300 amalgamateAllMeta();
301 success = writeMetaToCSV(collMetaMap, csvFile, appendSetting);
302 } else { // no preexisting metadata.csv file, just write out GLI meta
303 TreeMap<File, TreeMap<String,TreeSet<String>>> assignedMeta = getAllAssignedMetadataForAllFiles();
304 success = writeMetaToCSV(assignedMeta, csvFile, appendSetting);
305 }
306
307 return success;
308 }
309
310 private boolean writeMetaToCSV(TreeMap<File, TreeMap<String,TreeSet<String>>> metaMap, File csvFile, boolean appendSetting) {
311 boolean success = true;
312
313 // First would need to write the row of all headings
314 TreeSet<String> metaFieldColumnHeadings = getAllCollHeadings(metaMap);
315 // Careful, collHeadings are alphabetically ordered, but not all docs may have meta for each column heading/metadata field name
316 // Need metadataFieldNames in an indexed array
317 Vector<String> columnHeadings = new Vector<String>(metaFieldColumnHeadings.size());
318 // put the Filename column as first item
319 columnHeadings.add("Filename");
320 columnHeadings.addAll(metaFieldColumnHeadings); // now have an indexed, yet still ordered, list of all column headings(the meta fieldnames)
321
322 CSVFormat customCSVFormat = CSVFormat.DEFAULT
323 .withDelimiter(meta_field_sep)
324 .withIgnoreSurroundingSpaces(false)
325 .withQuoteMode(QuoteMode.MINIMAL)
326 .withTrim();
327
328 try (CSVPrinter printer = new CSVPrinter(new FileWriter(csvFile, appendSetting), customCSVFormat)) {
329 printer.printRecord(columnHeadings);
330 // https://javadoc.io/doc/org.apache.commons/commons-csv/latest/index.html
331 Iterator<File> iFiles = metaMap.keySet().iterator();
332 while(iFiles.hasNext()) {
333 File f = iFiles.next();
334 String relFilename = fileToRelativeString(f);
335 // write out the filename field of this record
336 printer.print(relFilename);
337
338 TreeMap<String, TreeSet<String>> fileMetadata = metaMap.get(f);
339 // now get each metadata field's value in the order of the column headings, and write them out
340 //for(String metaFieldName : columnHeadings) {
341 for(int i = 1; i < columnHeadings.size(); i++) { // skip past Filename coll heading, already written out
342 String metaFieldName = columnHeadings.get(i);
343 TreeSet<String> metavalues = fileMetadata.get(metaFieldName);
344 StringBuffer allMetaValuesForField = new StringBuffer();
345 if(metavalues == null || metavalues.size() == 0) {
346 // this file does not have (metavalues) such a metaFieldName, the cell for this column is empty
347 //System.err.println("No meta values for fieldname: " + metaFieldName);
348 printer.print(allMetaValuesForField);
349 } else {
350 for(String metavalue : metavalues) {
351 //metavalue = metavalue.trim();
352 allMetaValuesForField.append(meta_value_sep_char);
353 allMetaValuesForField.append(metavalue);
354 }
355 // write out the current metadata field of this record
356 // remove the extra meta_value_separator_char added the first time
357 printer.print(allMetaValuesForField.substring(1));
358 }
359 }
360
361 printer.println(); // done writing a record
362 }
363 } catch (IOException ex) {
364 success = false;
365 DebugStream.printStackTrace(ex);
366 System.err.println("Caught exception when writing meta to CSVFile " + csvFile.getAbsolutePath());
367 System.err.println("\t" + ex.getMessage());
368 }
369
370 return success;
371 }
372
373
374 public TreeMap<File, TreeMap<String,TreeSet<String>>> loadMetaFromCSVFile(File csvFile) {
375 TreeMap<File, TreeMap<String,TreeSet<String>>> csvFileMeta = new TreeMap<File, TreeMap<String,TreeSet<String>>>();
376
377 if(!csvFile.exists()) {
378 return csvFileMeta;
379 }
380
381 Reader in = null;
382 //try(Reader in = new FileReader(csvFile);) { // try-with-resources may break on older Java that we use to build GS3 binaries
383 try {
384 in = new FileReader(csvFile);
385 boolean headingRow = true;
386
387 // https://javadoc.io/doc/org.apache.commons/commons-csv/latest/index.html
388 CSVFormat lenientCSVFormat = CSVFormat.DEFAULT
389 .withDelimiter(meta_field_sep)
390 .withFirstRecordAsHeader()
391 .withCommentMarker('#')
392 .withIgnoreSurroundingSpaces()
393 .withTrim();
394
395 // https://stackoverflow.com/questions/36269387/get-csv-file-header-using-apache-commons
396 // The first col heading which is the Filename
397 // the remaining CSV column headings are the metadata field names
398
399 CSVParser parser = lenientCSVFormat.parse(in);
400
401 //String[] metaFieldNames = lenientCSVFormat.getHeader(); // didn't work
402 // getHeaders() returns List<String>, convert to String[] array
403 String[] metaFieldNames = parser.getHeaderNames().toArray(new String[0]);
404
405 for (CSVRecord record : parser) {
406
407 // a new row, represents a new file's meta
408 TreeMap<String,TreeSet<String>> meta = new TreeMap<String,TreeSet<String>>();
409
410 for(int i = 0; i < record.size(); i++) { //for (String field : record) {
411 String field = record.get(i);
412
413 if(i == 0) { // col 0 = Filename
414 String filename = field;
415 // TODO: filenames are stored relative to import folder, convert to full path for internal use?
416 File fullPathFile = new File(coll_importdir_path + filename);
417 ///System.err.println("Found Filename meta: " + filename);
418 csvFileMeta.put(fullPathFile, meta);
419 } else {
420 // not Filename, but metadata field name, add into meta map for this file
421 TreeSet<String> metaValues = new TreeSet<String>();
422 String metadataFieldName = metaFieldNames[i]; // get column heading=meta field name for current cell
423 meta.put(metadataFieldName, metaValues);
424 ///System.err.println("Found value for meta field: " + metadataFieldName);
425 // Split the field to get all metavalues for this metadata field name
426 // and add to metaValues set
427 String unparsedMetaVal = field.trim();
428 String[] metadataValues = unparsedMetaVal.split(meta_value_sep_re);
429 for(String metaVal : metadataValues) {
430 metaVal = metaVal.trim(); // get rid of whitespaces around separator char
431 if(!metaVal.equals("")) {
432 ///System.err.println("Found value for meta field: " + metaVal);
433 metaValues.add(metaVal);
434 }
435 }
436 }
437 }
438 }
439 } catch(Exception e) {
440 DebugStream.printStackTrace(e);
441 DebugStream.println("@@@ Error reading from CSV file: " + csvFile.getAbsolutePath());
442 } finally {
443 SafeProcess.closeResource(in);
444 }
445
446 //this.print(csvFileMeta);
447 return csvFileMeta;
448 }
449
450 /** For debugging */
451 public void print(TreeMap<File, TreeMap<String,TreeSet<String>>> metaMap ) {
452 Iterator<File> iFiles = metaMap.keySet().iterator();
453 while(iFiles.hasNext()) {
454 File f = iFiles.next();
455 TreeMap<String, TreeSet<String>> metaFields = metaMap.get(f);
456 if(metaFields != null) {
457 System.err.println("Meta for file: " + fileToRelativeString(f)); //f.getAbsolutePath());
458 }
459 Iterator<String> iMetaFields = metaFields.keySet().iterator();
460 if(!iMetaFields.hasNext()) {
461 System.err.println("No meta for file!");
462 }
463 while(iMetaFields.hasNext()) {
464 String fieldName = iMetaFields.next();
465 System.err.println("\tMetafield: " + fieldName);
466
467 TreeSet<String> metaValues = metaFields.get(fieldName);
468 Iterator<String> iMetaValues = metaValues.iterator();
469 while(iMetaValues.hasNext()) {
470 String metaValue = iMetaValues.next();
471 System.err.println("\t\tValue: " + metaValue);
472 }
473 }
474 }
475 }
476
477 /** For debugging */
478 public void printOrderedCollectionMeta() {
479 //TreeMap<File, TreeMap<String,TreeSet<String>>> collMetaMap = getAllAssignedMetadataForAllFiles();
480
481 amalgamateAllMeta();
482 this.print(collMetaMap);
483 }
484
485 public ArrayList<File> listFilesInCollection(String collection_directory_path) {
486
487 ///System.err.println("coll dir path: " + collection_directory_path);
488
489 // only files in import folder have meta. Don't list files outside import folder
490 File collDir = new File(collection_directory_path, IMPORT_DIRNAME);
491
492 ArrayList<File> files = new ArrayList<File>();
493
494 //FileFilter collDocsFilter = new CollectionDocFileFilter();
495 getAllFiles(files, collDir, this);
496
497 return files;
498 }
499
500 public void getAllFiles(ArrayList<File> files, File path, FileFilter filter) {
501 File[] fileList = path.listFiles(filter);
502 for(int i = 0; i < fileList.length; i++) {
503 File f = fileList[i];
504 if(f.isFile()) {
505 files.add(f);
506 } else {
507 getAllFiles(files, f, filter);
508 }
509 }
510 }
511
512 /** Filter to only allow Gathered GS documents
513 * to produce the list of files for which we need to export GLI metadata info to CSV.
514 */
515 //private class CollectionDocFileFilter implements FileFilter {
516 @Override
517 public boolean accept(File pathname) {
518 String tailname = pathname.getName();
519 if(pathname.isDirectory()) {
520 if(tailname.equals(".svn")) {
521 return false;
522 }
523 } else {
524 if(pathname.equals(metadataCSVFile)) { // skip any meta csv file user exported/put into import
525 return false;
526 } else if(tailname.equals("metadata.xml")) {
527 return false;
528 } else if(tailname.endsWith("~")) {
529 return false;
530 } else if(tailname.endsWith(".bak")) {
531 return false;
532 }
533 }
534 // accept all other file types
535 return true;
536 }
537 //}
538
539 public static File chooseMetaCSVFile(String defaultSearchPath, JFrame parent) {
540 JFileChooser chooser = new JFileChooser(defaultSearchPath);
541 chooser.setFileSelectionMode(JFileChooser.FILES_ONLY);
542 chooser.setDialogTitle(Dictionary.get("ExportMeta.ChooseMetaCSVFile"));
543 FileNameExtensionFilter filter = new FileNameExtensionFilter("CSV spreadsheet file", "csv");
544 chooser.setFileFilter(filter);//.addChoosableFileFilter(filter);
545 int returnVal = chooser.showOpenDialog(parent);
546 if(returnVal == JFileChooser.APPROVE_OPTION) {
547 File selectedFile = chooser.getSelectedFile();
548 ///System.err.println("File selected: " + selectedFile.getAbsolutePath());
549 return selectedFile;
550 } else {
551 return null;
552 }
553 }
554}
555
556
557
558
Note: See TracBrowser for help on using the repository browser.