/** *######################################################################### * * A component of the Gatherer application, part of the Greenstone digital * library suite from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * *

* * Author: Greenstone Digital Library, University of Waikato * *

* * Copyright (C) 2020 New Zealand Digital Library Project * *

* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * *

* * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * *

* * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *######################################################################## */ package org.greenstone.gatherer.metadata; import java.io.*; import java.util.*; import javax.swing.JFileChooser; import javax.swing.filechooser.FileNameExtensionFilter; import javax.swing.JFrame; import org.apache.commons.csv.*; import org.greenstone.gatherer.util.SafeProcess; //import org.greenstone.gatherer.Configuration; import org.greenstone.gatherer.DebugStream; import org.greenstone.gatherer.Dictionary; import org.greenstone.gatherer.Gatherer; //import org.greenstone.gatherer.gui.WarningDialog; import org.greenstone.gatherer.collection.Collection; import org.greenstone.gatherer.metadata.MetadataChangedListener; import org.greenstone.gatherer.metadata.MetadataElement; // import org.greenstone.gatherer.metadata.MetadataSet; import org.greenstone.gatherer.metadata.MetadataSetManager; import org.greenstone.gatherer.metadata.MetadataXMLFileManager; import org.greenstone.gatherer.metadata.MetadataValue;// /** * Class to export GLI metadata of a collection to a metadata.csv file. * This class can also merge GLI meta for the collection onto an existing metadata.csv file. * Merging is a cumulative process. * Duplicate entries and values are not preserved. * Uses TreeMap and TreeSet to keep everything alphabetically ordered. * TODO: What about ordering by unicode. Is that the natural ordering for Java Strings? * If so, this would support keeping metadata values ordered regardless of script used. */ public class MetadataToCSV implements FileFilter { private char meta_field_sep = ','; // comma is default field separator for CSV, comma separated values private String meta_value_sep_re = "\\|"; // must escape | to get regex private char meta_value_sep_char = '|'; // when written out to file private String collection_directory_path = ""; private String coll_importdir_path = ""; private final int import_path_length; /** The CSV metadata file to be read and rewritten. */ //private String metadataCSVFilename = "metadata.csv"; private File metadataCSVFile; /** Is this useful? * Not yet implemented: if this flag is true, then if a file mentioned in metadata.csv does not exist, * its entry is dropped and won't appear again when the metadata.csv is written out again. */ //private boolean removeMetaForFilesThatDoNotExist = false; private final String IMPORT_DIRNAME = "import"; /** A Map of all files/docs in this collection and their metadata, * itself tuples of metadata field names and their (possibly multiple) metadata values. */ TreeMap>> collMetaMap = new TreeMap>>(); public MetadataToCSV(String collDirPath) { this.collection_directory_path = collDirPath; this.coll_importdir_path = collDirPath + IMPORT_DIRNAME + File.separator; //new File(collDirPath, IMPORT_DIRNAME).getAbsolutePath(); import_path_length = this.coll_importdir_path.length(); this.metadataCSVFile = new File(coll_importdir_path, "metadata.csv"); } public MetadataToCSV(String collDirPath, File metadataCSV) { this(collDirPath); this.metadataCSVFile = metadataCSVFile; } public MetadataToCSV(String collDirPath, File metadataCSVFile, char metafieldSepChar, String readMetaValSepExpression, char writeMetaValSepChar) { this(collDirPath, metadataCSVFile); this.meta_field_sep = metafieldSepChar; this.meta_value_sep_re = readMetaValSepExpression; this.meta_value_sep_char = writeMetaValSepChar; } /** Remove import path prefix from given file. Returned is the path of file relative to import. */ public String fileToRelativeString(File f) { String fullPath = f.getAbsolutePath(); //System.err.println("@@@ fullpath: " + fullPath); //System.err.println("@@@ coll_importdir_path: " + this.coll_importdir_path); int indexMatch = fullPath.indexOf(coll_importdir_path); if(indexMatch == -1) { return fullPath; } else { return fullPath.substring(indexMatch+import_path_length); } } /** helper methods to export metadata for collection files to csv * Returns a Navigable Sorted Map of file names in the collection (relative to import folder), ordered alphabetically, * mapped to each file's metadata, sorted alphabetically by metadata field name, and list of metadata values sorted alphabetically */ public TreeMap>> getAllAssignedMetadataForAllFiles() { TreeMap>> files_with_meta = new TreeMap>>(); ArrayList files = listFilesInCollection(this.collection_directory_path); Iterator i = files.iterator(); while(i.hasNext()) { File f = i.next(); ArrayList file_meta = MetadataXMLFileManager.getMetadataAssignedToFile(f); //files_with_meta.put(f, file_meta); TreeMap> fileToMetaMap = new TreeMap>(); // debugging display ///System.err.println("Meta for file: " + f.getAbsolutePath()); Iterator it = file_meta.iterator(); while(it.hasNext()) { MetadataValue meta = (MetadataValue)it.next(); String metaValue = meta.getValue(); MetadataElement metaEl = meta.getMetadataElement(); String metaFieldName = metaEl.getFullName(); ///System.err.println(" field: " + metaFieldName); ///System.err.println(" value: " + metaValue); TreeSet vals = fileToMetaMap.get(metaFieldName); if(vals == null) { vals = new TreeSet(); vals.add(metaValue); fileToMetaMap.put(metaFieldName, vals); } else { vals.add(metaValue); } } files_with_meta.put(f, fileToMetaMap); } return files_with_meta; } // Get all meta in any metadata.csv file // and add to it all meta assigned for docs in this collection public void amalgamateAllMeta() { TreeMap>> assignedMeta = getAllAssignedMetadataForAllFiles(); TreeMap>> csvFileMeta = loadMetaFromCSVFile(this.metadataCSVFile); if(collMetaMap.size() == 0) { if(assignedMeta.keySet().size() > csvFileMeta.keySet().size()) { collMetaMap = assignedMeta; merge(collMetaMap, csvFileMeta); } else { collMetaMap = csvFileMeta; merge(collMetaMap, assignedMeta); } } else { merge(collMetaMap, assignedMeta); merge(collMetaMap, csvFileMeta); } } public TreeSet getAllCollHeadings(TreeMap>> metaMap) { TreeSet collHeadings = new TreeSet(); if(metaMap == null || metaMap.size() == 0) { return collHeadings; } // get all meta field names and add into collHeadings. As it's a TreeSet, // duplicates will be automatically ignored and collheadings will be sorted Iterator iFiles = metaMap.keySet().iterator(); while(iFiles.hasNext()) { File f = iFiles.next(); TreeMap> metaFields = metaMap.get(f); Iterator iMetaFields = metaFields.keySet().iterator(); while(iMetaFields.hasNext()) { String fieldName = iMetaFields.next(); collHeadings.add(fieldName); } } return collHeadings; } /** merge metaMap param into baseMetaMap: only portions not already present in baseMetaMap are added in * whether these are new file entries, new metadata field entries for extant files, or metadata values for extant fields of files. * A simple map.putALL() will not do the trick as collMetaMap is a complicated data structure. */ public void merge(TreeMap>> baseMetaMap, TreeMap>> metaMap) { if(metaMap == null || metaMap.size() == 0) { // nothing to do return; } Iterator iFiles = metaMap.keySet().iterator(); while(iFiles.hasNext()) { File f = iFiles.next(); // check if this file already has an entry in baseMetaMap TreeMap> origMetaFields = baseMetaMap.get(f); TreeMap> metaFields = metaMap.get(f); Iterator iMetaFields = metaFields.keySet().iterator(); // if file in metaMap didn't exist in baseMetaMap, easy: just copy its entry across in entirety if(origMetaFields == null) { metaMap.put(f, metaFields); continue; } // else, file already exists in baseMetaMap, need to check if we have to merge any meta on the file while(iMetaFields.hasNext()) { String fieldName = iMetaFields.next(); TreeSet metaValues = metaFields.get(fieldName); // check if this metadata field exists for the same file in baseMetaMap TreeSet origMetaValues = origMetaFields.get(fieldName); if(origMetaValues == null) { // this metadata field name did not exist for file in baseMetaMap, // so copy all vals for this fieldName into baseMetaMap's entry for this file origMetaFields.put(fieldName, metaValues); continue; // continue on inner loop } // else the meta fieldName existed for that file in baseMetaMap // Check if any of the metadata values didn't already exist, else add them in Iterator iMetaValues = metaValues.iterator(); while(iMetaValues.hasNext()) { String metaValue = iMetaValues.next(); if(!origMetaValues.contains(metaValue)) { origMetaValues.add(metaValue); } } } } } /** If successfully wrote out collection's meta from to a CSV file, * then will need to remove all meta from GLI (metadata.xml files). * Just del or rename those files to .bak? */ public void moveGLIMetaToCSV(File csvFile) { boolean success = exportGLIMetaToCSV(csvFile); // TODO if(success) { } else { System.err.println("Failed to export GLI metadata for this collection to CSV properly. Will not remove metadata.xml files"); } } /** If given a new file to create, creates the specified meta csv file from GLI's meta for the current collection. * If the file exists, this will append the GLI metadata without checking if the file already contains the same entries. */ public boolean exportGLIMetaToCSV(File csvFile) { boolean appendSetting = false; boolean success = false; // if(csvFile.exists()) { // appendSetting = true; // TODO: better to call the other version of this method in this case? // } // TreeMap>> assignedMeta = getAllAssignedMetadataForAllFiles(); // writeMetaToCSV(assignedMeta, csvFile, appendSetting); if(csvFile.exists()) { //appendSetting = true; // better to call the other version of this method in this case? amalgamateAllMeta(); success = writeMetaToCSV(collMetaMap, csvFile, appendSetting); } else { // no preexisting metadata.csv file, just write out GLI meta TreeMap>> assignedMeta = getAllAssignedMetadataForAllFiles(); success = writeMetaToCSV(assignedMeta, csvFile, appendSetting); } return success; } private boolean writeMetaToCSV(TreeMap>> metaMap, File csvFile, boolean appendSetting) { boolean success = true; // First would need to write the row of all headings TreeSet metaFieldColumnHeadings = getAllCollHeadings(metaMap); // Careful, collHeadings are alphabetically ordered, but not all docs may have meta for each column heading/metadata field name // Need metadataFieldNames in an indexed array Vector columnHeadings = new Vector(metaFieldColumnHeadings.size()); // put the Filename column as first item columnHeadings.add("Filename"); columnHeadings.addAll(metaFieldColumnHeadings); // now have an indexed, yet still ordered, list of all column headings(the meta fieldnames) CSVFormat customCSVFormat = CSVFormat.DEFAULT .withDelimiter(meta_field_sep) .withIgnoreSurroundingSpaces(false) .withQuoteMode(QuoteMode.MINIMAL) .withTrim(); try (CSVPrinter printer = new CSVPrinter(new FileWriter(csvFile, appendSetting), customCSVFormat)) { printer.printRecord(columnHeadings); // https://javadoc.io/doc/org.apache.commons/commons-csv/latest/index.html Iterator iFiles = metaMap.keySet().iterator(); while(iFiles.hasNext()) { File f = iFiles.next(); String relFilename = fileToRelativeString(f); // write out the filename field of this record printer.print(relFilename); TreeMap> fileMetadata = metaMap.get(f); // now get each metadata field's value in the order of the column headings, and write them out //for(String metaFieldName : columnHeadings) { for(int i = 1; i < columnHeadings.size(); i++) { // skip past Filename coll heading, already written out String metaFieldName = columnHeadings.get(i); TreeSet metavalues = fileMetadata.get(metaFieldName); StringBuffer allMetaValuesForField = new StringBuffer(); if(metavalues == null || metavalues.size() == 0) { // this file does not have (metavalues) such a metaFieldName, the cell for this column is empty //System.err.println("No meta values for fieldname: " + metaFieldName); printer.print(allMetaValuesForField); } else { for(String metavalue : metavalues) { //metavalue = metavalue.trim(); allMetaValuesForField.append(meta_value_sep_char); allMetaValuesForField.append(metavalue); } // write out the current metadata field of this record // remove the extra meta_value_separator_char added the first time printer.print(allMetaValuesForField.substring(1)); } } printer.println(); // done writing a record } } catch (IOException ex) { success = false; DebugStream.printStackTrace(ex); System.err.println("Caught exception when writing meta to CSVFile " + csvFile.getAbsolutePath()); System.err.println("\t" + ex.getMessage()); } return success; } public TreeMap>> loadMetaFromCSVFile(File csvFile) { TreeMap>> csvFileMeta = new TreeMap>>(); if(!csvFile.exists()) { return csvFileMeta; } Reader in = null; //try(Reader in = new FileReader(csvFile);) { // try-with-resources may break on older Java that we use to build GS3 binaries try { in = new FileReader(csvFile); boolean headingRow = true; // https://javadoc.io/doc/org.apache.commons/commons-csv/latest/index.html CSVFormat lenientCSVFormat = CSVFormat.DEFAULT .withDelimiter(meta_field_sep) .withFirstRecordAsHeader() .withCommentMarker('#') .withIgnoreSurroundingSpaces() .withTrim(); // https://stackoverflow.com/questions/36269387/get-csv-file-header-using-apache-commons // The first col heading which is the Filename // the remaining CSV column headings are the metadata field names CSVParser parser = lenientCSVFormat.parse(in); //String[] metaFieldNames = lenientCSVFormat.getHeader(); // didn't work // getHeaders() returns List, convert to String[] array String[] metaFieldNames = parser.getHeaderNames().toArray(new String[0]); for (CSVRecord record : parser) { // a new row, represents a new file's meta TreeMap> meta = new TreeMap>(); for(int i = 0; i < record.size(); i++) { //for (String field : record) { String field = record.get(i); if(i == 0) { // col 0 = Filename String filename = field; // TODO: filenames are stored relative to import folder, convert to full path for internal use? File fullPathFile = new File(coll_importdir_path + filename); ///System.err.println("Found Filename meta: " + filename); csvFileMeta.put(fullPathFile, meta); } else { // not Filename, but metadata field name, add into meta map for this file TreeSet metaValues = new TreeSet(); String metadataFieldName = metaFieldNames[i]; // get column heading=meta field name for current cell meta.put(metadataFieldName, metaValues); ///System.err.println("Found value for meta field: " + metadataFieldName); // Split the field to get all metavalues for this metadata field name // and add to metaValues set String unparsedMetaVal = field.trim(); String[] metadataValues = unparsedMetaVal.split(meta_value_sep_re); for(String metaVal : metadataValues) { metaVal = metaVal.trim(); // get rid of whitespaces around separator char if(!metaVal.equals("")) { ///System.err.println("Found value for meta field: " + metaVal); metaValues.add(metaVal); } } } } } } catch(Exception e) { DebugStream.printStackTrace(e); DebugStream.println("@@@ Error reading from CSV file: " + csvFile.getAbsolutePath()); } finally { SafeProcess.closeResource(in); } //this.print(csvFileMeta); return csvFileMeta; } /** For debugging */ public void print(TreeMap>> metaMap ) { Iterator iFiles = metaMap.keySet().iterator(); while(iFiles.hasNext()) { File f = iFiles.next(); TreeMap> metaFields = metaMap.get(f); if(metaFields != null) { System.err.println("Meta for file: " + fileToRelativeString(f)); //f.getAbsolutePath()); } Iterator iMetaFields = metaFields.keySet().iterator(); if(!iMetaFields.hasNext()) { System.err.println("No meta for file!"); } while(iMetaFields.hasNext()) { String fieldName = iMetaFields.next(); System.err.println("\tMetafield: " + fieldName); TreeSet metaValues = metaFields.get(fieldName); Iterator iMetaValues = metaValues.iterator(); while(iMetaValues.hasNext()) { String metaValue = iMetaValues.next(); System.err.println("\t\tValue: " + metaValue); } } } } /** For debugging */ public void printOrderedCollectionMeta() { //TreeMap>> collMetaMap = getAllAssignedMetadataForAllFiles(); amalgamateAllMeta(); this.print(collMetaMap); } public ArrayList listFilesInCollection(String collection_directory_path) { ///System.err.println("coll dir path: " + collection_directory_path); // only files in import folder have meta. Don't list files outside import folder File collDir = new File(collection_directory_path, IMPORT_DIRNAME); ArrayList files = new ArrayList(); //FileFilter collDocsFilter = new CollectionDocFileFilter(); getAllFiles(files, collDir, this); return files; } public void getAllFiles(ArrayList files, File path, FileFilter filter) { File[] fileList = path.listFiles(filter); for(int i = 0; i < fileList.length; i++) { File f = fileList[i]; if(f.isFile()) { files.add(f); } else { getAllFiles(files, f, filter); } } } /** Filter to only allow Gathered GS documents * to produce the list of files for which we need to export GLI metadata info to CSV. */ //private class CollectionDocFileFilter implements FileFilter { @Override public boolean accept(File pathname) { String tailname = pathname.getName(); if(pathname.isDirectory()) { if(tailname.equals(".svn")) { return false; } } else { if(pathname.equals(metadataCSVFile)) { // skip any meta csv file user exported/put into import return false; } else if(tailname.equals("metadata.xml")) { return false; } else if(tailname.endsWith("~")) { return false; } else if(tailname.endsWith(".bak")) { return false; } } // accept all other file types return true; } //} public static File chooseMetaCSVFile(String defaultSearchPath, JFrame parent) { JFileChooser chooser = new JFileChooser(defaultSearchPath); chooser.setFileSelectionMode(JFileChooser.FILES_ONLY); chooser.setDialogTitle(Dictionary.get("ExportMeta.ChooseMetaCSVFile")); FileNameExtensionFilter filter = new FileNameExtensionFilter("CSV spreadsheet file", "csv"); chooser.setFileFilter(filter);//.addChoosableFileFilter(filter); int returnVal = chooser.showOpenDialog(parent); if(returnVal == JFileChooser.APPROVE_OPTION) { File selectedFile = chooser.getSelectedFile(); ///System.err.println("File selected: " + selectedFile.getAbsolutePath()); return selectedFile; } else { return null; } } }