/**
*#########################################################################
*
* A component of the Gatherer application, part of the Greenstone digital
* library suite from the New Zealand Digital Library Project at the
* University of Waikato, New Zealand.
*
*
*
* Author: Greenstone Digital Library, University of Waikato
*
*
*
* Copyright (C) 2020 New Zealand Digital Library Project
*
*
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
*
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*########################################################################
*/
package org.greenstone.gatherer.metadata;
import java.io.*;
import java.util.*;
import javax.swing.JFileChooser;
import javax.swing.filechooser.FileNameExtensionFilter;
import javax.swing.JFrame;
import org.apache.commons.csv.*;
import org.greenstone.gatherer.util.SafeProcess;
//import org.greenstone.gatherer.Configuration;
import org.greenstone.gatherer.DebugStream;
import org.greenstone.gatherer.Dictionary;
import org.greenstone.gatherer.Gatherer;
//import org.greenstone.gatherer.gui.WarningDialog;
import org.greenstone.gatherer.collection.Collection;
import org.greenstone.gatherer.metadata.MetadataChangedListener;
import org.greenstone.gatherer.metadata.MetadataElement; //
import org.greenstone.gatherer.metadata.MetadataSet;
import org.greenstone.gatherer.metadata.MetadataSetManager;
import org.greenstone.gatherer.metadata.MetadataXMLFileManager;
import org.greenstone.gatherer.metadata.MetadataValue;//
/**
* Class to export GLI metadata of a collection to a metadata.csv file.
* This class can also merge GLI meta for the collection onto an existing metadata.csv file.
* Merging is a cumulative process.
* Duplicate entries and values are not preserved.
* Uses TreeMap and TreeSet to keep everything alphabetically ordered.
* TODO: What about ordering by unicode. Is that the natural ordering for Java Strings?
* If so, this would support keeping metadata values ordered regardless of script used.
*/
public class MetadataToCSV implements FileFilter {
private char meta_field_sep = ','; // comma is default field separator for CSV, comma separated values
private String meta_value_sep_re = "\\|"; // must escape | to get regex
private char meta_value_sep_char = '|'; // when written out to file
private String collection_directory_path = "";
private String coll_importdir_path = "";
private final int import_path_length;
/** The CSV metadata file to be read and rewritten. */
//private String metadataCSVFilename = "metadata.csv";
private File metadataCSVFile;
/** Is this useful?
* Not yet implemented: if this flag is true, then if a file mentioned in metadata.csv does not exist,
* its entry is dropped and won't appear again when the metadata.csv is written out again.
*/
//private boolean removeMetaForFilesThatDoNotExist = false;
private final String IMPORT_DIRNAME = "import";
/** A Map of all files/docs in this collection and their metadata,
* itself tuples of metadata field names and their (possibly multiple) metadata values. */
TreeMap>> collMetaMap = new TreeMap>>();
public MetadataToCSV(String collDirPath) {
this.collection_directory_path = collDirPath;
this.coll_importdir_path = collDirPath + IMPORT_DIRNAME + File.separator; //new File(collDirPath, IMPORT_DIRNAME).getAbsolutePath();
import_path_length = this.coll_importdir_path.length();
this.metadataCSVFile = new File(coll_importdir_path, "metadata.csv");
}
public MetadataToCSV(String collDirPath, File metadataCSV) {
this(collDirPath);
this.metadataCSVFile = metadataCSVFile;
}
public MetadataToCSV(String collDirPath, File metadataCSVFile, char metafieldSepChar, String readMetaValSepExpression, char writeMetaValSepChar) {
this(collDirPath, metadataCSVFile);
this.meta_field_sep = metafieldSepChar;
this.meta_value_sep_re = readMetaValSepExpression;
this.meta_value_sep_char = writeMetaValSepChar;
}
/** Remove import path prefix from given file. Returned is the path of file relative to import. */
public String fileToRelativeString(File f) {
String fullPath = f.getAbsolutePath();
//System.err.println("@@@ fullpath: " + fullPath);
//System.err.println("@@@ coll_importdir_path: " + this.coll_importdir_path);
int indexMatch = fullPath.indexOf(coll_importdir_path);
if(indexMatch == -1) {
return fullPath;
} else {
return fullPath.substring(indexMatch+import_path_length);
}
}
/** helper methods to export metadata for collection files to csv
* Returns a Navigable Sorted Map of file names in the collection (relative to import folder), ordered alphabetically,
* mapped to each file's metadata, sorted alphabetically by metadata field name, and list of metadata values sorted alphabetically
*/
public TreeMap>> getAllAssignedMetadataForAllFiles() {
TreeMap>> files_with_meta = new TreeMap>>();
ArrayList files = listFilesInCollection(this.collection_directory_path);
Iterator i = files.iterator();
while(i.hasNext()) {
File f = i.next();
ArrayList file_meta = MetadataXMLFileManager.getMetadataAssignedToFile(f);
//files_with_meta.put(f, file_meta);
TreeMap> fileToMetaMap = new TreeMap>();
// debugging display
///System.err.println("Meta for file: " + f.getAbsolutePath());
Iterator it = file_meta.iterator();
while(it.hasNext()) {
MetadataValue meta = (MetadataValue)it.next();
String metaValue = meta.getValue();
MetadataElement metaEl = meta.getMetadataElement();
String metaFieldName = metaEl.getFullName();
///System.err.println(" field: " + metaFieldName);
///System.err.println(" value: " + metaValue);
TreeSet vals = fileToMetaMap.get(metaFieldName);
if(vals == null) {
vals = new TreeSet();
vals.add(metaValue);
fileToMetaMap.put(metaFieldName, vals);
} else {
vals.add(metaValue);
}
}
files_with_meta.put(f, fileToMetaMap);
}
return files_with_meta;
}
// Get all meta in any metadata.csv file
// and add to it all meta assigned for docs in this collection
public void amalgamateAllMeta() {
TreeMap>> assignedMeta = getAllAssignedMetadataForAllFiles();
TreeMap>> csvFileMeta = loadMetaFromCSVFile(this.metadataCSVFile);
if(collMetaMap.size() == 0) {
if(assignedMeta.keySet().size() > csvFileMeta.keySet().size()) {
collMetaMap = assignedMeta;
merge(collMetaMap, csvFileMeta);
} else {
collMetaMap = csvFileMeta;
merge(collMetaMap, assignedMeta);
}
} else {
merge(collMetaMap, assignedMeta);
merge(collMetaMap, csvFileMeta);
}
}
public TreeSet getAllCollHeadings(TreeMap>> metaMap) {
TreeSet collHeadings = new TreeSet();
if(metaMap == null || metaMap.size() == 0) {
return collHeadings;
}
// get all meta field names and add into collHeadings. As it's a TreeSet,
// duplicates will be automatically ignored and collheadings will be sorted
Iterator iFiles = metaMap.keySet().iterator();
while(iFiles.hasNext()) {
File f = iFiles.next();
TreeMap> metaFields = metaMap.get(f);
Iterator iMetaFields = metaFields.keySet().iterator();
while(iMetaFields.hasNext()) {
String fieldName = iMetaFields.next();
collHeadings.add(fieldName);
}
}
return collHeadings;
}
/** merge metaMap param into baseMetaMap: only portions not already present in baseMetaMap are added in
* whether these are new file entries, new metadata field entries for extant files, or metadata values for extant fields of files.
* A simple map.putALL() will not do the trick as collMetaMap is a complicated data structure.
*/
public void merge(TreeMap>> baseMetaMap, TreeMap>> metaMap) {
if(metaMap == null || metaMap.size() == 0) {
// nothing to do
return;
}
Iterator iFiles = metaMap.keySet().iterator();
while(iFiles.hasNext()) {
File f = iFiles.next();
// check if this file already has an entry in baseMetaMap
TreeMap> origMetaFields = baseMetaMap.get(f);
TreeMap> metaFields = metaMap.get(f);
Iterator iMetaFields = metaFields.keySet().iterator();
// if file in metaMap didn't exist in baseMetaMap, easy: just copy its entry across in entirety
if(origMetaFields == null) {
metaMap.put(f, metaFields);
continue;
}
// else, file already exists in baseMetaMap, need to check if we have to merge any meta on the file
while(iMetaFields.hasNext()) {
String fieldName = iMetaFields.next();
TreeSet metaValues = metaFields.get(fieldName);
// check if this metadata field exists for the same file in baseMetaMap
TreeSet origMetaValues = origMetaFields.get(fieldName);
if(origMetaValues == null) { // this metadata field name did not exist for file in baseMetaMap,
// so copy all vals for this fieldName into baseMetaMap's entry for this file
origMetaFields.put(fieldName, metaValues);
continue; // continue on inner loop
}
// else the meta fieldName existed for that file in baseMetaMap
// Check if any of the metadata values didn't already exist, else add them in
Iterator iMetaValues = metaValues.iterator();
while(iMetaValues.hasNext()) {
String metaValue = iMetaValues.next();
if(!origMetaValues.contains(metaValue)) {
origMetaValues.add(metaValue);
}
}
}
}
}
/** If successfully wrote out collection's meta from to a CSV file,
* then will need to remove all meta from GLI (metadata.xml files).
* Just del or rename those files to .bak?
*/
public void moveGLIMetaToCSV(File csvFile) {
boolean success = exportGLIMetaToCSV(csvFile);
// TODO
if(success) {
} else {
System.err.println("Failed to export GLI metadata for this collection to CSV properly. Will not remove metadata.xml files");
}
}
/** If given a new file to create, creates the specified meta csv file from GLI's meta for the current collection.
* If the file exists, this will append the GLI metadata without checking if the file already contains the same entries. */
public boolean exportGLIMetaToCSV(File csvFile) {
boolean appendSetting = false;
boolean success = false;
// if(csvFile.exists()) {
// appendSetting = true; // TODO: better to call the other version of this method in this case?
// }
// TreeMap>> assignedMeta = getAllAssignedMetadataForAllFiles();
// writeMetaToCSV(assignedMeta, csvFile, appendSetting);
if(csvFile.exists()) {
//appendSetting = true; // better to call the other version of this method in this case?
amalgamateAllMeta();
success = writeMetaToCSV(collMetaMap, csvFile, appendSetting);
} else { // no preexisting metadata.csv file, just write out GLI meta
TreeMap>> assignedMeta = getAllAssignedMetadataForAllFiles();
success = writeMetaToCSV(assignedMeta, csvFile, appendSetting);
}
return success;
}
private boolean writeMetaToCSV(TreeMap>> metaMap, File csvFile, boolean appendSetting) {
boolean success = true;
// First would need to write the row of all headings
TreeSet metaFieldColumnHeadings = getAllCollHeadings(metaMap);
// Careful, collHeadings are alphabetically ordered, but not all docs may have meta for each column heading/metadata field name
// Need metadataFieldNames in an indexed array
Vector columnHeadings = new Vector(metaFieldColumnHeadings.size());
// put the Filename column as first item
columnHeadings.add("Filename");
columnHeadings.addAll(metaFieldColumnHeadings); // now have an indexed, yet still ordered, list of all column headings(the meta fieldnames)
CSVFormat customCSVFormat = CSVFormat.DEFAULT
.withDelimiter(meta_field_sep)
.withIgnoreSurroundingSpaces(false)
.withQuoteMode(QuoteMode.MINIMAL)
.withTrim();
try (CSVPrinter printer = new CSVPrinter(new FileWriter(csvFile, appendSetting), customCSVFormat)) {
printer.printRecord(columnHeadings);
// https://javadoc.io/doc/org.apache.commons/commons-csv/latest/index.html
Iterator iFiles = metaMap.keySet().iterator();
while(iFiles.hasNext()) {
File f = iFiles.next();
String relFilename = fileToRelativeString(f);
// write out the filename field of this record
printer.print(relFilename);
TreeMap> fileMetadata = metaMap.get(f);
// now get each metadata field's value in the order of the column headings, and write them out
//for(String metaFieldName : columnHeadings) {
for(int i = 1; i < columnHeadings.size(); i++) { // skip past Filename coll heading, already written out
String metaFieldName = columnHeadings.get(i);
TreeSet metavalues = fileMetadata.get(metaFieldName);
StringBuffer allMetaValuesForField = new StringBuffer();
if(metavalues == null || metavalues.size() == 0) {
// this file does not have (metavalues) such a metaFieldName, the cell for this column is empty
//System.err.println("No meta values for fieldname: " + metaFieldName);
printer.print(allMetaValuesForField);
} else {
for(String metavalue : metavalues) {
//metavalue = metavalue.trim();
allMetaValuesForField.append(meta_value_sep_char);
allMetaValuesForField.append(metavalue);
}
// write out the current metadata field of this record
// remove the extra meta_value_separator_char added the first time
printer.print(allMetaValuesForField.substring(1));
}
}
printer.println(); // done writing a record
}
} catch (IOException ex) {
success = false;
DebugStream.printStackTrace(ex);
System.err.println("Caught exception when writing meta to CSVFile " + csvFile.getAbsolutePath());
System.err.println("\t" + ex.getMessage());
}
return success;
}
public TreeMap>> loadMetaFromCSVFile(File csvFile) {
TreeMap>> csvFileMeta = new TreeMap>>();
if(!csvFile.exists()) {
return csvFileMeta;
}
Reader in = null;
//try(Reader in = new FileReader(csvFile);) { // try-with-resources may break on older Java that we use to build GS3 binaries
try {
in = new FileReader(csvFile);
boolean headingRow = true;
// https://javadoc.io/doc/org.apache.commons/commons-csv/latest/index.html
CSVFormat lenientCSVFormat = CSVFormat.DEFAULT
.withDelimiter(meta_field_sep)
.withFirstRecordAsHeader()
.withCommentMarker('#')
.withIgnoreSurroundingSpaces()
.withTrim();
// https://stackoverflow.com/questions/36269387/get-csv-file-header-using-apache-commons
// The first col heading which is the Filename
// the remaining CSV column headings are the metadata field names
CSVParser parser = lenientCSVFormat.parse(in);
//String[] metaFieldNames = lenientCSVFormat.getHeader(); // didn't work
// getHeaders() returns List, convert to String[] array
String[] metaFieldNames = parser.getHeaderNames().toArray(new String[0]);
for (CSVRecord record : parser) {
// a new row, represents a new file's meta
TreeMap> meta = new TreeMap>();
for(int i = 0; i < record.size(); i++) { //for (String field : record) {
String field = record.get(i);
if(i == 0) { // col 0 = Filename
String filename = field;
// TODO: filenames are stored relative to import folder, convert to full path for internal use?
File fullPathFile = new File(coll_importdir_path + filename);
///System.err.println("Found Filename meta: " + filename);
csvFileMeta.put(fullPathFile, meta);
} else {
// not Filename, but metadata field name, add into meta map for this file
TreeSet metaValues = new TreeSet();
String metadataFieldName = metaFieldNames[i]; // get column heading=meta field name for current cell
meta.put(metadataFieldName, metaValues);
///System.err.println("Found value for meta field: " + metadataFieldName);
// Split the field to get all metavalues for this metadata field name
// and add to metaValues set
String unparsedMetaVal = field.trim();
String[] metadataValues = unparsedMetaVal.split(meta_value_sep_re);
for(String metaVal : metadataValues) {
metaVal = metaVal.trim(); // get rid of whitespaces around separator char
if(!metaVal.equals("")) {
///System.err.println("Found value for meta field: " + metaVal);
metaValues.add(metaVal);
}
}
}
}
}
} catch(Exception e) {
DebugStream.printStackTrace(e);
DebugStream.println("@@@ Error reading from CSV file: " + csvFile.getAbsolutePath());
} finally {
SafeProcess.closeResource(in);
}
//this.print(csvFileMeta);
return csvFileMeta;
}
/** For debugging */
public void print(TreeMap>> metaMap ) {
Iterator iFiles = metaMap.keySet().iterator();
while(iFiles.hasNext()) {
File f = iFiles.next();
TreeMap> metaFields = metaMap.get(f);
if(metaFields != null) {
System.err.println("Meta for file: " + fileToRelativeString(f)); //f.getAbsolutePath());
}
Iterator iMetaFields = metaFields.keySet().iterator();
if(!iMetaFields.hasNext()) {
System.err.println("No meta for file!");
}
while(iMetaFields.hasNext()) {
String fieldName = iMetaFields.next();
System.err.println("\tMetafield: " + fieldName);
TreeSet metaValues = metaFields.get(fieldName);
Iterator iMetaValues = metaValues.iterator();
while(iMetaValues.hasNext()) {
String metaValue = iMetaValues.next();
System.err.println("\t\tValue: " + metaValue);
}
}
}
}
/** For debugging */
public void printOrderedCollectionMeta() {
//TreeMap>> collMetaMap = getAllAssignedMetadataForAllFiles();
amalgamateAllMeta();
this.print(collMetaMap);
}
public ArrayList listFilesInCollection(String collection_directory_path) {
///System.err.println("coll dir path: " + collection_directory_path);
// only files in import folder have meta. Don't list files outside import folder
File collDir = new File(collection_directory_path, IMPORT_DIRNAME);
ArrayList files = new ArrayList();
//FileFilter collDocsFilter = new CollectionDocFileFilter();
getAllFiles(files, collDir, this);
return files;
}
public void getAllFiles(ArrayList files, File path, FileFilter filter) {
File[] fileList = path.listFiles(filter);
for(int i = 0; i < fileList.length; i++) {
File f = fileList[i];
if(f.isFile()) {
files.add(f);
} else {
getAllFiles(files, f, filter);
}
}
}
/** Filter to only allow Gathered GS documents
* to produce the list of files for which we need to export GLI metadata info to CSV.
*/
//private class CollectionDocFileFilter implements FileFilter {
@Override
public boolean accept(File pathname) {
String tailname = pathname.getName();
if(pathname.isDirectory()) {
if(tailname.equals(".svn")) {
return false;
}
} else {
if(pathname.equals(metadataCSVFile)) { // skip any meta csv file user exported/put into import
return false;
} else if(tailname.equals("metadata.xml")) {
return false;
} else if(tailname.endsWith("~")) {
return false;
} else if(tailname.endsWith(".bak")) {
return false;
}
}
// accept all other file types
return true;
}
//}
public static File chooseMetaCSVFile(String defaultSearchPath, JFrame parent) {
JFileChooser chooser = new JFileChooser(defaultSearchPath);
chooser.setFileSelectionMode(JFileChooser.FILES_ONLY);
chooser.setDialogTitle(Dictionary.get("ExportMeta.ChooseMetaCSVFile"));
FileNameExtensionFilter filter = new FileNameExtensionFilter("CSV spreadsheet file", "csv");
chooser.setFileFilter(filter);//.addChoosableFileFilter(filter);
int returnVal = chooser.showOpenDialog(parent);
if(returnVal == JFileChooser.APPROVE_OPTION) {
File selectedFile = chooser.getSelectedFile();
///System.err.println("File selected: " + selectedFile.getAbsolutePath());
return selectedFile;
} else {
return null;
}
}
}