###########################################################################
#
# CSVDeprecatedPlugin.pm -- A plugin for files in comma-separated value format
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright 2006 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package CSVDeprecatedPlugin;

use SplitTextFile;
use MetadataRead;
use CSVFieldSeparator;

use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa

use Text::CSV;

# CSVDeprecatedPlugin is a sub-class of SplitTextFile.
sub BEGIN {
    @CSVDeprecatedPlugin::ISA = ('MetadataRead', 'SplitTextFile', 'CSVFieldSeparator');
}


my $arguments = 
    [ 
      { 'name' => "process_exp",
	'desc' => "{BaseImporter.process_exp}",
	'type' => "regexp",
	'reqd' => "no",
	'deft' => &get_default_process_exp() },
      { 'name' => "split_exp",
	'desc' => "{SplitTextFile.split_exp}",
	'type' => "regexp",
	'reqd' => "no",
	'deft' => &get_default_split_exp(),
        'hiddengli' => "yes" }
      ];


my $options = { 'name'     => "CSVDeprecatedPlugin",
		'desc'     => "{CSVDeprecatedPlugin.desc}",
		'abstract' => "no",
		'inherits' => "yes",
		'explodes' => "yes",
		'args'     => $arguments };


# This plugin processes files with the suffix ".csv"
sub get_default_process_exp {
    return q^(?i)(\.csv)$^;
}

    
# This plugin splits the input text by line
sub get_default_split_exp {
    return q^\r?\n^;
}


sub new
{
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
    push(@{$hashArgOptLists->{"OptList"}}, $options);

    new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
    my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);

    return bless $self, $class;
}


sub read_file
{
    my $self = shift (@_);
    my ($filename, $encoding, $language, $textref) = @_;

    # Read in file the usual ReadTextFile way
    # This ensure that $textref is a unicode aware string
    $self->SUPER::read_file(@_);

    #
    # Now top-up the processing of the text with what this plugin
    # needs
    #

    # Remove any blank lines so the data is split and processed properly
    $$textref =~ s/\n(\s*)\n/\n/g;

    # The first line contains the metadata element names
    $$textref =~ s/^(.*?)\r?\n//;
    my @csv_file_fields = ();
    my $csv_file_field_line = $1;

    my $separate_char = $self->{'csv_field_separator'};
    if ($separate_char =~ m/^auto$/i) {
	$separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
	# Replace the 'auto' setting the resolved value (for use later on)
	$self->{'separate_char'} = $separate_char;
    }

    my $csv = Text::CSV->new();
    $csv->sep_char($separate_char);
 
    if ($csv->parse($csv_file_field_line)) {
	@csv_file_fields = $csv->fields;
    }
    else {
	print STDERR "Error: Badly formatted CSV field line: $csv_file_field_line.\n";
    }

    #$self->{'csv_file_fields'} = \@csv_file_fields;
    $self->{'csv_file_fields'}->{$filename} = \@csv_file_fields;
    ###print STDERR "**** CSV file fields joined ($filename) = ", join(" ||| ", @{$self->{'csv_file_fields'}->{$filename}}), "\n";

}


sub process
{
    my $self = shift (@_);
    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
    my $outhandle = $self->{'outhandle'};

    my $section = $doc_obj->get_top_section();
    my $csv_line = $$textref;
    #my @csv_file_fields = @{$self->{'csv_file_fields'}};
    my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
    my @csv_file_fields = @{$self->{'csv_file_fields'}->{$filename_full_path}};
    
    ###print STDERR "**** CSV file fields joined = ", join(" ||| ", @csv_file_fields), "\n";
    
    # Add the raw line as the document text
    $doc_obj->add_utf8_text($section, $csv_line);

    my $separate_char = $self->{'separate_char'};

    my $md_val_sep = $self->{'metadata_value_separator'};
    undef $md_val_sep if ($md_val_sep eq "");

    my $csv = Text::CSV->new();
    $csv->sep_char($separate_char);
 
    # Build a hash of metadata name to metadata value for this line
    if ($csv->parse($csv_line)) {
	my @md_vals = $csv->fields;
	my $md_vals_len = scalar(@md_vals);

	for (my $i=0; $i<$md_vals_len; $i++) {
	    my $md_val = $md_vals[$i];
	    # Only bother with non-empty values
	    if ($md_val ne "" && defined($csv_file_fields[$i])) {
		if (defined $md_val_sep) {
		    my $md_name = $csv_file_fields[$i];

		    my @within_md_vals = split(/${md_val_sep}/,$md_val);
		    foreach my $within_md_val (@within_md_vals) {
			$doc_obj->add_utf8_metadata($section, $md_name, $within_md_val);
		    }
		}
		else {
		    $doc_obj->add_utf8_metadata($section, $csv_file_fields[$i], $md_val);
		}
	    }
	}
    }
    else {
	print STDERR "Error: Badly formatted CSV line: $csv_line.\n";
    }

    # Record was processed successfully
    return 1;
}


1;