###########################################################################
#
# ISISPlug.pm -- A plugin for CDS/ISIS databases
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright 1999-2004 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package ISISPlug;


use multiread;
use SplitPlug;

use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa

# ISISPlug is a sub-class of SplitPlug.
sub BEGIN {
    @ISISPlug::ISA = ('SplitPlug');
}


my $arguments = 
    [ { 'name' => "process_exp",
	'desc' => "{BasPlug.process_exp}",
	'type' => "regexp",
	'reqd' => "no",
	'deft' => &get_default_process_exp() },
      { 'name' => "block_exp",
	'desc' => "{BasPlug.block_exp}",
	'type' => "regexp",
	'reqd' => "no",
	'deft' => &get_default_block_exp() },
      { 'name' => "split_exp",
	'desc' => "{SplitPlug.split_exp}",
	'type' => "regexp",
	'reqd' => "no",
	'deft' => &get_default_split_exp() },

      # The interesting options
      { 'name' => "entry_separator",
	'desc' => "{ISISPlug.entry_separator}",
	'type' => "string",
	'reqd' => "no",
	'deft' => "<br>" },
      { 'name' => "subfield_separator",
	'desc' => "{ISISPlug.subfield_separator}",
	'type' => "string",
	'reqd' => "no",
	'deft' => ", " }
      ];

my $options = { 'name'     => "ISISPlug",
		'desc'     => "{ISISPlug.desc}",
		'abstract' => "no",
		'inherits' => "yes",
		'explodes' => "yes",
		'args'     => $arguments };


# This plugin processes files with the suffix ".mst"
sub get_default_process_exp {
    return q^(?i)(\.mst)$^;
}


# This plugin blocks files with the suffix ".fdt" and ".xrf"
sub get_default_block_exp {
    return q^(?i)(\.fdt|\.xrf)$^;
}

    
# This plugin splits the input text at the "----------" lines
sub get_default_split_exp {
    return q^\r?\n----------\r?\n^;
}


sub new
{
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};

    my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);

    return bless $self, $class;
}


sub read_file
{
    my $self = shift (@_);
    my ($filename, $encoding, $language, $textref) = @_;

    my ($databasename) = ($filename =~ /(.*)\.mst$/i);

    # Check the associated .fdt and .xrf files exist
    # These files must have a lowercase extension for the IsisGdl program to work
    # Bailing out because of this is kind of crappy but it is only an issue on Unix
    my $fdtfilename = $databasename . ".fdt";
    if (! -e $fdtfilename) {
	die "Error: Could not find ISIS FDT file $fdtfilename.\n";
    }
    my $xrffilename = $databasename . ".xrf";
    if (! -e $xrffilename) {
	die "Error: Could not find ISIS XRF file $xrffilename.\n";
    }

    # The text to split is exported from the database by the IsisGdl program
    open(FILE, "IsisGdl \"$filename\" |");

    my $reader = new multiread();
    $reader->set_handle('ISISPlug::FILE');
    $reader->set_encoding($encoding);
    $reader->read_file($textref);

    close(FILE);

    # Parse the associated ISIS database Field Definition Table file (.fdt)
    my %fdtmapping = &parse_field_definition_table($fdtfilename, $encoding);

    # Map the tag numbers to tag names, using the FDT mapping
    $$textref =~ s/\r?\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;

    # Remove the line at the start so it is split and processed properly
    $$textref =~ s/^----------\n//;
}


sub process
{
    my $self = shift (@_);
    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
    my $outhandle = $self->{'outhandle'};

    my $cursection = $doc_obj->get_top_section();
    my $subfield_separator = $self->{'subfield_separator'};
    my $entry_separator = $self->{'entry_separator'};

    # Report that we're processing the file
    print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
    print $outhandle "IsisPlug: processing $file\n"
	if ($self->{'verbosity'}) > 1;

    # Process each line of the ISIS record, one at a time
    foreach my $line (split(/\n/, $$textref)) {
	$line =~ /^tag=(.*) data=(.+)$/;
	my $rawtagname = $1;
	my $rawtagdata = $2;
	# print STDERR "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
	next if ($rawtagname eq "");

	# Metadata field names: title case, then remove spaces
	my $tagname = "";
	foreach my $word (split(/\s+/, $rawtagname)) {
	    substr($word, 0, 1) =~ tr/a-z/A-Z/;
	    $tagname .= $word;
	}

	# Make sure there is nothing bad in the tag names
	$tagname =~ s/&//g;

	# Handle each piece of metadata ('%' separated)
	my $completetagvalue = "";
	foreach my $rawtagvalue (split(/%/, $rawtagdata)) {
	    $completetagvalue .= $entry_separator unless ($completetagvalue eq "");

	    # Metadata field values: take care with subfields
	    my $completeentryvalue = "";
	    while ($rawtagvalue ne "") {
		# If there is a subfield specifier, parse it off
		my $subfieldname = "";
		if ($rawtagvalue =~ s/^\^// && $rawtagvalue =~ s/([a-z])//) {
		    $subfieldname = "^$1";
		}

		# Parse the metadata value off
		$rawtagvalue =~ s/^([^\^]*)//;
		my $metadatafieldname = $tagname . $subfieldname;
		my $metadatafieldvalue = $1;
		
		# Handle Keywords specially
		if ($metadatafieldname eq "Keywords") {
		    my $keywordmetadatavalue = $metadatafieldvalue;
		    my $keywordlist = "";
		    while ($keywordmetadatavalue =~ s/\<(.+?)\>//) {
			my $keyword = $1;
			$doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword); 
			$keywordlist .= ", " unless ($keywordlist eq "");
			$keywordlist .= $keyword;
		    }

		    $metadatafieldvalue = $keywordlist;
		}

		# Escape any '<' and '>' characters so they appear correctly in the final collection
		$metadatafieldvalue =~ s/\</&lt;/g;
		$metadatafieldvalue =~ s/\>/&gt;/g;

		# We have already added Keywords metadata above
		unless ($metadatafieldname eq "Keywords") {
		    $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $metadatafieldvalue); 
		}

		$completeentryvalue .= $subfield_separator unless ($completeentryvalue eq "");
		$completeentryvalue .= $metadatafieldvalue;
	    }

	    $completetagvalue .= $completeentryvalue;
	}

	$doc_obj->add_utf8_metadata($cursection, $tagname . "^all", $completetagvalue); 
    }

    # Add the full record as the document text
    $$textref =~ s/\</&lt;/g;
    $$textref =~ s/\>/&gt;/g;
    $doc_obj->add_utf8_text($cursection, $$textref);

    # Add FileFormat metadata
    $doc_obj->add_utf8_metadata($cursection, "FileFormat", "CDS/ISIS");

    # Record was processed successfully (and there was no document obtained)
    return 1;
}


sub parse_field_definition_table
{
    my $fdtfilename = shift(@_);
    my $encoding = shift(@_);

    my %fdtmapping = ();

    open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";

    my $fdtfiletext = "";
    my $reader = new multiread();
    $reader->set_handle('ISISPlug::FDT_FILE');
    $reader->set_encoding($encoding);
    $reader->read_file($fdtfiletext);

    my $amongstdefinitions = 0;
    foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
	$fdtfileline =~ s/(\s*)$//;  # Remove any nasty spaces at the end of the lines

	if ($amongstdefinitions) {
	    my $fieldtitle     = substr($fdtfileline,  0, 30);
	    my $fieldsubfields = substr($fdtfileline, 30, 20);
	    my $fieldspecs     = substr($fdtfileline, 50);

	    # Remove extra spaces
	    $fieldtitle =~ s/(\s*)$//;
	    $fieldsubfields =~ s/(\s*)$//;

	    # Map from tag number to metadata field title and subfields
	    my ($fieldtag) = ($fieldspecs =~ /^\s*(\d+)\s+/);
	    $fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
				       'subfields' => $fieldsubfields };
	}
	elsif ($fdtfileline eq "***") {
	    $amongstdefinitions = 1;
	}
    }

    close(FDT_FILE);

    return %fdtmapping;
}


1;