###########################################################################
#
# SplitPlug.pm - a plugin for splitting input files into segments that
#                will then be individually processed.
#
#
# Copyright 2000 Gordon W. Paynter (gwp@cs.waikato.ac.nz)
# Copyright 2000 The New Zealand Digital Library Project
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################


# SplitPlug is a plugin for splitting input files into segments that will
# then be individually processed.  

# This plugin should not be called directly.  Instead, if you need to
# process input files that contain several documents, you should write a
# plugin with a process function that will handle one of those documents
# and have it inherit from SplitPlug.  See ReferPlug for an example.


package SplitPlug;

use BasPlug;
use gsprintf 'gsprintf';
use util;

use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa

# SplitPlug is a sub-class of BasPlug.
sub BEGIN {
    @SplitPlug::ISA = ('BasPlug');
}


my $arguments =
    [ { 'name' => "split_exp",
	'desc' => "{SplitPlug.split_exp}",
	'type' => "regexp",
	#'deft' => &get_default_split_exp(),
	'deft' => "",
	'reqd' => "no" } ];

my $options = { 'name'     => "SplitPlug",
		'desc'     => "{SplitPlug.desc}",
		'abstract' => "yes",
		'inherits' => "yes",
	        'args'     => $arguments };


sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};

    my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);

    $self->{'textcat_store'} = {};
    $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
    return bless $self, $class;
}

sub init {
    my $self = shift (@_);
    my ($verbosity, $outhandle, $failhandle) = @_;

    $self->BasPlug::init($verbosity, $outhandle, $failhandle);

    if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {

	$self->{'process_exp'} = $self->get_default_process_exp ();
	if ($self->{'process_exp'} eq "") {
	    warn ref($self) . " Warning: plugin has no process_exp\n";
	}
    }


    # set split_exp to default unless explicitly set
    if (!$self->{'split_exp'}) {
	$self->{'split_exp'} = $self->get_default_split_exp ();
    }

}

# This plugin recurs over the segments it finds
sub is_recursive {
    return 1;
}

# By default, we split the input text at blank lines
sub get_default_split_exp {
    return q^\n\s*\n^;
}

sub metadata_read {
    my $self = shift (@_);  
    my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;

    my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file, 
					      $metadata, $extrametakeys, 
					      $extrametadata, $processor, 
					      $maxdocs, $gli);
    my $split_matched = undef;

    if ($matched) {

	my $outhandle = $self->{'outhandle'};
	my $filename = &util::filename_cat($base_dir, $file);

	my $plugin_name = ref ($self);
	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up

	$self->{'metapass_srcdoc'}->{$file} = {};

	# Do encoding stuff
	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
	my $le_rec = { 'language' => $language, 'encoding' => $encoding };
	$self->{'textcat_store'}->{$file} = $le_rec;

	# Read in file ($text will be in utf8)
	my $text = "";
	$self->read_file ($filename, $encoding, $language, \$text);

	if ($text !~ /\w/) {
	    gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n",
		     $file)
		if $self->{'verbosity'};
	    
	    my $failhandle = $self->{'failhandle'};
	    print $failhandle "$file: " . ref($self) . ": file contains no text\n";
	    $self->{'num_not_processed'} ++;

	    $self->{'textcat_store'}->{$file} = undef;

	    return 0; 
	}
    
    
	# Split the text into several smaller segments
	my $split_exp = $self->{'split_exp'};
	my @segments = split(/$split_exp/, $text);
	print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n" 
	    if $self->{'verbosity'};
	
	$self->{'split_segments'}->{$file} = \@segments;
	$split_matched = scalar(@segments);
    }
    
    return $split_matched;
}


# The read function opens a file and splits it into parts. 
# Each part is sent to the process function
#
# Returns: Number of document objects created (or undef if it fails)

sub read {
    my $self = shift (@_);
    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    my $outhandle = $self->{'outhandle'};
    my $verbosity = $self->{'verbosity'};

    #check process and block exps, smart block, etc
    my ($block_status,$filename) = $self->read_block(@_);    
    return $block_status if ((!defined $block_status) || ($block_status==0));

    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up

    my $le_rec = $self->{'textcat_store'}->{$file};
    if (!defined $le_rec) {
	# means no text was found;
	return 0; # not processed but no point in passing it on
    }

    my $language = $le_rec->{'language'};
    my $encoding = $le_rec->{'encoding'};
    $self->{'textcat_store'}->{$file} = undef;

    my $segments = $self->{'split_segments'}->{$file};
    $self->{'split_segments'}->{$file} = undef;

    # Process each segment in turn
    my ($count, $segment, $segtext, $status, $id);
    $segment = 0;
    $count = 0;
    foreach $segtext (@$segments) {
	$segment++;

	if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
	    # metadata is attached to a srcdoc
	    next;
	}

	# create a new document
	my $doc_obj = new doc ($filename, "indexed_doc");
	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
	my ($filemeta) = $file =~ /([^\\\/]+)$/;
	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
	if ($self->{'cover_image'}) {
	    $self->associate_cover_image($doc_obj, $filename);
	}
	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
	#$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");

	# Calculate a "base" document ID.
	if (!defined $id) {
	    $doc_obj->set_OID();
	    $id = $doc_obj->get_OID();
	}
    
	# include any metadata passed in from previous plugins 
	# note that this metadata is associated with the top level section
	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);

	# do plugin specific processing of doc_obj
	print $outhandle "segment $segment - " if ($self->{'verbosity'});
	$status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
	if (!defined $status) {
	    print $outhandle "WARNING - no plugin could process segment $segment of $file\n" 
		if ($verbosity >= 2);
	    next;
	}
	# If the plugin returned 0, it threw away this part
	if ($status == 0) {
	    next;
	}
	$count += $status;

	# do any automatic metadata extraction
	$self->auto_extract_metadata ($doc_obj);

	# add an OID
	$self->set_OID($doc_obj, $id, $segment);

	# process the document
	$processor->process($doc_obj);

	$self->{'num_processed'} ++;
    }

    delete $self->{'metapass_srcdoc'}->{$file};

    # Return number of document objects produced
    return $count; 
}

sub set_OID {
    my $self = shift (@_);
    my ($doc_obj, $id, $segment_number) = @_;
    
    $doc_obj->set_OID($id . "s" . $segment_number);
}

1;