###########################################################################
#
# PrePlug.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# plugin that processes simple html files like those output 
# by prescript when converting postscript to html. 
# prescript output has <p> tags separating paragraphs
# and <!--End Of Page--> tags separating pages.
# it may also have <!--Page No--> tags at top or bottom of 
# some or all pages.

# if a .html1 version of a .html file exists then that version 
# will be used so that the author name can be extracted
# (.html1 files were generated by software which automatically 
# extracts author names from .html files).

# also looks for a .info file containing metadata (as used by 
# the cstr collection)

package PrePlug;

use BasPlug;
use sorttools;

sub BEGIN {
    @ISA = ('BasPlug');
}

sub new {
    my ($class) = @_;
    $self = new BasPlug ();

    return bless $self, $class;
}

sub is_recursive {
    my $self = shift (@_);

    return 0; # this is not a recursive plugin
}


# return number of files processed, undef if can't process
# Note that $base_dir might be "" and that $file might 
# include directories
sub read {
    my $self = shift (@_);
    my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;

    my $filename = &util::filename_cat($base_dir, $file);

    return 0 if ($filename =~ /\.(html1|html1\.gz|info|gif)/);

    return undef unless ($filename =~ /\.(html(\.gz)?)$/i && (-e $filename));

    my $gz = 0;
    $gz = 1 if (defined $2 && $2 eq ".gz");

    my $filename1 = $filename;
    if ($gz) {
	$filename1 =~ s/\.html\.gz$/\.html1\.gz/;
    } else {
	$filename1 .= "1";
    }
    
    $filename = $filename1 if (-e $filename1);

    print STDERR "PrePlug: processing $filename\n" if $processor->{'verbosity'};

    # create a new document
    my $doc_obj = new doc ($file, "indexed_doc");

    if ($gz) {
	open (FILE, "zcat $filename |") || die "PrePlug::read - zcat can't open $filename\n";
    } else {
	open (FILE, $filename) || die "PrePlug::read - can't open $filename\n";
    }
    my $cursection = $doc_obj->get_top_section();

    my $text = "";
    my $line = "";
    my $pagenumber = 1;
    my @creators = ();
    my $numlines = 0;
    while (defined ($line = <FILE>)) {
	if ($filename =~ /html1$/) {
	    while ($text =~ s/<_author_search_\([^\)]*\)>([^<]*)<\/a>/$1/i) {
		push (@creators, $1);
	    }
	}

	# numlines sorts out documents that just have a single <!-- End of Page--> tag at the end
	if ($line =~ /<!--End\s+Of\s+Page-->/i && $numlines < 200) {

	    $numlines = -1000;
	    $cursection = 
		$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));

	    $text =~ s/<!--Page No-->\s*<p>\s*<center>\s*<b>\s*(\d+)<\/b>\s*<\/center>\s*//i;
#	    if (defined $1 && $1 != $pagenumber) {
#		print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n";
#	    }
	    $doc_obj->add_text ($cursection, $text);
	    $doc_obj->add_metadata ($cursection, "Title", $pagenumber);
	    $pagenumber ++;
	    $text = "";
	} else {
	    $text .= $line;
	    $numlines ++; 
	}
    }
    close FILE;

    if ($cursection eq $doc_obj->get_top_section()) {
	# there weren't any <!--End Of Page--> tags
	
	# see if there were any <!--Page No--> tags to split on
	if ($text =~ /<!--Page No-->/i) {
	    while ($text =~ s/^(.*?)<!--Page No-->\s*<p>\s*<center>\s*<b>\s*\d+<\/b>\s*<\/center>\s*//i) {
		$cursection = 
		    $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
		$doc_obj->add_text ($cursection, $text);
		$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
		$pagenumber ++;
	    }
	}
	# otherwise we'll just have to split pages on set number of lines
	else {
	    my $pagetext = "";
	    my $line = "";
	    my $count = 0;
	    while (length ($text) && $text =~ s/^(.*?)(\n|$)//) {
		$line = $1;
		$line = "" unless defined $line;
		$pagetext .= $line . "\n";
		if (($count >= 50 && $line !~ /\w/) || $count == 80) {
		    $cursection = 
			$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
		    $doc_obj->add_text ($cursection, $pagetext);
		    $doc_obj->add_metadata ($cursection, "Title", $pagenumber);
		    $pagetext = "";
		    $pagenumber ++;
		    $count = 0;
		} else {
		    $count ++;
		}
	    }
	}
	    
    }
	    
    # final section
    if ($text =~ /\w/) {
	$cursection = 
	    $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
	
	$text =~ s/<!--Page No-->\s*<p>\s*<center>\s*<b>\s*(\d+)<\/b>\s*<\/center>\s*//;
#	if (defined $1 && $1 != $pagenumber) {
#	    print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n";
#	}
	$doc_obj->add_text ($cursection, $text);
	$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
    }

    # add meta data for top level of document
    map {$doc_obj->add_metadata ($doc_obj->get_top_section(), "Creator", $_)} @creators;

    my ($filesuff) = $filename =~ /^(.*?)\.html1?/;
    if (!-e $filesuff . ".info") {
	print STDERR "Preplug Warning: $filename has no corresponding .info file\n";
    } else {
	my ($dir) = $filesuff =~ /^(.*?)~?[^~]*$/;
	open (INFO, $filesuff . ".info") || die "PrePlug::read - can't open $filesuff.info\n";
	my $line = "";
	while (defined ($line = <INFO>)) {
	    chop $line;
	    my ($key, $value) = $line =~ /^<([^>]*)>(.*)$/;
	    next if $key =~ /^(pages|compressedsize|\/?info|size)$/i;
	    next if $value !~ /\w/;
	    if ($key =~ /^abstract$/i) {
		$key = "Description";
	    } elsif ($key =~ /^url$/i) {
		$key = "Source";
	    } elsif ($key =~ /^filedate$/i) {
		$key = "Date";
		my ($day, $month, $year) = split /\//, $value;
		$value = &sorttools::format_date ($day, $month, $year);
	    } elsif ($key =~ /^transferdate$/i) {
		my ($day, $month, $year) = split /\//, $value;
		$value = &sorttools::format_date ($day, $month, $year);
	    } elsif ($key =~ /^facsimiles$/i) {
		$value =~ s/^\s+//;
		$value =~ s/\s+$//;
		my @facsimiles = split /\s+/, $value;
		$value = join ",", @facsimiles;
		foreach $facsimile (@facsimiles) {
		    # assume images are in the same directory
		    my $imagefile = $dir . "~" . $facsimile;
		    if (-e $imagefile) {
			my ($imagetype) = $facsimile =~ /\.([^\.]*)$/;
			$doc_obj->associate_file($imagefile, $facsimile, "image/" . $imagetype);
		    } else {
			print STDERR "PrePlug: Warning - facsimile file $imagefile doesn't exist\n";
			next;
		    }
		}
	    } elsif ($key =~ /^figures$/i) {
		$value =~ s/^\s+//;
		$value =~ s/\s+$//;
		my @figures = split /\s+/, $value;
		foreach $figure (@figures) {
		    # assume images are in the same directory
		    my $imagefile = $dir . "~" . $figure;
		    if (-e $imagefile) {
			my ($imagetype) = $figure =~ /\.([^\.]*)$/;
			$doc_obj->associate_file($imagefile, $figure, "image/" . $imagetype);
		    } else {
			print STDERR "PrePlug: Warning - figure file $imagefile doesn't exist\n";
			next;
		    }
		    $value = join ",", @figures;
		}
	    }
	    $doc_obj->add_metadata ($doc_obj->get_top_section(), $key, $value);
	}
	close INFO;
    }

    foreach $field (keys(%$metadata)) {
	# $metadata->{$field} may be an array reference
	if (ref ($metadata->{$field}) eq "ARRAY") {
	    map { 
		$doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $_); 
	    } @{$metadata->{$field}};
	} else {
	    $doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $metadata->{$field}); 
	}
    }

    # add OID
    $doc_obj->set_OID ();

    # process the document
    $processor->process($doc_obj);

    return 1; # processed the file
}

1;