###########################################################################
#
# ProtemixPlug.pm --
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 2002 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package ProtemixPlug;

use HTMLPlug;
use util;

sub BEGIN {
    @ISA = ('HTMLPlug');
}

use XML::Parser;

sub new {
    my $class = shift (@_);
    
    # $self must be global for XML parser routines
    $self = new HTMLPlug ($class, @_);

    $self->{'no_metadata'} = 1;
    $self->{'nolinks'} = 1;
    $self->{'section_metadata'} = {};
    $self->{'Page'} = "TopLevel";
    $self->{'metadata_name'} = "";

    return bless $self, $class;
}

sub read {
    my $self = shift (@_);  
    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;

    my $filename = $file;
    $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;

    my $basename = File::Basename::basename($filename);
    return 0 if $basename =~ /^\d+(-(all|\d+))?\.(html?|pdf)/;
    return undef unless $filename =~ /meta\.xml$/;
    
    # create a new document
    my $doc_obj = new doc ($filename, "indexed_doc");
    $doc_obj->set_OIDtype ($processor->{'OIDtype'});
    my $topsection = $doc_obj->get_top_section();

    # process the meta.xml file and set top level metadata
    my $parser = new XML::Parser('Style' => 'Stream');
    $self->{'section_metadata'} = {};
    $self->{'Page'} = "TopLevel";
    $parser->parsefile($filename);
    foreach my $key (keys %{$self->{'section_metadata'}->{'TopLevel'}}) {
	$doc_obj->add_utf8_metadata ($topsection, $key, $self->{'section_metadata'}->{'TopLevel'}->{$key});
    }

    my $dir = File::Basename::dirname($filename);

    my $outhandle = $self->{'outhandle'};
    print $outhandle "ProtemixPlug: processing $dir\n";


    # associate article level pdf file
    my ($pdffile) = $dir =~ /([^\/\\]+)$/;
    $pdffile = &util::filename_cat($dir, $pdffile);
    $pdffile .= "-all.pdf";
    if (-e $pdffile) {
	$doc_obj->associate_file($pdffile, "article.pdf", undef, $topsection);
	$doc_obj->add_utf8_metadata ($cursection, "pdf",  "article.pdf"); 
    } else {
	print STDERR "ProtemixPlug: Error: $pdffile does not exist\n";
    }

    # read in directory and process individual files
    opendir(DIR, $dir) || die;
    my @files = readdir DIR;
    closedir DIR;
    

    # we rely on the files being named in such a way that they'll be read
    # in the correct order
    my $count = 1;
    foreach my $thisfile (@files) {
	if ($thisfile =~ /^(.*?)\.html?$/i) {
	    my $filesuf = $1;
	    $thisfile = &util::filename_cat($dir, $thisfile);
	    my ($language, $encoding) = $self->textcat_get_language_encoding ($thisfile);
	    # read in file ($text will be in utf8)
	    my $text = "";
	    $self->read_file ($thisfile, $encoding, $language, \$text);
	    if (!length ($text)) {
		print STDERR "ProtemixPlug: Warning: $thisfile has no text\n";
	    }

	    my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));

	    # process HTML file with HTMLPlug
	    $self->process_section (\$text, '', $thisfile, $doc_obj, $cursection);

	    # associate PDF page level pdf file
	    my $pdffile = $thisfile;
	    $pdffile =~ s/\.html?$/\.pdf/;
	    if (-e $pdffile) {
		$doc_obj->associate_file($pdffile, "page$count.pdf", undef, $cursection);
		$doc_obj->add_utf8_metadata ($cursection, "pdf",  "page$count.pdf"); 
		$doc_obj->add_utf8_metadata ($cursection, "Title", $count); 
	    } else {
		print STDERR "ProtemixPlug: Warning: no pdf file for $thisfile\n";
	    }

	    # add any section level metadata we have for this page (set from within the meta.xml file)

# currently commented out as we're not using Class1, Class2, and Class3 metadata yet
#	    if (defined ($self->{'section_metadata'}->{$filesuf})) {
#		foreach my $key (keys %{$self->{'section_metadata'}->{$filesuf}}) {
#		    $doc_obj->add_utf8_metadata ($cursection, $key, $self->{'section_metadata'}->{$filesuf}->{$key});
#		}
#	    }

	    $count ++;
	}
    }
   
    # add an OID
    $doc_obj->set_OID();

    # process the document
    $processor->process($doc_obj);

    $self->{'num_processed'} ++;

    return 1; # processed the file
}


sub StartTag {
    my ($expat, $element) = @_;

    if ($element eq "Page") {
	$self->{'Page'} = $_{'filename'};

    } elsif ($element eq "Metadata") {
	if (!defined $self->{'section_metadata'}->{$self->{'Page'}}) {
	    $self->{'section_metadata'}->{$self->{'Page'}} = {};
	}
	$self->{'metadata_name'} = $_{'name'};
	$self->{'section_metadata'}->{$self->{'Page'}}->{$_{'name'}} = "";
    }
}

sub EndTag {
    my ($expat, $element) = @_;

    if ($element eq "Page") {
	$self->{'Page'} eq "TopLevel";

    } elsif ($element eq "Metadata") {
	$self->{'metadata_name'} = "";
    }
}

sub Text {
    if ($self->{'metadata_name'} ne "") {
	$self->{'section_metadata'}->{$self->{'Page'}}->{$self->{'metadata_name'}} .= $_;
    }
}

# don't want to convert character entities for now as mgpp appears to be broken
sub read_file {
    my ($self, $filename, $encoding, $language, $textref) = @_;

    &BasPlug::read_file($self, $filename, $encoding, $language, $textref);

    # Convert entities to their UTF8 equivalents
#    $$textref =~ s/&(lt|gt|amp|quot);/&z$1;/go;
#    $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
#    $$textref =~ s/&z(lt|gt|amp|quot);/&$1;/go;
}


1;