###########################################################################
#
# BibTexPlug.pm - a plugin for bibliography records in BibTex format
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright 2000 Gordon W. Paynter
# Copyright 1999-2000 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################


# BibTexPlug reads bibliography files in BibTex format.
#
# by Gordon W. Paynter (gwp@cs.waikato.ac.nz), November 2000
# Based on ReferPlug.  See ReferPlug for geneology.
#
# BibTexPlug creates a document object for every reference a the file.
# It is a subclass of SplitPlug, so if there are multiple records, all
# are read.


package BibTexPlug;

use SplitPlug;


# BibTexPlug is a sub-class of BasPlug.
sub BEGIN {
    @ISA = ('SplitPlug');
}

# This plugin processes files with the suffix ".bib"
sub get_default_process_exp {
    return q^(?i)\.bib$^;
}

# This plugin splits the input text at blank lines
sub get_default_split_exp {
    return q^\n+(?=@)^;
}

# The process function reads a single bibliographic record and stores
# it as a new document.

sub process {
    my $self = shift (@_);
    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
    my $outhandle = $self->{'outhandle'};

    $self->{'key'} = "default";

    # Check that we're dealing with a valid BibTex record
    return undef unless ($$textref =~ /^@\w+\{.*\}/s);

    # Ignore things we can't use
    return 0 if ($$textref =~ /^\@String/);

    # Report that we're processing the file
    print $outhandle "BibTexPlug: processing $file\n"
	if ($self->{'verbosity'}) > 1;


    # This hash translates BibTex field names into metadata names.  The
    # BibTex names are taken from the "Local Guide to Latex" Graeme
    # McKinstry.  Metadata names are consistabnt with ReferPlug.

    my %field = (
		 'address', 'PublisherAddress',
		 'author', 'Creator',
		 'booktitle', 'Booktitle',
		 'chapter', 'Chapter',
		 'edition', 'Edition',
		 'editor', 'Editor',
		 'institution', 'Publisher',
		 'journal', 'Journal',
		 'month', 'Month',
		 'number', 'Number',
		 'pages', 'Pages',
		 'publisher', 'Publisher',
		 'school', 'Publisher',
		 'title', 'Title',
		 'volume', 'Volume',
		 'year', 'Date',

		 'keywords', 'Keywords',
		 'abstract', 'Abstract',
		 'copyright', 'Copyright');

    # Metadata fields 
    my %metadata;
    my ($EntryType, $EntryID, $Creator, $Keywords, $text);

    # Make sure the text has exactly one entry per line
    my $lines = $$textref;
    $lines =~ s/,\s*\n/=====/g;
    $lines =~ s/\s+/ /g;
    $lines =~ s/\s*=====\s*/\n/g;
    my @lines = split(/\n+/, $lines);
    
    # Read and process each line in the bib file.
    my ($id, $name, $value, $line);
    foreach $line (@lines) {
	
	# Add each line.  Most lines consist of a field identifer and
	# then data, and we simply store them, though we treat some
	# of the fields a bit differently.

	$line =~ s/\s+/ /g;
	$text .= "$line\n";

	
	# The first line is special, it contains the reference type and OID
	if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]+)\W*$/) {
	    $EntryType = $1;
	    $EntryID = $2;
	    print "** $EntryType - \"$EntryID\" \n"
		if ($verbosity >= 4);
	    $self->{'key'} = $EntryID;
	    next;
	}
	if ($line =~ /\@/) {
	    print "bibtexplug: suspect line in bibtex file: $line\n"
		if ($verbosity >= 2);
	    print "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
		if ($verbosity >= 2);
	}
	
	# otherwise, parse the metadata out of this line
	next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
	$id = lc($1);
	$value = $2;
   	
	# Add this line of metadata
	$metadata{$id} .= "$value\n";
    }

    # Add the Entry type as metadata
    $doc_obj->add_metadata ($cursection, "EntryType", $EntryType);

    # Add the various field as metadata
    foreach my $id (keys %metadata) {
	
	next unless (defined $field{$id});
	next unless (defined $metadata{$id});	
	
	$name = $field{$id};
	$value = $metadata{$id};

	# Get rid of silly Latex stuff
	if ($value =~ /\"(.*)\"/) {
	    $value = $1;
	}
	if ($value =~ /\{(.*)\}/) {
	    $value = $1;
	}
	    
	# Add the various field as metadata	
	$value = &text_into_html($value);
	$doc_obj->add_metadata ($cursection, $name, $value);

	# Several special operatons on metadata follow
	
	# Add individual keywords.
	# The full set of keywords will be added, in due course, as "Keywords".
	# However, we also want to add them as individual "Keyword" metadata elements.
	if ($id eq "keywords") {
	    my @keywordlist = split(/,/, $value);
	    foreach my $k (@keywordlist) {
		$k = lc($k);
		$k =~ s/\s*$//; 
		$k =~ s/^\s*//; 
		if ($k =~ /\w/) {
		    $k = &text_into_html($k);
		    $doc_obj->add_metadata ($cursection, "Keyword", $k);
		}
	    } 
	}
	
	# Add individual authors
	# The author metadata will be stored as one "Creator" entry, but we
	# also want to split it into several individual "Author" fields in
	# "Lastename, Firstnames" format so we can browse it.
	if ($id eq "author") {
	    
	    my @authorlist = split(/(,|and)/, $value);
	    foreach $a (@authorlist) {
		$a =~ s/\s*$//; 
		$a =~ s/^\s*//; 
		
		# Reformat and add author name
		my @words = split(/ /, $a);
		my $lastname = pop @words;
		my $firstname = join(" ",  @words);
		my $fullname = $lastname . ", " . $firstname;
		
		# Add each name to set of Authors
		if ($fullname =~ /\w+, \w+/) {
		    $fullname = &text_into_html($fullname);
		    $doc_obj->add_metadata ($cursection, "Author", $fullname);
		}
	    }
	}

	# Books and Journals are additionally marked for display purposes
	if ($id eq "booktitle") {
	    $doc_obj->add_metadata($cursection, "BookConfOnly", 1);
	} elsif ($id eq "journal") {
	    $doc_obj->add_metadata($cursection, "JournalsOnly", 1); 
	}

    }

    # Add the text in BibTex format (all fields)
    if ($text =~ /\w/) {
	$text = &text_into_html($text);
	$doc_obj->add_text ($cursection, $text);
    }

    return 1;
}


# Convert a text string into HTML.

# The HTML is going to be inserted into a GML file, so we have to be
# careful not to use symbols like ">", which ocurs frequently in email
# messages (and use &gt instead.

# This function also turns URLs and email addresses into links, and
# replaces carriage returns with <BR> tags (and multiple carriage returns
# with <P> tags).


sub text_into_html {
    my ($text) = @_;


    # Convert problem charaters into HTML symbols
    $text =~ s/&/&amp;/g;
    $text =~ s/</&lt;/g;
    $text =~ s/>/&gt;/g;
    $text =~ s/\"/&quot;/g;
    $text =~ s/\'/ /g;
    $text =~ s/\+/ /g;
    $text =~ s/\(/ /g;
    $text =~ s/\)/ /g;

    # convert email addresses and URLs into links
    $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
    $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;

    # Clean up whitespace and convert \n charaters to <BR> or <P>
    $text =~ s/ +/ /g;
    $text =~ s/\s*$//; 
    $text =~ s/^\s*//; 
    $text =~ s/\n/\n<BR>/g;
    $text =~ s/<BR>\s*<BR>/<P>/g;

    return $text;
}

sub set_OID {
    my $self = shift (@_);
    my ($doc_obj, $id, $segment_number) = @_;
    
    if ( $self->{'key'} eq "default") {
	$doc_obj->set_OID();
    } else {
	$doc_obj->set_OID($self->{'key'});
    }
}

1;