###########################################################################
#
# BibTexPlug.pm - a plugin for bibliography records in BibTex format
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright 2000 Gordon W. Paynter
# Copyright 1999-2001 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################


# BibTexPlug reads bibliography files in BibTex format.
#
# by Gordon W. Paynter (gwp@cs.waikato.ac.nz), November 2000
# Based on ReferPlug.  See ReferPlug for geneology.
#
# BibTexPlug creates a document object for every reference a the file.
# It is a subclass of SplitPlug, so if there are multiple records, all
# are read.
#
# Modified Dec 2001 by John McPherson:
#  *  some modifications submitted by Sergey Yevtushenko
#                      <sergey@intellektik.informatik.tu-darmstadt.de>
#  *  some non-ascii char support (ie mostly Latin)
#  *  The raw ascii bibtex entry is stored as "BibTex" metadata.

package BibTexPlug;

use SplitPlug;

# BibTexPlug is a sub-class of BasPlug.
sub BEGIN {
    @ISA = ('SplitPlug');
}

# This plugin processes files with the suffix ".bib"
sub get_default_process_exp {
    return q^(?i)\.bib$^;
}

# This plugin splits the input text at blank lines
sub get_default_split_exp {
    return q^\n+(?=@)^;
}


# The process function reads a single bibliographic record and stores
# it as a new document.

sub process {
    my $self = shift (@_);
    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
    my $outhandle = $self->{'outhandle'};

    $self->{'key'} = "default";

    # Check that we're dealing with a valid BibTex record
    return undef unless ($$textref =~ /^@\w+\{.*\}/s);

    # Ignore things we can't use
    return 0 if ($$textref =~ /^\@String/);

    # Report that we're processing the file
    print $outhandle "BibTexPlug: processing $file\n"
	if ($self->{'verbosity'}) > 1;


    # This hash translates BibTex field names into metadata names.  The
    # BibTex names are taken from the "Local Guide to Latex" Graeme
    # McKinstry.  Metadata names are consistent with ReferPlug.

    my %field = (
		 'address', 'PublisherAddress',
		 'author', 'Creator',
		  
		 'booktitle', 'Booktitle',
		 'chapter', 'Chapter',
		 'edition', 'Edition',
		 'editor', 'Editor', 
		 'institution', 'Publisher',
		 'journal', 'Journal',
		 'month', 'Month',
		 'number', 'Number',
		 'pages', 'Pages',
		 'publisher', 'Publisher',
		 'school', 'Publisher',
		 'title', 'Title',
		 'volume', 'Volume',
		 'year', 'Date',

		 'keywords', 'Keywords',
		 'abstract', 'Abstract',
		 'copyright', 'Copyright'
);

    # Metadata fields 
    my %metadata;
    my ($EntryType, $EntryID, $Creator, $Keywords, $text);

    my $verbosity = $self->{'verbosity'};
    $verbosity = 0 unless $verbosity;

    my $lines=$$textref;

    # Make sure the text has exactly one entry per line

    $lines  =~ s/^\s*(\@[^,]+,)\s*\n/$1=====/; #splitting key in entry
    $lines =~ s/([\"\}]\s*,)\s*\n/$1=====/g; #splitting by comma, followed by \n (assuming end of lines are " or })
    $lines =~ s/(\d+\s*\,)\s*\n/$1=====/g; #for the case, when we have number entry without closing "
    $lines =~ s/\n\s*\n/%%%%%/g; #this was simply added in order to allow to process newline inside  quoted strings, 
                                 #that continues for several lines
    $lines =~ s/\s+/ /g;
    $lines =~ s/\s*=====\s*/\n/g;

    my @all_lines = split(/\n+/, $lines);

    # Read and process each line in the bib file.
    my ($entryname, $name, $value, $line);
    foreach $line (@all_lines) {

	# Add each line.  Most lines consist of a field identifer and
	# then data, and we simply store them, though we treat some
	# of the fields a bit differently.

	$line =~ s/\s+/ /g;
	$text .= "$line\n";

	print "Processing line = $line \n" if $verbosity>=4;

	# The first line is special, it contains the reference type and OID
	if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]+)\W*$/) {
	    $EntryType = $1;
	    $EntryID = $2;
	    print "** $EntryType - \"$EntryID\" \n"
		if ($verbosity >= 4);
	    $self->{'key'} = $EntryID;
	    next;
	}
	if ($line =~ /\@/) {
	    print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
		if ($verbosity >= 2);
	    print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
		if ($verbosity >= 2);
	}
	
	# otherwise, parse the metadata out of this line
	next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
	$entryname = lc($1);
	$value = $2;
	# tidy up, removing " at start and end	
	$value =~ s/^"//;
	$value =~ s/(",)\s*$//;
	$value = &process_latex($value);

	# Add this line of metadata
	$metadata{$entryname} .= "$value\n";
	   
    }

    # Add the Entry type as metadata
    $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);

    # Add the various field as metadata
    foreach my $entryname (keys %metadata) {
	next unless (defined $field{$entryname});
	next unless (defined $metadata{$entryname});	
	
	$name = $field{$entryname};
	$value = $metadata{$entryname};

	# Add the various fields as metadata	
 	my $html_value = &text_into_html($value);
	$doc_obj->add_utf8_metadata ($cursection, $name, $html_value); 

	# Several special operatons on metadata follow
	
	# Add individual keywords.
	# The full set of keywords will be added, in due course, as "Keywords".
	# However, we also want to add them as individual "Keyword" metadata elements.
	if ($entryname eq "keywords") {
	    my @keywordlist = split(/,/, $value);
	    foreach my $k (@keywordlist) {
		$k = lc($k); 
		$k =~ s/\s*$//; 
		$k =~ s/^\s*//; 
		if ($k =~ /\w/) {
		    $k = &text_into_html($k);
		    $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
		}
	    } 
	}
	
	# Add individual authors
	# The author metadata will be stored as one "Creator" entry, but we
	# also want to split it into several individual "Author" fields in
	# "Lastename, Firstnames" format so we can browse it.
	if ($entryname eq "author") { #added also comparison with editor
	   
	    # und here for german language...
	    # don't use brackets in pattern, else the matched bit becomes
	    # an element in the list!
	    my @authorlist = split(/,|\s+and\s+|\s+und\s+/, $value); 
	    foreach $a (@authorlist) {
		$a =~ s/\s*$//; 
		$a =~ s/^\s*//; 
		# Reformat and add author name
		next if $a=~ /^\s*$/;
		my @words = split(/ /, $a);
		my $lastname = pop @words;
		my $firstname = join(" ",  @words);

		my $fullname = $lastname . ", " . $firstname;

		# Add each name to set of Authors
		# force utf8 pragma so that \w matches in this scope
		use utf8;
		if ($fullname =~ /\w+, \w+/) {
		   $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
		}
	    }
	}

	# Books and Journals are additionally marked for display purposes
	if ($entryname eq "booktitle") {
	    $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
	} elsif ($entryname eq "journal") {
	    $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1); 
	}

    }

    # Add the text in BibTex format (all fields)
    if ($text =~ /\w/) {
	$text = &text_into_html($text);
	$doc_obj->add_utf8_text ($cursection, $text);
	$doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
    }

    return 1;
}


# convert email addresses and URLs into links
sub convert_urls_into_links{
   my ($text) = @_;
 
   $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
   $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;

   return $text;
}

# Clean up whitespace and convert \n charaters to <BR> or <P>
sub clean_up_whitespaces{
   my ($text) = @_;

   $text =~ s/%%%%%/<BR> <BR>/g;
   $text =~ s/ +/ /g;
   $text =~ s/\s*$//; 
   $text =~ s/^\s*//; 
   $text =~ s/\n/\n<BR>/g;
   $text =~ s/<BR>\s*<BR>/<P>/g;

   return $text;
}


sub convert_problem_characters_without_ampersand{
    my ($text) = @_;
    $text =~ s/</&lt;/g;
    $text =~ s/>/&gt;/g;
    
    $text =~ s/\'\'/\"/g; #Latex -specific conversion
    $text =~ s/\`\`/\"/g; #Latex -specific conversion


    $text =~ s/\"/&quot;/g;
    $text =~ s/\'/&#8217;/g;
    $text =~ s/\`/&#8216;/g;
    $text =~ s/\+/ /g;
    $text =~ s/\(/ /g;
    $text =~ s/\)/ /g;

    $text =~ s/\\/\\\\/g;

    $text =~ s/\./\\\./g;

    return $text;
}

# Convert a text string into HTML.

# The HTML is going to be inserted into a GML file, so we have to be
# careful not to use symbols like ">", which occurs frequently in email
# messages (and use &gt instead.

# This function also turns URLs and email addresses into links, and
# replaces carriage returns with <BR> tags (and multiple carriage returns
# with <P> tags).

sub text_into_html {
    my ($text) = @_;

    # Convert problem characters into HTML symbols
    $text =~ s/&/&amp;/g;

    $text = &convert_problem_characters_without_ampersand( $text );

    # convert email addresses and URLs into links
    $text = &convert_urls_into_links( $text );

    $text = &clean_up_whitespaces( $text );

    return $text;
}


# Convert accented characters, remove { }, interprete some commands....
# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
sub process_latex {
    my ($text) = @_;
      
    # note - this is really ugly, but it works. There may be a prettier way
    # of mapping latex accented chars to utf8, but we just brute force it here.
    # Also, this isn't complete - not every single possible accented letter
    # is in here yet, but most of the common ones are.

    my %utf8_chars =
	(
	 # acutes
	 '\'a' => chr(0xc3).chr(0xa1),
	 '\'c' => chr(0xc4).chr(0x87),
	 '\'e' => chr(0xc3).chr(0xa9),
	 '\'i' => chr(0xc3).chr(0xad),
	 '\'l' => chr(0xc3).chr(0xba),
	 '\'n' => chr(0xc3).chr(0x84),
	 '\'o' => chr(0xc3).chr(0xb3),
	 '\'r' => chr(0xc5).chr(0x95),
	 '\'s' => chr(0xc5).chr(0x9b),
	 '\'u' => chr(0xc3).chr(0xba),
	 '\'y' => chr(0xc3).chr(0xbd),
	 '\'z' => chr(0xc5).chr(0xba),
	 # graves
	 '`a' => chr(0xc3).chr(0xa0),
	 '`A' => chr(0xc3).chr(0x80),
	 '`e' => chr(0xc3).chr(0xa8),
	 '`E' => chr(0xc3).chr(0x88),
	 '`i' => chr(0xc3).chr(0xac),
	 '`I' => chr(0xc3).chr(0x8c),
	 '`o' => chr(0xc3).chr(0xb2),
	 '`O' => chr(0xc3).chr(0x92),
	 '`u' => chr(0xc3).chr(0xb9),
	 '`U' => chr(0xc3).chr(0x99),
	 # circumflex
	 '^a' => chr(0xc3).chr(0xa2),
	 '^A' => chr(0xc3).chr(0x82),
	 '^c' => chr(0xc4).chr(0x89),
	 '^C' => chr(0xc4).chr(0x88),
	 '^e' => chr(0xc3).chr(0xaa),
	 '^E' => chr(0xc3).chr(0x8a),
	 '^g' => chr(0xc4).chr(0x9d),
	 '^G' => chr(0xc4).chr(0x9c),
	 '^h' => chr(0xc4).chr(0xa5),
	 '^H' => chr(0xc4).chr(0xa4),
	 '^i' => chr(0xc3).chr(0xae),
	 '^I' => chr(0xc3).chr(0x8e),
	 '^j' => chr(0xc4).chr(0xb5),
	 '^J' => chr(0xc4).chr(0xb4),
	 '^o' => chr(0xc3).chr(0xb4),
	 '^O' => chr(0xc3).chr(0x94),
	 '^s' => chr(0xc5).chr(0x9d),
	 '^S' => chr(0xc5).chr(0x9c),
	 '^u' => chr(0xc3).chr(0xa2),
	 '^U' => chr(0xc3).chr(0xbb),
	 '^w' => chr(0xc5).chr(0xb5),
	 '^W' => chr(0xc5).chr(0xb4),
	 '^y' => chr(0xc5).chr(0xb7),
	 '^Y' => chr(0xc5).chr(0xb6),
	 
	 # diaeresis
	 '"a' => chr(0xc3).chr(0xa4),
	 '"A' => chr(0xc3).chr(0x84),
	 '"e' => chr(0xc3).chr(0xab),
	 '"E' => chr(0xc3).chr(0x8b),
	 '"\\\\i' => chr(0xc3).chr(0xaf),
	 '"\\\\I' => chr(0xc3).chr(0x8f),
	 '"o' => chr(0xc3).chr(0xb6),
	 '"O' => chr(0xc3).chr(0x96),
	 '"u' => chr(0xc3).chr(0xbc),
	 '"U' => chr(0xc3).chr(0x9c),
	 '"y' => chr(0xc3).chr(0xbf),
	 '"Y' => chr(0xc3).chr(0xb8),
	 # tilde
	 # caron - handled specially
#	   ',s' => chr(0xc5).chr(0xa1),
#	   ',S' => chr(0xc5).chr(0xa5),
	 # breve
	 # double acute
	 # ring
	 # dot
	 # macron
	 '=a' => chr(0xc4).chr(0x81),
	 '=A' => chr(0xc4).chr(0x80),
	 '=e' => chr(0xc4).chr(0x93),
	 '=E' => chr(0xc4).chr(0x92),
	 '=i' => chr(0xc4).chr(0xab),
	 '=I' => chr(0xc4).chr(0xaa),
	 '=o' => chr(0xc4).chr(0x8d),
	 '=O' => chr(0xc4).chr(0x8c),
	 '=u' => chr(0xc4).chr(0xab),
	 '=U' => chr(0xc4).chr(0xaa),
	 
	 # stroke - handled specially - see below
	 
	 # cedilla - handled specially
	 
	 );
    
# these are one letter latex commands - we make sure they're not a longer
# command name. eg {\d} is d+stroke, so careful of \d
    my %special_utf8_chars = 
	(
	 # caron
	 'v n' => chr(0xc5).chr(0x88),
	 'v N' => chr(0xc5).chr(0x87),
	 'v s' => chr(0xc5).chr(0xa1),
	 'v S' => chr(0xc5).chr(0xa5),
	 # cedilla
	 'c c' => chr(0xc3).chr(0xa7),
	 'c C' => chr(0xc3).chr(0x87),
	 'c g' => chr(0xc4).chr(0xa3),
	 'c G' => chr(0xc4).chr(0xa2),
	 'c k' => chr(0xc4).chr(0xb7),
	 'c K' => chr(0xc4).chr(0xb6),
	 'c l' => chr(0xc4).chr(0xbc),
	 'c L' => chr(0xc4).chr(0xbb),
	 'c n' => chr(0xc5).chr(0x86),
	 'c N' => chr(0xc5).chr(0x85),
	 'c r' => chr(0xc5).chr(0x97),
	 'c R' => chr(0xc5).chr(0x96),
	 'c s' => chr(0xc5).chr(0x9f),
	 'c S' => chr(0xc5).chr(0x9e),
	 'c t' => chr(0xc5).chr(0xa3),
	 'c T' => chr(0xc5).chr(0xa2),
	 # double acute / Hungarian accent
	 'H O' => chr(0xc5).chr(0x90),
	 'H o' => chr(0xc5).chr(0x91),
	 'H U' => chr(0xc5).chr(0xb0),
	 'H u' => chr(0xc5).chr(0xb1),
	 
	 # stroke 
	 'd' => chr(0xc4).chr(0x91),
	 'D' => chr(0xc4).chr(0x90),
	 'h' => chr(0xc4).chr(0xa7),
#	 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
	 'l' => chr(0xc5).chr(0x82),
	 'L' => chr(0xc5).chr(0x81),
	 'o' => chr(0xc3).chr(0xb8),
	 'O' => chr(0xc3).chr(0x98),
	 't' => chr(0xc5).chr(0xa7),
	 'T' => chr(0xc5).chr(0xa6),
	 # german ss/szlig/sharp s
	 'ss' => chr(0xc3).chr(0x9f),
	 );
    
    # convert latex-style accented characters.
    # remove space (if any) between \ and letter to accent (eg {\' a})

    $text =~ s@(\\[`'="])\s(\w)@$1$2@g;

    # remove {} around a single character (eg \'{e})
    $text =~ s@(\\[`'="]){(\w)}@$1$2@;

    # remove {} around a single character for special 1 letter commands -
    # need to insert a space. Eg \v{s}  ->  {\v s}
    $text =~ s@(\\[vcH]){(\w)}@{$1 $2}@;

    # this is slow (go through whole hash for each substitution!) so
    # only do if the text contains a '\' character. 
    if ($text =~ m|\\|) {
      for $latex_code (keys %utf8_chars) {
	  $text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g;
      }

      # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
      # only do the change if immediately followed by a space, }, {, or \
      for $latex_code (keys %special_utf8_chars) {
	  $text =~ s/\\${latex_code}([\\\s\{\}])/$special_utf8_chars{$latex_code}$1/g;
      }
    }

    # remove latex groupings { } (but not \{ or \} )
    # note - need it like this for first char match - eg {xx}{yy}
    while ($text =~ s@([^\\]){([^}]*?[^\\])}@$1$2@g) {}

    # remove latex commands
    $text =~ s@\\\w+{(.*)}@$1@g;

    # maths mode $...$ - this is not interpreted in any way at the moment...
    $text =~ s@\$(.*)\$@$1@g;

    # quoted { } chars
    $text =~ s@\\{@{@g;
    $text =~ s@\\}@}@g;

    return $text;
}


sub set_OID {
    my $self = shift (@_);
    my ($doc_obj, $id, $segment_number) = @_;
    
    if ( $self->{'key'} eq "default") {
	$doc_obj->set_OID();
    } else {
	$doc_obj->set_OID($self->{'key'});
    }
}

1;