###########################################################################
#
# BibTexPlug.pm - a plugin for bibliography records in BibTex format
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright 2000 Gordon W. Paynter
# Copyright 1999-2000 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
# BibTexPlug reads bibliography files in BibTex format.
#
# by Gordon W. Paynter (gwp@cs.waikato.ac.nz), November 2000
# Based on ReferPlug. See ReferPlug for geneology.
#
# BibTexPlug creates a document object for every reference a the file.
# It is a subclass of SplitPlug, so if there are multiple records, all
# are read.
package BibTexPlug;
use SplitPlug;
# BibTexPlug is a sub-class of BasPlug.
sub BEGIN {
@ISA = ('SplitPlug');
}
# This plugin processes files with the suffix ".bib"
sub get_default_process_exp {
return q^(?i)\.bib$^;
}
# This plugin splits the input text at blank lines
sub get_default_split_exp {
return q^\n+(?=@)^;
}
# The process function reads a single bibliographic record and stores
# it as a new document.
sub process {
my $self = shift (@_);
my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
my $outhandle = $self->{'outhandle'};
$self->{'key'} = "default";
# Check that we're dealing with a valid BibTex record
return undef unless ($$textref =~ /^@\w+\{.*\}/s);
# Ignore things we can't use
return 0 if ($$textref =~ /^\@String/);
# Report that we're processing the file
print $outhandle "BibTexPlug: processing $file\n"
if ($self->{'verbosity'}) > 1;
# This hash translates BibTex field names into metadata names. The
# BibTex names are taken from the "Local Guide to Latex" Graeme
# McKinstry. Metadata names are consistabnt with ReferPlug.
my %field = (
'address', 'PublisherAddress',
'author', 'Creator',
'booktitle', 'Booktitle',
'chapter', 'Chapter',
'edition', 'Edition',
'editor', 'Editor',
'institution', 'Publisher',
'journal', 'Journal',
'month', 'Month',
'number', 'Number',
'pages', 'Pages',
'publisher', 'Publisher',
'school', 'Publisher',
'title', 'Title',
'volume', 'Volume',
'year', 'Date',
'keywords', 'Keywords',
'abstract', 'Abstract',
'copyright', 'Copyright');
# Metadata fields
my %metadata;
my ($EntryType, $EntryID, $Creator, $Keywords, $text);
# Make sure the text has exactly one entry per line
my $lines = $$textref;
$lines =~ s/,\s*\n/=====/g;
$lines =~ s/\s+/ /g;
$lines =~ s/\s*=====\s*/\n/g;
my @lines = split(/\n+/, $lines);
# Read and process each line in the bib file.
my ($id, $name, $value, $line);
foreach $line (@lines) {
# Add each line. Most lines consist of a field identifer and
# then data, and we simply store them, though we treat some
# of the fields a bit differently.
$line =~ s/\s+/ /g;
$text .= "$line\n";
# The first line is special, it contains the reference type and OID
if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]+)\W*$/) {
$EntryType = $1;
$EntryID = $2;
print "** $EntryType - \"$EntryID\" \n"
if ($verbosity >= 4);
$self->{'key'} = $EntryID;
next;
}
if ($line =~ /\@/) {
print "bibtexplug: suspect line in bibtex file: $line\n"
if ($verbosity >= 2);
print "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
if ($verbosity >= 2);
}
# otherwise, parse the metadata out of this line
next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
$id = lc($1);
$value = $2;
# Add this line of metadata
$metadata{$id} .= "$value\n";
}
# Add the Entry type as metadata
$doc_obj->add_metadata ($cursection, "EntryType", $EntryType);
# Add the various field as metadata
foreach my $id (keys %metadata) {
next unless (defined $field{$id});
next unless (defined $metadata{$id});
$name = $field{$id};
$value = $metadata{$id};
# Get rid of silly Latex stuff
if ($value =~ /\"(.*)\"/) {
$value = $1;
}
if ($value =~ /\{(.*)\}/) {
$value = $1;
}
# Add the various field as metadata
$value = &text_into_html($value);
$doc_obj->add_metadata ($cursection, $name, $value);
# Several special operatons on metadata follow
# Add individual keywords.
# The full set of keywords will be added, in due course, as "Keywords".
# However, we also want to add them as individual "Keyword" metadata elements.
if ($id eq "keywords") {
my @keywordlist = split(/,/, $value);
foreach my $k (@keywordlist) {
$k = lc($k);
$k =~ s/\s*$//;
$k =~ s/^\s*//;
if ($k =~ /\w/) {
$k = &text_into_html($k);
$doc_obj->add_metadata ($cursection, "Keyword", $k);
}
}
}
# Add individual authors
# The author metadata will be stored as one "Creator" entry, but we
# also want to split it into several individual "Author" fields in
# "Lastename, Firstnames" format so we can browse it.
if ($id eq "author") {
my @authorlist = split(/(,|and)/, $value);
foreach $a (@authorlist) {
$a =~ s/\s*$//;
$a =~ s/^\s*//;
# Reformat and add author name
my @words = split(/ /, $a);
my $lastname = pop @words;
my $firstname = join(" ", @words);
my $fullname = $lastname . ", " . $firstname;
# Add each name to set of Authors
if ($fullname =~ /\w+, \w+/) {
$fullname = &text_into_html($fullname);
$doc_obj->add_metadata ($cursection, "Author", $fullname);
}
}
}
# Books and Journals are additionally marked for display purposes
if ($id eq "booktitle") {
$doc_obj->add_metadata($cursection, "BookConfOnly", 1);
} elsif ($id eq "journal") {
$doc_obj->add_metadata($cursection, "JournalsOnly", 1);
}
}
# Add the text in BibTex format (all fields)
if ($text =~ /\w/) {
$text = &text_into_html($text);
$doc_obj->add_text ($cursection, $text);
}
return 1;
}
# Convert a text string into HTML.
# The HTML is going to be inserted into a GML file, so we have to be
# careful not to use symbols like ">", which ocurs frequently in email
# messages (and use > instead.
# This function also turns URLs and email addresses into links, and
# replaces carriage returns with
tags (and multiple carriage returns
# with
tags).
sub text_into_html {
my ($text) = @_;
# Convert problem charaters into HTML symbols
$text =~ s/&/&/g;
$text =~ s/</g;
$text =~ s/>/>/g;
$text =~ s/\"/"/g;
$text =~ s/\'/ /g;
$text =~ s/\+/ /g;
$text =~ s/\(/ /g;
$text =~ s/\)/ /g;
# convert email addresses and URLs into links
$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/$1<\/a>/g;
$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/$1<\/a>/g;
# Clean up whitespace and convert \n charaters to
$text =~ s/ +/ /g;
$text =~ s/\s*$//;
$text =~ s/^\s*//;
$text =~ s/\n/\n /g;
return $text;
}
sub set_OID {
my $self = shift (@_);
my ($doc_obj, $id, $segment_number) = @_;
if ( $self->{'key'} eq "default") {
$doc_obj->set_OID();
} else {
$doc_obj->set_OID($self->{'key'});
}
}
1;
or
/g;
$text =~ s/
\s*
/