########################################################################### # # BibTexPlug.pm - a plugin for bibliography records in BibTex format # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright 2000 Gordon W. Paynter # Copyright 1999-2000 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # BibTexPlug reads bibliography files in BibTex format. # # by Gordon W. Paynter (gwp@cs.waikato.ac.nz), November 2000 # Based on ReferPlug. See ReferPlug for geneology. # # BibTexPlug creates a document object for every reference a the file. # It is a subclass of SplitPlug, so if there are multiple records, all # are read. package BibTexPlug; use SplitPlug; # BibTexPlug is a sub-class of BasPlug. sub BEGIN { @ISA = ('SplitPlug'); } # This plugin processes files with the suffix ".bib" sub get_default_process_exp { return q^(?i)\.bib$^; } # This plugin splits the input text at blank lines sub get_default_split_exp { return q^\n+(?=@)^; } # The process function reads a single bibliographic record and stores # it as a new document. sub process { my $self = shift (@_); my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; my $outhandle = $self->{'outhandle'}; $self->{'key'} = "default"; # Check that we're dealing with a valid BibTex record return undef unless ($$textref =~ /^@\w+\{.*\}/s); # Ignore things we can't use return 0 if ($$textref =~ /^\@String/); # Report that we're processing the file print $outhandle "BibTexPlug: processing $file\n" if ($self->{'verbosity'}) > 1; # This hash translates BibTex field names into metadata names. The # BibTex names are taken from the "Local Guide to Latex" Graeme # McKinstry. Metadata names are consistabnt with ReferPlug. my %field = ( 'address', 'PublisherAddress', 'author', 'Creator', 'booktitle', 'Booktitle', 'chapter', 'Chapter', 'edition', 'Edition', 'editor', 'Editor', 'institution', 'Publisher', 'journal', 'Journal', 'month', 'Month', 'number', 'Number', 'pages', 'Pages', 'publisher', 'Publisher', 'school', 'Publisher', 'title', 'Title', 'volume', 'Volume', 'year', 'Date', 'keywords', 'Keywords', 'abstract', 'Abstract', 'copyright', 'Copyright'); # Metadata fields my %metadata; my ($EntryType, $EntryID, $Creator, $Keywords, $text); # Make sure the text has exactly one entry per line my $lines = $$textref; $lines =~ s/,\s*\n/=====/g; $lines =~ s/\s+/ /g; $lines =~ s/\s*=====\s*/\n/g; my @lines = split(/\n+/, $lines); # Read and process each line in the bib file. my ($id, $name, $value, $line); foreach $line (@lines) { # Add each line. Most lines consist of a field identifer and # then data, and we simply store them, though we treat some # of the fields a bit differently. $line =~ s/\s+/ /g; $text .= "$line\n"; # The first line is special, it contains the reference type and OID if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]+)\W*$/) { $EntryType = $1; $EntryID = $2; print "** $EntryType - \"$EntryID\" \n" if ($verbosity >= 4); $self->{'key'} = $EntryID; next; } if ($line =~ /\@/) { print "bibtexplug: suspect line in bibtex file: $line\n" if ($verbosity >= 2); print "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n" if ($verbosity >= 2); } # otherwise, parse the metadata out of this line next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/); $id = lc($1); $value = $2; # Add this line of metadata $metadata{$id} .= "$value\n"; } # Add the Entry type as metadata $doc_obj->add_metadata ($cursection, "EntryType", $EntryType); # Add the various field as metadata foreach my $id (keys %metadata) { next unless (defined $field{$id}); next unless (defined $metadata{$id}); $name = $field{$id}; $value = $metadata{$id}; # Get rid of silly Latex stuff if ($value =~ /\"(.*)\"/) { $value = $1; } if ($value =~ /\{(.*)\}/) { $value = $1; } # Add the various field as metadata $value = &text_into_html($value); $doc_obj->add_metadata ($cursection, $name, $value); # Several special operatons on metadata follow # Add individual keywords. # The full set of keywords will be added, in due course, as "Keywords". # However, we also want to add them as individual "Keyword" metadata elements. if ($id eq "keywords") { my @keywordlist = split(/,/, $value); foreach my $k (@keywordlist) { $k = lc($k); $k =~ s/\s*$//; $k =~ s/^\s*//; if ($k =~ /\w/) { $k = &text_into_html($k); $doc_obj->add_metadata ($cursection, "Keyword", $k); } } } # Add individual authors # The author metadata will be stored as one "Creator" entry, but we # also want to split it into several individual "Author" fields in # "Lastename, Firstnames" format so we can browse it. if ($id eq "author") { my @authorlist = split(/(,|and)/, $value); foreach $a (@authorlist) { $a =~ s/\s*$//; $a =~ s/^\s*//; # Reformat and add author name my @words = split(/ /, $a); my $lastname = pop @words; my $firstname = join(" ", @words); my $fullname = $lastname . ", " . $firstname; # Add each name to set of Authors if ($fullname =~ /\w+, \w+/) { $fullname = &text_into_html($fullname); $doc_obj->add_metadata ($cursection, "Author", $fullname); } } } # Books and Journals are additionally marked for display purposes if ($id eq "booktitle") { $doc_obj->add_metadata($cursection, "BookConfOnly", 1); } elsif ($id eq "journal") { $doc_obj->add_metadata($cursection, "JournalsOnly", 1); } } # Add the text in BibTex format (all fields) if ($text =~ /\w/) { $text = &text_into_html($text); $doc_obj->add_text ($cursection, $text); } return 1; } # Convert a text string into HTML. # The HTML is going to be inserted into a GML file, so we have to be # careful not to use symbols like ">", which ocurs frequently in email # messages (and use > instead. # This function also turns URLs and email addresses into links, and # replaces carriage returns with
tags (and multiple carriage returns # with

tags). sub text_into_html { my ($text) = @_; # Convert problem charaters into HTML symbols $text =~ s/&/&/g; $text =~ s//>/g; $text =~ s/\"/"/g; $text =~ s/\'/ /g; $text =~ s/\+/ /g; $text =~ s/\(/ /g; $text =~ s/\)/ /g; # convert email addresses and URLs into links $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/$1<\/a>/g; $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/$1<\/a>/g; # Clean up whitespace and convert \n charaters to
or

$text =~ s/ +/ /g; $text =~ s/\s*$//; $text =~ s/^\s*//; $text =~ s/\n/\n
/g; $text =~ s/
\s*
/

/g; return $text; } sub set_OID { my $self = shift (@_); my ($doc_obj, $id, $segment_number) = @_; if ( $self->{'key'} eq "default") { $doc_obj->set_OID(); } else { $doc_obj->set_OID($self->{'key'}); } } 1;