Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 2484

Last change on this file since 2484 was 2484, checked in by say1, 23 years ago
Changed SplitPlug to allow control over the OID. Changed BibTexPlug to be more permissive in the Bibtex format it accepts. Changed BibTexPlug to use the BibTex key as the OID.
Property svn:keywords set to `Author Date Id Revision`
File size: 7.9 KB

Rev	Line
[1676]	1	###########################################################################
	2	#
	3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright 2000 Gordon W. Paynter
	10	# Copyright 1999-2000 New Zealand Digital Library Project
	11	#
	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	25	#
	26	###########################################################################
	27
	28
	29	# BibTexPlug reads bibliography files in BibTex format.
	30	#
	31	# by Gordon W. Paynter ([email protected]), November 2000
	32	# Based on ReferPlug. See ReferPlug for geneology.
	33	#
	34	# BibTexPlug creates a document object for every reference a the file.
	35	# It is a subclass of SplitPlug, so if there are multiple records, all
	36	# are read.
	37
	38
	39	package BibTexPlug;
	40
	41	use SplitPlug;
	42
	43
	44	# BibTexPlug is a sub-class of BasPlug.
	45	sub BEGIN {
	46	@ISA = ('SplitPlug');
	47	}
	48
	49	# This plugin processes files with the suffix ".bib"
	50	sub get_default_process_exp {
	51	return q^(?i)\.bib$^;
	52	}
	53
	54	# This plugin splits the input text at blank lines
	55	sub get_default_split_exp {
	56	return q^\n+(?=@)^;
	57	}
	58
	59	# The process function reads a single bibliographic record and stores
	60	# it as a new document.
	61
	62	sub process {
	63	my $self = shift (@_);
	64	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
	65	my $outhandle = $self->{'outhandle'};
	66
[2484]	67	$self->{'key'} = "default";
	68
[1676]	69	# Check that we're dealing with a valid BibTex record
	70	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
	71
	72	# Ignore things we can't use
	73	return 0 if ($$textref =~ /^\@String/);
	74
	75	# Report that we're processing the file
	76	print $outhandle "BibTexPlug: processing $file\n"
	77	if ($self->{'verbosity'}) > 1;
	78
	79
[1711]	80	# This hash translates BibTex field names into metadata names. The
[1676]	81	# BibTex names are taken from the "Local Guide to Latex" Graeme
	82	# McKinstry. Metadata names are consistabnt with ReferPlug.
	83
	84	my %field = (
	85	'address', 'PublisherAddress',
	86	'author', 'Creator',
	87	'booktitle', 'Booktitle',
	88	'chapter', 'Chapter',
	89	'edition', 'Edition',
	90	'editor', 'Editor',
	91	'institution', 'Publisher',
	92	'journal', 'Journal',
	93	'month', 'Month',
	94	'number', 'Number',
	95	'pages', 'Pages',
	96	'publisher', 'Publisher',
	97	'school', 'Publisher',
	98	'title', 'Title',
	99	'volume', 'Volume',
	100	'year', 'Date',
	101
	102	'keywords', 'Keywords',
	103	'abstract', 'Abstract',
	104	'copyright', 'Copyright');
	105
	106	# Metadata fields
	107	my %metadata;
	108	my ($EntryType, $EntryID, $Creator, $Keywords, $text);
	109
	110	# Make sure the text has exactly one entry per line
	111	my $lines = $$textref;
	112	$lines =~ s/,\s*\n/=====/g;
	113	$lines =~ s/\s+/ /g;
	114	$lines =~ s/\s=====\s/\n/g;
	115	my @lines = split(/\n+/, $lines);
	116
	117	# Read and process each line in the bib file.
	118	my ($id, $name, $value, $line);
	119	foreach $line (@lines) {
	120
	121	# Add each line. Most lines consist of a field identifer and
	122	# then data, and we simply store them, though we treat some
	123	# of the fields a bit differently.
	124
	125	$line =~ s/\s+/ /g;
	126	$text .= "$line\n";
	127
[2484]	128
[1676]	129	# The first line is special, it contains the reference type and OID
[2484]	130	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-]+)\W$/) {
[1676]	131	$EntryType = $1;
	132	$EntryID = $2;
[2484]	133	print "** $EntryType - \"$EntryID\" \n"
	134	if ($verbosity >= 4);
	135	$self->{'key'} = $EntryID;
[1676]	136	next;
	137	}
[2484]	138	if ($line =~ /\@/) {
	139	print "bibtexplug: suspect line in bibtex file: $line\n"
	140	if ($verbosity >= 2);
	141	print "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
	142	if ($verbosity >= 2);
	143	}
[1676]	144
	145	# otherwise, parse the metadata out of this line
	146	next unless ($line =~ /^\s(\w+)\s+=\s+(.)/);
	147	$id = lc($1);
	148	$value = $2;
	149
	150	# Add this line of metadata
	151	$metadata{$id} .= "$value\n";
	152	}
	153
[1677]	154	# Add the Entry type as metadata
	155	$doc_obj->add_metadata ($cursection, "EntryType", $EntryType);
[1676]	156
	157	# Add the various field as metadata
	158	foreach my $id (keys %metadata) {
	159
	160	next unless (defined $field{$id});
	161	next unless (defined $metadata{$id});
	162
	163	$name = $field{$id};
	164	$value = $metadata{$id};
	165
	166	# Get rid of silly Latex stuff
	167	if ($value =~ /\"(.*)\"/) {
	168	$value = $1;
	169	}
	170	if ($value =~ /\{(.*)\}/) {
	171	$value = $1;
	172	}
	173
	174	# Add the various field as metadata
	175	$value = &text_into_html($value);
	176	$doc_obj->add_metadata ($cursection, $name, $value);
	177
	178	# Several special operatons on metadata follow
	179
	180	# Add individual keywords.
	181	# The full set of keywords will be added, in due course, as "Keywords".
	182	# However, we also want to add them as individual "Keyword" metadata elements.
	183	if ($id eq "keywords") {
	184	my @keywordlist = split(/,/, $value);
	185	foreach my $k (@keywordlist) {
	186	$k = lc($k);
	187	$k =~ s/\s*$//;
	188	$k =~ s/^\s*//;
	189	if ($k =~ /\w/) {
	190	$k = &text_into_html($k);
	191	$doc_obj->add_metadata ($cursection, "Keyword", $k);
	192	}
	193	}
	194	}
	195
	196	# Add individual authors
	197	# The author metadata will be stored as one "Creator" entry, but we
	198	# also want to split it into several individual "Author" fields in
	199	# "Lastename, Firstnames" format so we can browse it.
	200	if ($id eq "author") {
	201
	202	my @authorlist = split(/(,\|and)/, $value);
	203	foreach $a (@authorlist) {
	204	$a =~ s/\s*$//;
	205	$a =~ s/^\s*//;
	206
	207	# Reformat and add author name
	208	my @words = split(/ /, $a);
	209	my $lastname = pop @words;
	210	my $firstname = join(" ", @words);
	211	my $fullname = $lastname . ", " . $firstname;
	212
	213	# Add each name to set of Authors
	214	if ($fullname =~ /\w+, \w+/) {
	215	$fullname = &text_into_html($fullname);
	216	$doc_obj->add_metadata ($cursection, "Author", $fullname);
	217	}
	218	}
	219	}
	220
	221	# Books and Journals are additionally marked for display purposes
	222	if ($id eq "booktitle") {
	223	$doc_obj->add_metadata($cursection, "BookConfOnly", 1);
	224	} elsif ($id eq "journal") {
	225	$doc_obj->add_metadata($cursection, "JournalsOnly", 1);
	226	}
	227
	228	}
	229
	230	# Add the text in BibTex format (all fields)
	231	if ($text =~ /\w/) {
	232	$text = &text_into_html($text);
	233	$doc_obj->add_text ($cursection, $text);
	234	}
	235
[1677]	236	return 1;
[1676]	237	}
	238
[1677]	239
[1676]	240	# Convert a text string into HTML.
	241
[1677]	242	# The HTML is going to be inserted into a GML file, so we have to be
	243	# careful not to use symbols like ">", which ocurs frequently in email
	244	# messages (and use &gt instead.
	245
	246	# This function also turns URLs and email addresses into links, and
	247	# replaces carriage returns with <BR> tags (and multiple carriage returns
	248	# with <P> tags).
	249
	250
[1676]	251	sub text_into_html {
	252	my ($text) = @_;
	253
	254
	255	# Convert problem charaters into HTML symbols
	256	$text =~ s/&/&/g;
	257	$text =~ s/</</g;
	258	$text =~ s/>/>/g;
	259	$text =~ s/\"/"/g;
	260	$text =~ s/\'/ /g;
	261	$text =~ s/\+/ /g;
	262	$text =~ s/\(/ /g;
	263	$text =~ s/\)/ /g;
	264
	265	# convert email addresses and URLs into links
	266	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
	267	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
	268
	269	# Clean up whitespace and convert \n charaters to <BR> or <P>
	270	$text =~ s/ +/ /g;
	271	$text =~ s/\s*$//;
	272	$text =~ s/^\s*//;
	273	$text =~ s/\n/\n<BR>/g;
	274	$text =~ s/<BR>\s*<BR>/<P>/g;
	275
	276	return $text;
	277	}
	278
[2484]	279	sub set_OID {
	280	my $self = shift (@_);
	281	my ($doc_obj, $id, $segment_number) = @_;
	282
	283	if ( $self->{'key'} eq "default") {
	284	$doc_obj->set_OID();
	285	} else {
	286	$doc_obj->set_OID($self->{'key'});
	287	}
	288	}
[1676]	289
[2484]	290	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: