source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 2484

Last change on this file since 2484 was 2484, checked in by say1, 23 years ago

Changed SplitPlug to allow control over the OID. Changed BibTexPlug to be more permissive in the Bibtex format it accepts. Changed BibTexPlug to use the BibTex key as the OID.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.9 KB
RevLine 
[1676]1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2000 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
37
38
39package BibTexPlug;
40
41use SplitPlug;
42
43
44# BibTexPlug is a sub-class of BasPlug.
45sub BEGIN {
46 @ISA = ('SplitPlug');
47}
48
49# This plugin processes files with the suffix ".bib"
50sub get_default_process_exp {
51 return q^(?i)\.bib$^;
52}
53
54# This plugin splits the input text at blank lines
55sub get_default_split_exp {
56 return q^\n+(?=@)^;
57}
58
59# The process function reads a single bibliographic record and stores
60# it as a new document.
61
62sub process {
63 my $self = shift (@_);
64 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
65 my $outhandle = $self->{'outhandle'};
66
[2484]67 $self->{'key'} = "default";
68
[1676]69 # Check that we're dealing with a valid BibTex record
70 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
71
72 # Ignore things we can't use
73 return 0 if ($$textref =~ /^\@String/);
74
75 # Report that we're processing the file
76 print $outhandle "BibTexPlug: processing $file\n"
77 if ($self->{'verbosity'}) > 1;
78
79
[1711]80 # This hash translates BibTex field names into metadata names. The
[1676]81 # BibTex names are taken from the "Local Guide to Latex" Graeme
82 # McKinstry. Metadata names are consistabnt with ReferPlug.
83
84 my %field = (
85 'address', 'PublisherAddress',
86 'author', 'Creator',
87 'booktitle', 'Booktitle',
88 'chapter', 'Chapter',
89 'edition', 'Edition',
90 'editor', 'Editor',
91 'institution', 'Publisher',
92 'journal', 'Journal',
93 'month', 'Month',
94 'number', 'Number',
95 'pages', 'Pages',
96 'publisher', 'Publisher',
97 'school', 'Publisher',
98 'title', 'Title',
99 'volume', 'Volume',
100 'year', 'Date',
101
102 'keywords', 'Keywords',
103 'abstract', 'Abstract',
104 'copyright', 'Copyright');
105
106 # Metadata fields
107 my %metadata;
108 my ($EntryType, $EntryID, $Creator, $Keywords, $text);
109
110 # Make sure the text has exactly one entry per line
111 my $lines = $$textref;
112 $lines =~ s/,\s*\n/=====/g;
113 $lines =~ s/\s+/ /g;
114 $lines =~ s/\s*=====\s*/\n/g;
115 my @lines = split(/\n+/, $lines);
116
117 # Read and process each line in the bib file.
118 my ($id, $name, $value, $line);
119 foreach $line (@lines) {
120
121 # Add each line. Most lines consist of a field identifer and
122 # then data, and we simply store them, though we treat some
123 # of the fields a bit differently.
124
125 $line =~ s/\s+/ /g;
126 $text .= "$line\n";
127
[2484]128
[1676]129 # The first line is special, it contains the reference type and OID
[2484]130 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]+)\W*$/) {
[1676]131 $EntryType = $1;
132 $EntryID = $2;
[2484]133 print "** $EntryType - \"$EntryID\" \n"
134 if ($verbosity >= 4);
135 $self->{'key'} = $EntryID;
[1676]136 next;
137 }
[2484]138 if ($line =~ /\@/) {
139 print "bibtexplug: suspect line in bibtex file: $line\n"
140 if ($verbosity >= 2);
141 print "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
142 if ($verbosity >= 2);
143 }
[1676]144
145 # otherwise, parse the metadata out of this line
146 next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
147 $id = lc($1);
148 $value = $2;
149
150 # Add this line of metadata
151 $metadata{$id} .= "$value\n";
152 }
153
[1677]154 # Add the Entry type as metadata
155 $doc_obj->add_metadata ($cursection, "EntryType", $EntryType);
[1676]156
157 # Add the various field as metadata
158 foreach my $id (keys %metadata) {
159
160 next unless (defined $field{$id});
161 next unless (defined $metadata{$id});
162
163 $name = $field{$id};
164 $value = $metadata{$id};
165
166 # Get rid of silly Latex stuff
167 if ($value =~ /\"(.*)\"/) {
168 $value = $1;
169 }
170 if ($value =~ /\{(.*)\}/) {
171 $value = $1;
172 }
173
174 # Add the various field as metadata
175 $value = &text_into_html($value);
176 $doc_obj->add_metadata ($cursection, $name, $value);
177
178 # Several special operatons on metadata follow
179
180 # Add individual keywords.
181 # The full set of keywords will be added, in due course, as "Keywords".
182 # However, we also want to add them as individual "Keyword" metadata elements.
183 if ($id eq "keywords") {
184 my @keywordlist = split(/,/, $value);
185 foreach my $k (@keywordlist) {
186 $k = lc($k);
187 $k =~ s/\s*$//;
188 $k =~ s/^\s*//;
189 if ($k =~ /\w/) {
190 $k = &text_into_html($k);
191 $doc_obj->add_metadata ($cursection, "Keyword", $k);
192 }
193 }
194 }
195
196 # Add individual authors
197 # The author metadata will be stored as one "Creator" entry, but we
198 # also want to split it into several individual "Author" fields in
199 # "Lastename, Firstnames" format so we can browse it.
200 if ($id eq "author") {
201
202 my @authorlist = split(/(,|and)/, $value);
203 foreach $a (@authorlist) {
204 $a =~ s/\s*$//;
205 $a =~ s/^\s*//;
206
207 # Reformat and add author name
208 my @words = split(/ /, $a);
209 my $lastname = pop @words;
210 my $firstname = join(" ", @words);
211 my $fullname = $lastname . ", " . $firstname;
212
213 # Add each name to set of Authors
214 if ($fullname =~ /\w+, \w+/) {
215 $fullname = &text_into_html($fullname);
216 $doc_obj->add_metadata ($cursection, "Author", $fullname);
217 }
218 }
219 }
220
221 # Books and Journals are additionally marked for display purposes
222 if ($id eq "booktitle") {
223 $doc_obj->add_metadata($cursection, "BookConfOnly", 1);
224 } elsif ($id eq "journal") {
225 $doc_obj->add_metadata($cursection, "JournalsOnly", 1);
226 }
227
228 }
229
230 # Add the text in BibTex format (all fields)
231 if ($text =~ /\w/) {
232 $text = &text_into_html($text);
233 $doc_obj->add_text ($cursection, $text);
234 }
235
[1677]236 return 1;
[1676]237}
238
[1677]239
[1676]240# Convert a text string into HTML.
241
[1677]242# The HTML is going to be inserted into a GML file, so we have to be
243# careful not to use symbols like ">", which ocurs frequently in email
244# messages (and use &gt instead.
245
246# This function also turns URLs and email addresses into links, and
247# replaces carriage returns with <BR> tags (and multiple carriage returns
248# with <P> tags).
249
250
[1676]251sub text_into_html {
252 my ($text) = @_;
253
254
255 # Convert problem charaters into HTML symbols
256 $text =~ s/&/&amp;/g;
257 $text =~ s/</&lt;/g;
258 $text =~ s/>/&gt;/g;
259 $text =~ s/\"/&quot;/g;
260 $text =~ s/\'/ /g;
261 $text =~ s/\+/ /g;
262 $text =~ s/\(/ /g;
263 $text =~ s/\)/ /g;
264
265 # convert email addresses and URLs into links
266 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
267 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
268
269 # Clean up whitespace and convert \n charaters to <BR> or <P>
270 $text =~ s/ +/ /g;
271 $text =~ s/\s*$//;
272 $text =~ s/^\s*//;
273 $text =~ s/\n/\n<BR>/g;
274 $text =~ s/<BR>\s*<BR>/<P>/g;
275
276 return $text;
277}
278
[2484]279sub set_OID {
280 my $self = shift (@_);
281 my ($doc_obj, $id, $segment_number) = @_;
282
283 if ( $self->{'key'} eq "default") {
284 $doc_obj->set_OID();
285 } else {
286 $doc_obj->set_OID($self->{'key'});
287 }
288}
[1676]289
[2484]2901;
Note: See TracBrowser for help on using the repository browser.