source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 1711

Last change on this file since 1711 was 1711, checked in by say1, 23 years ago

fixed minor spelling mistake

  • Property svn:keywords set to Author Date Id Revision
File size: 7.3 KB
Line 
1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2000 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
37
38
39package BibTexPlug;
40
41use SplitPlug;
42
43
44# BibTexPlug is a sub-class of BasPlug.
45sub BEGIN {
46 @ISA = ('SplitPlug');
47}
48
49# This plugin processes files with the suffix ".bib"
50sub get_default_process_exp {
51 return q^(?i)\.bib$^;
52}
53
54# This plugin splits the input text at blank lines
55sub get_default_split_exp {
56 return q^\n+(?=@)^;
57}
58
59
60# The process function reads a single bibliographic record and stores
61# it as a new document.
62
63sub process {
64 my $self = shift (@_);
65 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
66 my $outhandle = $self->{'outhandle'};
67
68 # Check that we're dealing with a valid BibTex record
69 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
70
71 # Ignore things we can't use
72 return 0 if ($$textref =~ /^\@String/);
73
74 # Report that we're processing the file
75 print $outhandle "BibTexPlug: processing $file\n"
76 if ($self->{'verbosity'}) > 1;
77
78
79 # This hash translates BibTex field names into metadata names. The
80 # BibTex names are taken from the "Local Guide to Latex" Graeme
81 # McKinstry. Metadata names are consistabnt with ReferPlug.
82
83 my %field = (
84 'address', 'PublisherAddress',
85 'author', 'Creator',
86 'booktitle', 'Booktitle',
87 'chapter', 'Chapter',
88 'edition', 'Edition',
89 'editor', 'Editor',
90 'institution', 'Publisher',
91 'journal', 'Journal',
92 'month', 'Month',
93 'number', 'Number',
94 'pages', 'Pages',
95 'publisher', 'Publisher',
96 'school', 'Publisher',
97 'title', 'Title',
98 'volume', 'Volume',
99 'year', 'Date',
100
101 'keywords', 'Keywords',
102 'abstract', 'Abstract',
103 'copyright', 'Copyright');
104
105 # Metadata fields
106 my %metadata;
107 my ($EntryType, $EntryID, $Creator, $Keywords, $text);
108
109 # Make sure the text has exactly one entry per line
110 my $lines = $$textref;
111 $lines =~ s/,\s*\n/=====/g;
112 $lines =~ s/\s+/ /g;
113 $lines =~ s/\s*=====\s*/\n/g;
114 my @lines = split(/\n+/, $lines);
115
116 # Read and process each line in the bib file.
117 my ($id, $name, $value, $line);
118 foreach $line (@lines) {
119
120 # Add each line. Most lines consist of a field identifer and
121 # then data, and we simply store them, though we treat some
122 # of the fields a bit differently.
123
124 $line =~ s/\s+/ /g;
125 $text .= "$line\n";
126
127 # The first line is special, it contains the reference type and OID
128 if ($line =~ /\@(\w+)\{([\w\d]*),/) {
129 $EntryType = $1;
130 $EntryID = $2;
131 print "** $EntryType - $EntryID \n";
132 next;
133 }
134
135 # otherwise, parse the metadata out of this line
136 next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
137 $id = lc($1);
138 $value = $2;
139
140 # Add this line of metadata
141 $metadata{$id} .= "$value\n";
142 }
143
144 # Add the Entry type as metadata
145 $doc_obj->add_metadata ($cursection, "EntryType", $EntryType);
146
147 # Add the various field as metadata
148 foreach my $id (keys %metadata) {
149
150 next unless (defined $field{$id});
151 next unless (defined $metadata{$id});
152
153 $name = $field{$id};
154 $value = $metadata{$id};
155
156 # Get rid of silly Latex stuff
157 if ($value =~ /\"(.*)\"/) {
158 $value = $1;
159 }
160 if ($value =~ /\{(.*)\}/) {
161 $value = $1;
162 }
163
164 # Add the various field as metadata
165 $value = &text_into_html($value);
166 $doc_obj->add_metadata ($cursection, $name, $value);
167
168 # Several special operatons on metadata follow
169
170 # Add individual keywords.
171 # The full set of keywords will be added, in due course, as "Keywords".
172 # However, we also want to add them as individual "Keyword" metadata elements.
173 if ($id eq "keywords") {
174 my @keywordlist = split(/,/, $value);
175 foreach my $k (@keywordlist) {
176 $k = lc($k);
177 $k =~ s/\s*$//;
178 $k =~ s/^\s*//;
179 if ($k =~ /\w/) {
180 $k = &text_into_html($k);
181 $doc_obj->add_metadata ($cursection, "Keyword", $k);
182 }
183 }
184 }
185
186 # Add individual authors
187 # The author metadata will be stored as one "Creator" entry, but we
188 # also want to split it into several individual "Author" fields in
189 # "Lastename, Firstnames" format so we can browse it.
190 if ($id eq "author") {
191
192 my @authorlist = split(/(,|and)/, $value);
193 foreach $a (@authorlist) {
194 $a =~ s/\s*$//;
195 $a =~ s/^\s*//;
196
197 # Reformat and add author name
198 my @words = split(/ /, $a);
199 my $lastname = pop @words;
200 my $firstname = join(" ", @words);
201 my $fullname = $lastname . ", " . $firstname;
202
203 # Add each name to set of Authors
204 if ($fullname =~ /\w+, \w+/) {
205 $fullname = &text_into_html($fullname);
206 $doc_obj->add_metadata ($cursection, "Author", $fullname);
207 }
208 }
209 }
210
211 # Books and Journals are additionally marked for display purposes
212 if ($id eq "booktitle") {
213 $doc_obj->add_metadata($cursection, "BookConfOnly", 1);
214 } elsif ($id eq "journal") {
215 $doc_obj->add_metadata($cursection, "JournalsOnly", 1);
216 }
217
218 }
219
220 # Add the text in BibTex format (all fields)
221 if ($text =~ /\w/) {
222 $text = &text_into_html($text);
223 $doc_obj->add_text ($cursection, $text);
224 }
225
226 return 1;
227}
228
2291;
230
231
232# Convert a text string into HTML.
233
234# The HTML is going to be inserted into a GML file, so we have to be
235# careful not to use symbols like ">", which ocurs frequently in email
236# messages (and use &gt instead.
237
238# This function also turns URLs and email addresses into links, and
239# replaces carriage returns with <BR> tags (and multiple carriage returns
240# with <P> tags).
241
242
243sub text_into_html {
244 my ($text) = @_;
245
246
247 # Convert problem charaters into HTML symbols
248 $text =~ s/&/&amp;/g;
249 $text =~ s/</&lt;/g;
250 $text =~ s/>/&gt;/g;
251 $text =~ s/\"/&quot;/g;
252 $text =~ s/\'/ /g;
253 $text =~ s/\+/ /g;
254 $text =~ s/\(/ /g;
255 $text =~ s/\)/ /g;
256
257 # convert email addresses and URLs into links
258 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
259 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
260
261 # Clean up whitespace and convert \n charaters to <BR> or <P>
262 $text =~ s/ +/ /g;
263 $text =~ s/\s*$//;
264 $text =~ s/^\s*//;
265 $text =~ s/\n/\n<BR>/g;
266 $text =~ s/<BR>\s*<BR>/<P>/g;
267
268 return $text;
269}
270
271
Note: See TracBrowser for help on using the repository browser.