source: trunk/gsdl/perllib/plugins/ReferPlug.pm@ 1991

Last change on this file since 1991 was 1676, checked in by paynter, 24 years ago

Plugins for processing files of bibliography records in BibTex and Refer
format. SplitPlug is a plugin for splitting one text file into many
Greenstone documents. ReferPlug and BibTextPlug (which both inherit from
SplitPlug) are for processing individual Refer and BibTex records
repectively.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.5 KB
Line 
1###########################################################################
2#
3# ReferPlug.pm - a plugin for bibliography records in Refer format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2000 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# ReferPlug reads bibliography files in Refer format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32#
33# Loosely based on hcibib2Plug by Steve Jones ([email protected]).
34# Which was based on EMAILPlug by Gordon Paynter ([email protected]).
35# Which was based on old versions of HTMLplug and HCIBIBPlugby by Stefan
36# Boddie and others -- it's hard to tell what came from where, now.
37#
38#
39# ReferPlug creates a document object for every reference in the file.
40# It is a subclass of SplitPlug, so if there are multiple records, all
41# are read.
42#
43# Document text:
44# The document text consists of the reference in Refer format
45#
46# Metadata:
47# $Creator %A Author name
48# $Title %T Title of article of book
49# $Journal %J Title of Journal
50# $Booktitle %B Title of book containing the publication
51# $Report %R Type of Report, paper or thesis
52# $Volume %V Volume Number of Journal
53# $Number %N Number of Journal within Volume
54# $Editor %E Editor name
55# $Pages %P Page Number of article
56# $Publisher %I Name of Publisher
57# $Publisheraddr %C Publisher's address
58# $Date %D Date of publication
59# $Keywords %K Keywords associated with publication
60# $Abstract %X Abstract of publication
61# $Copyright %* Copyright information for the article
62#
63
64
65package ReferPlug;
66
67use SplitPlug;
68
69
70# ReferPlug is a sub-class of BasPlug.
71sub BEGIN {
72 @ISA = ('SplitPlug');
73}
74
75# This plugin processes files with the suffix ".bib"
76sub get_default_process_exp {
77 return q^(?i)\.bib$^;
78}
79
80# This plugin splits the input text at blank lines
81sub get_default_split_exp {
82 return q^\n\s*\n^;
83}
84
85
86# The process function reads a single bibliogrphic record and stores
87# it as a new document.
88
89sub process {
90 my $self = shift (@_);
91 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
92 my $outhandle = $self->{'outhandle'};
93
94 # Check that we're dealing with a valid Refer file
95 return undef unless ($$textref =~ /^\s*%/);
96
97 # Report that we're processing the file
98 print $outhandle "ReferPlug: processing $file\n"
99 if ($self->{'verbosity'}) > 1;
100
101 my %field = ('H', 'Header',
102 'A', 'Creator',
103 'T', 'Title',
104 'J', 'Journal',
105 'B', 'Booktitle',
106 'R', 'Report',
107 'V', 'Volume',
108 'N', 'Number',
109 'E', 'Editor',
110 'P', 'Pages',
111 'I', 'Publisher',
112 'C', 'PublisherAddress',
113 'D', 'Date',
114 'O', 'OtherInformation',
115 'K', 'Keywords',
116 'X', 'Abstract',
117 '*', 'Copyright');
118
119 # Metadata fields
120 my %metadata;
121 my ($id, $Creator, $Keywords, $text);
122 my @lines = split(/\n+/, $$textref);
123
124
125 # Read and process each line in the bib file.
126 # Each file consists of a set of metadata items, one to each line
127 # with the Refer key followed by a space then the associated data
128 foreach my $line (@lines) {
129
130 # Add each line. Most lines consist of a field identifer and
131 # then data, and we simply store them, though we treat some
132 # of the fields a bit differently.
133
134 $line =~ s/\s+/ /g;
135 $text .= "$line\n";
136 $ReferFormat .= "$line\n";
137
138 next unless ($line =~ /^%[A-Z\*]/);
139 $id = substr($line,1,1);
140 $line =~ s/^%. //;
141
142 # Add individual authors in "Lastname, Firstname" format.
143 # (The full set of authors will be added below as "Creator".)
144 if ($id eq "A") {
145
146 # Reformat and add author name
147 my @words = split(/ /, $line);
148 my $lastname = pop @words;
149 my $firstname = join(" ", @words);
150 my $fullname = $lastname . ", " . $firstname;
151
152 # Add each name to set of Authors
153 if ($fullname =~ /\w/) {
154 $fullname = &text_into_html($fullname);
155 $doc_obj->add_metadata ($cursection, "Author", $fullname);
156 }
157 }
158
159 # Add individual keywords.
160 # (The full set of authors will be added below as "Keywords".)
161 if ($id eq "K") {
162 my @keywordlist = split(/,/, $line);
163 foreach my $k (@keywordlist) {
164 $k = lc($k);
165 $k =~ s/\s*$//;
166 $k =~ s/^\s*//;
167 if ($k =~ /\w/) {
168 $k = &text_into_html($k);
169 $doc_obj->add_metadata ($cursection, "Keyword", $k);
170 }
171 }
172 }
173
174 # Add this line of metadata
175 $metadata{$id} .= "$line\n";
176 }
177
178
179
180 # Add the various field as metadata
181 my ($f, $name, $value);
182 foreach $f (keys %metadata) {
183
184 next unless (defined $field{$f});
185 next unless (defined $metadata{$f});
186
187 $name = $field{$f};
188 $value = $metadata{$f};
189
190 # Add the various field as metadata
191
192 # The Creator metadata is found by concatenating authors.
193 if ($f eq "A") {
194
195 my @authorlist = split(/\n/, $value);
196 my $lastauthor = pop @authorlist;
197 my $Creator = "";
198 if (scalar @authorlist) {
199 $Creator = join(", ", @authorlist) . "and $lastauthor";
200 } else {
201 $Creator = $lastauthor;
202 }
203
204 if ($Creator =~ /\w/) {
205 $Creator = &text_into_html($Creator);
206 $doc_obj->add_metadata ($cursection, "Creator", $Creator);
207 }
208 }
209
210 # The rest are added in a standard way
211 else {
212 $value = &text_into_html($value);
213 $doc_obj->add_metadata ($cursection, $name, $value);
214 }
215
216 # Books and Journals are additionally marked for display purposes
217 if ($f eq "B") {
218 $doc_obj->add_metadata($cursection, "BookConfOnly", 1);
219 } elsif ($f eq "J") {
220 $doc_obj->add_metadata($cursection, "JournalsOnly", 1);
221 }
222
223
224 }
225
226 # Add the text in refer format(all fields)
227 if ($text =~ /\w/) {
228 $text = &text_into_html($text);
229 $doc_obj->add_text ($cursection, $text);
230 }
231
232 return 1; # processed the file
233}
234
2351;
236#
237# Convert a text string into HTML.
238#
239# The HTML is going to be inserted into a GML file, so
240# we have to be careful not to use symbols like ">",
241# which ocurs frequently in email messages (and use
242# &gt instead.
243#
244# This function also turns links and email addresses into hyperlinks,
245# and replaces carriage returns with <BR> tags (and multiple carriage
246# returns with <P> tags).
247#
248
249sub text_into_html {
250 my ($text) = @_;
251
252
253 # Convert problem charaters into HTML symbols
254 $text =~ s/&/&amp;/g;
255 $text =~ s/</&lt;/g;
256 $text =~ s/>/&gt;/g;
257 $text =~ s/\"/&quot;/g;
258 $text =~ s/\'/ /g;
259 $text =~ s/\+/ /g;
260 $text =~ s/\(/ /g;
261 $text =~ s/\)/ /g;
262
263 # convert email addresses and URLs into links
264 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
265 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
266
267 # Clean up whitespace and convert \n charaters to <BR> or <P>
268 $text =~ s/ +/ /g;
269 $text =~ s/\s*$//;
270 $text =~ s/^\s*//;
271 $text =~ s/\n/\n<BR>/g;
272 $text =~ s/<BR>\s*<BR>/<P>/g;
273
274 return $text;
275}
276
277
Note: See TracBrowser for help on using the repository browser.