source: trunk/gsdl/perllib/plugins/ReferPlug.pm@ 3540

Last change on this file since 3540 was 3540, checked in by kjdon, 22 years ago

added John T's changes into CVS - added info to enable retrieval of usage info in xml

  • Property svn:keywords set to Author Date Id Revision
File size: 9.6 KB
Line 
1###########################################################################
2#
3# ReferPlug.pm - a plugin for bibliography records in Refer format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2000 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# ReferPlug reads bibliography files in Refer format.
29#
30# by Gordon W. Paynter ([email protected]), November 2000
31#
32# Loosely based on hcibib2Plug by Steve Jones ([email protected]).
33# Which was based on EMAILPlug by Gordon Paynter ([email protected]).
34# Which was based on old versions of HTMLplug and HCIBIBPlugby by Stefan
35# Boddie and others -- it's hard to tell what came from where, now.
36#
37#
38# ReferPlug creates a document object for every reference in the file.
39# It is a subclass of SplitPlug, so if there are multiple records, all
40# are read.
41#
42# Document text:
43# The document text consists of the reference in Refer format
44#
45# Metadata:
46# $Creator %A Author name
47# $Title %T Title of article of book
48# $Journal %J Title of Journal
49# $Booktitle %B Title of book containing the publication
50# $Report %R Type of Report, paper or thesis
51# $Volume %V Volume Number of Journal
52# $Number %N Number of Journal within Volume
53# $Editor %E Editor name
54# $Pages %P Page Number of article
55# $Publisher %I Name of Publisher
56# $Publisheraddr %C Publisher's address
57# $Date %D Date of publication
58# $Keywords %K Keywords associated with publication
59# $Abstract %X Abstract of publication
60# $Copyright %* Copyright information for the article
61#
62
63# 12/05/02 Added usage datastructure - John Thompson
64
65package ReferPlug;
66
67use SplitPlug;
68
69# ReferPlug is a sub-class of BasPlug.
70sub BEGIN {
71 @ISA = ('SplitPlug');
72}
73
74my $arguments = [ { 'name' => "process_exp",
75 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
76 'type' => "string",
77 'deft' => q^(?i)\.bib$^,
78 'reqd' => "no" } ];
79
80my $options = { 'name' => "ReferPlug",
81 'desc' => "ReferPlug reads bibliography files in Refer format.\nBy Gordon W. Paynter (gwp\@cs.waikato.ac.nz), November 2000\n\nLoosely based on hcibib2Plug by Steve Jones (stevej\@cs.waikato.ac.nz). Which was based on EMAILPlug by Gordon Paynter (gwp\@cs.waikato.ac.nz). Which was based on old versions of HTMLplug and HCIBIBPlugby by Stefan Boddie and others -- it's hard to tell what came from where, now.\n\nReferPlug creates a document object for every reference in the file. It is a subclass of SplitPlug, so if there are multiple records, all are read.\n\nDocument text:\n\tThe document text consists of the reference in Refer format.\nMetadata:\n\t\$Creator \%A Author name\n\t\$Title \%T Title of article of book\n\t\$Journal \%J Title of Journal\n\t\$Booktitle \%B Title of book containing the publication\n\t\$Report \%R Type of Report, paper or thesis\n\t\$Volume \%V Volume Number of Journal\n\t\$Number \%N Number of Journal within Volume\n\t\$Editor \%E Editor name\n\t\$Pages \%P Page Number of article\n\t\$Publisher \%I Name of Publisher\n\t\$Publisheraddr \%C Publisher's address\n\t\$Date \%D Date of publication\n\t\$Keywords \%K Keywords associated with publication\n\t\$Abstract \%X Abstract of publication\n\t\$Copyright\t\%* Copyright information for the article",
82 'inherits' => "yes",
83 'args' => $arguments };
84
85# This plugin processes files with the suffix ".bib"
86sub get_default_process_exp {
87 return q^(?i)\.bib$^;
88}
89
90# This plugin splits the input text at blank lines
91sub get_default_split_exp {
92 return q^\n\s*\n^;
93}
94
95sub new {
96 my $class = shift (@_);
97 my $self = new SplitPlug ($class, @_);
98
99 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
100 my $option_list = $self->{'option_list'};
101 push( @{$option_list}, $options );
102
103 return bless $self, $class;
104}
105
106# The process function reads a single bibliogrphic record and stores
107# it as a new document.
108
109sub process {
110 my $self = shift (@_);
111 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
112 my $outhandle = $self->{'outhandle'};
113
114 # Check that we're dealing with a valid Refer file
115 return undef unless ($$textref =~ /^\s*%/);
116
117 # Report that we're processing the file
118 print $outhandle "ReferPlug: processing $file\n"
119 if ($self->{'verbosity'}) > 1;
120
121 my %field = ('H', 'Header',
122 'A', 'Creator',
123 'T', 'Title',
124 'J', 'Journal',
125 'B', 'Booktitle',
126 'R', 'Report',
127 'V', 'Volume',
128 'N', 'Number',
129 'E', 'Editor',
130 'P', 'Pages',
131 'I', 'Publisher',
132 'C', 'PublisherAddress',
133 'D', 'Date',
134 'O', 'OtherInformation',
135 'K', 'Keywords',
136 'X', 'Abstract',
137 '*', 'Copyright');
138
139 # Metadata fields
140 my %metadata;
141 my ($id, $Creator, $Keywords, $text);
142 my @lines = split(/\n+/, $$textref);
143
144
145 # Read and process each line in the bib file.
146 # Each file consists of a set of metadata items, one to each line
147 # with the Refer key followed by a space then the associated data
148 foreach my $line (@lines) {
149
150 # Add each line. Most lines consist of a field identifer and
151 # then data, and we simply store them, though we treat some
152 # of the fields a bit differently.
153
154 $line =~ s/\s+/ /g;
155 $text .= "$line\n";
156 $ReferFormat .= "$line\n";
157
158 next unless ($line =~ /^%[A-Z\*]/);
159 $id = substr($line,1,1);
160 $line =~ s/^%. //;
161
162 # Add individual authors in "Lastname, Firstname" format.
163 # (The full set of authors will be added below as "Creator".)
164 if ($id eq "A") {
165
166 # Reformat and add author name
167 my @words = split(/ /, $line);
168 my $lastname = pop @words;
169 my $firstname = join(" ", @words);
170 my $fullname = $lastname . ", " . $firstname;
171
172 # Add each name to set of Authors
173 if ($fullname =~ /\w/) {
174 $fullname = &text_into_html($fullname);
175 $doc_obj->add_metadata ($cursection, "Author", $fullname);
176 }
177 }
178
179 # Add individual keywords.
180 # (The full set of authors will be added below as "Keywords".)
181 if ($id eq "K") {
182 my @keywordlist = split(/,/, $line);
183 foreach my $k (@keywordlist) {
184 $k = lc($k);
185 $k =~ s/\s*$//;
186 $k =~ s/^\s*//;
187 if ($k =~ /\w/) {
188 $k = &text_into_html($k);
189 $doc_obj->add_metadata ($cursection, "Keyword", $k);
190 }
191 }
192 }
193
194 # Add this line of metadata
195 $metadata{$id} .= "$line\n";
196 }
197
198
199
200 # Add the various field as metadata
201 my ($f, $name, $value);
202 foreach $f (keys %metadata) {
203
204 next unless (defined $field{$f});
205 next unless (defined $metadata{$f});
206
207 $name = $field{$f};
208 $value = $metadata{$f};
209
210 # Add the various field as metadata
211
212 # The Creator metadata is found by concatenating authors.
213 if ($f eq "A") {
214
215 my @authorlist = split(/\n/, $value);
216 my $lastauthor = pop @authorlist;
217 my $Creator = "";
218 if (scalar @authorlist) {
219 $Creator = join(", ", @authorlist) . "and $lastauthor";
220 } else {
221 $Creator = $lastauthor;
222 }
223
224 if ($Creator =~ /\w/) {
225 $Creator = &text_into_html($Creator);
226 $doc_obj->add_metadata ($cursection, "Creator", $Creator);
227 }
228 }
229
230 # The rest are added in a standard way
231 else {
232 $value = &text_into_html($value);
233 $doc_obj->add_metadata ($cursection, $name, $value);
234 }
235
236 # Books and Journals are additionally marked for display purposes
237 if ($f eq "B") {
238 $doc_obj->add_metadata($cursection, "BookConfOnly", 1);
239 } elsif ($f eq "J") {
240 $doc_obj->add_metadata($cursection, "JournalsOnly", 1);
241 }
242
243
244 }
245
246 # Add the text in refer format(all fields)
247 if ($text =~ /\w/) {
248 $text = &text_into_html($text);
249 $doc_obj->add_text ($cursection, $text);
250 }
251
252 return 1; # processed the file
253}
254
2551;
256#
257# Convert a text string into HTML.
258#
259# The HTML is going to be inserted into a GML file, so
260# we have to be careful not to use symbols like ">",
261# which ocurs frequently in email messages (and use
262# &gt instead.
263#
264# This function also turns links and email addresses into hyperlinks,
265# and replaces carriage returns with <BR> tags (and multiple carriage
266# returns with <P> tags).
267#
268
269sub text_into_html {
270 my ($text) = @_;
271
272
273 # Convert problem charaters into HTML symbols
274 $text =~ s/&/&amp;/g;
275 $text =~ s/</&lt;/g;
276 $text =~ s/>/&gt;/g;
277 $text =~ s/\"/&quot;/g;
278 $text =~ s/\'/ /g;
279 $text =~ s/\+/ /g;
280 $text =~ s/\(/ /g;
281 $text =~ s/\)/ /g;
282
283 # convert email addresses and URLs into links
284 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
285 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
286
287 # Clean up whitespace and convert \n charaters to <BR> or <P>
288 $text =~ s/ +/ /g;
289 $text =~ s/\s*$//;
290 $text =~ s/^\s*//;
291 $text =~ s/\n/\n<BR>/g;
292 $text =~ s/<BR>\s*<BR>/<P>/g;
293
294 return $text;
295}
296
297
Note: See TracBrowser for help on using the repository browser.