source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 2901

Last change on this file since 2901 was 2901, checked in by jrm21, 22 years ago

We now interprete some latex commands in the input, mostly to do with accents.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.4 KB
Line 
1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
42# * The raw ascii bibtex entry is stored as "BibTex" metadata.
43
44package BibTexPlug;
45
46use SplitPlug;
47
48# BibTexPlug is a sub-class of BasPlug.
49sub BEGIN {
50 @ISA = ('SplitPlug');
51}
52
53# This plugin processes files with the suffix ".bib"
54sub get_default_process_exp {
55 return q^(?i)\.bib$^;
56}
57
58# This plugin splits the input text at blank lines
59sub get_default_split_exp {
60 return q^\n+(?=@)^;
61}
62
63
64
65# The process function reads a single bibliographic record and stores
66# it as a new document.
67
68sub process {
69 my $self = shift (@_);
70 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
71 my $outhandle = $self->{'outhandle'};
72
73 $self->{'key'} = "default";
74
75 # Check that we're dealing with a valid BibTex record
76 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
77
78 # Ignore things we can't use
79 return 0 if ($$textref =~ /^\@String/);
80
81 # Report that we're processing the file
82 print $outhandle "BibTexPlug: processing $file\n"
83 if ($self->{'verbosity'}) > 1;
84
85
86 # This hash translates BibTex field names into metadata names. The
87 # BibTex names are taken from the "Local Guide to Latex" Graeme
88 # McKinstry. Metadata names are consistent with ReferPlug.
89
90 my %field = (
91 'address', 'PublisherAddress',
92 'author', 'Creator',
93
94 'booktitle', 'Booktitle',
95 'chapter', 'Chapter',
96 'edition', 'Edition',
97 'editor', 'Editor',
98 'institution', 'Publisher',
99 'journal', 'Journal',
100 'month', 'Month',
101 'number', 'Number',
102 'pages', 'Pages',
103 'publisher', 'Publisher',
104 'school', 'Publisher',
105 'title', 'Title',
106 'volume', 'Volume',
107 'year', 'Date',
108
109 'keywords', 'Keywords',
110 'abstract', 'Abstract',
111 'copyright', 'Copyright'
112);
113
114 # Metadata fields
115 my %metadata;
116 my ($EntryType, $EntryID, $Creator, $Keywords, $text);
117
118 my $verbosity = $self->{'verbosity'};
119 $verbosity = 0 unless $verbosity;
120
121 my $lines=$$textref;
122
123 # Make sure the text has exactly one entry per line
124
125 $lines =~ s/^\s*(\@[^,]+,)\s*\n/$1=====/; #splitting key in entry
126 $lines =~ s/([\"\}]\s*,)\s*\n/$1=====/g; #splitting by comma, followed by \n (assuming end of lines are " or })
127 $lines =~ s/(\d+\s*\,)\s*\n/$1=====/g; #for the case, when we have number entry without closing "
128 $lines =~ s/\n\s*\n/%%%%%/g; #this was simply added in order to allow to process newline inside quoted strings,
129 #that continues for several lines
130 $lines =~ s/\s+/ /g;
131 $lines =~ s/\s*=====\s*/\n/g;
132
133 my @all_lines = split(/\n+/, $lines);
134
135 # Read and process each line in the bib file.
136 my ($entryname, $name, $value, $line);
137 foreach $line (@all_lines) {
138
139 # Add each line. Most lines consist of a field identifer and
140 # then data, and we simply store them, though we treat some
141 # of the fields a bit differently.
142
143 $line =~ s/\s+/ /g;
144 $text .= "$line\n";
145
146 print "Processing line = $line \n" if $verbosity>=4;
147
148 # The first line is special, it contains the reference type and OID
149 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]+)\W*$/) {
150 $EntryType = $1;
151 $EntryID = $2;
152 print "** $EntryType - \"$EntryID\" \n"
153 if ($verbosity >= 4);
154 $self->{'key'} = $EntryID;
155 next;
156 }
157 if ($line =~ /\@/) {
158 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
159 if ($verbosity >= 2);
160 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
161 if ($verbosity >= 2);
162 }
163
164 # otherwise, parse the metadata out of this line
165 next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
166 $entryname = lc($1);
167 $value = $2;
168 # tidy up, removing " at start and end
169 $value =~ s/^"//;
170 $value =~ s/(",)\s*$//;
171 $value = &process_latex($value);
172
173 # Add this line of metadata
174 $metadata{$entryname} .= "$value\n";
175
176 }
177
178 # Add the Entry type as metadata
179 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
180
181 # Add the various field as metadata
182 foreach my $entryname (keys %metadata) {
183 next unless (defined $field{$entryname});
184 next unless (defined $metadata{$entryname});
185
186 $name = $field{$entryname};
187 $value = $metadata{$entryname};
188
189 # Add the various fields as metadata
190 my $html_value = &text_into_html($value);
191 $doc_obj->add_utf8_metadata ($cursection, $name, $html_value);
192
193 # Several special operatons on metadata follow
194
195 # Add individual keywords.
196 # The full set of keywords will be added, in due course, as "Keywords".
197 # However, we also want to add them as individual "Keyword" metadata elements.
198 if ($entryname eq "keywords") {
199 my @keywordlist = split(/,/, $value);
200 foreach my $k (@keywordlist) {
201 $k = lc($k);
202 $k =~ s/\s*$//;
203 $k =~ s/^\s*//;
204 if ($k =~ /\w/) {
205 $k = &text_into_html($k);
206 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
207 }
208 }
209 }
210
211 # Add individual authors
212 # The author metadata will be stored as one "Creator" entry, but we
213 # also want to split it into several individual "Author" fields in
214 # "Lastename, Firstnames" format so we can browse it.
215 if ($entryname eq "author") { #added also comparison with editor
216
217 # und here for german language...
218 # don't use brackets in pattern, else the matched bit becomes
219 # an element in the list!
220 my @authorlist = split(/,|\s+and\s+|\s+und\s+/, $value);
221 foreach $a (@authorlist) {
222 $a =~ s/\s*$//;
223 $a =~ s/^\s*//;
224 # Reformat and add author name
225 next if $a=~ /^\s*$/;
226 my @words = split(/ /, $a);
227 my $lastname = pop @words;
228 my $firstname = join(" ", @words);
229
230 my $fullname = $lastname . ", " . $firstname;
231
232 # Add each name to set of Authors
233 # force utf8 pragma so that \w matches in this scope
234 use utf8;
235 if ($fullname =~ /\w+, \w+/) {
236 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
237 }
238 }
239 }
240
241 # Books and Journals are additionally marked for display purposes
242 if ($entryname eq "booktitle") {
243 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
244 } elsif ($entryname eq "journal") {
245 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
246 }
247
248 }
249
250 # Add the text in BibTex format (all fields)
251 if ($text =~ /\w/) {
252 $text = &text_into_html($text);
253 $doc_obj->add_utf8_text ($cursection, $text);
254 $doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
255 }
256
257 return 1;
258}
259
260
261
262
263# convert email addresses and URLs into links
264sub convert_urls_into_links{
265 my ($text) = @_;
266
267 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
268 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
269
270 return $text;
271}
272
273# Clean up whitespace and convert \n charaters to <BR> or <P>
274sub clean_up_whitespaces{
275 my ($text) = @_;
276
277 $text =~ s/%%%%%/<BR> <BR>/g;
278 $text =~ s/ +/ /g;
279 $text =~ s/\s*$//;
280 $text =~ s/^\s*//;
281 $text =~ s/\n/\n<BR>/g;
282 $text =~ s/<BR>\s*<BR>/<P>/g;
283
284 return $text;
285}
286
287
288sub convert_problem_characters_without_ampersand{
289 my ($text) = @_;
290 $text =~ s/</&lt;/g;
291 $text =~ s/>/&gt;/g;
292
293 $text =~ s/\'\'/\"/g; #Latex -specific conversion
294 $text =~ s/\`\`/\"/g; #Latex -specific conversion
295
296
297 $text =~ s/\"/&quot;/g;
298 $text =~ s/\'/&#8217;/g;
299 $text =~ s/\`/&#8216;/g;
300 $text =~ s/\+/ /g;
301 $text =~ s/\(/ /g;
302 $text =~ s/\)/ /g;
303
304 $text =~ s/\\/\\\\/g;
305
306 $text =~ s/\./\\\./g;
307
308 return $text;
309}
310
311# Convert a text string into HTML.
312
313# The HTML is going to be inserted into a GML file, so we have to be
314# careful not to use symbols like ">", which occurs frequently in email
315# messages (and use &gt instead.
316
317# This function also turns URLs and email addresses into links, and
318# replaces carriage returns with <BR> tags (and multiple carriage returns
319# with <P> tags).
320
321sub text_into_html {
322 my ($text) = @_;
323
324 # Convert problem characters into HTML symbols
325 $text =~ s/&/&amp;/g;
326
327 $text = &convert_problem_characters_without_ampersand( $text );
328
329 # convert email addresses and URLs into links
330 $text = &convert_urls_into_links( $text );
331
332 $text = &clean_up_whitespaces( $text );
333
334 return $text;
335}
336
337
338
339
340# Convert accented characters, remove { }, interprete some commands....
341# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
342sub process_latex {
343 my ($text) = @_;
344
345 # note - this is really ugly, but it works. There may be a prettier way
346 # of mapping latex accented chars to utf8, but we just brute force it here.
347 # Also, this isn't complete - not every single possible accented letter
348 # is in here yet, but most of the common ones are.
349
350 my %utf8_chars =
351 (
352 # acutes
353 '\'a' => chr(0xc3).chr(0xa1),
354 '\'c' => chr(0xc4).chr(0x87),
355 '\'e' => chr(0xc3).chr(0xa9),
356 '\'i' => chr(0xc3).chr(0xad),
357 '\'l' => chr(0xc3).chr(0xba),
358 '\'n' => chr(0xc3).chr(0x84),
359 '\'o' => chr(0xc3).chr(0xb3),
360 '\'r' => chr(0xc5).chr(0x95),
361 '\'s' => chr(0xc5).chr(0x9b),
362 '\'u' => chr(0xc3).chr(0xba),
363 '\'y' => chr(0xc3).chr(0xbd),
364 '\'z' => chr(0xc5).chr(0xba),
365 # graves
366 '`a' => chr(0xc3).chr(0xa0),
367 '`A' => chr(0xc3).chr(0x80),
368 '`e' => chr(0xc3).chr(0xa8),
369 '`E' => chr(0xc3).chr(0x88),
370 '`i' => chr(0xc3).chr(0xac),
371 '`I' => chr(0xc3).chr(0x8c),
372 '`o' => chr(0xc3).chr(0xb2),
373 '`O' => chr(0xc3).chr(0x92),
374 '`u' => chr(0xc3).chr(0xb9),
375 '`U' => chr(0xc3).chr(0x99),
376 # circumflex
377 '^a' => chr(0xc3).chr(0xa2),
378 '^A' => chr(0xc3).chr(0x82),
379 '^c' => chr(0xc4).chr(0x89),
380 '^C' => chr(0xc4).chr(0x88),
381 '^e' => chr(0xc3).chr(0xaa),
382 '^E' => chr(0xc3).chr(0x8a),
383 '^g' => chr(0xc4).chr(0x9d),
384 '^G' => chr(0xc4).chr(0x9c),
385 '^h' => chr(0xc4).chr(0xa5),
386 '^H' => chr(0xc4).chr(0xa4),
387 '^i' => chr(0xc3).chr(0xae),
388 '^I' => chr(0xc3).chr(0x8e),
389 '^j' => chr(0xc4).chr(0xb5),
390 '^J' => chr(0xc4).chr(0xb4),
391 '^o' => chr(0xc3).chr(0xb4),
392 '^O' => chr(0xc3).chr(0x94),
393 '^s' => chr(0xc5).chr(0x9d),
394 '^S' => chr(0xc5).chr(0x9c),
395 '^u' => chr(0xc3).chr(0xa2),
396 '^U' => chr(0xc3).chr(0xbb),
397 '^w' => chr(0xc5).chr(0xb5),
398 '^W' => chr(0xc5).chr(0xb4),
399 '^y' => chr(0xc5).chr(0xb7),
400 '^Y' => chr(0xc5).chr(0xb6),
401
402 # diaeresis
403 '"a' => chr(0xc3).chr(0xa4),
404 '"A' => chr(0xc3).chr(0x84),
405 '"e' => chr(0xc3).chr(0xab),
406 '"E' => chr(0xc3).chr(0x8b),
407 '"\\\\i' => chr(0xc3).chr(0xaf),
408 '"\\\\I' => chr(0xc3).chr(0x8f),
409 '"o' => chr(0xc3).chr(0xb6),
410 '"O' => chr(0xc3).chr(0x96),
411 '"u' => chr(0xc3).chr(0xbc),
412 '"U' => chr(0xc3).chr(0x9c),
413 '"y' => chr(0xc3).chr(0xbf),
414 '"Y' => chr(0xc3).chr(0xb8),
415 # tilde
416 # caron - handled specially
417# ',s' => chr(0xc5).chr(0xa1),
418# ',S' => chr(0xc5).chr(0xa5),
419 # breve
420 # double acute
421 # ring
422 # dot
423 # macron
424 '=a' => chr(0xc4).chr(0x81),
425 '=A' => chr(0xc4).chr(0x80),
426 '=e' => chr(0xc4).chr(0x93),
427 '=E' => chr(0xc4).chr(0x92),
428 '=i' => chr(0xc4).chr(0xab),
429 '=I' => chr(0xc4).chr(0xaa),
430 '=o' => chr(0xc4).chr(0x8d),
431 '=O' => chr(0xc4).chr(0x8c),
432 '=u' => chr(0xc4).chr(0xab),
433 '=U' => chr(0xc4).chr(0xaa),
434
435 # stroke - handled specially - see below
436
437 # cedilla - handled specially
438
439 );
440
441# these are one letter latex commands - we make sure they're not a longer
442# command name. eg {\d} is d+stroke, so careful of \d
443 my %special_utf8_chars =
444 (
445 # caron
446 'v n' => chr(0xc5).chr(0x88),
447 'v N' => chr(0xc5).chr(0x87),
448 'v s' => chr(0xc5).chr(0xa1),
449 'v S' => chr(0xc5).chr(0xa5),
450 # cedilla
451 'c c' => chr(0xc3).chr(0xa7),
452 'c C' => chr(0xc3).chr(0x87),
453 'c g' => chr(0xc4).chr(0xa3),
454 'c G' => chr(0xc4).chr(0xa2),
455 'c k' => chr(0xc4).chr(0xb7),
456 'c K' => chr(0xc4).chr(0xb6),
457 'c l' => chr(0xc4).chr(0xbc),
458 'c L' => chr(0xc4).chr(0xbb),
459 'c n' => chr(0xc5).chr(0x86),
460 'c N' => chr(0xc5).chr(0x85),
461 'c r' => chr(0xc5).chr(0x97),
462 'c R' => chr(0xc5).chr(0x96),
463 'c s' => chr(0xc5).chr(0x9f),
464 'c S' => chr(0xc5).chr(0x9e),
465 'c t' => chr(0xc5).chr(0xa3),
466 'c T' => chr(0xc5).chr(0xa2),
467 # double acute / Hungarian accent
468 'H O' => chr(0xc5).chr(0x90),
469 'H o' => chr(0xc5).chr(0x91),
470 'H U' => chr(0xc5).chr(0xb0),
471 'H u' => chr(0xc5).chr(0xb1),
472
473 # stroke
474 'd' => chr(0xc4).chr(0x91),
475 'D' => chr(0xc4).chr(0x90),
476 'h' => chr(0xc4).chr(0xa7),
477# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
478 'l' => chr(0xc5).chr(0x82),
479 'L' => chr(0xc5).chr(0x81),
480 'o' => chr(0xc3).chr(0xb8),
481 'O' => chr(0xc3).chr(0x98),
482 't' => chr(0xc5).chr(0xa7),
483 'T' => chr(0xc5).chr(0xa6),
484 # german ss/szlig/sharp s
485 'ss' => chr(0xc3).chr(0x9f),
486 );
487
488 # convert latex-style accented characters.
489 # remove space (if any) between \ and letter to accent (eg {\' a})
490
491 $text =~ s@(\\[`'="])\s(\w)@$1$2@g;
492
493 # remove {} around a single character (eg \'{e})
494 $text =~ s@(\\[`'="]){(\w)}@$1$2@;
495
496 # remove {} around a single character for special 1 letter commands -
497 # need to insert a space. Eg \v{s} -> {\v s}
498 $text =~ s@(\\[vcH]){(\w)}@{$1 $2}@;
499
500 # this is slow (go through whole hash for each substitution!) so
501 # only do if the text contains a '\' character.
502 if ($text =~ m|\\|) {
503 for $latex_code (keys %utf8_chars) {
504 $text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g;
505 }
506
507 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
508 # only do the change if immediately followed by a space, }, {, or \
509 for $latex_code (keys %special_utf8_chars) {
510 $text =~ s/\\${latex_code}([\\\s\{\}])/$special_utf8_chars{$latex_code}$1/g;
511 }
512 }
513
514 # remove latex groupings { } (but not \{ or \} )
515 # note - need it like this for first char match - eg {xx}{yy}
516 while ($text =~ s@([^\\]){([^}]*?[^\\])}@$1$2@g) {}
517
518 # remove latex commands
519 $text =~ s@\\\w+{(.*)}@$1@g;
520
521 # maths mode $...$ - this is not interpreted in any way at the moment...
522 $text =~ s@\$(.*)\$@$1@g;
523
524 # quoted { } chars
525 $text =~ s@\\{@{@g;
526 $text =~ s@\\}@}@g;
527
528 return $text;
529}
530
531
532sub set_OID {
533 my $self = shift (@_);
534 my ($doc_obj, $id, $segment_number) = @_;
535
536 if ( $self->{'key'} eq "default") {
537 $doc_obj->set_OID();
538 } else {
539 $doc_obj->set_OID($self->{'key'});
540 }
541}
542
5431;
Note: See TracBrowser for help on using the repository browser.