source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 3156

Last change on this file since 3156 was 3156, checked in by jrm21, 20 years ago

Added a few extra accented characters, and recognise some bibtex-specific macros.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.6 KB
Line 
1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter (gwp@cs.waikato.ac.nz), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <sergey@intellektik.informatik.tu-darmstadt.de>
41# * some non-ascii char support (ie mostly Latin)
42# * The raw ascii bibtex entry is stored as "BibTex" metadata.
43
44package BibTexPlug;
45
46use SplitPlug;
47
48# BibTexPlug is a sub-class of BasPlug.
49sub BEGIN {
50 @ISA = ('SplitPlug');
51}
52
53# This plugin processes files with the suffix ".bib"
54sub get_default_process_exp {
55 return q^(?i)\.bib$^;
56}
57
58# This plugin splits the input text at blank lines
59sub get_default_split_exp {
60 return q^\n+(?=@)^;
61}
62
63
64
65# The process function reads a single bibliographic record and stores
66# it as a new document.
67
68sub process {
69 my $self = shift (@_);
70 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
71 my $outhandle = $self->{'outhandle'};
72
73 $self->{'key'} = "default";
74
75 # Check that we're dealing with a valid BibTex record
76 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
77
78 # Ignore things we can't use
79 return 0 if ($$textref =~ /^\@String/);
80
81 # Report that we're processing the file
82 print $outhandle "BibTexPlug: processing $file\n"
83 if ($self->{'verbosity'}) > 1;
84
85
86 # This hash translates BibTex field names into metadata names. The
87 # BibTex names are taken from the "Local Guide to Latex" Graeme
88 # McKinstry. Metadata names are consistent with ReferPlug.
89
90 my %field = (
91 'address', 'PublisherAddress',
92 'author', 'Creator',
93
94 'booktitle', 'Booktitle',
95 'chapter', 'Chapter',
96 'edition', 'Edition',
97 'editor', 'Editor',
98 'institution', 'Publisher',
99 'journal', 'Journal',
100 'month', 'Month',
101 'number', 'Number',
102 'pages', 'Pages',
103 'publisher', 'Publisher',
104 'school', 'Publisher',
105 'title', 'Title',
106 'volume', 'Volume',
107 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
108
109 'keywords', 'Keywords',
110 'abstract', 'Abstract',
111 'copyright', 'Copyright'
112);
113
114 # Metadata fields
115 my %metadata;
116 my ($EntryType, $EntryID, $Creator, $Keywords, $text);
117
118 my $verbosity = $self->{'verbosity'};
119 $verbosity = 0 unless $verbosity;
120
121 # Make sure the text has exactly one entry per line -
122 # append line to previous if it doesn't start with " <key> = "
123
124 my @input_lines=split('\n', $$textref);
125 my @all_lines;
126 my $entry_line=shift @input_lines;
127 foreach my $input_line (@input_lines) {
128 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
129 # this is a new key
130 push(@all_lines, $entry_line);
131 $entry_line=$input_line;
132 } else {
133 # this is a continuation of previous line
134 $entry_line .= $input_line;
135 }
136
137 }
138 # add final line, removing trailing '}'
139 $entry_line =~ s/\}\s*$//;
140 push(@all_lines, $entry_line);
141 push(@all_lines, "}");
142
143
144
145 # Read and process each line in the bib file.
146 my ($entryname, $name, $value, $line);
147 foreach $line (@all_lines) {
148
149 # Add each line. Most lines consist of a field identifer and
150 # then data, and we simply store them, though we treat some
151 # of the fields a bit differently.
152
153 $line =~ s/\s+/ /g;
154 $text .= "$line\n";
155
156 print "Processing line = $line \n" if $verbosity>=4;
157
158 # The first line is special, it contains the reference type and OID
159 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]+)\W*$/) {
160 $EntryType = $1;
161 $EntryID = $2;
162 print "** $EntryType - \"$EntryID\" \n"
163 if ($verbosity >= 4);
164 $self->{'key'} = $EntryID;
165 next;
166 }
167 if ($line =~ /\@/) {
168 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
169 if ($verbosity >= 2);
170 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
171 if ($verbosity >= 2);
172 }
173
174 # otherwise, parse the metadata out of this line
175 next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
176 $entryname = lc($1);
177 $value = $2;
178 $value =~ s/,?\s*$//; # remove trailing comma and space
179 if ($value =~ /^"/ && $value =~ /"$/) {
180 # remove surrounding " marks
181 $value =~ s/^"//; $value =~ s/"$//;
182 }
183 $value = &process_latex($value);
184
185 # Add this line of metadata
186 $metadata{$entryname} .= "$value\n";
187
188 }
189
190 # Add the Entry type as metadata
191 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
192
193 # Add the various field as metadata
194 foreach my $entryname (keys %metadata) {
195 next unless (defined $field{$entryname});
196 next unless (defined $metadata{$entryname});
197
198 $name = $field{$entryname};
199 $value = $metadata{$entryname};
200
201 if ($name =~ /^Month/) {
202 $value=expand_month($value);
203 }
204 # Add the various fields as metadata
205 my $html_value = &text_into_html($value);
206 $doc_obj->add_utf8_metadata ($cursection, $name, $html_value);
207
208 # Several special operatons on metadata follow
209
210 # Add individual keywords.
211 # The full set of keywords will be added, in due course, as "Keywords".
212 # However, we also want to add them as individual "Keyword" metadata elements.
213 if ($entryname eq "keywords") {
214 my @keywordlist = split(/,/, $value);
215 foreach my $k (@keywordlist) {
216 $k = lc($k);
217 $k =~ s/\s*$//;
218 $k =~ s/^\s*//;
219 if ($k =~ /\w/) {
220 $k = &text_into_html($k);
221 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
222 }
223 }
224 }
225
226 # Add individual authors
227 # The author metadata will be stored as one "Creator" entry, but we
228 # also want to split it into several individual "Author" fields in
229 # "Lastname, Firstnames" format so we can browse it.
230 if ($entryname eq "author") { #added also comparison with editor
231
232 # und here for german language...
233 # don't use brackets in pattern, else the matched bit becomes
234 # an element in the list!
235 my @authorlist = split(/,|\s+and\s+|\s+und\s+/, $value);
236 foreach $a (@authorlist) {
237 $a =~ s/\s*$//;
238 $a =~ s/^\s*//;
239 # Reformat and add author name
240 next if $a=~ /^\s*$/;
241 my @words = split(/ /, $a);
242 my $lastname = pop @words;
243 my $firstname = join(" ", @words);
244
245 my $fullname = $lastname . ", " . $firstname;
246
247 # Add each name to set of Authors
248 # force utf8 pragma so that \w matches in this scope
249 use utf8;
250 if ($fullname =~ /\w+, \w+/) {
251 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
252 } else {
253 }
254 }
255 }
256
257 # Books and Journals are additionally marked for display purposes
258 if ($entryname eq "booktitle") {
259 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
260 } elsif ($entryname eq "journal") {
261 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
262 }
263
264 }
265
266 # Add the text in BibTex format (all fields)
267 if ($text =~ /\w/) {
268 $text = &text_into_html($text);
269 $doc_obj->add_utf8_text ($cursection, $text);
270 $doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
271 }
272
273 return 1;
274}
275
276
277
278
279# convert email addresses and URLs into links
280sub convert_urls_into_links{
281 my ($text) = @_;
282
283 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
284 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
285
286 return $text;
287}
288
289# Clean up whitespace and convert \n charaters to <BR> or <P>
290sub clean_up_whitespaces{
291 my ($text) = @_;
292
293 $text =~ s/%%%%%/<BR> <BR>/g;
294 $text =~ s/ +/ /g;
295 $text =~ s/\s*$//;
296 $text =~ s/^\s*//;
297 $text =~ s/\n/\n<BR>/g;
298 $text =~ s/<BR>\s*<BR>/<P>/g;
299
300 return $text;
301}
302
303
304sub convert_problem_characters_without_ampersand{
305 my ($text) = @_;
306 $text =~ s/</&lt;/g;
307 $text =~ s/>/&gt;/g;
308
309 $text =~ s/\'\'/\"/g; #Latex -specific conversion
310 $text =~ s/\`\`/\"/g; #Latex -specific conversion
311
312
313 $text =~ s/\"/&quot;/g;
314 $text =~ s/\'/&#8217;/g;
315 $text =~ s/\`/&#8216;/g;
316 $text =~ s/\+/ /g;
317 $text =~ s/\(/ /g;
318 $text =~ s/\)/ /g;
319
320 $text =~ s/\\/\\\\/g;
321
322 $text =~ s/\./\\\./g;
323
324 return $text;
325}
326
327# Convert a text string into HTML.
328
329# The HTML is going to be inserted into a GML file, so we have to be
330# careful not to use symbols like ">", which occurs frequently in email
331# messages (and use &gt instead.
332
333# This function also turns URLs and email addresses into links, and
334# replaces carriage returns with <BR> tags (and multiple carriage returns
335# with <P> tags).
336
337sub text_into_html {
338 my ($text) = @_;
339
340 # Convert problem characters into HTML symbols
341 $text =~ s/&/&amp;/g;
342
343 $text = &convert_problem_characters_without_ampersand( $text );
344
345 # convert email addresses and URLs into links
346 $text = &convert_urls_into_links( $text );
347
348 $text = &clean_up_whitespaces( $text );
349
350 return $text;
351}
352
353
354sub expand_month {
355 my $text=shift;
356
357 # bibtex style files expand abbreviations for months.
358 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
359 $text =~ s/jan/_textmonth01_/g;
360 $text =~ s/feb/_textmonth02_/g;
361 $text =~ s/mar/_textmonth03_/g;
362 $text =~ s/apr/_textmonth04_/g;
363 $text =~ s/may/_textmonth05_/g;
364 $text =~ s/jun/_textmonth06_/g;
365 $text =~ s/jul/_textmonth07_/g;
366 $text =~ s/aug/_textmonth08_/g;
367 $text =~ s/sep/_textmonth09_/g;
368 $text =~ s/oct/_textmonth10_/g;
369 $text =~ s/nov/_textmonth11_/g;
370 $text =~ s/dec/_textmonth12_/g;
371
372 return $text;
373}
374
375
376# Convert accented characters, remove { }, interprete some commands....
377# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
378sub process_latex {
379 my ($text) = @_;
380
381 # note - this is really ugly, but it works. There may be a prettier way
382 # of mapping latex accented chars to utf8, but we just brute force it here.
383 # Also, this isn't complete - not every single possible accented letter
384 # is in here yet, but most of the common ones are.
385
386 my %utf8_chars =
387 (
388 # acutes
389 '\'a' => chr(0xc3).chr(0xa1),
390 '\'c' => chr(0xc4).chr(0x87),
391 '\'e' => chr(0xc3).chr(0xa9),
392 '\'i' => chr(0xc3).chr(0xad),
393 '\'l' => chr(0xc3).chr(0xba),
394 '\'n' => chr(0xc3).chr(0x84),
395 '\'o' => chr(0xc3).chr(0xb3),
396 '\'r' => chr(0xc5).chr(0x95),
397 '\'s' => chr(0xc5).chr(0x9b),
398 '\'u' => chr(0xc3).chr(0xba),
399 '\'y' => chr(0xc3).chr(0xbd),
400 '\'z' => chr(0xc5).chr(0xba),
401 # graves
402 '`a' => chr(0xc3).chr(0xa0),
403 '`A' => chr(0xc3).chr(0x80),
404 '`e' => chr(0xc3).chr(0xa8),
405 '`E' => chr(0xc3).chr(0x88),
406 '`i' => chr(0xc3).chr(0xac),
407 '`I' => chr(0xc3).chr(0x8c),
408 '`o' => chr(0xc3).chr(0xb2),
409 '`O' => chr(0xc3).chr(0x92),
410 '`u' => chr(0xc3).chr(0xb9),
411 '`U' => chr(0xc3).chr(0x99),
412 # circumflex
413 '^a' => chr(0xc3).chr(0xa2),
414 '^A' => chr(0xc3).chr(0x82),
415 '^c' => chr(0xc4).chr(0x89),
416 '^C' => chr(0xc4).chr(0x88),
417 '^e' => chr(0xc3).chr(0xaa),
418 '^E' => chr(0xc3).chr(0x8a),
419 '^g' => chr(0xc4).chr(0x9d),
420 '^G' => chr(0xc4).chr(0x9c),
421 '^h' => chr(0xc4).chr(0xa5),
422 '^H' => chr(0xc4).chr(0xa4),
423 '^i' => chr(0xc3).chr(0xae),
424 '^I' => chr(0xc3).chr(0x8e),
425 '^j' => chr(0xc4).chr(0xb5),
426 '^J' => chr(0xc4).chr(0xb4),
427 '^o' => chr(0xc3).chr(0xb4),
428 '^O' => chr(0xc3).chr(0x94),
429 '^s' => chr(0xc5).chr(0x9d),
430 '^S' => chr(0xc5).chr(0x9c),
431 '^u' => chr(0xc3).chr(0xa2),
432 '^U' => chr(0xc3).chr(0xbb),
433 '^w' => chr(0xc5).chr(0xb5),
434 '^W' => chr(0xc5).chr(0xb4),
435 '^y' => chr(0xc5).chr(0xb7),
436 '^Y' => chr(0xc5).chr(0xb6),
437
438 # diaeresis
439 '"a' => chr(0xc3).chr(0xa4),
440 '"A' => chr(0xc3).chr(0x84),
441 '"e' => chr(0xc3).chr(0xab),
442 '"E' => chr(0xc3).chr(0x8b),
443 '"\\\\i' => chr(0xc3).chr(0xaf),
444 '"\\\\I' => chr(0xc3).chr(0x8f),
445 '"o' => chr(0xc3).chr(0xb6),
446 '"O' => chr(0xc3).chr(0x96),
447 '"u' => chr(0xc3).chr(0xbc),
448 '"U' => chr(0xc3).chr(0x9c),
449 '"y' => chr(0xc3).chr(0xbf),
450 '"Y' => chr(0xc3).chr(0xb8),
451 # tilde
452 '~A' => chr(0xc3).chr(0x83),
453 '~N' => chr(0xc3).chr(0x91),
454 '~O' => chr(0xc3).chr(0x95),
455 '~a' => chr(0xc3).chr(0xa3),
456 '~n' => chr(0xc3).chr(0xb1),
457 '~o' => chr(0xc3).chr(0xb5),
458 # caron - handled specially
459# ',s' => chr(0xc5).chr(0xa1),
460# ',S' => chr(0xc5).chr(0xa5),
461 # double acute
462 # ring
463 # dot
464 '\.c' => chr(0xc4).chr(0x8b),
465 '\.C' => chr(0xc4).chr(0x8a),
466 '\.e' => chr(0xc4).chr(0x97),
467 '\.E' => chr(0xc4).chr(0x96),
468 '\.g' => chr(0xc4).chr(0xa1),
469 '\.G' => chr(0xc4).chr(0xa0),
470 '\.I' => chr(0xc4).chr(0xb0),
471 '\.z' => chr(0xc5).chr(0xbc),
472 '\.Z' => chr(0xc5).chr(0xbb),
473 # macron
474 '=a' => chr(0xc4).chr(0x81),
475 '=A' => chr(0xc4).chr(0x80),
476 '=e' => chr(0xc4).chr(0x93),
477 '=E' => chr(0xc4).chr(0x92),
478 '=i' => chr(0xc4).chr(0xab),
479 '=I' => chr(0xc4).chr(0xaa),
480 '=o' => chr(0xc4).chr(0x8d),
481 '=O' => chr(0xc4).chr(0x8c),
482 '=u' => chr(0xc4).chr(0xab),
483 '=U' => chr(0xc4).chr(0xaa),
484
485 # stroke - handled specially - see below
486
487 # cedilla - handled specially
488
489 );
490
491# these are one letter latex commands - we make sure they're not a longer
492# command name. eg {\d} is d+stroke, so careful of \d
493 my %special_utf8_chars =
494 (
495 # breve
496 'u g' => chr(0xc4).chr(0x9f),
497 'u G' => chr(0xc4).chr(0x9e),
498 'u i' => chr(0xc4).chr(0xad),
499 'u I' => chr(0xc4).chr(0xac),
500 'u o' => chr(0xc5).chr(0x8f),
501 'u O' => chr(0xc5).chr(0x8e),
502 'u u' => chr(0xc5).chr(0xad),
503 'u U' => chr(0xc5).chr(0xac),
504 # caron
505 'v c' => chr(0xc4).chr(0x8d),
506 'v C' => chr(0xc4).chr(0x8c),
507 'v n' => chr(0xc5).chr(0x88),
508 'v N' => chr(0xc5).chr(0x87),
509 'v s' => chr(0xc5).chr(0xa1),
510 'v S' => chr(0xc5).chr(0xa5),
511 'v z' => chr(0xc5).chr(0xbe),
512 'v Z' => chr(0xc5).chr(0xbd),
513 # cedilla
514 'c c' => chr(0xc3).chr(0xa7),
515 'c C' => chr(0xc3).chr(0x87),
516 'c g' => chr(0xc4).chr(0xa3),
517 'c G' => chr(0xc4).chr(0xa2),
518 'c k' => chr(0xc4).chr(0xb7),
519 'c K' => chr(0xc4).chr(0xb6),
520 'c l' => chr(0xc4).chr(0xbc),
521 'c L' => chr(0xc4).chr(0xbb),
522 'c n' => chr(0xc5).chr(0x86),
523 'c N' => chr(0xc5).chr(0x85),
524 'c r' => chr(0xc5).chr(0x97),
525 'c R' => chr(0xc5).chr(0x96),
526 'c s' => chr(0xc5).chr(0x9f),
527 'c S' => chr(0xc5).chr(0x9e),
528 'c t' => chr(0xc5).chr(0xa3),
529 'c T' => chr(0xc5).chr(0xa2),
530 # double acute / Hungarian accent
531 'H O' => chr(0xc5).chr(0x90),
532 'H o' => chr(0xc5).chr(0x91),
533 'H U' => chr(0xc5).chr(0xb0),
534 'H u' => chr(0xc5).chr(0xb1),
535
536 # stroke
537 'd' => chr(0xc4).chr(0x91),
538 'D' => chr(0xc4).chr(0x90),
539 'h' => chr(0xc4).chr(0xa7),
540# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
541 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
542 'l' => chr(0xc5).chr(0x82),
543 'L' => chr(0xc5).chr(0x81),
544 'o' => chr(0xc3).chr(0xb8),
545 'O' => chr(0xc3).chr(0x98),
546 't' => chr(0xc5).chr(0xa7),
547 'T' => chr(0xc5).chr(0xa6),
548 # german ss/szlig/sharp s
549 'ss' => chr(0xc3).chr(0x9f),
550 );
551
552 # convert latex-style accented characters.
553 # remove space (if any) between \ and letter to accent (eg {\' a})
554
555 $text =~ s@(\\[`'="])\s(\w)@$1$2@g;
556
557 # remove {} around a single character (eg \'{e})
558 $text =~ s@(\\[`'="\.]){(\w)}@{$1$2}@g;
559
560 # remove {} around a single character for special 1 letter commands -
561 # need to insert a space. Eg \v{s} -> {\v s}
562 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
563 # this is slow (go through whole hash for each substitution!) so
564 # only do if the text contains a '\' character.
565 if ($text =~ m|\\|) {
566 for $latex_code (keys %utf8_chars) {
567 $text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g;
568 }
569
570 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
571 # only do the change if immediately followed by a space, }, {, or \
572 for $latex_code (keys %special_utf8_chars) {
573 $text =~ s/\\${latex_code}([\\\s{}])/$special_utf8_chars{$latex_code}$1/g;
574 }
575 }
576 # remove latex commands
577 # commands with optional arguments...
578 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
579 # $text =~ s@\\noopsort{[^}]+\}@@g;
580 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
581
582 # remove latex groupings { } (but not \{ or \} )
583 while ($text =~ s/([^\\])[\{\}]/$1/g) {;}
584 $text =~ s/^\{//; # remove { if first char
585
586 # maths mode $...$ - this is not interpreted in any way at the moment...
587 $text =~ s@\$(.*)\$@$1@g;
588
589 # latex characters
590 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
591 $text =~ s/([^\\])~+/$1/g; # non-breaking space "~"
592 # optional break "\-"
593 if ($text =~ m/\#/) { # concat macros (bibtex)
594 # the non-macro bits have quotes around them - we just remove quotes
595 $text =~ s/[\"\#]//g;
596 }
597 # quoted { } chars
598 $text =~ s@\\{@{@g;
599 $text =~ s@\\}@}@g;
600
601 return $text;
602}
603
604
605sub set_OID {
606 my $self = shift (@_);
607 my ($doc_obj, $id, $segment_number) = @_;
608
609 if ( $self->{'key'} eq "default") {
610 $doc_obj->set_OID();
611 } else {
612 $doc_obj->set_OID($self->{'key'});
613 }
614}
615
6161;
Note: See TracBrowser for help on using the repository browser.