source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 3112

Last change on this file since 3112 was 3112, checked in by jrm21, 22 years ago

minor changes to formatted values (eg if enclosed in { and } ) and added a
few extra accented characters.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.9 KB
Line 
1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
42# * The raw ascii bibtex entry is stored as "BibTex" metadata.
43
44package BibTexPlug;
45
46use SplitPlug;
47
48# BibTexPlug is a sub-class of BasPlug.
49sub BEGIN {
50 @ISA = ('SplitPlug');
51}
52
53# This plugin processes files with the suffix ".bib"
54sub get_default_process_exp {
55 return q^(?i)\.bib$^;
56}
57
58# This plugin splits the input text at blank lines
59sub get_default_split_exp {
60 return q^\n+(?=@)^;
61}
62
63
64
65# The process function reads a single bibliographic record and stores
66# it as a new document.
67
68sub process {
69 my $self = shift (@_);
70 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
71 my $outhandle = $self->{'outhandle'};
72
73 $self->{'key'} = "default";
74
75 # Check that we're dealing with a valid BibTex record
76 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
77
78 # Ignore things we can't use
79 return 0 if ($$textref =~ /^\@String/);
80
81 # Report that we're processing the file
82 print $outhandle "BibTexPlug: processing $file\n"
83 if ($self->{'verbosity'}) > 1;
84
85
86 # This hash translates BibTex field names into metadata names. The
87 # BibTex names are taken from the "Local Guide to Latex" Graeme
88 # McKinstry. Metadata names are consistent with ReferPlug.
89
90 my %field = (
91 'address', 'PublisherAddress',
92 'author', 'Creator',
93
94 'booktitle', 'Booktitle',
95 'chapter', 'Chapter',
96 'edition', 'Edition',
97 'editor', 'Editor',
98 'institution', 'Publisher',
99 'journal', 'Journal',
100 'month', 'Month',
101 'number', 'Number',
102 'pages', 'Pages',
103 'publisher', 'Publisher',
104 'school', 'Publisher',
105 'title', 'Title',
106 'volume', 'Volume',
107 'year', 'Date',
108
109 'keywords', 'Keywords',
110 'abstract', 'Abstract',
111 'copyright', 'Copyright'
112);
113
114 # Metadata fields
115 my %metadata;
116 my ($EntryType, $EntryID, $Creator, $Keywords, $text);
117
118 my $verbosity = $self->{'verbosity'};
119 $verbosity = 0 unless $verbosity;
120
121 my $lines=$$textref;
122
123 # Make sure the text has exactly one entry per line
124
125 $lines =~ s/^\s*(\@[^,]+,)\s*\n/$1=====/; #splitting key in entry
126 $lines =~ s/([\"\}]\s*,)\s*\n/$1=====/g; #splitting by comma, followed by \n (assuming end of lines are " or })
127 $lines =~ s/(\d+\s*\,)\s*\n/$1=====/g; #for the case, when we have number entry without closing "
128 $lines =~ s/\n\s*\n/%%%%%/g; #this was simply added in order to allow to process newline inside quoted strings,
129 #that continues for several lines
130 $lines =~ s/\s+/ /g;
131 $lines =~ s/\s*=====\s*/\n/g;
132
133 my @all_lines = split(/\n+/, $lines);
134
135 # Read and process each line in the bib file.
136 my ($entryname, $name, $value, $line);
137 foreach $line (@all_lines) {
138
139 # Add each line. Most lines consist of a field identifer and
140 # then data, and we simply store them, though we treat some
141 # of the fields a bit differently.
142
143 $line =~ s/\s+/ /g;
144 $text .= "$line\n";
145
146 print "Processing line = $line \n" if $verbosity>=4;
147
148 # The first line is special, it contains the reference type and OID
149 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]+)\W*$/) {
150 $EntryType = $1;
151 $EntryID = $2;
152 print "** $EntryType - \"$EntryID\" \n"
153 if ($verbosity >= 4);
154 $self->{'key'} = $EntryID;
155 next;
156 }
157 if ($line =~ /\@/) {
158 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
159 if ($verbosity >= 2);
160 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
161 if ($verbosity >= 2);
162 }
163
164 # otherwise, parse the metadata out of this line
165 next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
166 $entryname = lc($1);
167 $value = $2;
168 $value =~ s/,?\s*$//; # remove trailing comma and space
169 if ($value =~ /^"/ && $value =~ /"$/) {
170 # remove surrounding " marks
171 $value =~ s/^"//; $value =~ s/"$//;
172 }
173 $value = &process_latex($value);
174
175 # Add this line of metadata
176 $metadata{$entryname} .= "$value\n";
177
178 }
179
180 # Add the Entry type as metadata
181 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
182
183 # Add the various field as metadata
184 foreach my $entryname (keys %metadata) {
185 next unless (defined $field{$entryname});
186 next unless (defined $metadata{$entryname});
187
188 $name = $field{$entryname};
189 $value = $metadata{$entryname};
190
191 # Add the various fields as metadata
192 my $html_value = &text_into_html($value);
193 $doc_obj->add_utf8_metadata ($cursection, $name, $html_value);
194
195 # Several special operatons on metadata follow
196
197 # Add individual keywords.
198 # The full set of keywords will be added, in due course, as "Keywords".
199 # However, we also want to add them as individual "Keyword" metadata elements.
200 if ($entryname eq "keywords") {
201 my @keywordlist = split(/,/, $value);
202 foreach my $k (@keywordlist) {
203 $k = lc($k);
204 $k =~ s/\s*$//;
205 $k =~ s/^\s*//;
206 if ($k =~ /\w/) {
207 $k = &text_into_html($k);
208 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
209 }
210 }
211 }
212
213 # Add individual authors
214 # The author metadata will be stored as one "Creator" entry, but we
215 # also want to split it into several individual "Author" fields in
216 # "Lastname, Firstnames" format so we can browse it.
217 if ($entryname eq "author") { #added also comparison with editor
218
219 # und here for german language...
220 # don't use brackets in pattern, else the matched bit becomes
221 # an element in the list!
222 my @authorlist = split(/,|\s+and\s+|\s+und\s+/, $value);
223 foreach $a (@authorlist) {
224 $a =~ s/\s*$//;
225 $a =~ s/^\s*//;
226 # Reformat and add author name
227 next if $a=~ /^\s*$/;
228 my @words = split(/ /, $a);
229 my $lastname = pop @words;
230 my $firstname = join(" ", @words);
231
232 my $fullname = $lastname . ", " . $firstname;
233
234 # Add each name to set of Authors
235 # force utf8 pragma so that \w matches in this scope
236 use utf8;
237 if ($fullname =~ /\w+, \w+/) {
238 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
239 }
240 }
241 }
242
243 # Books and Journals are additionally marked for display purposes
244 if ($entryname eq "booktitle") {
245 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
246 } elsif ($entryname eq "journal") {
247 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
248 }
249
250 }
251
252 # Add the text in BibTex format (all fields)
253 if ($text =~ /\w/) {
254 $text = &text_into_html($text);
255 $doc_obj->add_utf8_text ($cursection, $text);
256 $doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
257 }
258
259 return 1;
260}
261
262
263
264
265# convert email addresses and URLs into links
266sub convert_urls_into_links{
267 my ($text) = @_;
268
269 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
270 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
271
272 return $text;
273}
274
275# Clean up whitespace and convert \n charaters to <BR> or <P>
276sub clean_up_whitespaces{
277 my ($text) = @_;
278
279 $text =~ s/%%%%%/<BR> <BR>/g;
280 $text =~ s/ +/ /g;
281 $text =~ s/\s*$//;
282 $text =~ s/^\s*//;
283 $text =~ s/\n/\n<BR>/g;
284 $text =~ s/<BR>\s*<BR>/<P>/g;
285
286 return $text;
287}
288
289
290sub convert_problem_characters_without_ampersand{
291 my ($text) = @_;
292 $text =~ s/</&lt;/g;
293 $text =~ s/>/&gt;/g;
294
295 $text =~ s/\'\'/\"/g; #Latex -specific conversion
296 $text =~ s/\`\`/\"/g; #Latex -specific conversion
297
298
299 $text =~ s/\"/&quot;/g;
300 $text =~ s/\'/&#8217;/g;
301 $text =~ s/\`/&#8216;/g;
302 $text =~ s/\+/ /g;
303 $text =~ s/\(/ /g;
304 $text =~ s/\)/ /g;
305
306 $text =~ s/\\/\\\\/g;
307
308 $text =~ s/\./\\\./g;
309
310 return $text;
311}
312
313# Convert a text string into HTML.
314
315# The HTML is going to be inserted into a GML file, so we have to be
316# careful not to use symbols like ">", which occurs frequently in email
317# messages (and use &gt instead.
318
319# This function also turns URLs and email addresses into links, and
320# replaces carriage returns with <BR> tags (and multiple carriage returns
321# with <P> tags).
322
323sub text_into_html {
324 my ($text) = @_;
325
326 # Convert problem characters into HTML symbols
327 $text =~ s/&/&amp;/g;
328
329 $text = &convert_problem_characters_without_ampersand( $text );
330
331 # convert email addresses and URLs into links
332 $text = &convert_urls_into_links( $text );
333
334 $text = &clean_up_whitespaces( $text );
335
336 return $text;
337}
338
339
340
341
342# Convert accented characters, remove { }, interprete some commands....
343# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
344sub process_latex {
345 my ($text) = @_;
346
347 # note - this is really ugly, but it works. There may be a prettier way
348 # of mapping latex accented chars to utf8, but we just brute force it here.
349 # Also, this isn't complete - not every single possible accented letter
350 # is in here yet, but most of the common ones are.
351
352 my %utf8_chars =
353 (
354 # acutes
355 '\'a' => chr(0xc3).chr(0xa1),
356 '\'c' => chr(0xc4).chr(0x87),
357 '\'e' => chr(0xc3).chr(0xa9),
358 '\'i' => chr(0xc3).chr(0xad),
359 '\'l' => chr(0xc3).chr(0xba),
360 '\'n' => chr(0xc3).chr(0x84),
361 '\'o' => chr(0xc3).chr(0xb3),
362 '\'r' => chr(0xc5).chr(0x95),
363 '\'s' => chr(0xc5).chr(0x9b),
364 '\'u' => chr(0xc3).chr(0xba),
365 '\'y' => chr(0xc3).chr(0xbd),
366 '\'z' => chr(0xc5).chr(0xba),
367 # graves
368 '`a' => chr(0xc3).chr(0xa0),
369 '`A' => chr(0xc3).chr(0x80),
370 '`e' => chr(0xc3).chr(0xa8),
371 '`E' => chr(0xc3).chr(0x88),
372 '`i' => chr(0xc3).chr(0xac),
373 '`I' => chr(0xc3).chr(0x8c),
374 '`o' => chr(0xc3).chr(0xb2),
375 '`O' => chr(0xc3).chr(0x92),
376 '`u' => chr(0xc3).chr(0xb9),
377 '`U' => chr(0xc3).chr(0x99),
378 # circumflex
379 '^a' => chr(0xc3).chr(0xa2),
380 '^A' => chr(0xc3).chr(0x82),
381 '^c' => chr(0xc4).chr(0x89),
382 '^C' => chr(0xc4).chr(0x88),
383 '^e' => chr(0xc3).chr(0xaa),
384 '^E' => chr(0xc3).chr(0x8a),
385 '^g' => chr(0xc4).chr(0x9d),
386 '^G' => chr(0xc4).chr(0x9c),
387 '^h' => chr(0xc4).chr(0xa5),
388 '^H' => chr(0xc4).chr(0xa4),
389 '^i' => chr(0xc3).chr(0xae),
390 '^I' => chr(0xc3).chr(0x8e),
391 '^j' => chr(0xc4).chr(0xb5),
392 '^J' => chr(0xc4).chr(0xb4),
393 '^o' => chr(0xc3).chr(0xb4),
394 '^O' => chr(0xc3).chr(0x94),
395 '^s' => chr(0xc5).chr(0x9d),
396 '^S' => chr(0xc5).chr(0x9c),
397 '^u' => chr(0xc3).chr(0xa2),
398 '^U' => chr(0xc3).chr(0xbb),
399 '^w' => chr(0xc5).chr(0xb5),
400 '^W' => chr(0xc5).chr(0xb4),
401 '^y' => chr(0xc5).chr(0xb7),
402 '^Y' => chr(0xc5).chr(0xb6),
403
404 # diaeresis
405 '"a' => chr(0xc3).chr(0xa4),
406 '"A' => chr(0xc3).chr(0x84),
407 '"e' => chr(0xc3).chr(0xab),
408 '"E' => chr(0xc3).chr(0x8b),
409 '"\\\\i' => chr(0xc3).chr(0xaf),
410 '"\\\\I' => chr(0xc3).chr(0x8f),
411 '"o' => chr(0xc3).chr(0xb6),
412 '"O' => chr(0xc3).chr(0x96),
413 '"u' => chr(0xc3).chr(0xbc),
414 '"U' => chr(0xc3).chr(0x9c),
415 '"y' => chr(0xc3).chr(0xbf),
416 '"Y' => chr(0xc3).chr(0xb8),
417 # tilde
418 # caron - handled specially
419# ',s' => chr(0xc5).chr(0xa1),
420# ',S' => chr(0xc5).chr(0xa5),
421 # breve
422 # double acute
423 # ring
424 # dot
425 '\.c' => chr(0xc4).chr(0x8b),
426 '\.C' => chr(0xc4).chr(0x8a),
427 '\.e' => chr(0xc4).chr(0x97),
428 '\.E' => chr(0xc4).chr(0x96),
429 '\.g' => chr(0xc4).chr(0xa1),
430 '\.G' => chr(0xc4).chr(0xa0),
431 '\.I' => chr(0xc4).chr(0xb0),
432 '\.z' => chr(0xc5).chr(0xbc),
433 '\.Z' => chr(0xc5).chr(0xbb),
434 # macron
435 '=a' => chr(0xc4).chr(0x81),
436 '=A' => chr(0xc4).chr(0x80),
437 '=e' => chr(0xc4).chr(0x93),
438 '=E' => chr(0xc4).chr(0x92),
439 '=i' => chr(0xc4).chr(0xab),
440 '=I' => chr(0xc4).chr(0xaa),
441 '=o' => chr(0xc4).chr(0x8d),
442 '=O' => chr(0xc4).chr(0x8c),
443 '=u' => chr(0xc4).chr(0xab),
444 '=U' => chr(0xc4).chr(0xaa),
445
446 # stroke - handled specially - see below
447
448 # cedilla - handled specially
449
450 );
451
452# these are one letter latex commands - we make sure they're not a longer
453# command name. eg {\d} is d+stroke, so careful of \d
454 my %special_utf8_chars =
455 (
456 # caron
457 'v n' => chr(0xc5).chr(0x88),
458 'v N' => chr(0xc5).chr(0x87),
459 'v s' => chr(0xc5).chr(0xa1),
460 'v S' => chr(0xc5).chr(0xa5),
461 # cedilla
462 'c c' => chr(0xc3).chr(0xa7),
463 'c C' => chr(0xc3).chr(0x87),
464 'c g' => chr(0xc4).chr(0xa3),
465 'c G' => chr(0xc4).chr(0xa2),
466 'c k' => chr(0xc4).chr(0xb7),
467 'c K' => chr(0xc4).chr(0xb6),
468 'c l' => chr(0xc4).chr(0xbc),
469 'c L' => chr(0xc4).chr(0xbb),
470 'c n' => chr(0xc5).chr(0x86),
471 'c N' => chr(0xc5).chr(0x85),
472 'c r' => chr(0xc5).chr(0x97),
473 'c R' => chr(0xc5).chr(0x96),
474 'c s' => chr(0xc5).chr(0x9f),
475 'c S' => chr(0xc5).chr(0x9e),
476 'c t' => chr(0xc5).chr(0xa3),
477 'c T' => chr(0xc5).chr(0xa2),
478 # double acute / Hungarian accent
479 'H O' => chr(0xc5).chr(0x90),
480 'H o' => chr(0xc5).chr(0x91),
481 'H U' => chr(0xc5).chr(0xb0),
482 'H u' => chr(0xc5).chr(0xb1),
483
484 # stroke
485 'd' => chr(0xc4).chr(0x91),
486 'D' => chr(0xc4).chr(0x90),
487 'h' => chr(0xc4).chr(0xa7),
488# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
489 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
490 'l' => chr(0xc5).chr(0x82),
491 'L' => chr(0xc5).chr(0x81),
492 'o' => chr(0xc3).chr(0xb8),
493 'O' => chr(0xc3).chr(0x98),
494 't' => chr(0xc5).chr(0xa7),
495 'T' => chr(0xc5).chr(0xa6),
496 # german ss/szlig/sharp s
497 'ss' => chr(0xc3).chr(0x9f),
498 );
499
500 # convert latex-style accented characters.
501 # remove space (if any) between \ and letter to accent (eg {\' a})
502
503 $text =~ s@(\\[`'="])\s(\w)@$1$2@g;
504
505 # remove {} around a single character (eg \'{e})
506 $text =~ s@(\\[`'="\.]){(\w)}@$1$2@g;
507
508 # remove {} around a single character for special 1 letter commands -
509 # need to insert a space. Eg \v{s} -> {\v s}
510 $text =~ s@(\\[vcH]){(\w)}@{$1 $2}@;
511
512 # this is slow (go through whole hash for each substitution!) so
513 # only do if the text contains a '\' character.
514 if ($text =~ m|\\|) {
515 for $latex_code (keys %utf8_chars) {
516 $text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g;
517 }
518
519 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
520 # only do the change if immediately followed by a space, }, {, or \
521 for $latex_code (keys %special_utf8_chars) {
522 $text =~ s/\\${latex_code}([\\\s\{\}])/$special_utf8_chars{$latex_code}$1/g;
523 }
524 }
525
526 # remove latex commands
527 $text =~ s@\\\w+{([^}]*)}@$1@g;
528
529 # remove latex groupings { } (but not \{ or \} )
530 $text =~ s/([^\\])\{/$1/g;
531 $text =~ s/([^\\])\}/$1/g;
532 $text =~ s/^{//; # remove { if first char
533
534 # maths mode $...$ - this is not interpreted in any way at the moment...
535 $text =~ s@\$(.*)\$@$1@g;
536
537 # quoted { } chars
538 $text =~ s@\\{@{@g;
539 $text =~ s@\\}@}@g;
540
541 return $text;
542}
543
544
545sub set_OID {
546 my $self = shift (@_);
547 my ($doc_obj, $id, $segment_number) = @_;
548
549 if ( $self->{'key'} eq "default") {
550 $doc_obj->set_OID();
551 } else {
552 $doc_obj->set_OID($self->{'key'});
553 }
554}
555
5561;
Note: See TracBrowser for help on using the repository browser.