Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 3156

Last change on this file since 3156 was 3156, checked in by jrm21, 22 years ago
Added a few extra accented characters, and recognise some bibtex-specific macros.
Property svn:keywords set to `Author Date Id Revision`
File size: 17.6 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42	# * The raw ascii bibtex entry is stored as "BibTex" metadata.
43
44	package BibTexPlug;
45
46	use SplitPlug;
47
48	# BibTexPlug is a sub-class of BasPlug.
49	sub BEGIN {
50	@ISA = ('SplitPlug');
51	}
52
53	# This plugin processes files with the suffix ".bib"
54	sub get_default_process_exp {
55	return q^(?i)\.bib$^;
56	}
57
58	# This plugin splits the input text at blank lines
59	sub get_default_split_exp {
60	return q^\n+(?=@)^;
61	}
62
63
64
65	# The process function reads a single bibliographic record and stores
66	# it as a new document.
67
68	sub process {
69	my $self = shift (@_);
70	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
71	my $outhandle = $self->{'outhandle'};
72
73	$self->{'key'} = "default";
74
75	# Check that we're dealing with a valid BibTex record
76	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
77
78	# Ignore things we can't use
79	return 0 if ($$textref =~ /^\@String/);
80
81	# Report that we're processing the file
82	print $outhandle "BibTexPlug: processing $file\n"
83	if ($self->{'verbosity'}) > 1;
84
85
86	# This hash translates BibTex field names into metadata names. The
87	# BibTex names are taken from the "Local Guide to Latex" Graeme
88	# McKinstry. Metadata names are consistent with ReferPlug.
89
90	my %field = (
91	'address', 'PublisherAddress',
92	'author', 'Creator',
93
94	'booktitle', 'Booktitle',
95	'chapter', 'Chapter',
96	'edition', 'Edition',
97	'editor', 'Editor',
98	'institution', 'Publisher',
99	'journal', 'Journal',
100	'month', 'Month',
101	'number', 'Number',
102	'pages', 'Pages',
103	'publisher', 'Publisher',
104	'school', 'Publisher',
105	'title', 'Title',
106	'volume', 'Volume',
107	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
108
109	'keywords', 'Keywords',
110	'abstract', 'Abstract',
111	'copyright', 'Copyright'
112	);
113
114	# Metadata fields
115	my %metadata;
116	my ($EntryType, $EntryID, $Creator, $Keywords, $text);
117
118	my $verbosity = $self->{'verbosity'};
119	$verbosity = 0 unless $verbosity;
120
121	# Make sure the text has exactly one entry per line -
122	# append line to previous if it doesn't start with " <key> = "
123
124	my @input_lines=split('\n', $$textref);
125	my @all_lines;
126	my $entry_line=shift @input_lines;
127	foreach my $input_line (@input_lines) {
128	if ($input_line =~ m/^\s\w+\s=\s*/) {
129	# this is a new key
130	push(@all_lines, $entry_line);
131	$entry_line=$input_line;
132	} else {
133	# this is a continuation of previous line
134	$entry_line .= $input_line;
135	}
136
137	}
138	# add final line, removing trailing '}'
139	$entry_line =~ s/\}\s*$//;
140	push(@all_lines, $entry_line);
141	push(@all_lines, "}");
142
143
144
145	# Read and process each line in the bib file.
146	my ($entryname, $name, $value, $line);
147	foreach $line (@all_lines) {
148
149	# Add each line. Most lines consist of a field identifer and
150	# then data, and we simply store them, though we treat some
151	# of the fields a bit differently.
152
153	$line =~ s/\s+/ /g;
154	$text .= "$line\n";
155
156	print "Processing line = $line \n" if $verbosity>=4;
157
158	# The first line is special, it contains the reference type and OID
159	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-]+)\W$/) {
160	$EntryType = $1;
161	$EntryID = $2;
162	print "** $EntryType - \"$EntryID\" \n"
163	if ($verbosity >= 4);
164	$self->{'key'} = $EntryID;
165	next;
166	}
167	if ($line =~ /\@/) {
168	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
169	if ($verbosity >= 2);
170	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
171	if ($verbosity >= 2);
172	}
173
174	# otherwise, parse the metadata out of this line
175	next unless ($line =~ /^\s(\w+)\s+=\s+(.)/);
176	$entryname = lc($1);
177	$value = $2;
178	$value =~ s/,?\s*$//; # remove trailing comma and space
179	if ($value =~ /^"/ && $value =~ /"$/) {
180	# remove surrounding " marks
181	$value =~ s/^"//; $value =~ s/"$//;
182	}
183	$value = &process_latex($value);
184
185	# Add this line of metadata
186	$metadata{$entryname} .= "$value\n";
187
188	}
189
190	# Add the Entry type as metadata
191	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
192
193	# Add the various field as metadata
194	foreach my $entryname (keys %metadata) {
195	next unless (defined $field{$entryname});
196	next unless (defined $metadata{$entryname});
197
198	$name = $field{$entryname};
199	$value = $metadata{$entryname};
200
201	if ($name =~ /^Month/) {
202	$value=expand_month($value);
203	}
204	# Add the various fields as metadata
205	my $html_value = &text_into_html($value);
206	$doc_obj->add_utf8_metadata ($cursection, $name, $html_value);
207
208	# Several special operatons on metadata follow
209
210	# Add individual keywords.
211	# The full set of keywords will be added, in due course, as "Keywords".
212	# However, we also want to add them as individual "Keyword" metadata elements.
213	if ($entryname eq "keywords") {
214	my @keywordlist = split(/,/, $value);
215	foreach my $k (@keywordlist) {
216	$k = lc($k);
217	$k =~ s/\s*$//;
218	$k =~ s/^\s*//;
219	if ($k =~ /\w/) {
220	$k = &text_into_html($k);
221	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
222	}
223	}
224	}
225
226	# Add individual authors
227	# The author metadata will be stored as one "Creator" entry, but we
228	# also want to split it into several individual "Author" fields in
229	# "Lastname, Firstnames" format so we can browse it.
230	if ($entryname eq "author") { #added also comparison with editor
231
232	# und here for german language...
233	# don't use brackets in pattern, else the matched bit becomes
234	# an element in the list!
235	my @authorlist = split(/,\|\s+and\s+\|\s+und\s+/, $value);
236	foreach $a (@authorlist) {
237	$a =~ s/\s*$//;
238	$a =~ s/^\s*//;
239	# Reformat and add author name
240	next if $a=~ /^\s*$/;
241	my @words = split(/ /, $a);
242	my $lastname = pop @words;
243	my $firstname = join(" ", @words);
244
245	my $fullname = $lastname . ", " . $firstname;
246
247	# Add each name to set of Authors
248	# force utf8 pragma so that \w matches in this scope
249	use utf8;
250	if ($fullname =~ /\w+, \w+/) {
251	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
252	} else {
253	}
254	}
255	}
256
257	# Books and Journals are additionally marked for display purposes
258	if ($entryname eq "booktitle") {
259	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
260	} elsif ($entryname eq "journal") {
261	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
262	}
263
264	}
265
266	# Add the text in BibTex format (all fields)
267	if ($text =~ /\w/) {
268	$text = &text_into_html($text);
269	$doc_obj->add_utf8_text ($cursection, $text);
270	$doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
271	}
272
273	return 1;
274	}
275
276
277
278
279	# convert email addresses and URLs into links
280	sub convert_urls_into_links{
281	my ($text) = @_;
282
283	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
284	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
285
286	return $text;
287	}
288
289	# Clean up whitespace and convert \n charaters to <BR> or <P>
290	sub clean_up_whitespaces{
291	my ($text) = @_;
292
293	$text =~ s/%%%%%/<BR> <BR>/g;
294	$text =~ s/ +/ /g;
295	$text =~ s/\s*$//;
296	$text =~ s/^\s*//;
297	$text =~ s/\n/\n<BR>/g;
298	$text =~ s/<BR>\s*<BR>/<P>/g;
299
300	return $text;
301	}
302
303
304	sub convert_problem_characters_without_ampersand{
305	my ($text) = @_;
306	$text =~ s/</</g;
307	$text =~ s/>/>/g;
308
309	$text =~ s/\'\'/\"/g; #Latex -specific conversion
310	$text =~ s/\`\`/\"/g; #Latex -specific conversion
311
312
313	$text =~ s/\"/"/g;
314	$text =~ s/\'/’/g;
315	$text =~ s/\`/‘/g;
316	$text =~ s/\+/ /g;
317	$text =~ s/\(/ /g;
318	$text =~ s/\)/ /g;
319
320	$text =~ s/\\/\\\\/g;
321
322	$text =~ s/\./\\\./g;
323
324	return $text;
325	}
326
327	# Convert a text string into HTML.
328
329	# The HTML is going to be inserted into a GML file, so we have to be
330	# careful not to use symbols like ">", which occurs frequently in email
331	# messages (and use &gt instead.
332
333	# This function also turns URLs and email addresses into links, and
334	# replaces carriage returns with <BR> tags (and multiple carriage returns
335	# with <P> tags).
336
337	sub text_into_html {
338	my ($text) = @_;
339
340	# Convert problem characters into HTML symbols
341	$text =~ s/&/&/g;
342
343	$text = &convert_problem_characters_without_ampersand( $text );
344
345	# convert email addresses and URLs into links
346	$text = &convert_urls_into_links( $text );
347
348	$text = &clean_up_whitespaces( $text );
349
350	return $text;
351	}
352
353
354	sub expand_month {
355	my $text=shift;
356
357	# bibtex style files expand abbreviations for months.
358	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
359	$text =~ s/jan/_textmonth01_/g;
360	$text =~ s/feb/_textmonth02_/g;
361	$text =~ s/mar/_textmonth03_/g;
362	$text =~ s/apr/_textmonth04_/g;
363	$text =~ s/may/_textmonth05_/g;
364	$text =~ s/jun/_textmonth06_/g;
365	$text =~ s/jul/_textmonth07_/g;
366	$text =~ s/aug/_textmonth08_/g;
367	$text =~ s/sep/_textmonth09_/g;
368	$text =~ s/oct/_textmonth10_/g;
369	$text =~ s/nov/_textmonth11_/g;
370	$text =~ s/dec/_textmonth12_/g;
371
372	return $text;
373	}
374
375
376	# Convert accented characters, remove { }, interprete some commands....
377	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
378	sub process_latex {
379	my ($text) = @_;
380
381	# note - this is really ugly, but it works. There may be a prettier way
382	# of mapping latex accented chars to utf8, but we just brute force it here.
383	# Also, this isn't complete - not every single possible accented letter
384	# is in here yet, but most of the common ones are.
385
386	my %utf8_chars =
387	(
388	# acutes
389	'\'a' => chr(0xc3).chr(0xa1),
390	'\'c' => chr(0xc4).chr(0x87),
391	'\'e' => chr(0xc3).chr(0xa9),
392	'\'i' => chr(0xc3).chr(0xad),
393	'\'l' => chr(0xc3).chr(0xba),
394	'\'n' => chr(0xc3).chr(0x84),
395	'\'o' => chr(0xc3).chr(0xb3),
396	'\'r' => chr(0xc5).chr(0x95),
397	'\'s' => chr(0xc5).chr(0x9b),
398	'\'u' => chr(0xc3).chr(0xba),
399	'\'y' => chr(0xc3).chr(0xbd),
400	'\'z' => chr(0xc5).chr(0xba),
401	# graves
402	'`a' => chr(0xc3).chr(0xa0),
403	'`A' => chr(0xc3).chr(0x80),
404	'`e' => chr(0xc3).chr(0xa8),
405	'`E' => chr(0xc3).chr(0x88),
406	'`i' => chr(0xc3).chr(0xac),
407	'`I' => chr(0xc3).chr(0x8c),
408	'`o' => chr(0xc3).chr(0xb2),
409	'`O' => chr(0xc3).chr(0x92),
410	'`u' => chr(0xc3).chr(0xb9),
411	'`U' => chr(0xc3).chr(0x99),
412	# circumflex
413	'^a' => chr(0xc3).chr(0xa2),
414	'^A' => chr(0xc3).chr(0x82),
415	'^c' => chr(0xc4).chr(0x89),
416	'^C' => chr(0xc4).chr(0x88),
417	'^e' => chr(0xc3).chr(0xaa),
418	'^E' => chr(0xc3).chr(0x8a),
419	'^g' => chr(0xc4).chr(0x9d),
420	'^G' => chr(0xc4).chr(0x9c),
421	'^h' => chr(0xc4).chr(0xa5),
422	'^H' => chr(0xc4).chr(0xa4),
423	'^i' => chr(0xc3).chr(0xae),
424	'^I' => chr(0xc3).chr(0x8e),
425	'^j' => chr(0xc4).chr(0xb5),
426	'^J' => chr(0xc4).chr(0xb4),
427	'^o' => chr(0xc3).chr(0xb4),
428	'^O' => chr(0xc3).chr(0x94),
429	'^s' => chr(0xc5).chr(0x9d),
430	'^S' => chr(0xc5).chr(0x9c),
431	'^u' => chr(0xc3).chr(0xa2),
432	'^U' => chr(0xc3).chr(0xbb),
433	'^w' => chr(0xc5).chr(0xb5),
434	'^W' => chr(0xc5).chr(0xb4),
435	'^y' => chr(0xc5).chr(0xb7),
436	'^Y' => chr(0xc5).chr(0xb6),
437
438	# diaeresis
439	'"a' => chr(0xc3).chr(0xa4),
440	'"A' => chr(0xc3).chr(0x84),
441	'"e' => chr(0xc3).chr(0xab),
442	'"E' => chr(0xc3).chr(0x8b),
443	'"\\\\i' => chr(0xc3).chr(0xaf),
444	'"\\\\I' => chr(0xc3).chr(0x8f),
445	'"o' => chr(0xc3).chr(0xb6),
446	'"O' => chr(0xc3).chr(0x96),
447	'"u' => chr(0xc3).chr(0xbc),
448	'"U' => chr(0xc3).chr(0x9c),
449	'"y' => chr(0xc3).chr(0xbf),
450	'"Y' => chr(0xc3).chr(0xb8),
451	# tilde
452	'~A' => chr(0xc3).chr(0x83),
453	'~N' => chr(0xc3).chr(0x91),
454	'~O' => chr(0xc3).chr(0x95),
455	'~a' => chr(0xc3).chr(0xa3),
456	'~n' => chr(0xc3).chr(0xb1),
457	'~o' => chr(0xc3).chr(0xb5),
458	# caron - handled specially
459	# ',s' => chr(0xc5).chr(0xa1),
460	# ',S' => chr(0xc5).chr(0xa5),
461	# double acute
462	# ring
463	# dot
464	'\.c' => chr(0xc4).chr(0x8b),
465	'\.C' => chr(0xc4).chr(0x8a),
466	'\.e' => chr(0xc4).chr(0x97),
467	'\.E' => chr(0xc4).chr(0x96),
468	'\.g' => chr(0xc4).chr(0xa1),
469	'\.G' => chr(0xc4).chr(0xa0),
470	'\.I' => chr(0xc4).chr(0xb0),
471	'\.z' => chr(0xc5).chr(0xbc),
472	'\.Z' => chr(0xc5).chr(0xbb),
473	# macron
474	'=a' => chr(0xc4).chr(0x81),
475	'=A' => chr(0xc4).chr(0x80),
476	'=e' => chr(0xc4).chr(0x93),
477	'=E' => chr(0xc4).chr(0x92),
478	'=i' => chr(0xc4).chr(0xab),
479	'=I' => chr(0xc4).chr(0xaa),
480	'=o' => chr(0xc4).chr(0x8d),
481	'=O' => chr(0xc4).chr(0x8c),
482	'=u' => chr(0xc4).chr(0xab),
483	'=U' => chr(0xc4).chr(0xaa),
484
485	# stroke - handled specially - see below
486
487	# cedilla - handled specially
488
489	);
490
491	# these are one letter latex commands - we make sure they're not a longer
492	# command name. eg {\d} is d+stroke, so careful of \d
493	my %special_utf8_chars =
494	(
495	# breve
496	'u g' => chr(0xc4).chr(0x9f),
497	'u G' => chr(0xc4).chr(0x9e),
498	'u i' => chr(0xc4).chr(0xad),
499	'u I' => chr(0xc4).chr(0xac),
500	'u o' => chr(0xc5).chr(0x8f),
501	'u O' => chr(0xc5).chr(0x8e),
502	'u u' => chr(0xc5).chr(0xad),
503	'u U' => chr(0xc5).chr(0xac),
504	# caron
505	'v c' => chr(0xc4).chr(0x8d),
506	'v C' => chr(0xc4).chr(0x8c),
507	'v n' => chr(0xc5).chr(0x88),
508	'v N' => chr(0xc5).chr(0x87),
509	'v s' => chr(0xc5).chr(0xa1),
510	'v S' => chr(0xc5).chr(0xa5),
511	'v z' => chr(0xc5).chr(0xbe),
512	'v Z' => chr(0xc5).chr(0xbd),
513	# cedilla
514	'c c' => chr(0xc3).chr(0xa7),
515	'c C' => chr(0xc3).chr(0x87),
516	'c g' => chr(0xc4).chr(0xa3),
517	'c G' => chr(0xc4).chr(0xa2),
518	'c k' => chr(0xc4).chr(0xb7),
519	'c K' => chr(0xc4).chr(0xb6),
520	'c l' => chr(0xc4).chr(0xbc),
521	'c L' => chr(0xc4).chr(0xbb),
522	'c n' => chr(0xc5).chr(0x86),
523	'c N' => chr(0xc5).chr(0x85),
524	'c r' => chr(0xc5).chr(0x97),
525	'c R' => chr(0xc5).chr(0x96),
526	'c s' => chr(0xc5).chr(0x9f),
527	'c S' => chr(0xc5).chr(0x9e),
528	'c t' => chr(0xc5).chr(0xa3),
529	'c T' => chr(0xc5).chr(0xa2),
530	# double acute / Hungarian accent
531	'H O' => chr(0xc5).chr(0x90),
532	'H o' => chr(0xc5).chr(0x91),
533	'H U' => chr(0xc5).chr(0xb0),
534	'H u' => chr(0xc5).chr(0xb1),
535
536	# stroke
537	'd' => chr(0xc4).chr(0x91),
538	'D' => chr(0xc4).chr(0x90),
539	'h' => chr(0xc4).chr(0xa7),
540	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
541	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
542	'l' => chr(0xc5).chr(0x82),
543	'L' => chr(0xc5).chr(0x81),
544	'o' => chr(0xc3).chr(0xb8),
545	'O' => chr(0xc3).chr(0x98),
546	't' => chr(0xc5).chr(0xa7),
547	'T' => chr(0xc5).chr(0xa6),
548	# german ss/szlig/sharp s
549	'ss' => chr(0xc3).chr(0x9f),
550	);
551
552	# convert latex-style accented characters.
553	# remove space (if any) between \ and letter to accent (eg {\' a})
554
555	$text =~ s@(\\[`'="])\s(\w)@$1$2@g;
556
557	# remove {} around a single character (eg \'{e})
558	$text =~ s@(\\[`'="\.]){(\w)}@{$1$2}@g;
559
560	# remove {} around a single character for special 1 letter commands -
561	# need to insert a space. Eg \v{s} -> {\v s}
562	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
563	# this is slow (go through whole hash for each substitution!) so
564	# only do if the text contains a '\' character.
565	if ($text =~ m\|\\\|) {
566	for $latex_code (keys %utf8_chars) {
567	$text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g;
568	}
569
570	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
571	# only do the change if immediately followed by a space, }, {, or \
572	for $latex_code (keys %special_utf8_chars) {
573	$text =~ s/\\${latex_code}([\\\s{}])/$special_utf8_chars{$latex_code}$1/g;
574	}
575	}
576	# remove latex commands
577	# commands with optional arguments...
578	$text =~ s@\\\w+(\[.?\])?\s@@g;
579	# $text =~ s@\\noopsort{[^}]+\}@@g;
580	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
581
582	# remove latex groupings { } (but not \{ or \} )
583	while ($text =~ s/([^\\])[\{\}]/$1/g) {;}
584	$text =~ s/^\{//; # remove { if first char
585
586	# maths mode $...$ - this is not interpreted in any way at the moment...
587	$text =~ s@\$(.*)\$@$1@g;
588
589	# latex characters
590	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
591	$text =~ s/([^\\])~+/$1/g; # non-breaking space "~"
592	# optional break "\-"
593	if ($text =~ m/\#/) { # concat macros (bibtex)
594	# the non-macro bits have quotes around them - we just remove quotes
595	$text =~ s/[\"\#]//g;
596	}
597	# quoted { } chars
598	$text =~ s@\\{@{@g;
599	$text =~ s@\\}@}@g;
600
601	return $text;
602	}
603
604
605	sub set_OID {
606	my $self = shift (@_);
607	my ($doc_obj, $id, $segment_number) = @_;
608
609	if ( $self->{'key'} eq "default") {
610	$doc_obj->set_OID();
611	} else {
612	$doc_obj->set_OID($self->{'key'});
613	}
614	}
615
616	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: