Context Navigation

BibTexPlug.pm@ 3142

Last change on this file since 3142 was 3142, checked in by jrm21, 22 years ago

1) We can't use "Date" for the year metadata, as greenstone assumes Date is YYYYMMDD, so we now use "Year" instead.

2) Tidied up the single-entry-per-line code, as it was missing some entries.

3) bibtex allows 3 letter month abbreviations, so we know expand those.

4) some more tidying up of latex (and bibtex) commands.

Property svn:keywords set to Author Date Id Revision

File size: 17.1 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42	# * The raw ascii bibtex entry is stored as "BibTex" metadata.
43
44	package BibTexPlug;
45
46	use SplitPlug;
47
48	# BibTexPlug is a sub-class of BasPlug.
49	sub BEGIN {
50	@ISA = ('SplitPlug');
51	}
52
53	# This plugin processes files with the suffix ".bib"
54	sub get_default_process_exp {
55	return q^(?i)\.bib$^;
56	}
57
58	# This plugin splits the input text at blank lines
59	sub get_default_split_exp {
60	return q^\n+(?=@)^;
61	}
62
63
64
65	# The process function reads a single bibliographic record and stores
66	# it as a new document.
67
68	sub process {
69	my $self = shift (@_);
70	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
71	my $outhandle = $self->{'outhandle'};
72
73	$self->{'key'} = "default";
74
75	# Check that we're dealing with a valid BibTex record
76	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
77
78	# Ignore things we can't use
79	return 0 if ($$textref =~ /^\@String/);
80
81	# Report that we're processing the file
82	print $outhandle "BibTexPlug: processing $file\n"
83	if ($self->{'verbosity'}) > 1;
84
85
86	# This hash translates BibTex field names into metadata names. The
87	# BibTex names are taken from the "Local Guide to Latex" Graeme
88	# McKinstry. Metadata names are consistent with ReferPlug.
89
90	my %field = (
91	'address', 'PublisherAddress',
92	'author', 'Creator',
93
94	'booktitle', 'Booktitle',
95	'chapter', 'Chapter',
96	'edition', 'Edition',
97	'editor', 'Editor',
98	'institution', 'Publisher',
99	'journal', 'Journal',
100	'month', 'Month',
101	'number', 'Number',
102	'pages', 'Pages',
103	'publisher', 'Publisher',
104	'school', 'Publisher',
105	'title', 'Title',
106	'volume', 'Volume',
107	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
108
109	'keywords', 'Keywords',
110	'abstract', 'Abstract',
111	'copyright', 'Copyright'
112	);
113
114	# Metadata fields
115	my %metadata;
116	my ($EntryType, $EntryID, $Creator, $Keywords, $text);
117
118	my $verbosity = $self->{'verbosity'};
119	$verbosity = 0 unless $verbosity;
120
121	# Make sure the text has exactly one entry per line -
122	# append line to previous if it doesn't start with " <key> = "
123
124	my @input_lines=split('\n', $$textref);
125	my @all_lines;
126	my $entry_line=shift @input_lines;
127	foreach my $input_line (@input_lines) {
128	if ($input_line =~ m/^\s\w+\s=\s*/) {
129	# this is a new key
130	push(@all_lines, $entry_line);
131	$entry_line=$input_line;
132	} else {
133	# this is a continuation of previous line
134	$entry_line .= $input_line;
135	}
136
137	}
138	# add final line, removing trailing '}'
139	$entry_line =~ s/\}\s*$//;
140	push(@all_lines, $entry_line);
141	push(@all_lines, "}");
142
143
144
145	# Read and process each line in the bib file.
146	my ($entryname, $name, $value, $line);
147	foreach $line (@all_lines) {
148
149	# Add each line. Most lines consist of a field identifer and
150	# then data, and we simply store them, though we treat some
151	# of the fields a bit differently.
152
153	$line =~ s/\s+/ /g;
154	$text .= "$line\n";
155
156	print "Processing line = $line \n" if $verbosity>=4;
157
158	# The first line is special, it contains the reference type and OID
159	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-]+)\W$/) {
160	$EntryType = $1;
161	$EntryID = $2;
162	print "** $EntryType - \"$EntryID\" \n"
163	if ($verbosity >= 4);
164	$self->{'key'} = $EntryID;
165	next;
166	}
167	if ($line =~ /\@/) {
168	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
169	if ($verbosity >= 2);
170	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
171	if ($verbosity >= 2);
172	}
173
174	# otherwise, parse the metadata out of this line
175	next unless ($line =~ /^\s(\w+)\s+=\s+(.)/);
176	$entryname = lc($1);
177	$value = $2;
178	$value =~ s/,?\s*$//; # remove trailing comma and space
179	if ($value =~ /^"/ && $value =~ /"$/) {
180	# remove surrounding " marks
181	$value =~ s/^"//; $value =~ s/"$//;
182	}
183	$value = &process_latex($value);
184
185	# Add this line of metadata
186	$metadata{$entryname} .= "$value\n";
187
188	}
189
190	# Add the Entry type as metadata
191	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
192
193	# Add the various field as metadata
194	foreach my $entryname (keys %metadata) {
195	next unless (defined $field{$entryname});
196	next unless (defined $metadata{$entryname});
197
198	$name = $field{$entryname};
199	$value = $metadata{$entryname};
200
201	if ($name =~ /^Month/) {
202	$value=expand_month($value);
203	}
204	# Add the various fields as metadata
205	my $html_value = &text_into_html($value);
206	$doc_obj->add_utf8_metadata ($cursection, $name, $html_value);
207
208	# Several special operatons on metadata follow
209
210	# Add individual keywords.
211	# The full set of keywords will be added, in due course, as "Keywords".
212	# However, we also want to add them as individual "Keyword" metadata elements.
213	if ($entryname eq "keywords") {
214	my @keywordlist = split(/,/, $value);
215	foreach my $k (@keywordlist) {
216	$k = lc($k);
217	$k =~ s/\s*$//;
218	$k =~ s/^\s*//;
219	if ($k =~ /\w/) {
220	$k = &text_into_html($k);
221	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
222	}
223	}
224	}
225
226	# Add individual authors
227	# The author metadata will be stored as one "Creator" entry, but we
228	# also want to split it into several individual "Author" fields in
229	# "Lastname, Firstnames" format so we can browse it.
230	if ($entryname eq "author") { #added also comparison with editor
231
232	# und here for german language...
233	# don't use brackets in pattern, else the matched bit becomes
234	# an element in the list!
235	my @authorlist = split(/,\|\s+and\s+\|\s+und\s+/, $value);
236	foreach $a (@authorlist) {
237	$a =~ s/\s*$//;
238	$a =~ s/^\s*//;
239	# Reformat and add author name
240	next if $a=~ /^\s*$/;
241	my @words = split(/ /, $a);
242	my $lastname = pop @words;
243	my $firstname = join(" ", @words);
244
245	my $fullname = $lastname . ", " . $firstname;
246
247	# Add each name to set of Authors
248	# force utf8 pragma so that \w matches in this scope
249	use utf8;
250	if ($fullname =~ /\w+, \w+/) {
251	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
252	}
253	}
254	}
255
256	# Books and Journals are additionally marked for display purposes
257	if ($entryname eq "booktitle") {
258	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
259	} elsif ($entryname eq "journal") {
260	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
261	}
262
263	}
264
265	# Add the text in BibTex format (all fields)
266	if ($text =~ /\w/) {
267	$text = &text_into_html($text);
268	$doc_obj->add_utf8_text ($cursection, $text);
269	$doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
270	}
271
272	return 1;
273	}
274
275
276
277
278	# convert email addresses and URLs into links
279	sub convert_urls_into_links{
280	my ($text) = @_;
281
282	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
283	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
284
285	return $text;
286	}
287
288	# Clean up whitespace and convert \n charaters to <BR> or <P>
289	sub clean_up_whitespaces{
290	my ($text) = @_;
291
292	$text =~ s/%%%%%/<BR> <BR>/g;
293	$text =~ s/ +/ /g;
294	$text =~ s/\s*$//;
295	$text =~ s/^\s*//;
296	$text =~ s/\n/\n<BR>/g;
297	$text =~ s/<BR>\s*<BR>/<P>/g;
298
299	return $text;
300	}
301
302
303	sub convert_problem_characters_without_ampersand{
304	my ($text) = @_;
305	$text =~ s/</</g;
306	$text =~ s/>/>/g;
307
308	$text =~ s/\'\'/\"/g; #Latex -specific conversion
309	$text =~ s/\`\`/\"/g; #Latex -specific conversion
310
311
312	$text =~ s/\"/"/g;
313	$text =~ s/\'/’/g;
314	$text =~ s/\`/‘/g;
315	$text =~ s/\+/ /g;
316	$text =~ s/\(/ /g;
317	$text =~ s/\)/ /g;
318
319	$text =~ s/\\/\\\\/g;
320
321	$text =~ s/\./\\\./g;
322
323	return $text;
324	}
325
326	# Convert a text string into HTML.
327
328	# The HTML is going to be inserted into a GML file, so we have to be
329	# careful not to use symbols like ">", which occurs frequently in email
330	# messages (and use &gt instead.
331
332	# This function also turns URLs and email addresses into links, and
333	# replaces carriage returns with <BR> tags (and multiple carriage returns
334	# with <P> tags).
335
336	sub text_into_html {
337	my ($text) = @_;
338
339	# Convert problem characters into HTML symbols
340	$text =~ s/&/&/g;
341
342	$text = &convert_problem_characters_without_ampersand( $text );
343
344	# convert email addresses and URLs into links
345	$text = &convert_urls_into_links( $text );
346
347	$text = &clean_up_whitespaces( $text );
348
349	return $text;
350	}
351
352
353	sub expand_month {
354	my $text=shift;
355
356	# bibtex style files expand abbreviations for months.
357	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
358	$text =~ s/jan/_textmonth01_/g;
359	$text =~ s/feb/_textmonth02_/g;
360	$text =~ s/mar/_textmonth03_/g;
361	$text =~ s/apr/_textmonth04_/g;
362	$text =~ s/may/_textmonth05_/g;
363	$text =~ s/jun/_textmonth06_/g;
364	$text =~ s/jul/_textmonth07_/g;
365	$text =~ s/aug/_textmonth08_/g;
366	$text =~ s/sep/_textmonth09_/g;
367	$text =~ s/oct/_textmonth10_/g;
368	$text =~ s/nov/_textmonth11_/g;
369	$text =~ s/dec/_textmonth12_/g;
370
371	return $text;
372	}
373
374
375	# Convert accented characters, remove { }, interprete some commands....
376	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
377	sub process_latex {
378	my ($text) = @_;
379
380	# note - this is really ugly, but it works. There may be a prettier way
381	# of mapping latex accented chars to utf8, but we just brute force it here.
382	# Also, this isn't complete - not every single possible accented letter
383	# is in here yet, but most of the common ones are.
384
385	my %utf8_chars =
386	(
387	# acutes
388	'\'a' => chr(0xc3).chr(0xa1),
389	'\'c' => chr(0xc4).chr(0x87),
390	'\'e' => chr(0xc3).chr(0xa9),
391	'\'i' => chr(0xc3).chr(0xad),
392	'\'l' => chr(0xc3).chr(0xba),
393	'\'n' => chr(0xc3).chr(0x84),
394	'\'o' => chr(0xc3).chr(0xb3),
395	'\'r' => chr(0xc5).chr(0x95),
396	'\'s' => chr(0xc5).chr(0x9b),
397	'\'u' => chr(0xc3).chr(0xba),
398	'\'y' => chr(0xc3).chr(0xbd),
399	'\'z' => chr(0xc5).chr(0xba),
400	# graves
401	'`a' => chr(0xc3).chr(0xa0),
402	'`A' => chr(0xc3).chr(0x80),
403	'`e' => chr(0xc3).chr(0xa8),
404	'`E' => chr(0xc3).chr(0x88),
405	'`i' => chr(0xc3).chr(0xac),
406	'`I' => chr(0xc3).chr(0x8c),
407	'`o' => chr(0xc3).chr(0xb2),
408	'`O' => chr(0xc3).chr(0x92),
409	'`u' => chr(0xc3).chr(0xb9),
410	'`U' => chr(0xc3).chr(0x99),
411	# circumflex
412	'^a' => chr(0xc3).chr(0xa2),
413	'^A' => chr(0xc3).chr(0x82),
414	'^c' => chr(0xc4).chr(0x89),
415	'^C' => chr(0xc4).chr(0x88),
416	'^e' => chr(0xc3).chr(0xaa),
417	'^E' => chr(0xc3).chr(0x8a),
418	'^g' => chr(0xc4).chr(0x9d),
419	'^G' => chr(0xc4).chr(0x9c),
420	'^h' => chr(0xc4).chr(0xa5),
421	'^H' => chr(0xc4).chr(0xa4),
422	'^i' => chr(0xc3).chr(0xae),
423	'^I' => chr(0xc3).chr(0x8e),
424	'^j' => chr(0xc4).chr(0xb5),
425	'^J' => chr(0xc4).chr(0xb4),
426	'^o' => chr(0xc3).chr(0xb4),
427	'^O' => chr(0xc3).chr(0x94),
428	'^s' => chr(0xc5).chr(0x9d),
429	'^S' => chr(0xc5).chr(0x9c),
430	'^u' => chr(0xc3).chr(0xa2),
431	'^U' => chr(0xc3).chr(0xbb),
432	'^w' => chr(0xc5).chr(0xb5),
433	'^W' => chr(0xc5).chr(0xb4),
434	'^y' => chr(0xc5).chr(0xb7),
435	'^Y' => chr(0xc5).chr(0xb6),
436
437	# diaeresis
438	'"a' => chr(0xc3).chr(0xa4),
439	'"A' => chr(0xc3).chr(0x84),
440	'"e' => chr(0xc3).chr(0xab),
441	'"E' => chr(0xc3).chr(0x8b),
442	'"\\\\i' => chr(0xc3).chr(0xaf),
443	'"\\\\I' => chr(0xc3).chr(0x8f),
444	'"o' => chr(0xc3).chr(0xb6),
445	'"O' => chr(0xc3).chr(0x96),
446	'"u' => chr(0xc3).chr(0xbc),
447	'"U' => chr(0xc3).chr(0x9c),
448	'"y' => chr(0xc3).chr(0xbf),
449	'"Y' => chr(0xc3).chr(0xb8),
450	# tilde
451	'~A' => chr(0xc3).chr(0x83),
452	'~N' => chr(0xc3).chr(0x91),
453	'~O' => chr(0xc3).chr(0x95),
454	'~a' => chr(0xc3).chr(0xa3),
455	'~n' => chr(0xc3).chr(0xb1),
456	'~o' => chr(0xc3).chr(0xb5),
457	# caron - handled specially
458	# ',s' => chr(0xc5).chr(0xa1),
459	# ',S' => chr(0xc5).chr(0xa5),
460	# breve
461	# double acute
462	# ring
463	# dot
464	'\.c' => chr(0xc4).chr(0x8b),
465	'\.C' => chr(0xc4).chr(0x8a),
466	'\.e' => chr(0xc4).chr(0x97),
467	'\.E' => chr(0xc4).chr(0x96),
468	'\.g' => chr(0xc4).chr(0xa1),
469	'\.G' => chr(0xc4).chr(0xa0),
470	'\.I' => chr(0xc4).chr(0xb0),
471	'\.z' => chr(0xc5).chr(0xbc),
472	'\.Z' => chr(0xc5).chr(0xbb),
473	# macron
474	'=a' => chr(0xc4).chr(0x81),
475	'=A' => chr(0xc4).chr(0x80),
476	'=e' => chr(0xc4).chr(0x93),
477	'=E' => chr(0xc4).chr(0x92),
478	'=i' => chr(0xc4).chr(0xab),
479	'=I' => chr(0xc4).chr(0xaa),
480	'=o' => chr(0xc4).chr(0x8d),
481	'=O' => chr(0xc4).chr(0x8c),
482	'=u' => chr(0xc4).chr(0xab),
483	'=U' => chr(0xc4).chr(0xaa),
484
485	# stroke - handled specially - see below
486
487	# cedilla - handled specially
488
489	);
490
491	# these are one letter latex commands - we make sure they're not a longer
492	# command name. eg {\d} is d+stroke, so careful of \d
493	my %special_utf8_chars =
494	(
495	# caron
496	'v n' => chr(0xc5).chr(0x88),
497	'v N' => chr(0xc5).chr(0x87),
498	'v s' => chr(0xc5).chr(0xa1),
499	'v S' => chr(0xc5).chr(0xa5),
500	# cedilla
501	'c c' => chr(0xc3).chr(0xa7),
502	'c C' => chr(0xc3).chr(0x87),
503	'c g' => chr(0xc4).chr(0xa3),
504	'c G' => chr(0xc4).chr(0xa2),
505	'c k' => chr(0xc4).chr(0xb7),
506	'c K' => chr(0xc4).chr(0xb6),
507	'c l' => chr(0xc4).chr(0xbc),
508	'c L' => chr(0xc4).chr(0xbb),
509	'c n' => chr(0xc5).chr(0x86),
510	'c N' => chr(0xc5).chr(0x85),
511	'c r' => chr(0xc5).chr(0x97),
512	'c R' => chr(0xc5).chr(0x96),
513	'c s' => chr(0xc5).chr(0x9f),
514	'c S' => chr(0xc5).chr(0x9e),
515	'c t' => chr(0xc5).chr(0xa3),
516	'c T' => chr(0xc5).chr(0xa2),
517	# double acute / Hungarian accent
518	'H O' => chr(0xc5).chr(0x90),
519	'H o' => chr(0xc5).chr(0x91),
520	'H U' => chr(0xc5).chr(0xb0),
521	'H u' => chr(0xc5).chr(0xb1),
522
523	# stroke
524	'd' => chr(0xc4).chr(0x91),
525	'D' => chr(0xc4).chr(0x90),
526	'h' => chr(0xc4).chr(0xa7),
527	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
528	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
529	'l' => chr(0xc5).chr(0x82),
530	'L' => chr(0xc5).chr(0x81),
531	'o' => chr(0xc3).chr(0xb8),
532	'O' => chr(0xc3).chr(0x98),
533	't' => chr(0xc5).chr(0xa7),
534	'T' => chr(0xc5).chr(0xa6),
535	# german ss/szlig/sharp s
536	'ss' => chr(0xc3).chr(0x9f),
537	);
538
539	# convert latex-style accented characters.
540	# remove space (if any) between \ and letter to accent (eg {\' a})
541
542	$text =~ s@(\\[`'="])\s(\w)@$1$2@g;
543
544	# remove {} around a single character (eg \'{e})
545	$text =~ s@(\\[`'="\.]){(\w)}@$1$2@g;
546
547	# remove {} around a single character for special 1 letter commands -
548	# need to insert a space. Eg \v{s} -> {\v s}
549	$text =~ s@(\\[vcH]){(\w)}@{$1 $2}@;
550
551	# this is slow (go through whole hash for each substitution!) so
552	# only do if the text contains a '\' character.
553	if ($text =~ m\|\\\|) {
554	for $latex_code (keys %utf8_chars) {
555	$text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g;
556	}
557
558	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
559	# only do the change if immediately followed by a space, }, {, or \
560	for $latex_code (keys %special_utf8_chars) {
561	$text =~ s/\\${latex_code}([\\\s{}])/$special_utf8_chars{$latex_code}$1/g;
562	}
563	}
564	# remove latex commands
565	$text =~ s@\\noopsort{[^}]+\}@@g;
566	$text =~ s@\\\w+{([^}]*)}@$1@g; # all other commands
567
568	# remove latex groupings { } (but not \{ or \} )
569	while ($text =~ s/([^\\])[\{\}]/$1/g) {;}
570	$text =~ s/^\{//; # remove { if first char
571
572	# maths mode $...$ - this is not interpreted in any way at the moment...
573	$text =~ s@\$(.*)\$@$1@g;
574
575	# latex characters
576	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
577	$text =~ s/([^\\])~+/$1/g; # non-breaking space "~"
578	# optional break "\-"
579	if ($text =~ m/\#/) { # concat macros (bibtex)
580	# the non-macro bits have quotes around them - we just remove quotes
581	$text =~ s/[\"\#]//g;
582	}
583	# quoted { } chars
584	$text =~ s@\\{@{@g;
585	$text =~ s@\\}@}@g;
586
587	return $text;
588	}
589
590
591	sub set_OID {
592	my $self = shift (@_);
593	my ($doc_obj, $id, $segment_number) = @_;
594
595	if ( $self->{'key'} eq "default") {
596	$doc_obj->set_OID();
597	} else {
598	$doc_obj->set_OID($self->{'key'});
599	}
600	}
601
602	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 3142

Download in other formats: