Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 3112

Last change on this file since 3112 was 3112, checked in by jrm21, 22 years ago
minor changes to formatted values (eg if enclosed in { and } ) and added a few extra accented characters.
Property svn:keywords set to `Author Date Id Revision`
File size: 15.9 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42	# * The raw ascii bibtex entry is stored as "BibTex" metadata.
43
44	package BibTexPlug;
45
46	use SplitPlug;
47
48	# BibTexPlug is a sub-class of BasPlug.
49	sub BEGIN {
50	@ISA = ('SplitPlug');
51	}
52
53	# This plugin processes files with the suffix ".bib"
54	sub get_default_process_exp {
55	return q^(?i)\.bib$^;
56	}
57
58	# This plugin splits the input text at blank lines
59	sub get_default_split_exp {
60	return q^\n+(?=@)^;
61	}
62
63
64
65	# The process function reads a single bibliographic record and stores
66	# it as a new document.
67
68	sub process {
69	my $self = shift (@_);
70	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
71	my $outhandle = $self->{'outhandle'};
72
73	$self->{'key'} = "default";
74
75	# Check that we're dealing with a valid BibTex record
76	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
77
78	# Ignore things we can't use
79	return 0 if ($$textref =~ /^\@String/);
80
81	# Report that we're processing the file
82	print $outhandle "BibTexPlug: processing $file\n"
83	if ($self->{'verbosity'}) > 1;
84
85
86	# This hash translates BibTex field names into metadata names. The
87	# BibTex names are taken from the "Local Guide to Latex" Graeme
88	# McKinstry. Metadata names are consistent with ReferPlug.
89
90	my %field = (
91	'address', 'PublisherAddress',
92	'author', 'Creator',
93
94	'booktitle', 'Booktitle',
95	'chapter', 'Chapter',
96	'edition', 'Edition',
97	'editor', 'Editor',
98	'institution', 'Publisher',
99	'journal', 'Journal',
100	'month', 'Month',
101	'number', 'Number',
102	'pages', 'Pages',
103	'publisher', 'Publisher',
104	'school', 'Publisher',
105	'title', 'Title',
106	'volume', 'Volume',
107	'year', 'Date',
108
109	'keywords', 'Keywords',
110	'abstract', 'Abstract',
111	'copyright', 'Copyright'
112	);
113
114	# Metadata fields
115	my %metadata;
116	my ($EntryType, $EntryID, $Creator, $Keywords, $text);
117
118	my $verbosity = $self->{'verbosity'};
119	$verbosity = 0 unless $verbosity;
120
121	my $lines=$$textref;
122
123	# Make sure the text has exactly one entry per line
124
125	$lines =~ s/^\s(\@[^,]+,)\s\n/$1=====/; #splitting key in entry
126	$lines =~ s/([\"\}]\s,)\s\n/$1=====/g; #splitting by comma, followed by \n (assuming end of lines are " or })
127	$lines =~ s/(\d+\s\,)\s\n/$1=====/g; #for the case, when we have number entry without closing "
128	$lines =~ s/\n\s*\n/%%%%%/g; #this was simply added in order to allow to process newline inside quoted strings,
129	#that continues for several lines
130	$lines =~ s/\s+/ /g;
131	$lines =~ s/\s=====\s/\n/g;
132
133	my @all_lines = split(/\n+/, $lines);
134
135	# Read and process each line in the bib file.
136	my ($entryname, $name, $value, $line);
137	foreach $line (@all_lines) {
138
139	# Add each line. Most lines consist of a field identifer and
140	# then data, and we simply store them, though we treat some
141	# of the fields a bit differently.
142
143	$line =~ s/\s+/ /g;
144	$text .= "$line\n";
145
146	print "Processing line = $line \n" if $verbosity>=4;
147
148	# The first line is special, it contains the reference type and OID
149	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-]+)\W$/) {
150	$EntryType = $1;
151	$EntryID = $2;
152	print "** $EntryType - \"$EntryID\" \n"
153	if ($verbosity >= 4);
154	$self->{'key'} = $EntryID;
155	next;
156	}
157	if ($line =~ /\@/) {
158	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
159	if ($verbosity >= 2);
160	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
161	if ($verbosity >= 2);
162	}
163
164	# otherwise, parse the metadata out of this line
165	next unless ($line =~ /^\s(\w+)\s+=\s+(.)/);
166	$entryname = lc($1);
167	$value = $2;
168	$value =~ s/,?\s*$//; # remove trailing comma and space
169	if ($value =~ /^"/ && $value =~ /"$/) {
170	# remove surrounding " marks
171	$value =~ s/^"//; $value =~ s/"$//;
172	}
173	$value = &process_latex($value);
174
175	# Add this line of metadata
176	$metadata{$entryname} .= "$value\n";
177
178	}
179
180	# Add the Entry type as metadata
181	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
182
183	# Add the various field as metadata
184	foreach my $entryname (keys %metadata) {
185	next unless (defined $field{$entryname});
186	next unless (defined $metadata{$entryname});
187
188	$name = $field{$entryname};
189	$value = $metadata{$entryname};
190
191	# Add the various fields as metadata
192	my $html_value = &text_into_html($value);
193	$doc_obj->add_utf8_metadata ($cursection, $name, $html_value);
194
195	# Several special operatons on metadata follow
196
197	# Add individual keywords.
198	# The full set of keywords will be added, in due course, as "Keywords".
199	# However, we also want to add them as individual "Keyword" metadata elements.
200	if ($entryname eq "keywords") {
201	my @keywordlist = split(/,/, $value);
202	foreach my $k (@keywordlist) {
203	$k = lc($k);
204	$k =~ s/\s*$//;
205	$k =~ s/^\s*//;
206	if ($k =~ /\w/) {
207	$k = &text_into_html($k);
208	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
209	}
210	}
211	}
212
213	# Add individual authors
214	# The author metadata will be stored as one "Creator" entry, but we
215	# also want to split it into several individual "Author" fields in
216	# "Lastname, Firstnames" format so we can browse it.
217	if ($entryname eq "author") { #added also comparison with editor
218
219	# und here for german language...
220	# don't use brackets in pattern, else the matched bit becomes
221	# an element in the list!
222	my @authorlist = split(/,\|\s+and\s+\|\s+und\s+/, $value);
223	foreach $a (@authorlist) {
224	$a =~ s/\s*$//;
225	$a =~ s/^\s*//;
226	# Reformat and add author name
227	next if $a=~ /^\s*$/;
228	my @words = split(/ /, $a);
229	my $lastname = pop @words;
230	my $firstname = join(" ", @words);
231
232	my $fullname = $lastname . ", " . $firstname;
233
234	# Add each name to set of Authors
235	# force utf8 pragma so that \w matches in this scope
236	use utf8;
237	if ($fullname =~ /\w+, \w+/) {
238	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
239	}
240	}
241	}
242
243	# Books and Journals are additionally marked for display purposes
244	if ($entryname eq "booktitle") {
245	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
246	} elsif ($entryname eq "journal") {
247	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
248	}
249
250	}
251
252	# Add the text in BibTex format (all fields)
253	if ($text =~ /\w/) {
254	$text = &text_into_html($text);
255	$doc_obj->add_utf8_text ($cursection, $text);
256	$doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
257	}
258
259	return 1;
260	}
261
262
263
264
265	# convert email addresses and URLs into links
266	sub convert_urls_into_links{
267	my ($text) = @_;
268
269	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
270	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
271
272	return $text;
273	}
274
275	# Clean up whitespace and convert \n charaters to <BR> or <P>
276	sub clean_up_whitespaces{
277	my ($text) = @_;
278
279	$text =~ s/%%%%%/<BR> <BR>/g;
280	$text =~ s/ +/ /g;
281	$text =~ s/\s*$//;
282	$text =~ s/^\s*//;
283	$text =~ s/\n/\n<BR>/g;
284	$text =~ s/<BR>\s*<BR>/<P>/g;
285
286	return $text;
287	}
288
289
290	sub convert_problem_characters_without_ampersand{
291	my ($text) = @_;
292	$text =~ s/</</g;
293	$text =~ s/>/>/g;
294
295	$text =~ s/\'\'/\"/g; #Latex -specific conversion
296	$text =~ s/\`\`/\"/g; #Latex -specific conversion
297
298
299	$text =~ s/\"/"/g;
300	$text =~ s/\'/’/g;
301	$text =~ s/\`/‘/g;
302	$text =~ s/\+/ /g;
303	$text =~ s/\(/ /g;
304	$text =~ s/\)/ /g;
305
306	$text =~ s/\\/\\\\/g;
307
308	$text =~ s/\./\\\./g;
309
310	return $text;
311	}
312
313	# Convert a text string into HTML.
314
315	# The HTML is going to be inserted into a GML file, so we have to be
316	# careful not to use symbols like ">", which occurs frequently in email
317	# messages (and use &gt instead.
318
319	# This function also turns URLs and email addresses into links, and
320	# replaces carriage returns with <BR> tags (and multiple carriage returns
321	# with <P> tags).
322
323	sub text_into_html {
324	my ($text) = @_;
325
326	# Convert problem characters into HTML symbols
327	$text =~ s/&/&/g;
328
329	$text = &convert_problem_characters_without_ampersand( $text );
330
331	# convert email addresses and URLs into links
332	$text = &convert_urls_into_links( $text );
333
334	$text = &clean_up_whitespaces( $text );
335
336	return $text;
337	}
338
339
340
341
342	# Convert accented characters, remove { }, interprete some commands....
343	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
344	sub process_latex {
345	my ($text) = @_;
346
347	# note - this is really ugly, but it works. There may be a prettier way
348	# of mapping latex accented chars to utf8, but we just brute force it here.
349	# Also, this isn't complete - not every single possible accented letter
350	# is in here yet, but most of the common ones are.
351
352	my %utf8_chars =
353	(
354	# acutes
355	'\'a' => chr(0xc3).chr(0xa1),
356	'\'c' => chr(0xc4).chr(0x87),
357	'\'e' => chr(0xc3).chr(0xa9),
358	'\'i' => chr(0xc3).chr(0xad),
359	'\'l' => chr(0xc3).chr(0xba),
360	'\'n' => chr(0xc3).chr(0x84),
361	'\'o' => chr(0xc3).chr(0xb3),
362	'\'r' => chr(0xc5).chr(0x95),
363	'\'s' => chr(0xc5).chr(0x9b),
364	'\'u' => chr(0xc3).chr(0xba),
365	'\'y' => chr(0xc3).chr(0xbd),
366	'\'z' => chr(0xc5).chr(0xba),
367	# graves
368	'`a' => chr(0xc3).chr(0xa0),
369	'`A' => chr(0xc3).chr(0x80),
370	'`e' => chr(0xc3).chr(0xa8),
371	'`E' => chr(0xc3).chr(0x88),
372	'`i' => chr(0xc3).chr(0xac),
373	'`I' => chr(0xc3).chr(0x8c),
374	'`o' => chr(0xc3).chr(0xb2),
375	'`O' => chr(0xc3).chr(0x92),
376	'`u' => chr(0xc3).chr(0xb9),
377	'`U' => chr(0xc3).chr(0x99),
378	# circumflex
379	'^a' => chr(0xc3).chr(0xa2),
380	'^A' => chr(0xc3).chr(0x82),
381	'^c' => chr(0xc4).chr(0x89),
382	'^C' => chr(0xc4).chr(0x88),
383	'^e' => chr(0xc3).chr(0xaa),
384	'^E' => chr(0xc3).chr(0x8a),
385	'^g' => chr(0xc4).chr(0x9d),
386	'^G' => chr(0xc4).chr(0x9c),
387	'^h' => chr(0xc4).chr(0xa5),
388	'^H' => chr(0xc4).chr(0xa4),
389	'^i' => chr(0xc3).chr(0xae),
390	'^I' => chr(0xc3).chr(0x8e),
391	'^j' => chr(0xc4).chr(0xb5),
392	'^J' => chr(0xc4).chr(0xb4),
393	'^o' => chr(0xc3).chr(0xb4),
394	'^O' => chr(0xc3).chr(0x94),
395	'^s' => chr(0xc5).chr(0x9d),
396	'^S' => chr(0xc5).chr(0x9c),
397	'^u' => chr(0xc3).chr(0xa2),
398	'^U' => chr(0xc3).chr(0xbb),
399	'^w' => chr(0xc5).chr(0xb5),
400	'^W' => chr(0xc5).chr(0xb4),
401	'^y' => chr(0xc5).chr(0xb7),
402	'^Y' => chr(0xc5).chr(0xb6),
403
404	# diaeresis
405	'"a' => chr(0xc3).chr(0xa4),
406	'"A' => chr(0xc3).chr(0x84),
407	'"e' => chr(0xc3).chr(0xab),
408	'"E' => chr(0xc3).chr(0x8b),
409	'"\\\\i' => chr(0xc3).chr(0xaf),
410	'"\\\\I' => chr(0xc3).chr(0x8f),
411	'"o' => chr(0xc3).chr(0xb6),
412	'"O' => chr(0xc3).chr(0x96),
413	'"u' => chr(0xc3).chr(0xbc),
414	'"U' => chr(0xc3).chr(0x9c),
415	'"y' => chr(0xc3).chr(0xbf),
416	'"Y' => chr(0xc3).chr(0xb8),
417	# tilde
418	# caron - handled specially
419	# ',s' => chr(0xc5).chr(0xa1),
420	# ',S' => chr(0xc5).chr(0xa5),
421	# breve
422	# double acute
423	# ring
424	# dot
425	'\.c' => chr(0xc4).chr(0x8b),
426	'\.C' => chr(0xc4).chr(0x8a),
427	'\.e' => chr(0xc4).chr(0x97),
428	'\.E' => chr(0xc4).chr(0x96),
429	'\.g' => chr(0xc4).chr(0xa1),
430	'\.G' => chr(0xc4).chr(0xa0),
431	'\.I' => chr(0xc4).chr(0xb0),
432	'\.z' => chr(0xc5).chr(0xbc),
433	'\.Z' => chr(0xc5).chr(0xbb),
434	# macron
435	'=a' => chr(0xc4).chr(0x81),
436	'=A' => chr(0xc4).chr(0x80),
437	'=e' => chr(0xc4).chr(0x93),
438	'=E' => chr(0xc4).chr(0x92),
439	'=i' => chr(0xc4).chr(0xab),
440	'=I' => chr(0xc4).chr(0xaa),
441	'=o' => chr(0xc4).chr(0x8d),
442	'=O' => chr(0xc4).chr(0x8c),
443	'=u' => chr(0xc4).chr(0xab),
444	'=U' => chr(0xc4).chr(0xaa),
445
446	# stroke - handled specially - see below
447
448	# cedilla - handled specially
449
450	);
451
452	# these are one letter latex commands - we make sure they're not a longer
453	# command name. eg {\d} is d+stroke, so careful of \d
454	my %special_utf8_chars =
455	(
456	# caron
457	'v n' => chr(0xc5).chr(0x88),
458	'v N' => chr(0xc5).chr(0x87),
459	'v s' => chr(0xc5).chr(0xa1),
460	'v S' => chr(0xc5).chr(0xa5),
461	# cedilla
462	'c c' => chr(0xc3).chr(0xa7),
463	'c C' => chr(0xc3).chr(0x87),
464	'c g' => chr(0xc4).chr(0xa3),
465	'c G' => chr(0xc4).chr(0xa2),
466	'c k' => chr(0xc4).chr(0xb7),
467	'c K' => chr(0xc4).chr(0xb6),
468	'c l' => chr(0xc4).chr(0xbc),
469	'c L' => chr(0xc4).chr(0xbb),
470	'c n' => chr(0xc5).chr(0x86),
471	'c N' => chr(0xc5).chr(0x85),
472	'c r' => chr(0xc5).chr(0x97),
473	'c R' => chr(0xc5).chr(0x96),
474	'c s' => chr(0xc5).chr(0x9f),
475	'c S' => chr(0xc5).chr(0x9e),
476	'c t' => chr(0xc5).chr(0xa3),
477	'c T' => chr(0xc5).chr(0xa2),
478	# double acute / Hungarian accent
479	'H O' => chr(0xc5).chr(0x90),
480	'H o' => chr(0xc5).chr(0x91),
481	'H U' => chr(0xc5).chr(0xb0),
482	'H u' => chr(0xc5).chr(0xb1),
483
484	# stroke
485	'd' => chr(0xc4).chr(0x91),
486	'D' => chr(0xc4).chr(0x90),
487	'h' => chr(0xc4).chr(0xa7),
488	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
489	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
490	'l' => chr(0xc5).chr(0x82),
491	'L' => chr(0xc5).chr(0x81),
492	'o' => chr(0xc3).chr(0xb8),
493	'O' => chr(0xc3).chr(0x98),
494	't' => chr(0xc5).chr(0xa7),
495	'T' => chr(0xc5).chr(0xa6),
496	# german ss/szlig/sharp s
497	'ss' => chr(0xc3).chr(0x9f),
498	);
499
500	# convert latex-style accented characters.
501	# remove space (if any) between \ and letter to accent (eg {\' a})
502
503	$text =~ s@(\\[`'="])\s(\w)@$1$2@g;
504
505	# remove {} around a single character (eg \'{e})
506	$text =~ s@(\\[`'="\.]){(\w)}@$1$2@g;
507
508	# remove {} around a single character for special 1 letter commands -
509	# need to insert a space. Eg \v{s} -> {\v s}
510	$text =~ s@(\\[vcH]){(\w)}@{$1 $2}@;
511
512	# this is slow (go through whole hash for each substitution!) so
513	# only do if the text contains a '\' character.
514	if ($text =~ m\|\\\|) {
515	for $latex_code (keys %utf8_chars) {
516	$text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g;
517	}
518
519	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
520	# only do the change if immediately followed by a space, }, {, or \
521	for $latex_code (keys %special_utf8_chars) {
522	$text =~ s/\\${latex_code}([\\\s\{\}])/$special_utf8_chars{$latex_code}$1/g;
523	}
524	}
525
526	# remove latex commands
527	$text =~ s@\\\w+{([^}]*)}@$1@g;
528
529	# remove latex groupings { } (but not \{ or \} )
530	$text =~ s/([^\\])\{/$1/g;
531	$text =~ s/([^\\])\}/$1/g;
532	$text =~ s/^{//; # remove { if first char
533
534	# maths mode $...$ - this is not interpreted in any way at the moment...
535	$text =~ s@\$(.*)\$@$1@g;
536
537	# quoted { } chars
538	$text =~ s@\\{@{@g;
539	$text =~ s@\\}@}@g;
540
541	return $text;
542	}
543
544
545	sub set_OID {
546	my $self = shift (@_);
547	my ($doc_obj, $id, $segment_number) = @_;
548
549	if ( $self->{'key'} eq "default") {
550	$doc_obj->set_OID();
551	} else {
552	$doc_obj->set_OID($self->{'key'});
553	}
554	}
555
556	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: