Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 2901

Last change on this file since 2901 was 2901, checked in by jrm21, 22 years ago
We now interprete some latex commands in the input, mostly to do with accents.
Property svn:keywords set to `Author Date Id Revision`
File size: 15.4 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42	# * The raw ascii bibtex entry is stored as "BibTex" metadata.
43
44	package BibTexPlug;
45
46	use SplitPlug;
47
48	# BibTexPlug is a sub-class of BasPlug.
49	sub BEGIN {
50	@ISA = ('SplitPlug');
51	}
52
53	# This plugin processes files with the suffix ".bib"
54	sub get_default_process_exp {
55	return q^(?i)\.bib$^;
56	}
57
58	# This plugin splits the input text at blank lines
59	sub get_default_split_exp {
60	return q^\n+(?=@)^;
61	}
62
63
64
65	# The process function reads a single bibliographic record and stores
66	# it as a new document.
67
68	sub process {
69	my $self = shift (@_);
70	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
71	my $outhandle = $self->{'outhandle'};
72
73	$self->{'key'} = "default";
74
75	# Check that we're dealing with a valid BibTex record
76	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
77
78	# Ignore things we can't use
79	return 0 if ($$textref =~ /^\@String/);
80
81	# Report that we're processing the file
82	print $outhandle "BibTexPlug: processing $file\n"
83	if ($self->{'verbosity'}) > 1;
84
85
86	# This hash translates BibTex field names into metadata names. The
87	# BibTex names are taken from the "Local Guide to Latex" Graeme
88	# McKinstry. Metadata names are consistent with ReferPlug.
89
90	my %field = (
91	'address', 'PublisherAddress',
92	'author', 'Creator',
93
94	'booktitle', 'Booktitle',
95	'chapter', 'Chapter',
96	'edition', 'Edition',
97	'editor', 'Editor',
98	'institution', 'Publisher',
99	'journal', 'Journal',
100	'month', 'Month',
101	'number', 'Number',
102	'pages', 'Pages',
103	'publisher', 'Publisher',
104	'school', 'Publisher',
105	'title', 'Title',
106	'volume', 'Volume',
107	'year', 'Date',
108
109	'keywords', 'Keywords',
110	'abstract', 'Abstract',
111	'copyright', 'Copyright'
112	);
113
114	# Metadata fields
115	my %metadata;
116	my ($EntryType, $EntryID, $Creator, $Keywords, $text);
117
118	my $verbosity = $self->{'verbosity'};
119	$verbosity = 0 unless $verbosity;
120
121	my $lines=$$textref;
122
123	# Make sure the text has exactly one entry per line
124
125	$lines =~ s/^\s(\@[^,]+,)\s\n/$1=====/; #splitting key in entry
126	$lines =~ s/([\"\}]\s,)\s\n/$1=====/g; #splitting by comma, followed by \n (assuming end of lines are " or })
127	$lines =~ s/(\d+\s\,)\s\n/$1=====/g; #for the case, when we have number entry without closing "
128	$lines =~ s/\n\s*\n/%%%%%/g; #this was simply added in order to allow to process newline inside quoted strings,
129	#that continues for several lines
130	$lines =~ s/\s+/ /g;
131	$lines =~ s/\s=====\s/\n/g;
132
133	my @all_lines = split(/\n+/, $lines);
134
135	# Read and process each line in the bib file.
136	my ($entryname, $name, $value, $line);
137	foreach $line (@all_lines) {
138
139	# Add each line. Most lines consist of a field identifer and
140	# then data, and we simply store them, though we treat some
141	# of the fields a bit differently.
142
143	$line =~ s/\s+/ /g;
144	$text .= "$line\n";
145
146	print "Processing line = $line \n" if $verbosity>=4;
147
148	# The first line is special, it contains the reference type and OID
149	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-]+)\W$/) {
150	$EntryType = $1;
151	$EntryID = $2;
152	print "** $EntryType - \"$EntryID\" \n"
153	if ($verbosity >= 4);
154	$self->{'key'} = $EntryID;
155	next;
156	}
157	if ($line =~ /\@/) {
158	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
159	if ($verbosity >= 2);
160	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
161	if ($verbosity >= 2);
162	}
163
164	# otherwise, parse the metadata out of this line
165	next unless ($line =~ /^\s(\w+)\s+=\s+(.)/);
166	$entryname = lc($1);
167	$value = $2;
168	# tidy up, removing " at start and end
169	$value =~ s/^"//;
170	$value =~ s/(",)\s*$//;
171	$value = &process_latex($value);
172
173	# Add this line of metadata
174	$metadata{$entryname} .= "$value\n";
175
176	}
177
178	# Add the Entry type as metadata
179	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
180
181	# Add the various field as metadata
182	foreach my $entryname (keys %metadata) {
183	next unless (defined $field{$entryname});
184	next unless (defined $metadata{$entryname});
185
186	$name = $field{$entryname};
187	$value = $metadata{$entryname};
188
189	# Add the various fields as metadata
190	my $html_value = &text_into_html($value);
191	$doc_obj->add_utf8_metadata ($cursection, $name, $html_value);
192
193	# Several special operatons on metadata follow
194
195	# Add individual keywords.
196	# The full set of keywords will be added, in due course, as "Keywords".
197	# However, we also want to add them as individual "Keyword" metadata elements.
198	if ($entryname eq "keywords") {
199	my @keywordlist = split(/,/, $value);
200	foreach my $k (@keywordlist) {
201	$k = lc($k);
202	$k =~ s/\s*$//;
203	$k =~ s/^\s*//;
204	if ($k =~ /\w/) {
205	$k = &text_into_html($k);
206	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
207	}
208	}
209	}
210
211	# Add individual authors
212	# The author metadata will be stored as one "Creator" entry, but we
213	# also want to split it into several individual "Author" fields in
214	# "Lastename, Firstnames" format so we can browse it.
215	if ($entryname eq "author") { #added also comparison with editor
216
217	# und here for german language...
218	# don't use brackets in pattern, else the matched bit becomes
219	# an element in the list!
220	my @authorlist = split(/,\|\s+and\s+\|\s+und\s+/, $value);
221	foreach $a (@authorlist) {
222	$a =~ s/\s*$//;
223	$a =~ s/^\s*//;
224	# Reformat and add author name
225	next if $a=~ /^\s*$/;
226	my @words = split(/ /, $a);
227	my $lastname = pop @words;
228	my $firstname = join(" ", @words);
229
230	my $fullname = $lastname . ", " . $firstname;
231
232	# Add each name to set of Authors
233	# force utf8 pragma so that \w matches in this scope
234	use utf8;
235	if ($fullname =~ /\w+, \w+/) {
236	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
237	}
238	}
239	}
240
241	# Books and Journals are additionally marked for display purposes
242	if ($entryname eq "booktitle") {
243	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
244	} elsif ($entryname eq "journal") {
245	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
246	}
247
248	}
249
250	# Add the text in BibTex format (all fields)
251	if ($text =~ /\w/) {
252	$text = &text_into_html($text);
253	$doc_obj->add_utf8_text ($cursection, $text);
254	$doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
255	}
256
257	return 1;
258	}
259
260
261
262
263	# convert email addresses and URLs into links
264	sub convert_urls_into_links{
265	my ($text) = @_;
266
267	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
268	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
269
270	return $text;
271	}
272
273	# Clean up whitespace and convert \n charaters to <BR> or <P>
274	sub clean_up_whitespaces{
275	my ($text) = @_;
276
277	$text =~ s/%%%%%/<BR> <BR>/g;
278	$text =~ s/ +/ /g;
279	$text =~ s/\s*$//;
280	$text =~ s/^\s*//;
281	$text =~ s/\n/\n<BR>/g;
282	$text =~ s/<BR>\s*<BR>/<P>/g;
283
284	return $text;
285	}
286
287
288	sub convert_problem_characters_without_ampersand{
289	my ($text) = @_;
290	$text =~ s/</</g;
291	$text =~ s/>/>/g;
292
293	$text =~ s/\'\'/\"/g; #Latex -specific conversion
294	$text =~ s/\`\`/\"/g; #Latex -specific conversion
295
296
297	$text =~ s/\"/"/g;
298	$text =~ s/\'/’/g;
299	$text =~ s/\`/‘/g;
300	$text =~ s/\+/ /g;
301	$text =~ s/\(/ /g;
302	$text =~ s/\)/ /g;
303
304	$text =~ s/\\/\\\\/g;
305
306	$text =~ s/\./\\\./g;
307
308	return $text;
309	}
310
311	# Convert a text string into HTML.
312
313	# The HTML is going to be inserted into a GML file, so we have to be
314	# careful not to use symbols like ">", which occurs frequently in email
315	# messages (and use &gt instead.
316
317	# This function also turns URLs and email addresses into links, and
318	# replaces carriage returns with <BR> tags (and multiple carriage returns
319	# with <P> tags).
320
321	sub text_into_html {
322	my ($text) = @_;
323
324	# Convert problem characters into HTML symbols
325	$text =~ s/&/&/g;
326
327	$text = &convert_problem_characters_without_ampersand( $text );
328
329	# convert email addresses and URLs into links
330	$text = &convert_urls_into_links( $text );
331
332	$text = &clean_up_whitespaces( $text );
333
334	return $text;
335	}
336
337
338
339
340	# Convert accented characters, remove { }, interprete some commands....
341	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
342	sub process_latex {
343	my ($text) = @_;
344
345	# note - this is really ugly, but it works. There may be a prettier way
346	# of mapping latex accented chars to utf8, but we just brute force it here.
347	# Also, this isn't complete - not every single possible accented letter
348	# is in here yet, but most of the common ones are.
349
350	my %utf8_chars =
351	(
352	# acutes
353	'\'a' => chr(0xc3).chr(0xa1),
354	'\'c' => chr(0xc4).chr(0x87),
355	'\'e' => chr(0xc3).chr(0xa9),
356	'\'i' => chr(0xc3).chr(0xad),
357	'\'l' => chr(0xc3).chr(0xba),
358	'\'n' => chr(0xc3).chr(0x84),
359	'\'o' => chr(0xc3).chr(0xb3),
360	'\'r' => chr(0xc5).chr(0x95),
361	'\'s' => chr(0xc5).chr(0x9b),
362	'\'u' => chr(0xc3).chr(0xba),
363	'\'y' => chr(0xc3).chr(0xbd),
364	'\'z' => chr(0xc5).chr(0xba),
365	# graves
366	'`a' => chr(0xc3).chr(0xa0),
367	'`A' => chr(0xc3).chr(0x80),
368	'`e' => chr(0xc3).chr(0xa8),
369	'`E' => chr(0xc3).chr(0x88),
370	'`i' => chr(0xc3).chr(0xac),
371	'`I' => chr(0xc3).chr(0x8c),
372	'`o' => chr(0xc3).chr(0xb2),
373	'`O' => chr(0xc3).chr(0x92),
374	'`u' => chr(0xc3).chr(0xb9),
375	'`U' => chr(0xc3).chr(0x99),
376	# circumflex
377	'^a' => chr(0xc3).chr(0xa2),
378	'^A' => chr(0xc3).chr(0x82),
379	'^c' => chr(0xc4).chr(0x89),
380	'^C' => chr(0xc4).chr(0x88),
381	'^e' => chr(0xc3).chr(0xaa),
382	'^E' => chr(0xc3).chr(0x8a),
383	'^g' => chr(0xc4).chr(0x9d),
384	'^G' => chr(0xc4).chr(0x9c),
385	'^h' => chr(0xc4).chr(0xa5),
386	'^H' => chr(0xc4).chr(0xa4),
387	'^i' => chr(0xc3).chr(0xae),
388	'^I' => chr(0xc3).chr(0x8e),
389	'^j' => chr(0xc4).chr(0xb5),
390	'^J' => chr(0xc4).chr(0xb4),
391	'^o' => chr(0xc3).chr(0xb4),
392	'^O' => chr(0xc3).chr(0x94),
393	'^s' => chr(0xc5).chr(0x9d),
394	'^S' => chr(0xc5).chr(0x9c),
395	'^u' => chr(0xc3).chr(0xa2),
396	'^U' => chr(0xc3).chr(0xbb),
397	'^w' => chr(0xc5).chr(0xb5),
398	'^W' => chr(0xc5).chr(0xb4),
399	'^y' => chr(0xc5).chr(0xb7),
400	'^Y' => chr(0xc5).chr(0xb6),
401
402	# diaeresis
403	'"a' => chr(0xc3).chr(0xa4),
404	'"A' => chr(0xc3).chr(0x84),
405	'"e' => chr(0xc3).chr(0xab),
406	'"E' => chr(0xc3).chr(0x8b),
407	'"\\\\i' => chr(0xc3).chr(0xaf),
408	'"\\\\I' => chr(0xc3).chr(0x8f),
409	'"o' => chr(0xc3).chr(0xb6),
410	'"O' => chr(0xc3).chr(0x96),
411	'"u' => chr(0xc3).chr(0xbc),
412	'"U' => chr(0xc3).chr(0x9c),
413	'"y' => chr(0xc3).chr(0xbf),
414	'"Y' => chr(0xc3).chr(0xb8),
415	# tilde
416	# caron - handled specially
417	# ',s' => chr(0xc5).chr(0xa1),
418	# ',S' => chr(0xc5).chr(0xa5),
419	# breve
420	# double acute
421	# ring
422	# dot
423	# macron
424	'=a' => chr(0xc4).chr(0x81),
425	'=A' => chr(0xc4).chr(0x80),
426	'=e' => chr(0xc4).chr(0x93),
427	'=E' => chr(0xc4).chr(0x92),
428	'=i' => chr(0xc4).chr(0xab),
429	'=I' => chr(0xc4).chr(0xaa),
430	'=o' => chr(0xc4).chr(0x8d),
431	'=O' => chr(0xc4).chr(0x8c),
432	'=u' => chr(0xc4).chr(0xab),
433	'=U' => chr(0xc4).chr(0xaa),
434
435	# stroke - handled specially - see below
436
437	# cedilla - handled specially
438
439	);
440
441	# these are one letter latex commands - we make sure they're not a longer
442	# command name. eg {\d} is d+stroke, so careful of \d
443	my %special_utf8_chars =
444	(
445	# caron
446	'v n' => chr(0xc5).chr(0x88),
447	'v N' => chr(0xc5).chr(0x87),
448	'v s' => chr(0xc5).chr(0xa1),
449	'v S' => chr(0xc5).chr(0xa5),
450	# cedilla
451	'c c' => chr(0xc3).chr(0xa7),
452	'c C' => chr(0xc3).chr(0x87),
453	'c g' => chr(0xc4).chr(0xa3),
454	'c G' => chr(0xc4).chr(0xa2),
455	'c k' => chr(0xc4).chr(0xb7),
456	'c K' => chr(0xc4).chr(0xb6),
457	'c l' => chr(0xc4).chr(0xbc),
458	'c L' => chr(0xc4).chr(0xbb),
459	'c n' => chr(0xc5).chr(0x86),
460	'c N' => chr(0xc5).chr(0x85),
461	'c r' => chr(0xc5).chr(0x97),
462	'c R' => chr(0xc5).chr(0x96),
463	'c s' => chr(0xc5).chr(0x9f),
464	'c S' => chr(0xc5).chr(0x9e),
465	'c t' => chr(0xc5).chr(0xa3),
466	'c T' => chr(0xc5).chr(0xa2),
467	# double acute / Hungarian accent
468	'H O' => chr(0xc5).chr(0x90),
469	'H o' => chr(0xc5).chr(0x91),
470	'H U' => chr(0xc5).chr(0xb0),
471	'H u' => chr(0xc5).chr(0xb1),
472
473	# stroke
474	'd' => chr(0xc4).chr(0x91),
475	'D' => chr(0xc4).chr(0x90),
476	'h' => chr(0xc4).chr(0xa7),
477	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
478	'l' => chr(0xc5).chr(0x82),
479	'L' => chr(0xc5).chr(0x81),
480	'o' => chr(0xc3).chr(0xb8),
481	'O' => chr(0xc3).chr(0x98),
482	't' => chr(0xc5).chr(0xa7),
483	'T' => chr(0xc5).chr(0xa6),
484	# german ss/szlig/sharp s
485	'ss' => chr(0xc3).chr(0x9f),
486	);
487
488	# convert latex-style accented characters.
489	# remove space (if any) between \ and letter to accent (eg {\' a})
490
491	$text =~ s@(\\[`'="])\s(\w)@$1$2@g;
492
493	# remove {} around a single character (eg \'{e})
494	$text =~ s@(\\[`'="]){(\w)}@$1$2@;
495
496	# remove {} around a single character for special 1 letter commands -
497	# need to insert a space. Eg \v{s} -> {\v s}
498	$text =~ s@(\\[vcH]){(\w)}@{$1 $2}@;
499
500	# this is slow (go through whole hash for each substitution!) so
501	# only do if the text contains a '\' character.
502	if ($text =~ m\|\\\|) {
503	for $latex_code (keys %utf8_chars) {
504	$text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g;
505	}
506
507	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
508	# only do the change if immediately followed by a space, }, {, or \
509	for $latex_code (keys %special_utf8_chars) {
510	$text =~ s/\\${latex_code}([\\\s\{\}])/$special_utf8_chars{$latex_code}$1/g;
511	}
512	}
513
514	# remove latex groupings { } (but not \{ or \} )
515	# note - need it like this for first char match - eg {xx}{yy}
516	while ($text =~ s@([^\\]){([^}]*?[^\\])}@$1$2@g) {}
517
518	# remove latex commands
519	$text =~ s@\\\w+{(.*)}@$1@g;
520
521	# maths mode $...$ - this is not interpreted in any way at the moment...
522	$text =~ s@\$(.*)\$@$1@g;
523
524	# quoted { } chars
525	$text =~ s@\\{@{@g;
526	$text =~ s@\\}@}@g;
527
528	return $text;
529	}
530
531
532	sub set_OID {
533	my $self = shift (@_);
534	my ($doc_obj, $id, $segment_number) = @_;
535
536	if ( $self->{'key'} eq "default") {
537	$doc_obj->set_OID();
538	} else {
539	$doc_obj->set_OID($self->{'key'});
540	}
541	}
542
543	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: