Context Navigation

BibTexPlug.pm@ 3249

Last change on this file since 3249 was 3249, checked in by jrm21, 22 years ago

1) add a space when joining consecutive lines, just in case.

2) Don't use ',' to separate author names.

3) Proper name parsing: first, von, last, jr. And we modify it slightly for
the Creator metadata so it's a nice list with only one "and".

4) Proper Date metadata in the greenstone Date format yyyymmdd so that the
receptionist doesn't print out corrupted strings.

5) Don't create BibTex metadata, as it is exactly the same as the [Text].

6) Modified latex accent parsing, so it is faster - only substitute found
accents instead of old brute force of whole hash.

Property svn:keywords set to Author Date Id Revision

File size: 21.7 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42	# * The raw ascii bibtex entry is stored as "BibTex" metadata.
43
44	package BibTexPlug;
45
46	use SplitPlug;
47
48	# BibTexPlug is a sub-class of BasPlug.
49	sub BEGIN {
50	@ISA = ('SplitPlug');
51	}
52
53	# This plugin processes files with the suffix ".bib"
54	sub get_default_process_exp {
55	return q^(?i)\.bib$^;
56	}
57
58	# This plugin splits the input text at blank lines
59	sub get_default_split_exp {
60	return q^\n+(?=@)^;
61	}
62
63
64
65	# The process function reads a single bibliographic record and stores
66	# it as a new document.
67
68	sub process {
69	my $self = shift (@_);
70	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
71	my $outhandle = $self->{'outhandle'};
72
73	$self->{'key'} = "default";
74
75	# Check that we're dealing with a valid BibTex record
76	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
77
78	# Ignore things we can't use
79	return 0 if ($$textref =~ /^\@String/);
80
81	# Report that we're processing the file
82	print $outhandle "BibTexPlug: processing $file\n"
83	if ($self->{'verbosity'}) > 1;
84
85
86	# This hash translates BibTex field names into metadata names. The
87	# BibTex names are taken from the "Local Guide to Latex" Graeme
88	# McKinstry. Metadata names are consistent with ReferPlug.
89
90	my %field = (
91	'address', 'PublisherAddress',
92	'author', 'Creator',
93
94	'booktitle', 'Booktitle',
95	'chapter', 'Chapter',
96	'edition', 'Edition',
97	'editor', 'Editor',
98	'institution', 'Publisher',
99	'journal', 'Journal',
100	'month', 'Month',
101	'number', 'Number',
102	'pages', 'Pages',
103	'publisher', 'Publisher',
104	'school', 'Publisher',
105	'title', 'Title',
106	'volume', 'Volume',
107	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
108
109	'keywords', 'Keywords',
110	'abstract', 'Abstract',
111	'copyright', 'Copyright'
112	);
113
114	# Metadata fields
115	my %metadata;
116	my ($EntryType, $EntryID, $Creator, $Keywords, $text);
117
118	my $verbosity = $self->{'verbosity'};
119	$verbosity = 0 unless $verbosity;
120
121	# Make sure the text has exactly one entry per line -
122	# append line to previous if it doesn't start with " <key> = "
123
124	my @input_lines=split('\n', $$textref);
125	my @all_lines;
126	my $entry_line=shift @input_lines;
127	foreach my $input_line (@input_lines) {
128	if ($input_line =~ m/^\s\w+\s=\s*/) {
129	# this is a new key
130	push(@all_lines, $entry_line);
131	$entry_line=$input_line;
132	} else {
133	# this is a continuation of previous line
134	$entry_line .= " " . $input_line;
135	}
136
137	}
138	# add final line, removing trailing '}'
139	$entry_line =~ s/\}\s*$//;
140	push(@all_lines, $entry_line);
141	push(@all_lines, "}");
142
143
144
145	# Read and process each line in the bib file.
146	my ($entryname, $name, $value, $line);
147	foreach $line (@all_lines) {
148
149	# Add each line. Most lines consist of a field identifer and
150	# then data, and we simply store them, though we treat some
151	# of the fields a bit differently.
152
153	$line =~ s/\s+/ /g;
154	$text .= "$line\n";
155
156	print "Processing line = $line \n" if $verbosity>=4;
157
158	# The first line is special, it contains the reference type and OID
159	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-]+)\W$/) {
160	$EntryType = $1;
161	$EntryID = $2;
162	print "** $EntryType - \"$EntryID\" \n"
163	if ($verbosity >= 4);
164	$self->{'key'} = $EntryID;
165	next;
166	}
167	if ($line =~ /\@/) {
168	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
169	if ($verbosity >= 2);
170	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
171	if ($verbosity >= 2);
172	}
173
174	# otherwise, parse the metadata out of this line
175	next unless ($line =~ /^\s(\w+)\s+=\s+(.)/);
176	$entryname = lc($1);
177	$value = $2;
178	$value =~ s/,?\s*$//; # remove trailing comma and space
179	if ($value =~ /^"/ && $value =~ /"$/) {
180	# remove surrounding " marks
181	$value =~ s/^"//; $value =~ s/"$//;
182	}
183	$value = &process_latex($value);
184
185	# Add this line of metadata
186	$metadata{$entryname} .= "$value\n";
187
188	}
189
190	# Add the Entry type as metadata
191	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
192
193	# Add the various field as metadata
194	foreach my $entryname (keys %metadata) {
195	next unless (defined $field{$entryname});
196	next unless (defined $metadata{$entryname});
197
198	$name = $field{$entryname};
199	$value = $metadata{$entryname};
200
201	if ($name =~ /^Month/) {
202	$value=expand_month($value);
203	}
204
205	# Several special operatons on metadata follow
206
207	# Add individual keywords.
208	# The full set of keywords will be added, in due course, as "Keywords".
209	# However, we also want to add them as individual "Keyword" metadata elements.
210	if ($entryname eq "keywords") {
211	my @keywordlist = split(/,/, $value);
212	foreach my $k (@keywordlist) {
213	$k = lc($k);
214	$k =~ s/\s*$//;
215	$k =~ s/^\s*//;
216	if ($k =~ /\w/) {
217	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
218	}
219	}
220	}
221
222	# Add individual authors
223	# The author metadata will be stored as one "Creator" entry, but we
224	# also want to split it into several individual "Author" fields in
225	# "Lastname, Firstnames" format so we can browse it.
226	if ($entryname eq "author") { #added also comparison with editor
227
228	# take care of "et al."...
229	$value =~ s/(\s+et\.?\s+al\.?)\s*$//;
230	my $etal=$1;
231	$etal="" if (!defined ($etal));
232	# und here for german language...
233	# don't use brackets in pattern, else the matched bit becomes
234	# an element in the list!
235	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
236	my @formattedlist = ();
237	foreach $a (@authorlist) {
238	$a =~ s/\s*$//;
239	$a =~ s/^\s*//;
240	# Reformat and add author name
241	next if $a=~ /^\s*$/;
242
243	# names are "First von Last", "von Last, First"
244	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
245	my $first="";
246	my $vonlast="";
247	my $jr="";
248
249	if ($a =~ /,/) {
250	my @parts=split(/,\s*/, $a);
251	$first = pop @parts;
252	if (scalar(@parts) == 2) {
253	$jr = pop @parts;
254	}
255	$vonlast=shift @parts;
256	if (scalar(@parts) > 0) {
257	print $outhandle "BibTexPlug: couldn't parse name $a\n";
258	# but we continue anyway...
259	}
260	} else { # First von Last
261	my @words = split(/ /, $a);
262	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
263	$first .= " " . shift (@words);
264	}
265	$first =~ s/^\s//;
266	$vonlast = join (' ', @words); # whatever's left...
267	}
268	my $von="";
269	my $last="";
270	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
271	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
272	$von = $1;
273	if (!defined ($von)) {
274	# some non-English names do start with lowercase
275	# eg "Marie desJardins". Also we can get typos...
276	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
277	$von="";
278	if ($vonlast =~ /^[a-z]+$/) {
279	# if it's all lowercase, uppercase 1st.
280	$vonlast =~ s/^(.)/\u$1/;
281
282	}
283	}
284	$von =~ s/\s*$//;
285	$last=$vonlast;
286	} else {
287	$last=$vonlast;
288	}
289	my $wholename="$first $von $last $jr";
290	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
291	push (@formattedlist, $wholename);
292	my $fullname = "$last";
293	$fullname .= " $jr" if ($jr);
294	$fullname .= ", $first";
295	$fullname .= " $von" if ($von);
296
297	# Add each name to set of Authors
298	# force utf8 pragma so that \w matches in this scope
299	use utf8;
300	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
301	}
302
303	# Only want at most one "and" in the Creator field
304	if (scalar(@formattedlist) > 2) {
305	my $lastauthor=pop @formattedlist;
306	$value=join(', ', @formattedlist);
307	$value.=" and $lastauthor";
308	} else { # 1 or 2 authors...
309	$value=join(" and ",@formattedlist);
310	}
311	$value.=$etal; # if there was "et al."
312	}
313
314	# Books and Journals are additionally marked for display purposes
315	if ($entryname eq "booktitle") {
316	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
317	} elsif ($entryname eq "journal") {
318	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
319	}
320
321
322	# Add the various fields as metadata
323	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
324	}
325
326	# Add Date (yyyymmdd) metadata
327	if (defined ($metadata{'year'}) ) {
328	my $date=$metadata{'year'};
329	chomp $date;
330	my $month=$metadata{'month'};
331	if (defined($month)) {
332	# month is currently 3 letter code or a range...
333	$month = expand_month($month);
334	# take the first month found... might not find one!
335	$month =~ m/_textmonth(\d\d)_/;
336	$month = $1;
337	}
338	if (!defined($month)) {
339	$month="00";
340	}
341	$date .= "${month}00";
342	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
343	}
344
345	# # Add the text in BibTex format (all fields)
346	if ($text =~ /\w/) {
347
348	$text =~ s@&@&@g;
349	$text =~ s@<@<@g;
350	$text =~ s@>@>@g;
351	$text =~ s@\n@<br/>\n@g;
352	$text =~ s@\\@\\\\@g;
353
354	# Not really required...
355	# $doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
356	$doc_obj->add_utf8_text ($cursection, $text);
357	}
358
359	return 1;
360	}
361
362
363
364
365	# convert email addresses and URLs into links
366	sub convert_urls_into_links{
367	my ($text) = @_;
368
369	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
370	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
371
372	return $text;
373	}
374
375	# Clean up whitespace and convert \n charaters to <BR> or <P>
376	sub clean_up_whitespaces{
377	my ($text) = @_;
378
379	$text =~ s/%%%%%/<BR> <BR>/g;
380	$text =~ s/ +/ /g;
381	$text =~ s/\s*$//;
382	$text =~ s/^\s*//;
383	$text =~ s/\n/\n<BR>/g;
384	$text =~ s/<BR>\s*<BR>/<P>/g;
385
386	return $text;
387	}
388
389
390	sub convert_problem_characters_without_ampersand{
391	my ($text) = @_;
392	$text =~ s/</</g;
393	$text =~ s/>/>/g;
394
395	$text =~ s/\'\'/\"/g; #Latex -specific conversion
396	$text =~ s/\`\`/\"/g; #Latex -specific conversion
397
398	$text =~ s/\"/"/g;
399	$text =~ s/\'/’/g;
400	$text =~ s/\`/‘/g;
401
402	# $text =~ s/\+/ /g;
403	# $text =~ s/\(/ /g;
404	# $text =~ s/\)/ /g;
405
406	$text =~ s/\\/\\\\/g;
407
408	# $text =~ s/\./\\\./g;
409
410	return $text;
411	}
412
413	# Convert a text string into HTML.
414
415	# The HTML is going to be inserted into a GML file, so we have to be
416	# careful not to use symbols like ">", which occurs frequently in email
417	# messages (and use &gt instead.
418
419	# This function also turns URLs and email addresses into links, and
420	# replaces carriage returns with <BR> tags (and multiple carriage returns
421	# with <P> tags).
422
423	sub text_into_html {
424	my ($text) = @_;
425
426	# Convert problem characters into HTML symbols
427	$text =~ s/&/&/g;
428
429	$text = &convert_problem_characters_without_ampersand( $text );
430
431	# convert email addresses and URLs into links
432	$text = &convert_urls_into_links( $text );
433
434	$text = &clean_up_whitespaces( $text );
435
436	return $text;
437	}
438
439
440	sub expand_month {
441	my $text=shift;
442
443	# bibtex style files expand abbreviations for months.
444	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
445	$text =~ s/jan/_textmonth01_/g;
446	$text =~ s/feb/_textmonth02_/g;
447	$text =~ s/mar/_textmonth03_/g;
448	$text =~ s/apr/_textmonth04_/g;
449	$text =~ s/may/_textmonth05_/g;
450	$text =~ s/jun/_textmonth06_/g;
451	$text =~ s/jul/_textmonth07_/g;
452	$text =~ s/aug/_textmonth08_/g;
453	$text =~ s/sep/_textmonth09_/g;
454	$text =~ s/oct/_textmonth10_/g;
455	$text =~ s/nov/_textmonth11_/g;
456	$text =~ s/dec/_textmonth12_/g;
457
458	return $text;
459	}
460
461
462	# Convert accented characters, remove { }, interprete some commands....
463	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
464	sub process_latex {
465	my ($text) = @_;
466
467	# note - this is really ugly, but it works. There may be a prettier way
468	# of mapping latex accented chars to utf8, but we just brute force it here.
469	# Also, this isn't complete - not every single possible accented letter
470	# is in here yet, but most of the common ones are.
471
472	my %utf8_chars =
473	(
474	# acutes
475	'\'a' => chr(0xc3).chr(0xa1),
476	'\'c' => chr(0xc4).chr(0x87),
477	'\'e' => chr(0xc3).chr(0xa9),
478	'\'i' => chr(0xc3).chr(0xad),
479	'\'l' => chr(0xc3).chr(0xba),
480	'\'n' => chr(0xc3).chr(0x84),
481	'\'o' => chr(0xc3).chr(0xb3),
482	'\'r' => chr(0xc5).chr(0x95),
483	'\'s' => chr(0xc5).chr(0x9b),
484	'\'u' => chr(0xc3).chr(0xba),
485	'\'y' => chr(0xc3).chr(0xbd),
486	'\'z' => chr(0xc5).chr(0xba),
487	# graves
488	'`a' => chr(0xc3).chr(0xa0),
489	'`A' => chr(0xc3).chr(0x80),
490	'`e' => chr(0xc3).chr(0xa8),
491	'`E' => chr(0xc3).chr(0x88),
492	'`i' => chr(0xc3).chr(0xac),
493	'`I' => chr(0xc3).chr(0x8c),
494	'`o' => chr(0xc3).chr(0xb2),
495	'`O' => chr(0xc3).chr(0x92),
496	'`u' => chr(0xc3).chr(0xb9),
497	'`U' => chr(0xc3).chr(0x99),
498	# circumflex
499	'^a' => chr(0xc3).chr(0xa2),
500	'^A' => chr(0xc3).chr(0x82),
501	'^c' => chr(0xc4).chr(0x89),
502	'^C' => chr(0xc4).chr(0x88),
503	'^e' => chr(0xc3).chr(0xaa),
504	'^E' => chr(0xc3).chr(0x8a),
505	'^g' => chr(0xc4).chr(0x9d),
506	'^G' => chr(0xc4).chr(0x9c),
507	'^h' => chr(0xc4).chr(0xa5),
508	'^H' => chr(0xc4).chr(0xa4),
509	'^i' => chr(0xc3).chr(0xae),
510	'^I' => chr(0xc3).chr(0x8e),
511	'^j' => chr(0xc4).chr(0xb5),
512	'^J' => chr(0xc4).chr(0xb4),
513	'^o' => chr(0xc3).chr(0xb4),
514	'^O' => chr(0xc3).chr(0x94),
515	'^s' => chr(0xc5).chr(0x9d),
516	'^S' => chr(0xc5).chr(0x9c),
517	'^u' => chr(0xc3).chr(0xa2),
518	'^U' => chr(0xc3).chr(0xbb),
519	'^w' => chr(0xc5).chr(0xb5),
520	'^W' => chr(0xc5).chr(0xb4),
521	'^y' => chr(0xc5).chr(0xb7),
522	'^Y' => chr(0xc5).chr(0xb6),
523
524	# diaeresis
525	'"a' => chr(0xc3).chr(0xa4),
526	'"A' => chr(0xc3).chr(0x84),
527	'"e' => chr(0xc3).chr(0xab),
528	'"E' => chr(0xc3).chr(0x8b),
529	'"\\\\i' => chr(0xc3).chr(0xaf),
530	'"\\\\I' => chr(0xc3).chr(0x8f),
531	'"o' => chr(0xc3).chr(0xb6),
532	'"O' => chr(0xc3).chr(0x96),
533	'"u' => chr(0xc3).chr(0xbc),
534	'"U' => chr(0xc3).chr(0x9c),
535	'"y' => chr(0xc3).chr(0xbf),
536	'"Y' => chr(0xc3).chr(0xb8),
537	# tilde
538	'~A' => chr(0xc3).chr(0x83),
539	'~N' => chr(0xc3).chr(0x91),
540	'~O' => chr(0xc3).chr(0x95),
541	'~a' => chr(0xc3).chr(0xa3),
542	'~n' => chr(0xc3).chr(0xb1),
543	'~o' => chr(0xc3).chr(0xb5),
544	# caron - handled specially
545	# double acute
546	# ring
547	# dot
548	'.c' => chr(0xc4).chr(0x8b),
549	'.C' => chr(0xc4).chr(0x8a),
550	'.e' => chr(0xc4).chr(0x97),
551	'.E' => chr(0xc4).chr(0x96),
552	'.g' => chr(0xc4).chr(0xa1),
553	'.G' => chr(0xc4).chr(0xa0),
554	'.I' => chr(0xc4).chr(0xb0),
555	'.z' => chr(0xc5).chr(0xbc),
556	'.Z' => chr(0xc5).chr(0xbb),
557	# macron
558	'=a' => chr(0xc4).chr(0x81),
559	'=A' => chr(0xc4).chr(0x80),
560	'=e' => chr(0xc4).chr(0x93),
561	'=E' => chr(0xc4).chr(0x92),
562	'=i' => chr(0xc4).chr(0xab),
563	'=I' => chr(0xc4).chr(0xaa),
564	'=o' => chr(0xc4).chr(0x8d),
565	'=O' => chr(0xc4).chr(0x8c),
566	'=u' => chr(0xc4).chr(0xab),
567	'=U' => chr(0xc4).chr(0xaa),
568
569	# stroke - handled specially - see below
570
571	# cedilla - handled specially
572	);
573
574	# these are one letter latex commands - we make sure they're not a longer
575	# command name. eg {\d} is d+stroke, so careful of \d
576	my %special_utf8_chars =
577	(
578	# breve
579	'u g' => chr(0xc4).chr(0x9f),
580	'u G' => chr(0xc4).chr(0x9e),
581	'u i' => chr(0xc4).chr(0xad),
582	'u I' => chr(0xc4).chr(0xac),
583	'u o' => chr(0xc5).chr(0x8f),
584	'u O' => chr(0xc5).chr(0x8e),
585	'u u' => chr(0xc5).chr(0xad),
586	'u U' => chr(0xc5).chr(0xac),
587	# caron
588	'v c' => chr(0xc4).chr(0x8d),
589	'v C' => chr(0xc4).chr(0x8c),
590	'v n' => chr(0xc5).chr(0x88),
591	'v N' => chr(0xc5).chr(0x87),
592	'v s' => chr(0xc5).chr(0xa1),
593	'v S' => chr(0xc5).chr(0xa5),
594	'v z' => chr(0xc5).chr(0xbe),
595	'v Z' => chr(0xc5).chr(0xbd),
596	# cedilla
597	'c c' => chr(0xc3).chr(0xa7),
598	'c C' => chr(0xc3).chr(0x87),
599	'c g' => chr(0xc4).chr(0xa3),
600	'c G' => chr(0xc4).chr(0xa2),
601	'c k' => chr(0xc4).chr(0xb7),
602	'c K' => chr(0xc4).chr(0xb6),
603	'c l' => chr(0xc4).chr(0xbc),
604	'c L' => chr(0xc4).chr(0xbb),
605	'c n' => chr(0xc5).chr(0x86),
606	'c N' => chr(0xc5).chr(0x85),
607	'c r' => chr(0xc5).chr(0x97),
608	'c R' => chr(0xc5).chr(0x96),
609	'c s' => chr(0xc5).chr(0x9f),
610	'c S' => chr(0xc5).chr(0x9e),
611	'c t' => chr(0xc5).chr(0xa3),
612	'c T' => chr(0xc5).chr(0xa2),
613	# double acute / Hungarian accent
614	'H O' => chr(0xc5).chr(0x90),
615	'H o' => chr(0xc5).chr(0x91),
616	'H U' => chr(0xc5).chr(0xb0),
617	'H u' => chr(0xc5).chr(0xb1),
618
619	# stroke
620	'd' => chr(0xc4).chr(0x91),
621	'D' => chr(0xc4).chr(0x90),
622	'h' => chr(0xc4).chr(0xa7),
623	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
624	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
625	'l' => chr(0xc5).chr(0x82),
626	'L' => chr(0xc5).chr(0x81),
627	'o' => chr(0xc3).chr(0xb8),
628	'O' => chr(0xc3).chr(0x98),
629	't' => chr(0xc5).chr(0xa7),
630	'T' => chr(0xc5).chr(0xa6),
631	# german ss/szlig/sharp s
632	'ss' => chr(0xc3).chr(0x9f),
633	);
634
635	# convert latex-style accented characters.
636
637	# remove space (if any) between \ and letter to accent (eg {\' a})
638
639	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
640
641	# remove {} around a single character (eg \'{e})
642	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
643
644	# \, is another way of doing cedilla \c
645	$text =~ s@\\,(.)@\\c $1@g;
646
647	# remove {} around a single character for special 1 letter commands -
648	# need to insert a space. Eg \v{s} -> {\v s}
649	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
650
651	# only do if the text contains a '\' character.
652	if ($text =~ m\|\\\|) {
653	# "normal" accents - ie non-alpha latex tag
654	while ($text =~ m@\\([`'="^~\.])([\w])@) {
655	my $tex="$1$2"; my $char="$2";
656	my $replacement=$utf8_chars{$tex};
657	if (!defined($replacement)) {
658	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
659	$replacement=$char;
660	}
661	$text =~ s/\\$tex/$replacement/g;
662	}
663
664	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
665	# only do the change if immediately followed by a space, }, {, or \
666	# one letter accents ( + ss)
667	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
668	my $tex=$1;
669	my $replacement=$special_utf8_chars{$tex};
670	if (!defined($replacement)) {
671	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
672	$replacement=$tex;
673	}
674	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
675
676	}
677
678	# one letter latex accent commands that affect following letter
679	while ($text =~ m@\\([uvcH]) ([\w])@) {
680	my $tex="$1 $2"; my $char="$2";
681	my $replacement=$special_utf8_chars{$tex};
682	if (!defined($replacement)) {
683	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
684	$replacement=$char;
685	}
686	$text =~ s/\\$tex/$replacement/g;
687	}
688	}
689
690	# escape html-sensitive characters
691	$text =~ s@&@&@g;
692	$text =~ s@<@<@g;
693	$text =~ s@>@>@g;
694	$text =~ s/''/"/g; # Latex-specific
695	$text =~ s/``/"/g; # Latex-specific
696	# greenstone-specific
697	$text =~ s@\[@&\#91;@g;
698	$text =~ s@\]@&\#93;@g;
699
700	# remove latex commands
701
702	# explicitly recognised commands
703	$text =~ s@\\ldots@…@g;
704
705	# maths mode
706	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
707
708	# remove all other commands with optional arguments...
709	$text =~ s@\\\w+(\[.?\])?\s@@g;
710	# $text =~ s@\\noopsort{[^}]+\}@@g;
711	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
712
713	# remove latex groupings { } (but not \{ or \} )
714	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
715	$text =~ s/^\{//; # remove { if first char
716
717	# latex characters
718	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
719	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
720	# optional break "\-"
721	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
722	# the non-macro bits have quotes around them - we just remove quotes
723	$text =~ s/([^&])[\"\#]/$1/g;
724	}
725	# dashes. Convert (m\|n)-dash into single dash for html.
726	$text =~ s@\-\-+@\-@g;
727
728	# quoted { } chars
729	$text =~ s@\\{@{@g;
730	$text =~ s@\\}@}@g;
731
732	# finally to protect against macro language...
733	$text =~ s@\\@\\\\@g;
734
735	return $text;
736	}
737
738
739	sub process_latex_math {
740	my $text = shift;
741
742	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
743	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
744	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
745	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
746	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
747
748	# put all other command names in italics
749	$text =~ s@\\([\w]+)@<i>$1</i>@g;
750
751	return $text;
752	}
753
754	sub set_OID {
755	my $self = shift (@_);
756	my ($doc_obj, $id, $segment_number) = @_;
757
758	if ( $self->{'key'} eq "default") {
759	$doc_obj->set_OID();
760	} else {
761	$doc_obj->set_OID($self->{'key'});
762	}
763	}
764
765	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 3249

Download in other formats: