Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 9582

Last change on this file since 9582 was 9582, checked in by jrm21, 19 years ago

couple of fixes:

" and others" in the author field => 'et. al.'
make whitespace optional around the = sign

use the booktitle or journal name if the entry has no 'title'

Property svn:keywords set to Author Date Id Revision

File size: 23.0 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42
43
44	package BibTexPlug;
45
46	use SplitPlug;
47
48	# BibTexPlug is a sub-class of BasPlug.
49	sub BEGIN {
50	@ISA = ('SplitPlug');
51	}
52
53	my $arguments =
54	[ { 'name' => "process_exp",
55	'desc' => "{BasPlug.process_exp}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => &get_default_process_exp() },
59	{ 'name' => "split_exp",
60	'desc' => "{SplitPlug.split_exp}",
61	'type' => "regexp",
62	'deft' => &get_default_split_exp(),
63	'reqd' => "no" }
64	];
65
66	my $options = { 'name' => "BibTexPlug",
67	'desc' => "{BibTexPlug.desc}",
68	'abstract' => "no",
69	'inherits' => "yes",
70	'explodes' => "yes",
71	'args' => $arguments };
72
73	# This plugin processes files with the suffix ".bib"
74	sub get_default_process_exp {
75	return q^(?i)\.bib$^;
76	}
77
78	# This plugin splits the input text at blank lines
79	sub get_default_split_exp {
80	return q^\n+(?=@)^;
81	}
82	sub new {
83	my $class = shift (@_);
84	my $self = new SplitPlug ($class, @_);
85	$self->{'plugin_type'} = "BibTexPlug";
86	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
87	my $option_list = $self->{'option_list'};
88	push( @{$option_list}, $options );
89	return bless $self, $class;
90	}
91
92
93
94	# The process function reads a single bibliographic record and stores
95	# it as a new document.
96
97	sub process {
98	my $self = shift (@_);
99	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
100	my $outhandle = $self->{'outhandle'};
101
102	$self->{'key'} = "default";
103
104	# Check that we're dealing with a valid BibTex record
105	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
106
107	# Ignore things we can't use
108	return 0 if ($$textref =~ /^\@String/);
109
110	# Report that we're processing the file
111	print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
112	print $outhandle "BibTexPlug: processing $file\n"
113	if ($self->{'verbosity'}) > 1;
114
115
116	# This hash translates BibTex field names into metadata names. The
117	# BibTex names are taken from the "Local Guide to Latex" Graeme
118	# McKinstry. Metadata names are consistent with ReferPlug.
119
120	# The author metadata will be stored as one "Creator" entry, but will
121	# also be split up into several individual "Author" fields.
122
123	my %field = (
124	'address', 'PublisherAddress',
125	'author', 'Creator',
126
127	'booktitle', 'Booktitle',
128	'chapter', 'Chapter',
129	'edition', 'Edition',
130	'editor', 'Editor',
131	'institution', 'Publisher',
132	'journal', 'Journal',
133	'month', 'Month',
134	'number', 'Number',
135	'pages', 'Pages',
136	'publisher', 'Publisher',
137	'school', 'Publisher',
138	'title', 'Title',
139	'volume', 'Volume',
140	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
141
142	'keywords', 'Keywords',
143	'abstract', 'Abstract',
144	'copyright', 'Copyright',
145	'note', 'Note',
146	);
147
148	# Metadata fields
149	my %metadata;
150	my ($EntryType, $Creator, $Keywords, $text);
151
152	my $verbosity = $self->{'verbosity'};
153	$verbosity = 0 unless $verbosity;
154
155	# Make sure the text has exactly one entry per line -
156	# append line to previous if it doesn't start with " <key> = "
157
158	my @input_lines=split(/\r?\n/, $$textref);
159	my @all_lines;
160	my $entry_line=shift @input_lines;
161	foreach my $input_line (@input_lines) {
162	if ($input_line =~ m/^\s\w+\s=\s*/) {
163	# this is a new key
164	push(@all_lines, $entry_line);
165	$entry_line=$input_line;
166	} else {
167	# this is a continuation of previous line
168	$entry_line .= " " . $input_line;
169	}
170
171	}
172	# add final line, removing trailing '}'
173	$entry_line =~ s/\}\s*$//;
174	push(@all_lines, $entry_line);
175	push(@all_lines, "}");
176
177	# Read and process each line in the bib file.
178	my ($entryname, $name, $value, $line);
179	foreach $line (@all_lines) {
180
181	# Add each line. Most lines consist of a field identifer and
182	# then data, and we simply store them, though we treat some
183	# of the fields a bit differently.
184
185	$line =~ s/\s+/ /g;
186	$text .= "$line\n";
187
188
189	print "Processing line = $line \n" if $verbosity>=4;
190
191	# The first line is special, it contains the reference type and OID
192	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
193	$EntryType = $1;
194	my $EntryID = (defined $2) ? $2 : "default";
195	print "** $EntryType - \"$EntryID\" \n"
196	if ($verbosity >= 4);
197
198	next;
199	}
200	if ($line =~ /\@/) {
201	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
202	if ($verbosity >= 2);
203	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
204	if ($verbosity >= 2);
205	}
206
207	# otherwise, parse the metadata out of this line
208	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
209	$entryname = lc($1);
210	$value = $2;
211	$value =~ s/,?\s*$//; # remove trailing comma and space
212	if ($value =~ /^"/ && $value =~ /"$/) {
213	# remove surrounding " marks
214	$value =~ s/^"//; $value =~ s/"$//;
215	}
216	$value = &process_latex($value);
217
218	# Add this line of metadata
219	$metadata{$entryname} .= "$value";
220
221	}
222
223	# Add the Entry type as metadata
224	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
225
226	#Add the fileformat as metadata
227	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
228
229
230	# Add the various field as metadata
231	foreach my $entryname (keys %metadata) {
232	next unless (defined $field{$entryname});
233	next unless (defined $metadata{$entryname});
234
235	$name = $field{$entryname};
236	$value = $metadata{$entryname};
237
238	if ($name =~ /^Month/) {
239	$value=expand_month($value);
240	}
241
242	# Several special operatons on metadata follow
243
244	# Add individual keywords.
245	# The full set of keywords will be added, in due course, as "Keywords".
246	# However, we also want to add them as individual "Keyword" metadata elements.
247	if ($entryname eq "keywords") {
248	my @keywordlist = split(/,/, $value);
249	foreach my $k (@keywordlist) {
250	$k = lc($k);
251	$k =~ s/\s*$//;
252	$k =~ s/^\s*//;
253	if ($k =~ /\w/) {
254	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
255	}
256	}
257	}
258
259	# Add individual authors
260	# The author metadata will be stored as one "Creator" entry, but we
261	# also want to split it into several individual "Author" fields in
262	# "Lastname, Firstnames" format so we can browse it.
263	if ($entryname eq "author") { #added also comparison with editor
264
265	# take care of "et al."...
266	my $etal='';
267	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
268	$etal=' <em>et. al.</em>';
269	}
270	# und here for german language...
271	# don't use brackets in pattern, else the matched bit becomes
272	# an element in the list!
273	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
274	my @formattedlist = ();
275	foreach $a (@authorlist) {
276	$a =~ s/\s*$//;
277	$a =~ s/^\s*//;
278	# Reformat and add author name
279	next if $a=~ /^\s*$/;
280
281	# names are "First von Last", "von Last, First"
282	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
283	my $first="";
284	my $vonlast="";
285	my $jr="";
286
287	if ($a =~ /,/) {
288	my @parts=split(/,\s*/, $a);
289	$first = pop @parts;
290	if (scalar(@parts) == 2) {
291	$jr = pop @parts;
292	}
293	$vonlast=shift @parts;
294	if (scalar(@parts) > 0) {
295	print $outhandle "BibTexPlug: couldn't parse name $a\n";
296	# but we continue anyway...
297	}
298	} else { # First von Last
299	my @words = split(/ /, $a);
300	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
301	$first .= " " . shift (@words);
302	}
303	$first =~ s/^\s//;
304	$vonlast = join (' ', @words); # whatever's left...
305	}
306	my $von="";
307	my $last="";
308	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
309	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
310	$von = $1;
311	if (!defined ($von)) {
312	# some non-English names do start with lowercase
313	# eg "Marie desJardins". Also we can get typos...
314	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
315	$von="";
316	if ($vonlast =~ /^[a-z]+$/) {
317	# if it's all lowercase, uppercase 1st.
318	$vonlast =~ s/^(.)/\u$1/;
319
320	}
321	}
322	$von =~ s/\s*$//;
323	$last=$vonlast;
324	} else {
325	$last=$vonlast;
326	}
327	my $wholename="$first $von $last $jr";
328	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
329	$wholename =~ s/ $//;
330	push (@formattedlist, $wholename);
331	my $fullname = "$last";
332	$fullname .= " $jr" if ($jr);
333	$fullname .= ", $first";
334	$fullname .= " $von" if ($von);
335
336	# Add each name to set of Authors
337	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
338	}
339
340	# Only want at most one "and" in the Creator field
341	if (scalar(@formattedlist) > 2) {
342	my $lastauthor=pop @formattedlist;
343	$value=join(', ', @formattedlist);
344	$value.=" and $lastauthor";
345	} else { # 1 or 2 authors...
346	$value=join(" and ",@formattedlist);
347	}
348	$value.=$etal; # if there was "et al."
349	}
350
351	# Books and Journals are additionally marked for display purposes
352	if ($entryname eq "booktitle") {
353	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
354	} elsif ($entryname eq "journal") {
355	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
356	}
357
358	# Add the various fields as metadata
359	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
360	}
361
362	# for books and journals...
363	if (!exists $metadata{'title'}) {
364	my $name=$field{'title'}; # get Greenstone metadata name
365	my $value;
366	if (exists $metadata{'booktitle'}) {
367	$value=$metadata{'booktitle'};
368	} elsif (exists $metadata{'journal'}) {
369	$value=$metadata{'journal'};
370	}
371	if ($value) {
372	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
373	}
374	}
375
376	# Add Date (yyyymmdd) metadata
377	if (defined ($metadata{'year'}) ) {
378	my $date=$metadata{'year'};
379	chomp $date;
380	my $month=$metadata{'month'};
381	if (defined($month)) {
382	# month is currently 3 letter code or a range...
383	$month = expand_month($month);
384	# take the first month found... might not find one!
385	$month =~ m/_textmonth(\d\d)_/;
386	$month = $1;
387	}
388	if (!defined($month)) {
389	$month="00";
390	}
391	$date .= "${month}00";
392	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
393	}
394
395	# # Add the text in BibTex format (all fields)
396	if ($text =~ /\w/) {
397
398	$text =~ s@&@&@g;
399	$text =~ s@<@<@g;
400	$text =~ s@>@>@g;
401	$text =~ s@\n@<br/>\n@g;
402	$text =~ s@\\@\\\\@g;
403
404	$doc_obj->add_utf8_text ($cursection, $text);
405	}
406
407	return 1;
408	}
409
410
411
412
413	# convert email addresses and URLs into links
414	sub convert_urls_into_links{
415	my ($text) = @_;
416
417	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
418	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
419
420	return $text;
421	}
422
423	# Clean up whitespace and convert \n charaters to <BR> or <P>
424	sub clean_up_whitespaces{
425	my ($text) = @_;
426
427	$text =~ s/%%%%%/<BR> <BR>/g;
428	$text =~ s/ +/ /g;
429	$text =~ s/\s*$//;
430	$text =~ s/^\s*//;
431	$text =~ s/\n/\n<BR>/g;
432	$text =~ s/<BR>\s*<BR>/<P>/g;
433
434	return $text;
435	}
436
437
438	sub convert_problem_characters_without_ampersand{
439	my ($text) = @_;
440	$text =~ s/</</g;
441	$text =~ s/>/>/g;
442
443	$text =~ s/\'\'/\"/g; #Latex -specific conversion
444	$text =~ s/\`\`/\"/g; #Latex -specific conversion
445
446	$text =~ s/\"/"/g;
447	$text =~ s/\'/’/g;
448	$text =~ s/\`/‘/g;
449
450	# $text =~ s/\+/ /g;
451	# $text =~ s/\(/ /g;
452	# $text =~ s/\)/ /g;
453
454	$text =~ s/\\/\\\\/g;
455
456	# $text =~ s/\./\\\./g;
457
458	return $text;
459	}
460
461	# Convert a text string into HTML.
462
463	# The HTML is going to be inserted into a GML file, so we have to be
464	# careful not to use symbols like ">", which occurs frequently in email
465	# messages (and use &gt instead.
466
467	# This function also turns URLs and email addresses into links, and
468	# replaces carriage returns with <BR> tags (and multiple carriage returns
469	# with <P> tags).
470
471	sub text_into_html {
472	my ($text) = @_;
473
474	# Convert problem characters into HTML symbols
475	$text =~ s/&/&/g;
476
477	$text = &convert_problem_characters_without_ampersand( $text );
478
479	# convert email addresses and URLs into links
480	$text = &convert_urls_into_links( $text );
481
482	$text = &clean_up_whitespaces( $text );
483
484	return $text;
485	}
486
487
488	sub expand_month {
489	my $text=shift;
490
491	# bibtex style files expand abbreviations for months.
492	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
493	$text =~ s/jan/_textmonth01_/g;
494	$text =~ s/feb/_textmonth02_/g;
495	$text =~ s/mar/_textmonth03_/g;
496	$text =~ s/apr/_textmonth04_/g;
497	$text =~ s/may/_textmonth05_/g;
498	$text =~ s/jun/_textmonth06_/g;
499	$text =~ s/jul/_textmonth07_/g;
500	$text =~ s/aug/_textmonth08_/g;
501	$text =~ s/sep/_textmonth09_/g;
502	$text =~ s/oct/_textmonth10_/g;
503	$text =~ s/nov/_textmonth11_/g;
504	$text =~ s/dec/_textmonth12_/g;
505
506	return $text;
507	}
508
509
510	# Convert accented characters, remove { }, interprete some commands....
511	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
512	sub process_latex {
513	my ($text) = @_;
514
515	# note - this is really ugly, but it works. There may be a prettier way
516	# of mapping latex accented chars to utf8, but we just brute force it here.
517	# Also, this isn't complete - not every single possible accented letter
518	# is in here yet, but most of the common ones are.
519
520	my %utf8_chars =
521	(
522	# acutes
523	'\'a' => chr(0xc3).chr(0xa1),
524	'\'c' => chr(0xc4).chr(0x87),
525	'\'e' => chr(0xc3).chr(0xa9),
526	'\'i' => chr(0xc3).chr(0xad),
527	'\'l' => chr(0xc3).chr(0xba),
528	'\'n' => chr(0xc3).chr(0x84),
529	'\'o' => chr(0xc3).chr(0xb3),
530	'\'r' => chr(0xc5).chr(0x95),
531	'\'s' => chr(0xc5).chr(0x9b),
532	'\'u' => chr(0xc3).chr(0xba),
533	'\'y' => chr(0xc3).chr(0xbd),
534	'\'z' => chr(0xc5).chr(0xba),
535	# graves
536	'`a' => chr(0xc3).chr(0xa0),
537	'`A' => chr(0xc3).chr(0x80),
538	'`e' => chr(0xc3).chr(0xa8),
539	'`E' => chr(0xc3).chr(0x88),
540	'`i' => chr(0xc3).chr(0xac),
541	'`I' => chr(0xc3).chr(0x8c),
542	'`o' => chr(0xc3).chr(0xb2),
543	'`O' => chr(0xc3).chr(0x92),
544	'`u' => chr(0xc3).chr(0xb9),
545	'`U' => chr(0xc3).chr(0x99),
546	# circumflex
547	'^a' => chr(0xc3).chr(0xa2),
548	'^A' => chr(0xc3).chr(0x82),
549	'^c' => chr(0xc4).chr(0x89),
550	'^C' => chr(0xc4).chr(0x88),
551	'^e' => chr(0xc3).chr(0xaa),
552	'^E' => chr(0xc3).chr(0x8a),
553	'^g' => chr(0xc4).chr(0x9d),
554	'^G' => chr(0xc4).chr(0x9c),
555	'^h' => chr(0xc4).chr(0xa5),
556	'^H' => chr(0xc4).chr(0xa4),
557	'^i' => chr(0xc3).chr(0xae),
558	'^I' => chr(0xc3).chr(0x8e),
559	'^j' => chr(0xc4).chr(0xb5),
560	'^J' => chr(0xc4).chr(0xb4),
561	'^o' => chr(0xc3).chr(0xb4),
562	'^O' => chr(0xc3).chr(0x94),
563	'^s' => chr(0xc5).chr(0x9d),
564	'^S' => chr(0xc5).chr(0x9c),
565	'^u' => chr(0xc3).chr(0xa2),
566	'^U' => chr(0xc3).chr(0xbb),
567	'^w' => chr(0xc5).chr(0xb5),
568	'^W' => chr(0xc5).chr(0xb4),
569	'^y' => chr(0xc5).chr(0xb7),
570	'^Y' => chr(0xc5).chr(0xb6),
571
572	# diaeresis
573	'"a' => chr(0xc3).chr(0xa4),
574	'"A' => chr(0xc3).chr(0x84),
575	'"e' => chr(0xc3).chr(0xab),
576	'"E' => chr(0xc3).chr(0x8b),
577	'"\\\\i' => chr(0xc3).chr(0xaf),
578	'"\\\\I' => chr(0xc3).chr(0x8f),
579	'"o' => chr(0xc3).chr(0xb6),
580	'"O' => chr(0xc3).chr(0x96),
581	'"u' => chr(0xc3).chr(0xbc),
582	'"U' => chr(0xc3).chr(0x9c),
583	'"y' => chr(0xc3).chr(0xbf),
584	'"Y' => chr(0xc3).chr(0xb8),
585	# tilde
586	'~A' => chr(0xc3).chr(0x83),
587	'~N' => chr(0xc3).chr(0x91),
588	'~O' => chr(0xc3).chr(0x95),
589	'~a' => chr(0xc3).chr(0xa3),
590	'~n' => chr(0xc3).chr(0xb1),
591	'~o' => chr(0xc3).chr(0xb5),
592	# caron - handled specially
593	# double acute
594	# ring
595	# dot
596	'.c' => chr(0xc4).chr(0x8b),
597	'.C' => chr(0xc4).chr(0x8a),
598	'.e' => chr(0xc4).chr(0x97),
599	'.E' => chr(0xc4).chr(0x96),
600	'.g' => chr(0xc4).chr(0xa1),
601	'.G' => chr(0xc4).chr(0xa0),
602	'.I' => chr(0xc4).chr(0xb0),
603	'.z' => chr(0xc5).chr(0xbc),
604	'.Z' => chr(0xc5).chr(0xbb),
605	# macron
606	'=a' => chr(0xc4).chr(0x81),
607	'=A' => chr(0xc4).chr(0x80),
608	'=e' => chr(0xc4).chr(0x93),
609	'=E' => chr(0xc4).chr(0x92),
610	'=i' => chr(0xc4).chr(0xab),
611	'=I' => chr(0xc4).chr(0xaa),
612	'=o' => chr(0xc4).chr(0x8d),
613	'=O' => chr(0xc4).chr(0x8c),
614	'=u' => chr(0xc4).chr(0xab),
615	'=U' => chr(0xc4).chr(0xaa),
616
617	# stroke - handled specially - see below
618
619	# cedilla - handled specially
620	);
621
622	# these are one letter latex commands - we make sure they're not a longer
623	# command name. eg {\d} is d+stroke, so careful of \d
624	my %special_utf8_chars =
625	(
626	# breve
627	'u g' => chr(0xc4).chr(0x9f),
628	'u G' => chr(0xc4).chr(0x9e),
629	'u i' => chr(0xc4).chr(0xad),
630	'u I' => chr(0xc4).chr(0xac),
631	'u o' => chr(0xc5).chr(0x8f),
632	'u O' => chr(0xc5).chr(0x8e),
633	'u u' => chr(0xc5).chr(0xad),
634	'u U' => chr(0xc5).chr(0xac),
635	# caron
636	'v c' => chr(0xc4).chr(0x8d),
637	'v C' => chr(0xc4).chr(0x8c),
638	'v n' => chr(0xc5).chr(0x88),
639	'v N' => chr(0xc5).chr(0x87),
640	'v s' => chr(0xc5).chr(0xa1),
641	'v S' => chr(0xc5).chr(0xa5),
642	'v z' => chr(0xc5).chr(0xbe),
643	'v Z' => chr(0xc5).chr(0xbd),
644	# cedilla
645	'c c' => chr(0xc3).chr(0xa7),
646	'c C' => chr(0xc3).chr(0x87),
647	'c g' => chr(0xc4).chr(0xa3),
648	'c G' => chr(0xc4).chr(0xa2),
649	'c k' => chr(0xc4).chr(0xb7),
650	'c K' => chr(0xc4).chr(0xb6),
651	'c l' => chr(0xc4).chr(0xbc),
652	'c L' => chr(0xc4).chr(0xbb),
653	'c n' => chr(0xc5).chr(0x86),
654	'c N' => chr(0xc5).chr(0x85),
655	'c r' => chr(0xc5).chr(0x97),
656	'c R' => chr(0xc5).chr(0x96),
657	'c s' => chr(0xc5).chr(0x9f),
658	'c S' => chr(0xc5).chr(0x9e),
659	'c t' => chr(0xc5).chr(0xa3),
660	'c T' => chr(0xc5).chr(0xa2),
661	# double acute / Hungarian accent
662	'H O' => chr(0xc5).chr(0x90),
663	'H o' => chr(0xc5).chr(0x91),
664	'H U' => chr(0xc5).chr(0xb0),
665	'H u' => chr(0xc5).chr(0xb1),
666
667	# stroke
668	'd' => chr(0xc4).chr(0x91),
669	'D' => chr(0xc4).chr(0x90),
670	'h' => chr(0xc4).chr(0xa7),
671	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
672	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
673	'l' => chr(0xc5).chr(0x82),
674	'L' => chr(0xc5).chr(0x81),
675	'o' => chr(0xc3).chr(0xb8),
676	'O' => chr(0xc3).chr(0x98),
677	't' => chr(0xc5).chr(0xa7),
678	'T' => chr(0xc5).chr(0xa6),
679	# german ss/szlig/sharp s
680	'ss' => chr(0xc3).chr(0x9f),
681	);
682
683	# convert latex-style accented characters.
684
685	# remove space (if any) between \ and letter to accent (eg {\' a})
686
687	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
688
689	# remove {} around a single character (eg \'{e})
690	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
691
692	# \, is another way of doing cedilla \c
693	$text =~ s@\\,(.)@\\c $1@g;
694
695	# remove {} around a single character for special 1 letter commands -
696	# need to insert a space. Eg \v{s} -> {\v s}
697	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
698
699	# only do if the text contains a '\' character.
700	if ($text =~ m\|\\\|) {
701	# "normal" accents - ie non-alpha latex tag
702	while ($text =~ m@\\([`'="^~\.])([\w])@) {
703	my $tex="$1$2"; my $char="$2";
704	my $replacement=$utf8_chars{$tex};
705	if (!defined($replacement)) {
706	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
707	$replacement=$char;
708	}
709	$text =~ s/\\\Q$tex/$replacement/g;
710	}
711
712	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
713	# only do the change if immediately followed by a space, }, {, or \
714	# one letter accents ( + ss)
715	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
716	my $tex=$1;
717	my $replacement=$special_utf8_chars{$tex};
718	if (!defined($replacement)) {
719	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
720	$replacement=$tex;
721	}
722	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
723
724	}
725
726	# one letter latex accent commands that affect following letter
727	while ($text =~ m@\\([uvcH]) ([\w])@) {
728	my $tex="$1 $2"; my $char="$2";
729	my $replacement=$special_utf8_chars{$tex};
730	if (!defined($replacement)) {
731	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
732	$replacement=$char;
733	}
734	$text =~ s/\\$tex/$replacement/g;
735	}
736	}
737
738	# escape html-sensitive characters
739	$text =~ s@&@&@g;
740	$text =~ s@<@<@g;
741	$text =~ s@>@>@g;
742	$text =~ s/''/"/g; # Latex-specific
743	$text =~ s/``/"/g; # Latex-specific
744	# greenstone-specific
745	$text =~ s@\[@&\#91;@g;
746	$text =~ s@\]@&\#93;@g;
747
748	# remove latex commands
749
750	# explicitly recognised commands
751	$text =~ s@\\ldots@…@g;
752
753	# maths mode
754	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
755
756	# remove all other commands with optional arguments...
757	$text =~ s@\\\w+(\[.?\])?\s@@g;
758	# $text =~ s@\\noopsort{[^}]+\}@@g;
759	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
760
761	# remove latex groupings { } (but not \{ or \} )
762	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
763	$text =~ s/^\{//; # remove { if first char
764
765	# latex characters
766	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
767	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
768	# optional break "\-"
769	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
770	# the non-macro bits have quotes around them - we just remove quotes
771	$text =~ s/([^&])[\"\#]/$1/g;
772	}
773	# dashes. Convert (m\|n)-dash into single dash for html.
774	$text =~ s@\-\-+@\-@g;
775
776	# quoted { } chars
777	$text =~ s@\\{@{@g;
778	$text =~ s@\\}@}@g;
779
780	# finally to protect against macro language...
781	$text =~ s@\\@\\\\@g;
782
783	return $text;
784	}
785
786
787	sub process_latex_math {
788	my $text = shift;
789
790	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
791	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
792	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
793	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
794	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
795
796	# put all other command names in italics
797	$text =~ s@\\([\w]+)@<i>$1</i>@g;
798
799	return $text;
800	}
801
802	sub set_OID {
803	my $self = shift (@_);
804	my ($doc_obj, $id, $segment_number) = @_;
805
806	if ( $self->{'key'} eq "default") {
807	$doc_obj->set_OID("$id\_$segment_number");
808	} else {
809	$doc_obj->set_OID($self->{'key'});
810	}
811	}
812
813	1;
814
815

Note: See TracBrowser for help on using the repository browser.

Download in other formats: