Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago
added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile
Property svn:keywords set to `Author Date Id Revision`
File size: 23.3 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42
43
44	package BibTexPlug;
45
46	use SplitPlug;
47	use strict;
48	no strict 'refs'; # allow filehandles to be variables and viceversa
49
50	# BibTexPlug is a sub-class of BasPlug.
51	sub BEGIN {
52	@BibTexPlug::ISA = ('SplitPlug');
53	}
54
55	my $arguments =
56	[ { 'name' => "process_exp",
57	'desc' => "{BasPlug.process_exp}",
58	'type' => "regexp",
59	'reqd' => "no",
60	'deft' => &get_default_process_exp() },
61	{ 'name' => "split_exp",
62	'desc' => "{SplitPlug.split_exp}",
63	'type' => "regexp",
64	'deft' => &get_default_split_exp(),
65	'reqd' => "no" }
66	];
67
68	my $options = { 'name' => "BibTexPlug",
69	'desc' => "{BibTexPlug.desc}",
70	'abstract' => "no",
71	'inherits' => "yes",
72	'explodes' => "yes",
73	'args' => $arguments };
74
75	# This plugin processes files with the suffix ".bib"
76	sub get_default_process_exp {
77	return q^(?i)\.bib$^;
78	}
79
80	# This plugin splits the input text at blank lines
81	sub get_default_split_exp {
82	return q^\n+(?=@)^;
83	}
84	sub new {
85	my ($class) = shift (@_);
86	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87	push(@$pluginlist, $class);
88
89	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
90	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
91
92	my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
93
94	return bless $self, $class;
95	}
96
97
98
99	# The process function reads a single bibliographic record and stores
100	# it as a new document.
101
102	sub process {
103	my $self = shift (@_);
104	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
105	my $outhandle = $self->{'outhandle'};
106
107	my $cursection = $doc_obj->get_top_section();
108	$self->{'key'} = "default";
109
110	# Check that we're dealing with a valid BibTex record
111	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
112
113	# Ignore things we can't use
114	return 0 if ($$textref =~ /^\@String/);
115
116	# Report that we're processing the file
117	print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
118	print $outhandle "BibTexPlug: processing $file\n"
119	if ($self->{'verbosity'}) > 1;
120
121
122	# This hash translates BibTex field names into metadata names. The
123	# BibTex names are taken from the "Local Guide to Latex" Graeme
124	# McKinstry. Metadata names are consistent with ReferPlug.
125
126	# The author metadata will be stored as one "Creator" entry, but will
127	# also be split up into several individual "Author" fields.
128
129	my %field = (
130	'address', 'PublisherAddress',
131	'author', 'Creator',
132
133	'booktitle', 'Booktitle',
134	'chapter', 'Chapter',
135	'edition', 'Edition',
136	'editor', 'Editor',
137	'institution', 'Publisher',
138	'journal', 'Journal',
139	'month', 'Month',
140	'number', 'Number',
141	'pages', 'Pages',
142	'publisher', 'Publisher',
143	'school', 'Publisher',
144	'title', 'Title',
145	'volume', 'Volume',
146	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
147
148	'keywords', 'Keywords',
149	'abstract', 'Abstract',
150	'copyright', 'Copyright',
151	'note', 'Note',
152	);
153
154	# Metadata fields
155	my %metadata;
156	my ($EntryType, $Creator, $Keywords, $text);
157
158	my $verbosity = $self->{'verbosity'};
159	$verbosity = 0 unless $verbosity;
160
161	# Make sure the text has exactly one entry per line -
162	# append line to previous if it doesn't start with " <key> = "
163
164	my @input_lines=split(/\r?\n/, $$textref);
165	my @all_lines;
166	my $entry_line=shift @input_lines;
167	foreach my $input_line (@input_lines) {
168	if ($input_line =~ m/^\s\w+\s=\s*/) {
169	# this is a new key
170	push(@all_lines, $entry_line);
171	$entry_line=$input_line;
172	} else {
173	# this is a continuation of previous line
174	$entry_line .= " " . $input_line;
175	}
176
177	}
178	# add final line, removing trailing '}'
179	$entry_line =~ s/\}\s*$//;
180	push(@all_lines, $entry_line);
181	push(@all_lines, "}");
182
183	# Read and process each line in the bib file.
184	my ($entryname, $name, $value, $line);
185	foreach $line (@all_lines) {
186
187	# Add each line. Most lines consist of a field identifer and
188	# then data, and we simply store them, though we treat some
189	# of the fields a bit differently.
190
191	$line =~ s/\s+/ /g;
192	$text .= "$line\n";
193
194
195	print "Processing line = $line \n" if $verbosity>=4;
196
197	# The first line is special, it contains the reference type and OID
198	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
199	$EntryType = $1;
200	my $EntryID = (defined $2) ? $2 : "default";
201	print "** $EntryType - \"$EntryID\" \n"
202	if ($verbosity >= 4);
203
204	next;
205	}
206	if ($line =~ /\@/) {
207	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
208	if ($verbosity >= 2);
209	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
210	if ($verbosity >= 2);
211	}
212
213	# otherwise, parse the metadata out of this line
214	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
215	$entryname = lc($1);
216	$value = $2;
217	$value =~ s/,?\s*$//; # remove trailing comma and space
218	if ($value =~ /^"/ && $value =~ /"$/) {
219	# remove surrounding " marks
220	$value =~ s/^"//; $value =~ s/"$//;
221	}
222	$value = &process_latex($value);
223
224	# Add this line of metadata
225	$metadata{$entryname} .= "$value";
226
227	}
228
229	# Add the Entry type as metadata
230	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
231
232	#Add the fileformat as metadata
233	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
234
235
236	# Add the various field as metadata
237	foreach my $entryname (keys %metadata) {
238	next unless (defined $field{$entryname});
239	next unless (defined $metadata{$entryname});
240
241	$name = $field{$entryname};
242	$value = $metadata{$entryname};
243
244	if ($name =~ /^Month/) {
245	$value=expand_month($value);
246	}
247
248	# Several special operatons on metadata follow
249
250	# Add individual keywords.
251	# The full set of keywords will be added, in due course, as "Keywords".
252	# However, we also want to add them as individual "Keyword" metadata elements.
253	if ($entryname eq "keywords") {
254	my @keywordlist = split(/,/, $value);
255	foreach my $k (@keywordlist) {
256	$k = lc($k);
257	$k =~ s/\s*$//;
258	$k =~ s/^\s*//;
259	if ($k =~ /\w/) {
260	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
261	}
262	}
263	}
264
265	# Add individual authors
266	# The author metadata will be stored as one "Creator" entry, but we
267	# also want to split it into several individual "Author" fields in
268	# "Lastname, Firstnames" format so we can browse it.
269	if ($entryname eq "author") { #added also comparison with editor
270
271	# take care of "et al."...
272	my $etal='';
273	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
274	$etal=' <em>et. al.</em>';
275	}
276	# und here for german language...
277	# don't use brackets in pattern, else the matched bit becomes
278	# an element in the list!
279	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
280	my @formattedlist = ();
281	foreach $a (@authorlist) {
282	$a =~ s/\s*$//;
283	$a =~ s/^\s*//;
284	# Reformat and add author name
285	next if $a=~ /^\s*$/;
286
287	# names are "First von Last", "von Last, First"
288	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
289	my $first="";
290	my $vonlast="";
291	my $jr="";
292
293	if ($a =~ /,/) {
294	my @parts=split(/,\s*/, $a);
295	$first = pop @parts;
296	if (scalar(@parts) == 2) {
297	$jr = pop @parts;
298	}
299	$vonlast=shift @parts;
300	if (scalar(@parts) > 0) {
301	print $outhandle "BibTexPlug: couldn't parse name $a\n";
302	# but we continue anyway...
303	}
304	} else { # First von Last
305	my @words = split(/ /, $a);
306	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
307	$first .= " " . shift (@words);
308	}
309	$first =~ s/^\s//;
310	$vonlast = join (' ', @words); # whatever's left...
311	}
312	my $von="";
313	my $last="";
314	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
315	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
316	$von = $1;
317	if (!defined ($von)) {
318	# some non-English names do start with lowercase
319	# eg "Marie desJardins". Also we can get typos...
320	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
321	$von="";
322	if ($vonlast =~ /^[a-z]+$/) {
323	# if it's all lowercase, uppercase 1st.
324	$vonlast =~ s/^(.)/\u$1/;
325
326	}
327	}
328	$von =~ s/\s*$//;
329	$last=$vonlast;
330	} else {
331	$last=$vonlast;
332	}
333	my $wholename="$first $von $last $jr";
334	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
335	$wholename =~ s/ $//;
336	push (@formattedlist, $wholename);
337	my $fullname = "$last";
338	$fullname .= " $jr" if ($jr);
339	$fullname .= ", $first";
340	$fullname .= " $von" if ($von);
341
342	# Add each name to set of Authors
343	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
344	}
345
346	# Only want at most one "and" in the Creator field
347	if (scalar(@formattedlist) > 2) {
348	my $lastauthor=pop @formattedlist;
349	$value=join(', ', @formattedlist);
350	$value.=" and $lastauthor";
351	} else { # 1 or 2 authors...
352	$value=join(" and ",@formattedlist);
353	}
354	$value.=$etal; # if there was "et al."
355	}
356
357	# Books and Journals are additionally marked for display purposes
358	if ($entryname eq "booktitle") {
359	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
360	} elsif ($entryname eq "journal") {
361	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
362	}
363
364	# Add the various fields as metadata
365	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
366	}
367
368	# for books and journals...
369	if (!exists $metadata{'title'}) {
370	my $name=$field{'title'}; # get Greenstone metadata name
371	my $value;
372	if (exists $metadata{'booktitle'}) {
373	$value=$metadata{'booktitle'};
374	} elsif (exists $metadata{'journal'}) {
375	$value=$metadata{'journal'};
376	}
377	if ($value) {
378	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
379	}
380	}
381
382	# Add Date (yyyymmdd) metadata
383	if (defined ($metadata{'year'}) ) {
384	my $date=$metadata{'year'};
385	chomp $date;
386	my $month=$metadata{'month'};
387	if (defined($month)) {
388	# month is currently 3 letter code or a range...
389	$month = expand_month($month);
390	# take the first month found... might not find one!
391	$month =~ m/_textmonth(\d\d)_/;
392	$month = $1;
393	}
394	if (!defined($month)) {
395	$month="00";
396	}
397	$date .= "${month}00";
398	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
399	}
400
401	# # Add the text in BibTex format (all fields)
402	if ($text =~ /\w/) {
403
404	$text =~ s@&@&@g;
405	$text =~ s@<@<@g;
406	$text =~ s@>@>@g;
407	$text =~ s@\n@<br/>\n@g;
408	$text =~ s@\\@\\\\@g;
409
410	$doc_obj->add_utf8_text ($cursection, $text);
411	}
412
413	return 1;
414	}
415
416
417
418
419	# convert email addresses and URLs into links
420	sub convert_urls_into_links{
421	my ($text) = @_;
422
423	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
424	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
425
426	return $text;
427	}
428
429	# Clean up whitespace and convert \n charaters to <BR> or <P>
430	sub clean_up_whitespaces{
431	my ($text) = @_;
432
433	$text =~ s/%%%%%/<BR> <BR>/g;
434	$text =~ s/ +/ /g;
435	$text =~ s/\s*$//;
436	$text =~ s/^\s*//;
437	$text =~ s/\n/\n<BR>/g;
438	$text =~ s/<BR>\s*<BR>/<P>/g;
439
440	return $text;
441	}
442
443
444	sub convert_problem_characters_without_ampersand{
445	my ($text) = @_;
446	$text =~ s/</</g;
447	$text =~ s/>/>/g;
448
449	$text =~ s/\'\'/\"/g; #Latex -specific conversion
450	$text =~ s/\`\`/\"/g; #Latex -specific conversion
451
452	$text =~ s/\"/"/g;
453	$text =~ s/\'/’/g;
454	$text =~ s/\`/‘/g;
455
456	# $text =~ s/\+/ /g;
457	# $text =~ s/\(/ /g;
458	# $text =~ s/\)/ /g;
459
460	$text =~ s/\\/\\\\/g;
461
462	# $text =~ s/\./\\\./g;
463
464	return $text;
465	}
466
467	# Convert a text string into HTML.
468
469	# The HTML is going to be inserted into a GML file, so we have to be
470	# careful not to use symbols like ">", which occurs frequently in email
471	# messages (and use &gt instead.
472
473	# This function also turns URLs and email addresses into links, and
474	# replaces carriage returns with <BR> tags (and multiple carriage returns
475	# with <P> tags).
476
477	sub text_into_html {
478	my ($text) = @_;
479
480	# Convert problem characters into HTML symbols
481	$text =~ s/&/&/g;
482
483	$text = &convert_problem_characters_without_ampersand( $text );
484
485	# convert email addresses and URLs into links
486	$text = &convert_urls_into_links( $text );
487
488	$text = &clean_up_whitespaces( $text );
489
490	return $text;
491	}
492
493
494	sub expand_month {
495	my $text=shift;
496
497	# bibtex style files expand abbreviations for months.
498	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
499	$text =~ s/jan/_textmonth01_/g;
500	$text =~ s/feb/_textmonth02_/g;
501	$text =~ s/mar/_textmonth03_/g;
502	$text =~ s/apr/_textmonth04_/g;
503	$text =~ s/may/_textmonth05_/g;
504	$text =~ s/jun/_textmonth06_/g;
505	$text =~ s/jul/_textmonth07_/g;
506	$text =~ s/aug/_textmonth08_/g;
507	$text =~ s/sep/_textmonth09_/g;
508	$text =~ s/oct/_textmonth10_/g;
509	$text =~ s/nov/_textmonth11_/g;
510	$text =~ s/dec/_textmonth12_/g;
511
512	return $text;
513	}
514
515
516	# Convert accented characters, remove { }, interprete some commands....
517	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
518	sub process_latex {
519	my ($text) = @_;
520
521	# note - this is really ugly, but it works. There may be a prettier way
522	# of mapping latex accented chars to utf8, but we just brute force it here.
523	# Also, this isn't complete - not every single possible accented letter
524	# is in here yet, but most of the common ones are.
525
526	my %utf8_chars =
527	(
528	# acutes
529	'\'a' => chr(0xc3).chr(0xa1),
530	'\'c' => chr(0xc4).chr(0x87),
531	'\'e' => chr(0xc3).chr(0xa9),
532	'\'i' => chr(0xc3).chr(0xad),
533	'\'l' => chr(0xc3).chr(0xba),
534	'\'n' => chr(0xc3).chr(0x84),
535	'\'o' => chr(0xc3).chr(0xb3),
536	'\'r' => chr(0xc5).chr(0x95),
537	'\'s' => chr(0xc5).chr(0x9b),
538	'\'u' => chr(0xc3).chr(0xba),
539	'\'y' => chr(0xc3).chr(0xbd),
540	'\'z' => chr(0xc5).chr(0xba),
541	# graves
542	'`a' => chr(0xc3).chr(0xa0),
543	'`A' => chr(0xc3).chr(0x80),
544	'`e' => chr(0xc3).chr(0xa8),
545	'`E' => chr(0xc3).chr(0x88),
546	'`i' => chr(0xc3).chr(0xac),
547	'`I' => chr(0xc3).chr(0x8c),
548	'`o' => chr(0xc3).chr(0xb2),
549	'`O' => chr(0xc3).chr(0x92),
550	'`u' => chr(0xc3).chr(0xb9),
551	'`U' => chr(0xc3).chr(0x99),
552	# circumflex
553	'^a' => chr(0xc3).chr(0xa2),
554	'^A' => chr(0xc3).chr(0x82),
555	'^c' => chr(0xc4).chr(0x89),
556	'^C' => chr(0xc4).chr(0x88),
557	'^e' => chr(0xc3).chr(0xaa),
558	'^E' => chr(0xc3).chr(0x8a),
559	'^g' => chr(0xc4).chr(0x9d),
560	'^G' => chr(0xc4).chr(0x9c),
561	'^h' => chr(0xc4).chr(0xa5),
562	'^H' => chr(0xc4).chr(0xa4),
563	'^i' => chr(0xc3).chr(0xae),
564	'^I' => chr(0xc3).chr(0x8e),
565	'^j' => chr(0xc4).chr(0xb5),
566	'^J' => chr(0xc4).chr(0xb4),
567	'^o' => chr(0xc3).chr(0xb4),
568	'^O' => chr(0xc3).chr(0x94),
569	'^s' => chr(0xc5).chr(0x9d),
570	'^S' => chr(0xc5).chr(0x9c),
571	'^u' => chr(0xc3).chr(0xa2),
572	'^U' => chr(0xc3).chr(0xbb),
573	'^w' => chr(0xc5).chr(0xb5),
574	'^W' => chr(0xc5).chr(0xb4),
575	'^y' => chr(0xc5).chr(0xb7),
576	'^Y' => chr(0xc5).chr(0xb6),
577
578	# diaeresis
579	'"a' => chr(0xc3).chr(0xa4),
580	'"A' => chr(0xc3).chr(0x84),
581	'"e' => chr(0xc3).chr(0xab),
582	'"E' => chr(0xc3).chr(0x8b),
583	'"\\\\i' => chr(0xc3).chr(0xaf),
584	'"\\\\I' => chr(0xc3).chr(0x8f),
585	'"o' => chr(0xc3).chr(0xb6),
586	'"O' => chr(0xc3).chr(0x96),
587	'"u' => chr(0xc3).chr(0xbc),
588	'"U' => chr(0xc3).chr(0x9c),
589	'"y' => chr(0xc3).chr(0xbf),
590	'"Y' => chr(0xc3).chr(0xb8),
591	# tilde
592	'~A' => chr(0xc3).chr(0x83),
593	'~N' => chr(0xc3).chr(0x91),
594	'~O' => chr(0xc3).chr(0x95),
595	'~a' => chr(0xc3).chr(0xa3),
596	'~n' => chr(0xc3).chr(0xb1),
597	'~o' => chr(0xc3).chr(0xb5),
598	# caron - handled specially
599	# double acute
600	# ring
601	# dot
602	'.c' => chr(0xc4).chr(0x8b),
603	'.C' => chr(0xc4).chr(0x8a),
604	'.e' => chr(0xc4).chr(0x97),
605	'.E' => chr(0xc4).chr(0x96),
606	'.g' => chr(0xc4).chr(0xa1),
607	'.G' => chr(0xc4).chr(0xa0),
608	'.I' => chr(0xc4).chr(0xb0),
609	'.z' => chr(0xc5).chr(0xbc),
610	'.Z' => chr(0xc5).chr(0xbb),
611	# macron
612	'=a' => chr(0xc4).chr(0x81),
613	'=A' => chr(0xc4).chr(0x80),
614	'=e' => chr(0xc4).chr(0x93),
615	'=E' => chr(0xc4).chr(0x92),
616	'=i' => chr(0xc4).chr(0xab),
617	'=I' => chr(0xc4).chr(0xaa),
618	'=o' => chr(0xc4).chr(0x8d),
619	'=O' => chr(0xc4).chr(0x8c),
620	'=u' => chr(0xc4).chr(0xab),
621	'=U' => chr(0xc4).chr(0xaa),
622
623	# stroke - handled specially - see below
624
625	# cedilla - handled specially
626	);
627
628	# these are one letter latex commands - we make sure they're not a longer
629	# command name. eg {\d} is d+stroke, so careful of \d
630	my %special_utf8_chars =
631	(
632	# breve
633	'u g' => chr(0xc4).chr(0x9f),
634	'u G' => chr(0xc4).chr(0x9e),
635	'u i' => chr(0xc4).chr(0xad),
636	'u I' => chr(0xc4).chr(0xac),
637	'u o' => chr(0xc5).chr(0x8f),
638	'u O' => chr(0xc5).chr(0x8e),
639	'u u' => chr(0xc5).chr(0xad),
640	'u U' => chr(0xc5).chr(0xac),
641	# caron
642	'v c' => chr(0xc4).chr(0x8d),
643	'v C' => chr(0xc4).chr(0x8c),
644	'v n' => chr(0xc5).chr(0x88),
645	'v N' => chr(0xc5).chr(0x87),
646	'v s' => chr(0xc5).chr(0xa1),
647	'v S' => chr(0xc5).chr(0xa5),
648	'v z' => chr(0xc5).chr(0xbe),
649	'v Z' => chr(0xc5).chr(0xbd),
650	# cedilla
651	'c c' => chr(0xc3).chr(0xa7),
652	'c C' => chr(0xc3).chr(0x87),
653	'c g' => chr(0xc4).chr(0xa3),
654	'c G' => chr(0xc4).chr(0xa2),
655	'c k' => chr(0xc4).chr(0xb7),
656	'c K' => chr(0xc4).chr(0xb6),
657	'c l' => chr(0xc4).chr(0xbc),
658	'c L' => chr(0xc4).chr(0xbb),
659	'c n' => chr(0xc5).chr(0x86),
660	'c N' => chr(0xc5).chr(0x85),
661	'c r' => chr(0xc5).chr(0x97),
662	'c R' => chr(0xc5).chr(0x96),
663	'c s' => chr(0xc5).chr(0x9f),
664	'c S' => chr(0xc5).chr(0x9e),
665	'c t' => chr(0xc5).chr(0xa3),
666	'c T' => chr(0xc5).chr(0xa2),
667	# double acute / Hungarian accent
668	'H O' => chr(0xc5).chr(0x90),
669	'H o' => chr(0xc5).chr(0x91),
670	'H U' => chr(0xc5).chr(0xb0),
671	'H u' => chr(0xc5).chr(0xb1),
672
673	# stroke
674	'd' => chr(0xc4).chr(0x91),
675	'D' => chr(0xc4).chr(0x90),
676	'h' => chr(0xc4).chr(0xa7),
677	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
678	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
679	'l' => chr(0xc5).chr(0x82),
680	'L' => chr(0xc5).chr(0x81),
681	'o' => chr(0xc3).chr(0xb8),
682	'O' => chr(0xc3).chr(0x98),
683	't' => chr(0xc5).chr(0xa7),
684	'T' => chr(0xc5).chr(0xa6),
685	# german ss/szlig/sharp s
686	'ss' => chr(0xc3).chr(0x9f),
687	);
688
689	# convert latex-style accented characters.
690
691	# remove space (if any) between \ and letter to accent (eg {\' a})
692
693	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
694
695	# remove {} around a single character (eg \'{e})
696	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
697
698	# \, is another way of doing cedilla \c
699	$text =~ s@\\,(.)@\\c $1@g;
700
701	# remove {} around a single character for special 1 letter commands -
702	# need to insert a space. Eg \v{s} -> {\v s}
703	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
704
705	# only do if the text contains a '\' character.
706	if ($text =~ m\|\\\|) {
707	# "normal" accents - ie non-alpha latex tag
708	while ($text =~ m@\\([`'="^~\.])([\w])@) {
709	my $tex="$1$2"; my $char="$2";
710	my $replacement=$utf8_chars{$tex};
711	if (!defined($replacement)) {
712	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
713	$replacement=$char;
714	}
715	$text =~ s/\\\Q$tex/$replacement/g;
716	}
717
718	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
719	# only do the change if immediately followed by a space, }, {, or \
720	# one letter accents ( + ss)
721	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
722	my $tex=$1;
723	my $replacement=$special_utf8_chars{$tex};
724	if (!defined($replacement)) {
725	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
726	$replacement=$tex;
727	}
728	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
729
730	}
731
732	# one letter latex accent commands that affect following letter
733	while ($text =~ m@\\([uvcH]) ([\w])@) {
734	my $tex="$1 $2"; my $char="$2";
735	my $replacement=$special_utf8_chars{$tex};
736	if (!defined($replacement)) {
737	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
738	$replacement=$char;
739	}
740	$text =~ s/\\$tex/$replacement/g;
741	}
742	}
743
744	# escape html-sensitive characters
745	$text =~ s@&@&@g;
746	$text =~ s@<@<@g;
747	$text =~ s@>@>@g;
748	$text =~ s/''/"/g; # Latex-specific
749	$text =~ s/``/"/g; # Latex-specific
750	# greenstone-specific
751	$text =~ s@\[@&\#91;@g;
752	$text =~ s@\]@&\#93;@g;
753
754	# remove latex commands
755
756	# explicitly recognised commands
757	$text =~ s@\\ldots@…@g;
758
759	# maths mode
760	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
761
762	# remove all other commands with optional arguments...
763	$text =~ s@\\\w+(\[.?\])?\s@@g;
764	# $text =~ s@\\noopsort{[^}]+\}@@g;
765	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
766
767	# remove latex groupings { } (but not \{ or \} )
768	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
769	$text =~ s/^\{//; # remove { if first char
770
771	# latex characters
772	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
773	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
774	# optional break "\-"
775	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
776	# the non-macro bits have quotes around them - we just remove quotes
777	$text =~ s/([^&])[\"\#]/$1/g;
778	}
779	# dashes. Convert (m\|n)-dash into single dash for html.
780	$text =~ s@\-\-+@\-@g;
781
782	# quoted { } chars
783	$text =~ s@\\{@{@g;
784	$text =~ s@\\}@}@g;
785
786	# finally to protect against macro language...
787	$text =~ s@\\@\\\\@g;
788
789	return $text;
790	}
791
792
793	sub process_latex_math {
794	my $text = shift;
795
796	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
797	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
798	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
799	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
800	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
801
802	# put all other command names in italics
803	$text =~ s@\\([\w]+)@<i>$1</i>@g;
804
805	return $text;
806	}
807
808	sub set_OID {
809	my $self = shift (@_);
810	my ($doc_obj, $id, $segment_number) = @_;
811
812	if ( $self->{'key'} eq "default") {
813	$doc_obj->set_OID("$id\_$segment_number");
814	} else {
815	$doc_obj->set_OID($self->{'key'});
816	}
817	}
818
819	1;
820
821

Note: See TracBrowser for help on using the repository browser.

Download in other formats: