Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 13416

Last change on this file since 13416 was 13416, checked in by kjdon, 17 years ago
remove surrounding {} as well as surrounding "" from value
Property svn:keywords set to `Author Date Id Revision`
File size: 23.6 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42
43
44	package BibTexPlug;
45
46	use SplitPlug;
47	use strict;
48	no strict 'refs'; # allow filehandles to be variables and viceversa
49
50	# BibTexPlug is a sub-class of BasPlug.
51	sub BEGIN {
52	@BibTexPlug::ISA = ('SplitPlug');
53	}
54
55	my $arguments =
56	[ { 'name' => "process_exp",
57	'desc' => "{BasPlug.process_exp}",
58	'type' => "regexp",
59	'reqd' => "no",
60	'deft' => &get_default_process_exp() },
61	{ 'name' => "split_exp",
62	'desc' => "{SplitPlug.split_exp}",
63	'type' => "regexp",
64	'deft' => &get_default_split_exp(),
65	'reqd' => "no" }
66	];
67
68	my $options = { 'name' => "BibTexPlug",
69	'desc' => "{BibTexPlug.desc}",
70	'abstract' => "no",
71	'inherits' => "yes",
72	'explodes' => "yes",
73	'args' => $arguments };
74
75	# This plugin processes files with the suffix ".bib"
76	sub get_default_process_exp {
77	return q^(?i)\.bib$^;
78	}
79
80	# This plugin splits the input text at blank lines
81	sub get_default_split_exp {
82	return q^\n+(?=@)^;
83	}
84	sub new {
85	my ($class) = shift (@_);
86	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87	push(@$pluginlist, $class);
88
89	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
90	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
91
92	my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
93
94	return bless $self, $class;
95	}
96
97
98
99	# The process function reads a single bibliographic record and stores
100	# it as a new document.
101
102	sub process {
103	my $self = shift (@_);
104	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
105	my $outhandle = $self->{'outhandle'};
106
107	my $cursection = $doc_obj->get_top_section();
108	$self->{'key'} = "default";
109
110	# Check that we're dealing with a valid BibTex record
111	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
112
113	# Ignore things we can't use
114	return 0 if ($$textref =~ /^\@String/);
115
116	# Report that we're processing the file
117	print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
118	print $outhandle "BibTexPlug: processing $file\n"
119	if ($self->{'verbosity'}) > 1;
120
121
122	# This hash translates BibTex field names into metadata names. The
123	# BibTex names are taken from the "Local Guide to Latex" Graeme
124	# McKinstry. Metadata names are consistent with ReferPlug.
125
126	# The author metadata will be stored as one "Creator" entry, but will
127	# also be split up into several individual "Author" fields.
128
129	my %field = (
130	'address', 'PublisherAddress',
131	'author', 'Creator',
132
133	'booktitle', 'Booktitle',
134	'chapter', 'Chapter',
135	'edition', 'Edition',
136	'editor', 'Editor',
137	'institution', 'Publisher',
138	'journal', 'Journal',
139	'month', 'Month',
140	'number', 'Number',
141	'pages', 'Pages',
142	'publisher', 'Publisher',
143	'school', 'Publisher',
144	'title', 'Title',
145	'volume', 'Volume',
146	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
147
148	'keywords', 'Keywords',
149	'abstract', 'Abstract',
150	'copyright', 'Copyright',
151	'note', 'Note',
152	'url', 'URL',
153	);
154
155	# Metadata fields
156	my %metadata;
157	my ($EntryType, $Creator, $Keywords, $text);
158
159	my $verbosity = $self->{'verbosity'};
160	$verbosity = 0 unless $verbosity;
161
162	# Make sure the text has exactly one entry per line -
163	# append line to previous if it doesn't start with " <key> = "
164
165	my @input_lines=split(/\r?\n/, $$textref);
166	my @all_lines;
167	my $entry_line=shift @input_lines;
168	foreach my $input_line (@input_lines) {
169	if ($input_line =~ m/^\s\w+\s=\s*/) {
170	# this is a new key
171	push(@all_lines, $entry_line);
172	$entry_line=$input_line;
173	} else {
174	# this is a continuation of previous line
175	$entry_line .= " " . $input_line;
176	}
177
178	}
179	# add final line, removing trailing '}'
180	$entry_line =~ s/\}\s*$//;
181	push(@all_lines, $entry_line);
182	push(@all_lines, "}");
183
184	# Read and process each line in the bib file.
185	my ($entryname, $name, $value, $line);
186	foreach $line (@all_lines) {
187
188	# Add each line. Most lines consist of a field identifer and
189	# then data, and we simply store them, though we treat some
190	# of the fields a bit differently.
191
192	$line =~ s/\s+/ /g;
193	$text .= "$line\n";
194
195
196	print "Processing line = $line \n" if $verbosity>=4;
197
198	# The first line is special, it contains the reference type and OID
199	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
200	$EntryType = $1;
201	my $EntryID = (defined $2) ? $2 : "default";
202	print "** $EntryType - \"$EntryID\" \n"
203	if ($verbosity >= 4);
204
205	next;
206	}
207	if ($line =~ /\@/) {
208	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
209	if ($verbosity >= 2);
210	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
211	if ($verbosity >= 2);
212	}
213
214	# otherwise, parse the metadata out of this line
215	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
216	$entryname = lc($1);
217	$value = $2;
218	$value =~ s/,?\s*$//; # remove trailing comma and space
219	if ($value =~ /^"/ && $value =~ /"$/) {
220	# remove surrounding " marks
221	$value =~ s/^"//; $value =~ s/"$//;
222	} elsif ($value =~ /^\{/ && $value =~ /\}$/) {
223	# remove surrounding {} marks
224	$value =~ s/^\{//; $value =~ s/\}$//;
225	}
226	# special case for year - we only want a 4 digit year, not
227	# "circa 2004" etc
228	if ($entryname=~ /^year$/i) {
229	if ($value=~ /(\d{4})/) {
230	$value=$1;
231	$metadata{$entryname} .= "$value";
232	}
233	}
234	else {
235
236	$value = &process_latex($value);
237	# Add this line of metadata
238	$metadata{$entryname} .= "$value";
239	}
240	}
241
242	# Add the Entry type as metadata
243	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
244
245	#Add the fileformat as metadata
246	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
247
248
249	# Add the various field as metadata
250	foreach my $entryname (keys %metadata) {
251	next unless (defined $field{$entryname});
252	next unless (defined $metadata{$entryname});
253
254	$name = $field{$entryname};
255	$value = $metadata{$entryname};
256
257	if ($name =~ /^Month/) {
258	$value=expand_month($value);
259	}
260
261	# Several special operatons on metadata follow
262
263	# Add individual keywords.
264	# The full set of keywords will be added, in due course, as "Keywords".
265	# However, we also want to add them as individual "Keyword" metadata elements.
266	if ($entryname eq "keywords") {
267	my @keywordlist = split(/,/, $value);
268	foreach my $k (@keywordlist) {
269	$k = lc($k);
270	$k =~ s/\s*$//;
271	$k =~ s/^\s*//;
272	if ($k =~ /\w/) {
273	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
274	}
275	}
276	}
277
278	# Add individual authors
279	# The author metadata will be stored as one "Creator" entry, but we
280	# also want to split it into several individual "Author" fields in
281	# "Lastname, Firstnames" format so we can browse it.
282	if ($entryname eq "author") { #added also comparison with editor
283
284	# take care of "et al."...
285	my $etal='';
286	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
287	$etal=' <em>et. al.</em>';
288	}
289	# und here for german language...
290	# don't use brackets in pattern, else the matched bit becomes
291	# an element in the list!
292	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
293	my @formattedlist = ();
294	foreach $a (@authorlist) {
295	$a =~ s/\s*$//;
296	$a =~ s/^\s*//;
297	# Reformat and add author name
298	next if $a=~ /^\s*$/;
299
300	# names are "First von Last", "von Last, First"
301	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
302	my $first="";
303	my $vonlast="";
304	my $jr="";
305
306	if ($a =~ /,/) {
307	my @parts=split(/,\s*/, $a);
308	$first = pop @parts;
309	if (scalar(@parts) == 2) {
310	$jr = pop @parts;
311	}
312	$vonlast=shift @parts;
313	if (scalar(@parts) > 0) {
314	print $outhandle "BibTexPlug: couldn't parse name $a\n";
315	# but we continue anyway...
316	}
317	} else { # First von Last
318	my @words = split(/ /, $a);
319	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
320	$first .= " " . shift (@words);
321	}
322	$first =~ s/^\s//;
323	$vonlast = join (' ', @words); # whatever's left...
324	}
325	my $von="";
326	my $last="";
327	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
328	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
329	$von = $1;
330	if (!defined ($von)) {
331	# some non-English names do start with lowercase
332	# eg "Marie desJardins". Also we can get typos...
333	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
334	$von="";
335	if ($vonlast =~ /^[a-z]+$/) {
336	# if it's all lowercase, uppercase 1st.
337	$vonlast =~ s/^(.)/\u$1/;
338
339	}
340	}
341	$von =~ s/\s*$//;
342	$last=$vonlast;
343	} else {
344	$last=$vonlast;
345	}
346	my $wholename="$first $von $last $jr";
347	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
348	$wholename =~ s/ $//;
349	push (@formattedlist, $wholename);
350	my $fullname = "$last";
351	$fullname .= " $jr" if ($jr);
352	$fullname .= ", $first";
353	$fullname .= " $von" if ($von);
354
355	# Add each name to set of Authors
356	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
357	}
358
359	# Only want at most one "and" in the Creator field
360	if (scalar(@formattedlist) > 2) {
361	my $lastauthor=pop @formattedlist;
362	$value=join(', ', @formattedlist);
363	$value.=" and $lastauthor";
364	} else { # 1 or 2 authors...
365	$value=join(" and ",@formattedlist);
366	}
367	$value.=$etal; # if there was "et al."
368	}
369
370	# Books and Journals are additionally marked for display purposes
371	if ($entryname eq "booktitle") {
372	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
373	} elsif ($entryname eq "journal") {
374	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
375	}
376
377	# Add the various fields as metadata
378	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
379
380	}
381
382	# for books and journals...
383	if (!exists $metadata{'title'}) {
384	my $name=$field{'title'}; # get Greenstone metadata name
385	my $value;
386	if (exists $metadata{'booktitle'}) {
387	$value=$metadata{'booktitle'};
388	} elsif (exists $metadata{'journal'}) {
389	$value=$metadata{'journal'};
390	}
391	if ($value) {
392	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
393	}
394	}
395
396	# Add Date (yyyymmdd) metadata
397	if (defined ($metadata{'year'}) ) {
398	my $date=$metadata{'year'};
399	chomp $date;
400	my $month=$metadata{'month'};
401	if (defined($month)) {
402	# month is currently 3 letter code or a range...
403	$month = expand_month($month);
404	# take the first month found... might not find one!
405	$month =~ m/_textmonth(\d\d)_/;
406	$month = $1;
407	}
408	if (!defined($month)) {
409	$month="00";
410	}
411	$date .= "${month}00";
412	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
413	}
414
415	# # Add the text in BibTex format (all fields)
416	if ($text =~ /\w/) {
417
418	$text =~ s@&@&@g;
419	$text =~ s@<@<@g;
420	$text =~ s@>@>@g;
421	$text =~ s@\n@<br/>\n@g;
422	$text =~ s@\\@\\\\@g;
423
424	$doc_obj->add_utf8_text ($cursection, $text);
425	}
426
427	return 1;
428	}
429
430
431
432
433	# convert email addresses and URLs into links
434	sub convert_urls_into_links{
435	my ($text) = @_;
436
437	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
438	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
439
440	return $text;
441	}
442
443	# Clean up whitespace and convert \n charaters to <BR> or <P>
444	sub clean_up_whitespaces{
445	my ($text) = @_;
446
447	$text =~ s/%%%%%/<BR> <BR>/g;
448	$text =~ s/ +/ /g;
449	$text =~ s/\s*$//;
450	$text =~ s/^\s*//;
451	$text =~ s/\n/\n<BR>/g;
452	$text =~ s/<BR>\s*<BR>/<P>/g;
453
454	return $text;
455	}
456
457
458	sub convert_problem_characters_without_ampersand{
459	my ($text) = @_;
460	$text =~ s/</</g;
461	$text =~ s/>/>/g;
462
463	$text =~ s/\'\'/\"/g; #Latex -specific conversion
464	$text =~ s/\`\`/\"/g; #Latex -specific conversion
465
466	$text =~ s/\"/"/g;
467	$text =~ s/\'/’/g;
468	$text =~ s/\`/‘/g;
469
470	# $text =~ s/\+/ /g;
471	# $text =~ s/\(/ /g;
472	# $text =~ s/\)/ /g;
473
474	$text =~ s/\\/\\\\/g;
475
476	# $text =~ s/\./\\\./g;
477
478	return $text;
479	}
480
481	# Convert a text string into HTML.
482
483	# The HTML is going to be inserted into a GML file, so we have to be
484	# careful not to use symbols like ">", which occurs frequently in email
485	# messages (and use &gt instead.
486
487	# This function also turns URLs and email addresses into links, and
488	# replaces carriage returns with <BR> tags (and multiple carriage returns
489	# with <P> tags).
490
491	sub text_into_html {
492	my ($text) = @_;
493
494	# Convert problem characters into HTML symbols
495	$text =~ s/&/&/g;
496
497	$text = &convert_problem_characters_without_ampersand( $text );
498
499	# convert email addresses and URLs into links
500	$text = &convert_urls_into_links( $text );
501
502	$text = &clean_up_whitespaces( $text );
503
504	return $text;
505	}
506
507
508	sub expand_month {
509	my $text=shift;
510
511	# bibtex style files expand abbreviations for months.
512	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
513	$text =~ s/jan/_textmonth01_/g;
514	$text =~ s/feb/_textmonth02_/g;
515	$text =~ s/mar/_textmonth03_/g;
516	$text =~ s/apr/_textmonth04_/g;
517	$text =~ s/may/_textmonth05_/g;
518	$text =~ s/jun/_textmonth06_/g;
519	$text =~ s/jul/_textmonth07_/g;
520	$text =~ s/aug/_textmonth08_/g;
521	$text =~ s/sep/_textmonth09_/g;
522	$text =~ s/oct/_textmonth10_/g;
523	$text =~ s/nov/_textmonth11_/g;
524	$text =~ s/dec/_textmonth12_/g;
525
526	return $text;
527	}
528
529
530	# Convert accented characters, remove { }, interprete some commands....
531	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
532	sub process_latex {
533	my ($text) = @_;
534
535	# note - this is really ugly, but it works. There may be a prettier way
536	# of mapping latex accented chars to utf8, but we just brute force it here.
537	# Also, this isn't complete - not every single possible accented letter
538	# is in here yet, but most of the common ones are.
539
540	my %utf8_chars =
541	(
542	# acutes
543	'\'a' => chr(0xc3).chr(0xa1),
544	'\'c' => chr(0xc4).chr(0x87),
545	'\'e' => chr(0xc3).chr(0xa9),
546	'\'i' => chr(0xc3).chr(0xad),
547	'\'l' => chr(0xc3).chr(0xba),
548	'\'n' => chr(0xc3).chr(0x84),
549	'\'o' => chr(0xc3).chr(0xb3),
550	'\'r' => chr(0xc5).chr(0x95),
551	'\'s' => chr(0xc5).chr(0x9b),
552	'\'u' => chr(0xc3).chr(0xba),
553	'\'y' => chr(0xc3).chr(0xbd),
554	'\'z' => chr(0xc5).chr(0xba),
555	# graves
556	'`a' => chr(0xc3).chr(0xa0),
557	'`A' => chr(0xc3).chr(0x80),
558	'`e' => chr(0xc3).chr(0xa8),
559	'`E' => chr(0xc3).chr(0x88),
560	'`i' => chr(0xc3).chr(0xac),
561	'`I' => chr(0xc3).chr(0x8c),
562	'`o' => chr(0xc3).chr(0xb2),
563	'`O' => chr(0xc3).chr(0x92),
564	'`u' => chr(0xc3).chr(0xb9),
565	'`U' => chr(0xc3).chr(0x99),
566	# circumflex
567	'^a' => chr(0xc3).chr(0xa2),
568	'^A' => chr(0xc3).chr(0x82),
569	'^c' => chr(0xc4).chr(0x89),
570	'^C' => chr(0xc4).chr(0x88),
571	'^e' => chr(0xc3).chr(0xaa),
572	'^E' => chr(0xc3).chr(0x8a),
573	'^g' => chr(0xc4).chr(0x9d),
574	'^G' => chr(0xc4).chr(0x9c),
575	'^h' => chr(0xc4).chr(0xa5),
576	'^H' => chr(0xc4).chr(0xa4),
577	'^i' => chr(0xc3).chr(0xae),
578	'^I' => chr(0xc3).chr(0x8e),
579	'^j' => chr(0xc4).chr(0xb5),
580	'^J' => chr(0xc4).chr(0xb4),
581	'^o' => chr(0xc3).chr(0xb4),
582	'^O' => chr(0xc3).chr(0x94),
583	'^s' => chr(0xc5).chr(0x9d),
584	'^S' => chr(0xc5).chr(0x9c),
585	'^u' => chr(0xc3).chr(0xa2),
586	'^U' => chr(0xc3).chr(0xbb),
587	'^w' => chr(0xc5).chr(0xb5),
588	'^W' => chr(0xc5).chr(0xb4),
589	'^y' => chr(0xc5).chr(0xb7),
590	'^Y' => chr(0xc5).chr(0xb6),
591
592	# diaeresis
593	'"a' => chr(0xc3).chr(0xa4),
594	'"A' => chr(0xc3).chr(0x84),
595	'"e' => chr(0xc3).chr(0xab),
596	'"E' => chr(0xc3).chr(0x8b),
597	'"\\\\i' => chr(0xc3).chr(0xaf),
598	'"\\\\I' => chr(0xc3).chr(0x8f),
599	'"o' => chr(0xc3).chr(0xb6),
600	'"O' => chr(0xc3).chr(0x96),
601	'"u' => chr(0xc3).chr(0xbc),
602	'"U' => chr(0xc3).chr(0x9c),
603	'"y' => chr(0xc3).chr(0xbf),
604	'"Y' => chr(0xc3).chr(0xb8),
605	# tilde
606	'~A' => chr(0xc3).chr(0x83),
607	'~N' => chr(0xc3).chr(0x91),
608	'~O' => chr(0xc3).chr(0x95),
609	'~a' => chr(0xc3).chr(0xa3),
610	'~n' => chr(0xc3).chr(0xb1),
611	'~o' => chr(0xc3).chr(0xb5),
612	# caron - handled specially
613	# double acute
614	# ring
615	# dot
616	'.c' => chr(0xc4).chr(0x8b),
617	'.C' => chr(0xc4).chr(0x8a),
618	'.e' => chr(0xc4).chr(0x97),
619	'.E' => chr(0xc4).chr(0x96),
620	'.g' => chr(0xc4).chr(0xa1),
621	'.G' => chr(0xc4).chr(0xa0),
622	'.I' => chr(0xc4).chr(0xb0),
623	'.z' => chr(0xc5).chr(0xbc),
624	'.Z' => chr(0xc5).chr(0xbb),
625	# macron
626	'=a' => chr(0xc4).chr(0x81),
627	'=A' => chr(0xc4).chr(0x80),
628	'=e' => chr(0xc4).chr(0x93),
629	'=E' => chr(0xc4).chr(0x92),
630	'=i' => chr(0xc4).chr(0xab),
631	'=I' => chr(0xc4).chr(0xaa),
632	'=o' => chr(0xc4).chr(0x8d),
633	'=O' => chr(0xc4).chr(0x8c),
634	'=u' => chr(0xc4).chr(0xab),
635	'=U' => chr(0xc4).chr(0xaa),
636
637	# stroke - handled specially - see below
638
639	# cedilla - handled specially
640	);
641
642	# these are one letter latex commands - we make sure they're not a longer
643	# command name. eg {\d} is d+stroke, so careful of \d
644	my %special_utf8_chars =
645	(
646	# breve
647	'u g' => chr(0xc4).chr(0x9f),
648	'u G' => chr(0xc4).chr(0x9e),
649	'u i' => chr(0xc4).chr(0xad),
650	'u I' => chr(0xc4).chr(0xac),
651	'u o' => chr(0xc5).chr(0x8f),
652	'u O' => chr(0xc5).chr(0x8e),
653	'u u' => chr(0xc5).chr(0xad),
654	'u U' => chr(0xc5).chr(0xac),
655	# caron
656	'v c' => chr(0xc4).chr(0x8d),
657	'v C' => chr(0xc4).chr(0x8c),
658	'v n' => chr(0xc5).chr(0x88),
659	'v N' => chr(0xc5).chr(0x87),
660	'v s' => chr(0xc5).chr(0xa1),
661	'v S' => chr(0xc5).chr(0xa5),
662	'v z' => chr(0xc5).chr(0xbe),
663	'v Z' => chr(0xc5).chr(0xbd),
664	# cedilla
665	'c c' => chr(0xc3).chr(0xa7),
666	'c C' => chr(0xc3).chr(0x87),
667	'c g' => chr(0xc4).chr(0xa3),
668	'c G' => chr(0xc4).chr(0xa2),
669	'c k' => chr(0xc4).chr(0xb7),
670	'c K' => chr(0xc4).chr(0xb6),
671	'c l' => chr(0xc4).chr(0xbc),
672	'c L' => chr(0xc4).chr(0xbb),
673	'c n' => chr(0xc5).chr(0x86),
674	'c N' => chr(0xc5).chr(0x85),
675	'c r' => chr(0xc5).chr(0x97),
676	'c R' => chr(0xc5).chr(0x96),
677	'c s' => chr(0xc5).chr(0x9f),
678	'c S' => chr(0xc5).chr(0x9e),
679	'c t' => chr(0xc5).chr(0xa3),
680	'c T' => chr(0xc5).chr(0xa2),
681	# double acute / Hungarian accent
682	'H O' => chr(0xc5).chr(0x90),
683	'H o' => chr(0xc5).chr(0x91),
684	'H U' => chr(0xc5).chr(0xb0),
685	'H u' => chr(0xc5).chr(0xb1),
686
687	# stroke
688	'd' => chr(0xc4).chr(0x91),
689	'D' => chr(0xc4).chr(0x90),
690	'h' => chr(0xc4).chr(0xa7),
691	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
692	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
693	'l' => chr(0xc5).chr(0x82),
694	'L' => chr(0xc5).chr(0x81),
695	'o' => chr(0xc3).chr(0xb8),
696	'O' => chr(0xc3).chr(0x98),
697	't' => chr(0xc5).chr(0xa7),
698	'T' => chr(0xc5).chr(0xa6),
699	# german ss/szlig/sharp s
700	'ss' => chr(0xc3).chr(0x9f),
701	);
702
703	# convert latex-style accented characters.
704
705	# remove space (if any) between \ and letter to accent (eg {\' a})
706
707	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
708
709	# remove {} around a single character (eg \'{e})
710	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
711
712	# \, is another way of doing cedilla \c
713	$text =~ s@\\,(.)@\\c $1@g;
714
715	# remove {} around a single character for special 1 letter commands -
716	# need to insert a space. Eg \v{s} -> {\v s}
717	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
718
719	# only do if the text contains a '\' character.
720	if ($text =~ m\|\\\|) {
721	# "normal" accents - ie non-alpha latex tag
722	while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
723	my $tex="$1$2"; my $char="$2";
724	my $replacement=$utf8_chars{$tex};
725	if (!defined($replacement)) {
726	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
727	$replacement=$char;
728	}
729	$text =~ s/\\\Q$tex/$replacement/g;
730	}
731
732	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
733	# only do the change if immediately followed by a space, }, {, or \
734	# one letter accents ( + ss)
735	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
736	my $tex=$1;
737	my $replacement=$special_utf8_chars{$tex};
738	if (!defined($replacement)) {
739	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
740	$replacement=$tex;
741	}
742	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
743
744	}
745
746	# one letter latex accent commands that affect following letter
747	while ($text =~ m@\\([uvcH]) ([\w])@) {
748	my $tex="$1 $2"; my $char="$2";
749	my $replacement=$special_utf8_chars{$tex};
750	if (!defined($replacement)) {
751	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
752	$replacement=$char;
753	}
754	$text =~ s/\\$tex/$replacement/g;
755	}
756	}
757
758	# escape html-sensitive characters
759	$text =~ s@&@&@g;
760	$text =~ s@<@<@g;
761	$text =~ s@>@>@g;
762	$text =~ s/''/"/g; # Latex-specific
763	$text =~ s/``/"/g; # Latex-specific
764	# greenstone-specific
765	$text =~ s@\[@&\#91;@g;
766	$text =~ s@\]@&\#93;@g;
767
768	# remove latex commands
769
770	# explicitly recognised commands
771	$text =~ s@\\ldots@…@g;
772
773	# maths mode
774	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
775
776	# remove all other commands with optional arguments...
777	$text =~ s@\\\w+(\[.?\])?\s@@g;
778	# $text =~ s@\\noopsort{[^}]+\}@@g;
779	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
780
781	# remove latex groupings { } (but not \{ or \} )
782	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
783	$text =~ s/^\{//; # remove { if first char
784
785	# latex characters
786	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
787	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
788	# optional break "\-"
789	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
790	# the non-macro bits have quotes around them - we just remove quotes
791	$text =~ s/([^&])[\"\#]/$1/g;
792	}
793	# dashes. Convert (m\|n)-dash into single dash for html.
794	$text =~ s@\-\-+@\-@g;
795
796	# quoted { } chars
797	$text =~ s@\\{@{@g;
798	$text =~ s@\\}@}@g;
799
800	# finally to protect against macro language...
801	$text =~ s@\\@\\\\@g;
802
803	return $text;
804	}
805
806
807	sub process_latex_math {
808	my $text = shift;
809
810	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
811	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
812	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
813	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
814	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
815
816	# put all other command names in italics
817	$text =~ s@\\([\w]+)@<i>$1</i>@g;
818
819	return $text;
820	}
821
822	sub set_OID {
823	my $self = shift (@_);
824	my ($doc_obj, $id, $segment_number) = @_;
825
826	if ( $self->{'key'} eq "default") {
827	$doc_obj->set_OID("$id\_$segment_number");
828	} else {
829	$doc_obj->set_OID($self->{'key'});
830	}
831	}
832
833	1;
834
835

Note: See TracBrowser for help on using the repository browser.

Download in other formats: