Context Navigation

source: main/trunk/greenstone2/perllib/plugins/BibTexPlugin.pm@ 32122

Last change on this file since 32122 was 32122, checked in by kjdon, 6 years ago
we had [a-z]{2..} which causes an error in later versions of perl. presumably this was meant to be 2 or more occurrences of a-z, so I have changed it to {2,}
Property svn:keywords set to `Author Date Id Revision`
File size: 23.5 KB

Line
1	###########################################################################
2	#
3	# BibTexPlugin.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlugin reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlugin creates a document object for every reference a the file.
35	# It is a subclass of SplitTextFile, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42
43
44	package BibTexPlugin;
45
46	use SplitTextFile;
47	use MetadataRead;
48	use strict;
49	no strict 'refs'; # allow filehandles to be variables and viceversa
50
51	# BibTexPlugin is a sub-class of SplitTextFile.
52	sub BEGIN {
53	@BibTexPlugin::ISA = ('MetadataRead', 'SplitTextFile');
54	}
55
56	my $arguments =
57	[ { 'name' => "process_exp",
58	'desc' => "{BaseImporter.process_exp}",
59	'type' => "regexp",
60	'reqd' => "no",
61	'deft' => &get_default_process_exp() },
62	{ 'name' => "split_exp",
63	'desc' => "{SplitTextFile.split_exp}",
64	'type' => "regexp",
65	'deft' => &get_default_split_exp(),
66	'reqd' => "no" }
67	];
68
69	my $options = { 'name' => "BibTexPlugin",
70	'desc' => "{BibTexPlugin.desc}",
71	'abstract' => "no",
72	'inherits' => "yes",
73	'explodes' => "yes",
74	'args' => $arguments };
75
76	# This plugin processes files with the suffix ".bib"
77	sub get_default_process_exp {
78	return q^(?i)\.bib$^;
79	}
80
81	# This plugin splits the input text at blank lines
82	sub get_default_split_exp {
83	return q^\n+(?=@)^;
84	}
85
86	sub new {
87	my ($class) = shift (@_);
88	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89	push(@$pluginlist, $class);
90
91	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92	push(@{$hashArgOptLists->{"OptList"}},$options);
93
94	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
95
96	return bless $self, $class;
97	}
98
99
100
101	# The process function reads a single bibliographic record and stores
102	# it as a new document.
103
104	sub process {
105	my $self = shift (@_);
106	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
107	my $outhandle = $self->{'outhandle'};
108
109	my $cursection = $doc_obj->get_top_section();
110	$self->{'key'} = "default";
111
112	# Check that we're dealing with a valid BibTex record
113	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
114
115	# Ignore things we can't use
116	return 0 if ($$textref =~ /^\@String/);
117
118	# This hash translates BibTex field names into metadata names. The
119	# BibTex names are taken from the "Local Guide to Latex" Graeme
120	# McKinstry. Metadata names are consistent with ReferPlug.
121
122	# The author metadata will be stored as one "Creator" entry, but will
123	# also be split up into several individual "Author" fields.
124
125	my %field = (
126	'address', 'PublisherAddress',
127	'author', 'Creator',
128
129	'booktitle', 'Booktitle',
130	'chapter', 'Chapter',
131	'edition', 'Edition',
132	'editor', 'Editor',
133	'institution', 'Publisher',
134	'journal', 'Journal',
135	'month', 'Month',
136	'number', 'Number',
137	'pages', 'Pages',
138	'publisher', 'Publisher',
139	'school', 'Publisher',
140	'title', 'Title',
141	'volume', 'Volume',
142	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
143
144	'keywords', 'Keywords',
145	'abstract', 'Abstract',
146	'copyright', 'Copyright',
147	'note', 'Note',
148	'url', 'URL',
149	);
150
151	# Metadata fields
152	my %metadata;
153	my ($EntryType, $Creator, $Keywords, $text);
154
155	my $verbosity = $self->{'verbosity'};
156	$verbosity = 0 unless $verbosity;
157
158	# Make sure the text has exactly one entry per line -
159	# append line to previous if it doesn't start with " <key> = "
160
161	my @input_lines=split(/\r?\n/, $$textref);
162	my @all_lines;
163	my $entry_line=shift @input_lines;
164	foreach my $input_line (@input_lines) {
165	if ($input_line =~ m/^\s\w+\s=\s*/) {
166	# this is a new key
167	push(@all_lines, $entry_line);
168	$entry_line=$input_line;
169	} else {
170	# this is a continuation of previous line
171	$entry_line .= " " . $input_line;
172	}
173
174	}
175	# add final line, removing trailing '}'
176	$entry_line =~ s/\}\s*$//;
177	push(@all_lines, $entry_line);
178	push(@all_lines, "}");
179
180	# Read and process each line in the bib file.
181	my ($entryname, $name, $value, $line);
182	foreach $line (@all_lines) {
183
184	# Add each line. Most lines consist of a field identifer and
185	# then data, and we simply store them, though we treat some
186	# of the fields a bit differently.
187
188	$line =~ s/\s+/ /g;
189	$text .= "$line\n";
190
191
192	print "Processing line = $line \n" if $verbosity>=4;
193
194	# The first line is special, it contains the reference type and OID
195	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
196	$EntryType = $1;
197	my $EntryID = (defined $2) ? $2 : "default";
198	print "** $EntryType - \"$EntryID\" \n"
199	if ($verbosity >= 4);
200
201	next;
202	}
203	if ($line =~ /\@/) {
204	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
205	if ($verbosity >= 2);
206	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
207	if ($verbosity >= 2);
208	}
209
210	# otherwise, parse the metadata out of this line
211	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
212	$entryname = lc($1);
213	$value = $2;
214	$value =~ s/,?\s*$//; # remove trailing comma and space
215	if ($value =~ /^"/ && $value =~ /"$/) {
216	# remove surrounding " marks
217	$value =~ s/^"//; $value =~ s/"$//;
218	} elsif ($value =~ /^\{/ && $value =~ /\}$/) {
219	# remove surrounding {} marks
220	$value =~ s/^\{//; $value =~ s/\}$//;
221	}
222	# special case for year - we only want a 4 digit year, not
223	# "circa 2004" etc
224	if ($entryname=~ /^year$/i) {
225	if ($value=~ /(\d{4})/) {
226	$value=$1;
227	$metadata{$entryname} .= "$value";
228	}
229	}
230	else {
231
232	$value = &process_latex($value);
233	# Add this line of metadata
234	$metadata{$entryname} .= "$value";
235	}
236	}
237
238	# Add the Entry type as metadata
239	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
240
241	#Add the fileformat as metadata
242	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
243
244
245	# Add the various field as metadata
246	foreach my $entryname (keys %metadata) {
247	next unless (defined $field{$entryname});
248	next unless (defined $metadata{$entryname});
249
250	$name = $field{$entryname};
251	$value = $metadata{$entryname};
252
253	if ($name =~ /^Month/) {
254	$value=expand_month($value);
255	}
256
257	# Several special operatons on metadata follow
258
259	# Add individual keywords.
260	# The full set of keywords will be added, in due course, as "Keywords".
261	# However, we also want to add them as individual "Keyword" metadata elements.
262	if ($entryname eq "keywords") {
263	my @keywordlist = split(/,/, $value);
264	foreach my $k (@keywordlist) {
265	$k = lc($k);
266	$k =~ s/\s*$//;
267	$k =~ s/^\s*//;
268	if ($k =~ /\w/) {
269	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
270	}
271	}
272	}
273
274	# Add individual authors
275	# The author metadata will be stored as one "Creator" entry, but we
276	# also want to split it into several individual "Author" fields in
277	# "Lastname, Firstnames" format so we can browse it.
278	if ($entryname eq "author") { #added also comparison with editor
279
280	# take care of "et al."...
281	my $etal='';
282	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
283	$etal=' <em>et. al.</em>';
284	}
285	# und here for german language...
286	# don't use brackets in pattern, else the matched bit becomes
287	# an element in the list!
288	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
289	my @formattedlist = ();
290	foreach $a (@authorlist) {
291	$a =~ s/\s*$//;
292	$a =~ s/^\s*//;
293	# Reformat and add author name
294	next if $a=~ /^\s*$/;
295
296	# names are "First von Last", "von Last, First"
297	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
298	my $first="";
299	my $vonlast="";
300	my $jr="";
301
302	if ($a =~ /,/) {
303	my @parts=split(/,\s*/, $a);
304	$first = pop @parts;
305	if (scalar(@parts) == 2) {
306	$jr = pop @parts;
307	}
308	$vonlast=shift @parts;
309	if (scalar(@parts) > 0) {
310	print $outhandle "BibTexPlugin: couldn't parse name $a\n";
311	# but we continue anyway...
312	}
313	} else { # First von Last
314	my @words = split(/ /, $a);
315	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2,}/) {
316	$first .= " " . shift (@words);
317	}
318	$first =~ s/^\s//;
319	$vonlast = join (' ', @words); # whatever's left...
320	}
321	my $von="";
322	my $last="";
323	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
324	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
325	$von = $1;
326	if (!defined ($von)) {
327	# some non-English names do start with lowercase
328	# eg "Marie desJardins". Also we can get typos...
329	print $outhandle "BibTexPlugin: couldn't parse surname $vonlast\n";
330	$von="";
331	if ($vonlast =~ /^[a-z]+$/) {
332	# if it's all lowercase, uppercase 1st.
333	$vonlast =~ s/^(.)/\u$1/;
334
335	}
336	}
337	$von =~ s/\s*$//;
338	$last=$vonlast;
339	} else {
340	$last=$vonlast;
341	}
342	my $wholename="$first $von $last $jr";
343	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
344	$wholename =~ s/ $//;
345	push (@formattedlist, $wholename);
346	my $fullname = "$last";
347	$fullname .= " $jr" if ($jr);
348	$fullname .= ", $first";
349	$fullname .= " $von" if ($von);
350
351	# Add each name to set of Authors
352	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
353	}
354
355	# Only want at most one "and" in the Creator field
356	if (scalar(@formattedlist) > 2) {
357	my $lastauthor=pop @formattedlist;
358	$value=join(', ', @formattedlist);
359	$value.=" and $lastauthor";
360	} else { # 1 or 2 authors...
361	$value=join(" and ",@formattedlist);
362	}
363	$value.=$etal; # if there was "et al."
364	}
365
366	# Books and Journals are additionally marked for display purposes
367	if ($entryname eq "booktitle") {
368	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
369	} elsif ($entryname eq "journal") {
370	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
371	}
372
373	# Add the various fields as metadata
374	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
375
376	}
377
378	# for books and journals...
379	if (!exists $metadata{'title'}) {
380	my $name=$field{'title'}; # get Greenstone metadata name
381	my $value;
382	if (exists $metadata{'booktitle'}) {
383	$value=$metadata{'booktitle'};
384	} elsif (exists $metadata{'journal'}) {
385	$value=$metadata{'journal'};
386	}
387	if ($value) {
388	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
389	}
390	}
391
392	# Add Date (yyyymmdd) metadata
393	if (defined ($metadata{'year'}) ) {
394	my $date=$metadata{'year'};
395	chomp $date;
396	my $month=$metadata{'month'};
397	if (defined($month)) {
398	# month is currently 3 letter code or a range...
399	$month = expand_month($month);
400	# take the first month found... might not find one!
401	$month =~ m/_textmonth(\d\d)_/;
402	$month = $1;
403	}
404	if (!defined($month)) {
405	$month="00";
406	}
407	$date .= "${month}00";
408	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
409	}
410
411	# # Add the text in BibTex format (all fields)
412	if ($text =~ /\w/) {
413
414	$text =~ s@&@&@g;
415	$text =~ s@<@<@g;
416	$text =~ s@>@>@g;
417	$text =~ s@\n@<br/>\n@g;
418	$text =~ s@\\@\\\\@g;
419
420	$doc_obj->add_utf8_text ($cursection, $text);
421	}
422
423	return 1;
424	}
425
426
427
428
429	# convert email addresses and URLs into links
430	sub convert_urls_into_links{
431	my ($text) = @_;
432
433	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
434	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
435
436	return $text;
437	}
438
439	# Clean up whitespace and convert \n charaters to <BR> or <P>
440	sub clean_up_whitespaces{
441	my ($text) = @_;
442
443	$text =~ s/%%%%%/<BR> <BR>/g;
444	$text =~ s/ +/ /g;
445	$text =~ s/\s*$//;
446	$text =~ s/^\s*//;
447	$text =~ s/\n/\n<BR>/g;
448	$text =~ s/<BR>\s*<BR>/<P>/g;
449
450	return $text;
451	}
452
453
454	sub convert_problem_characters_without_ampersand{
455	my ($text) = @_;
456	$text =~ s/</</g;
457	$text =~ s/>/>/g;
458
459	$text =~ s/\'\'/\"/g; #Latex -specific conversion
460	$text =~ s/\`\`/\"/g; #Latex -specific conversion
461
462	$text =~ s/\"/"/g;
463	$text =~ s/\'/’/g;
464	$text =~ s/\`/‘/g;
465
466	# $text =~ s/\+/ /g;
467	# $text =~ s/\(/ /g;
468	# $text =~ s/\)/ /g;
469
470	$text =~ s/\\/\\\\/g;
471
472	# $text =~ s/\./\\\./g;
473
474	return $text;
475	}
476
477	# Convert a text string into HTML.
478
479	# The HTML is going to be inserted into a GML file, so we have to be
480	# careful not to use symbols like ">", which occurs frequently in email
481	# messages (and use &gt instead.
482
483	# This function also turns URLs and email addresses into links, and
484	# replaces carriage returns with <BR> tags (and multiple carriage returns
485	# with <P> tags).
486
487	sub text_into_html {
488	my ($text) = @_;
489
490	# Convert problem characters into HTML symbols
491	$text =~ s/&/&/g;
492
493	$text = &convert_problem_characters_without_ampersand( $text );
494
495	# convert email addresses and URLs into links
496	$text = &convert_urls_into_links( $text );
497
498	$text = &clean_up_whitespaces( $text );
499
500	return $text;
501	}
502
503
504	sub expand_month {
505	my $text=shift;
506
507	# bibtex style files expand abbreviations for months.
508	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
509	$text =~ s/jan/_textmonth01_/g;
510	$text =~ s/feb/_textmonth02_/g;
511	$text =~ s/mar/_textmonth03_/g;
512	$text =~ s/apr/_textmonth04_/g;
513	$text =~ s/may/_textmonth05_/g;
514	$text =~ s/jun/_textmonth06_/g;
515	$text =~ s/jul/_textmonth07_/g;
516	$text =~ s/aug/_textmonth08_/g;
517	$text =~ s/sep/_textmonth09_/g;
518	$text =~ s/oct/_textmonth10_/g;
519	$text =~ s/nov/_textmonth11_/g;
520	$text =~ s/dec/_textmonth12_/g;
521
522	return $text;
523	}
524
525
526	# Convert accented characters, remove { }, interprete some commands....
527	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
528	sub process_latex {
529	my ($text) = @_;
530
531	# note - this is really ugly, but it works. There may be a prettier way
532	# of mapping latex accented chars to utf8, but we just brute force it here.
533	# Also, this isn't complete - not every single possible accented letter
534	# is in here yet, but most of the common ones are.
535
536	my %utf8_chars =
537	(
538	# acutes
539	'\'a' => chr(0xc3).chr(0xa1),
540	'\'c' => chr(0xc4).chr(0x87),
541	'\'e' => chr(0xc3).chr(0xa9),
542	'\'i' => chr(0xc3).chr(0xad),
543	'\'l' => chr(0xc3).chr(0xba),
544	'\'n' => chr(0xc3).chr(0x84),
545	'\'o' => chr(0xc3).chr(0xb3),
546	'\'r' => chr(0xc5).chr(0x95),
547	'\'s' => chr(0xc5).chr(0x9b),
548	'\'u' => chr(0xc3).chr(0xba),
549	'\'y' => chr(0xc3).chr(0xbd),
550	'\'z' => chr(0xc5).chr(0xba),
551	# graves
552	'`a' => chr(0xc3).chr(0xa0),
553	'`A' => chr(0xc3).chr(0x80),
554	'`e' => chr(0xc3).chr(0xa8),
555	'`E' => chr(0xc3).chr(0x88),
556	'`i' => chr(0xc3).chr(0xac),
557	'`I' => chr(0xc3).chr(0x8c),
558	'`o' => chr(0xc3).chr(0xb2),
559	'`O' => chr(0xc3).chr(0x92),
560	'`u' => chr(0xc3).chr(0xb9),
561	'`U' => chr(0xc3).chr(0x99),
562	# circumflex
563	'^a' => chr(0xc3).chr(0xa2),
564	'^A' => chr(0xc3).chr(0x82),
565	'^c' => chr(0xc4).chr(0x89),
566	'^C' => chr(0xc4).chr(0x88),
567	'^e' => chr(0xc3).chr(0xaa),
568	'^E' => chr(0xc3).chr(0x8a),
569	'^g' => chr(0xc4).chr(0x9d),
570	'^G' => chr(0xc4).chr(0x9c),
571	'^h' => chr(0xc4).chr(0xa5),
572	'^H' => chr(0xc4).chr(0xa4),
573	'^i' => chr(0xc3).chr(0xae),
574	'^I' => chr(0xc3).chr(0x8e),
575	'^j' => chr(0xc4).chr(0xb5),
576	'^J' => chr(0xc4).chr(0xb4),
577	'^o' => chr(0xc3).chr(0xb4),
578	'^O' => chr(0xc3).chr(0x94),
579	'^s' => chr(0xc5).chr(0x9d),
580	'^S' => chr(0xc5).chr(0x9c),
581	'^u' => chr(0xc3).chr(0xa2),
582	'^U' => chr(0xc3).chr(0xbb),
583	'^w' => chr(0xc5).chr(0xb5),
584	'^W' => chr(0xc5).chr(0xb4),
585	'^y' => chr(0xc5).chr(0xb7),
586	'^Y' => chr(0xc5).chr(0xb6),
587
588	# diaeresis
589	'"a' => chr(0xc3).chr(0xa4),
590	'"A' => chr(0xc3).chr(0x84),
591	'"e' => chr(0xc3).chr(0xab),
592	'"E' => chr(0xc3).chr(0x8b),
593	'"\\\\i' => chr(0xc3).chr(0xaf),
594	'"\\\\I' => chr(0xc3).chr(0x8f),
595	'"o' => chr(0xc3).chr(0xb6),
596	'"O' => chr(0xc3).chr(0x96),
597	'"u' => chr(0xc3).chr(0xbc),
598	'"U' => chr(0xc3).chr(0x9c),
599	'"y' => chr(0xc3).chr(0xbf),
600	'"Y' => chr(0xc3).chr(0xb8),
601	# tilde
602	'~A' => chr(0xc3).chr(0x83),
603	'~N' => chr(0xc3).chr(0x91),
604	'~O' => chr(0xc3).chr(0x95),
605	'~a' => chr(0xc3).chr(0xa3),
606	'~n' => chr(0xc3).chr(0xb1),
607	'~o' => chr(0xc3).chr(0xb5),
608	# caron - handled specially
609	# double acute
610	# ring
611	# dot
612	'.c' => chr(0xc4).chr(0x8b),
613	'.C' => chr(0xc4).chr(0x8a),
614	'.e' => chr(0xc4).chr(0x97),
615	'.E' => chr(0xc4).chr(0x96),
616	'.g' => chr(0xc4).chr(0xa1),
617	'.G' => chr(0xc4).chr(0xa0),
618	'.I' => chr(0xc4).chr(0xb0),
619	'.z' => chr(0xc5).chr(0xbc),
620	'.Z' => chr(0xc5).chr(0xbb),
621	# macron
622	'=a' => chr(0xc4).chr(0x81),
623	'=A' => chr(0xc4).chr(0x80),
624	'=e' => chr(0xc4).chr(0x93),
625	'=E' => chr(0xc4).chr(0x92),
626	'=i' => chr(0xc4).chr(0xab),
627	'=I' => chr(0xc4).chr(0xaa),
628	'=o' => chr(0xc4).chr(0x8d),
629	'=O' => chr(0xc4).chr(0x8c),
630	'=u' => chr(0xc4).chr(0xab),
631	'=U' => chr(0xc4).chr(0xaa),
632
633	# stroke - handled specially - see below
634
635	# cedilla - handled specially
636	);
637
638	# these are one letter latex commands - we make sure they're not a longer
639	# command name. eg {\d} is d+stroke, so careful of \d
640	my %special_utf8_chars =
641	(
642	# breve
643	'u g' => chr(0xc4).chr(0x9f),
644	'u G' => chr(0xc4).chr(0x9e),
645	'u i' => chr(0xc4).chr(0xad),
646	'u I' => chr(0xc4).chr(0xac),
647	'u o' => chr(0xc5).chr(0x8f),
648	'u O' => chr(0xc5).chr(0x8e),
649	'u u' => chr(0xc5).chr(0xad),
650	'u U' => chr(0xc5).chr(0xac),
651	# caron
652	'v c' => chr(0xc4).chr(0x8d),
653	'v C' => chr(0xc4).chr(0x8c),
654	'v n' => chr(0xc5).chr(0x88),
655	'v N' => chr(0xc5).chr(0x87),
656	'v s' => chr(0xc5).chr(0xa1),
657	'v S' => chr(0xc5).chr(0xa5),
658	'v z' => chr(0xc5).chr(0xbe),
659	'v Z' => chr(0xc5).chr(0xbd),
660	# cedilla
661	'c c' => chr(0xc3).chr(0xa7),
662	'c C' => chr(0xc3).chr(0x87),
663	'c g' => chr(0xc4).chr(0xa3),
664	'c G' => chr(0xc4).chr(0xa2),
665	'c k' => chr(0xc4).chr(0xb7),
666	'c K' => chr(0xc4).chr(0xb6),
667	'c l' => chr(0xc4).chr(0xbc),
668	'c L' => chr(0xc4).chr(0xbb),
669	'c n' => chr(0xc5).chr(0x86),
670	'c N' => chr(0xc5).chr(0x85),
671	'c r' => chr(0xc5).chr(0x97),
672	'c R' => chr(0xc5).chr(0x96),
673	'c s' => chr(0xc5).chr(0x9f),
674	'c S' => chr(0xc5).chr(0x9e),
675	'c t' => chr(0xc5).chr(0xa3),
676	'c T' => chr(0xc5).chr(0xa2),
677	# double acute / Hungarian accent
678	'H O' => chr(0xc5).chr(0x90),
679	'H o' => chr(0xc5).chr(0x91),
680	'H U' => chr(0xc5).chr(0xb0),
681	'H u' => chr(0xc5).chr(0xb1),
682
683	# stroke
684	'd' => chr(0xc4).chr(0x91),
685	'D' => chr(0xc4).chr(0x90),
686	'h' => chr(0xc4).chr(0xa7),
687	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
688	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
689	'l' => chr(0xc5).chr(0x82),
690	'L' => chr(0xc5).chr(0x81),
691	'o' => chr(0xc3).chr(0xb8),
692	'O' => chr(0xc3).chr(0x98),
693	't' => chr(0xc5).chr(0xa7),
694	'T' => chr(0xc5).chr(0xa6),
695	# german ss/szlig/sharp s
696	'ss' => chr(0xc3).chr(0x9f),
697	);
698
699	# convert latex-style accented characters.
700
701	# remove space (if any) between \ and letter to accent (eg {\' a})
702
703	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
704
705	# remove {} around a single character (eg \'{e})
706	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
707
708	# \, is another way of doing cedilla \c
709	$text =~ s@\\,(.)@\\c $1@g;
710
711	# remove {} around a single character for special 1 letter commands -
712	# need to insert a space. Eg \v{s} -> {\v s}
713	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
714
715	# only do if the text contains a '\' character.
716	if ($text =~ m\|\\\|) {
717	# "normal" accents - ie non-alpha latex tag
718	while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
719	my $tex="$1$2"; my $char="$2";
720	my $replacement=$utf8_chars{$tex};
721	if (!defined($replacement)) {
722	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
723	$replacement=$char;
724	}
725	$text =~ s/\\\Q$tex/$replacement/g;
726	}
727
728	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
729	# only do the change if immediately followed by a space, }, {, or \
730	# one letter accents ( + ss)
731	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
732	my $tex=$1;
733	my $replacement=$special_utf8_chars{$tex};
734	if (!defined($replacement)) {
735	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
736	$replacement=$tex;
737	}
738	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
739
740	}
741
742	# one letter latex accent commands that affect following letter
743	while ($text =~ m@\\([uvcH]) ([\w])@) {
744	my $tex="$1 $2"; my $char="$2";
745	my $replacement=$special_utf8_chars{$tex};
746	if (!defined($replacement)) {
747	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
748	$replacement=$char;
749	}
750	$text =~ s/\\$tex/$replacement/g;
751	}
752	}
753
754	# escape html-sensitive characters
755	$text =~ s@&@&@g;
756	$text =~ s@<@<@g;
757	$text =~ s@>@>@g;
758	$text =~ s/''/"/g; # Latex-specific
759	$text =~ s/``/"/g; # Latex-specific
760	# greenstone-specific
761	$text =~ s@\[@&\#91;@g;
762	$text =~ s@\]@&\#93;@g;
763
764	# remove latex commands
765
766	# explicitly recognised commands
767	$text =~ s@\\ldots@…@g;
768
769	# maths mode
770	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
771
772	# remove all other commands with optional arguments...
773	$text =~ s@\\\w+(\[.?\])?\s@@g;
774	# $text =~ s@\\noopsort{[^}]+\}@@g;
775	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
776
777	# remove latex groupings { } (but not \{ or \} )
778	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
779	$text =~ s/^\{//; # remove { if first char
780
781	# latex characters
782	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
783	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
784	# optional break "\-"
785	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
786	# the non-macro bits have quotes around them - we just remove quotes
787	$text =~ s/([^&])[\"\#]/$1/g;
788	}
789	# dashes. Convert (m\|n)-dash into single dash for html.
790	$text =~ s@\-\-+@\-@g;
791
792	# quoted { } chars
793	$text =~ s@\\\{@{@g;
794	$text =~ s@\\}@}@g;
795
796	# finally to protect against macro language...
797	$text =~ s@\\@\\\\@g;
798
799	return $text;
800	}
801
802
803	sub process_latex_math {
804	my $text = shift;
805
806	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
807	$text =~ s@\^\{(.*?)}@<sup>$1</sup>@g; # superscript
808	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
809	$text =~ s@\_\{(.*?)}@<sub>$1</sub>@g; # subscript
810	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
811
812	# put all other command names in italics
813	$text =~ s@\\([\w]+)@<i>$1</i>@g;
814
815	return $text;
816	}
817
818	sub add_OID {
819	my $self = shift (@_);
820	my ($doc_obj, $id, $segment_number) = @_;
821
822	if ( $self->{'key'} eq "default") {
823	$self->SUPER::add_OID(@_);
824	# $doc_obj->set_OID("$id\_$segment_number");
825	} else {
826	$doc_obj->set_OID($self->{'key'});
827	}
828	}
829
830	1;
831
832

Note: See TracBrowser for help on using the repository browser.

Download in other formats: