Context Navigation

source: gsdl/trunk/perllib/plugins/BibTexPlugin.pm@ 15880

Last change on this file since 15880 was 15872, checked in by kjdon, 16 years ago
plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...
Property svn:keywords set to `Author Date Id Revision`
File size: 23.6 KB

Line
1	###########################################################################
2	#
3	# BibTexPlugin.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlugin reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlugin creates a document object for every reference a the file.
35	# It is a subclass of SplitTextFile, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42
43
44	package BibTexPlugin;
45
46	use SplitTextFile;
47	use strict;
48	no strict 'refs'; # allow filehandles to be variables and viceversa
49
50	# BibTexPlugin is a sub-class of SplitTextFile.
51	sub BEGIN {
52	@BibTexPlugin::ISA = ('SplitTextFile');
53	}
54
55	my $arguments =
56	[ { 'name' => "process_exp",
57	'desc' => "{BasePlugin.process_exp}",
58	'type' => "regexp",
59	'reqd' => "no",
60	'deft' => &get_default_process_exp() },
61	{ 'name' => "split_exp",
62	'desc' => "{SplitTextFile.split_exp}",
63	'type' => "regexp",
64	'deft' => &get_default_split_exp(),
65	'reqd' => "no" }
66	];
67
68	my $options = { 'name' => "BibTexPlugin",
69	'desc' => "{BibTexPlugin.desc}",
70	'abstract' => "no",
71	'inherits' => "yes",
72	'explodes' => "yes",
73	'args' => $arguments };
74
75	# This plugin processes files with the suffix ".bib"
76	sub get_default_process_exp {
77	return q^(?i)\.bib$^;
78	}
79
80	# This plugin splits the input text at blank lines
81	sub get_default_split_exp {
82	return q^\n+(?=@)^;
83	}
84
85	sub new {
86	my ($class) = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88	push(@$pluginlist, $class);
89
90	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91	push(@{$hashArgOptLists->{"OptList"}},$options);
92
93	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
94
95	return bless $self, $class;
96	}
97
98
99
100	# The process function reads a single bibliographic record and stores
101	# it as a new document.
102
103	sub process {
104	my $self = shift (@_);
105	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
106	my $outhandle = $self->{'outhandle'};
107
108	my $cursection = $doc_obj->get_top_section();
109	$self->{'key'} = "default";
110
111	# Check that we're dealing with a valid BibTex record
112	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
113
114	# Ignore things we can't use
115	return 0 if ($$textref =~ /^\@String/);
116
117	# Report that we're processing the file
118	print STDERR "<Processing n='$file' p='BibTexPlugin'>\n" if ($gli);
119	print $outhandle "BibTexPlugin: processing $file\n"
120	if ($self->{'verbosity'}) > 1;
121
122
123	# This hash translates BibTex field names into metadata names. The
124	# BibTex names are taken from the "Local Guide to Latex" Graeme
125	# McKinstry. Metadata names are consistent with ReferPlug.
126
127	# The author metadata will be stored as one "Creator" entry, but will
128	# also be split up into several individual "Author" fields.
129
130	my %field = (
131	'address', 'PublisherAddress',
132	'author', 'Creator',
133
134	'booktitle', 'Booktitle',
135	'chapter', 'Chapter',
136	'edition', 'Edition',
137	'editor', 'Editor',
138	'institution', 'Publisher',
139	'journal', 'Journal',
140	'month', 'Month',
141	'number', 'Number',
142	'pages', 'Pages',
143	'publisher', 'Publisher',
144	'school', 'Publisher',
145	'title', 'Title',
146	'volume', 'Volume',
147	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
148
149	'keywords', 'Keywords',
150	'abstract', 'Abstract',
151	'copyright', 'Copyright',
152	'note', 'Note',
153	'url', 'URL',
154	);
155
156	# Metadata fields
157	my %metadata;
158	my ($EntryType, $Creator, $Keywords, $text);
159
160	my $verbosity = $self->{'verbosity'};
161	$verbosity = 0 unless $verbosity;
162
163	# Make sure the text has exactly one entry per line -
164	# append line to previous if it doesn't start with " <key> = "
165
166	my @input_lines=split(/\r?\n/, $$textref);
167	my @all_lines;
168	my $entry_line=shift @input_lines;
169	foreach my $input_line (@input_lines) {
170	if ($input_line =~ m/^\s\w+\s=\s*/) {
171	# this is a new key
172	push(@all_lines, $entry_line);
173	$entry_line=$input_line;
174	} else {
175	# this is a continuation of previous line
176	$entry_line .= " " . $input_line;
177	}
178
179	}
180	# add final line, removing trailing '}'
181	$entry_line =~ s/\}\s*$//;
182	push(@all_lines, $entry_line);
183	push(@all_lines, "}");
184
185	# Read and process each line in the bib file.
186	my ($entryname, $name, $value, $line);
187	foreach $line (@all_lines) {
188
189	# Add each line. Most lines consist of a field identifer and
190	# then data, and we simply store them, though we treat some
191	# of the fields a bit differently.
192
193	$line =~ s/\s+/ /g;
194	$text .= "$line\n";
195
196
197	print "Processing line = $line \n" if $verbosity>=4;
198
199	# The first line is special, it contains the reference type and OID
200	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
201	$EntryType = $1;
202	my $EntryID = (defined $2) ? $2 : "default";
203	print "** $EntryType - \"$EntryID\" \n"
204	if ($verbosity >= 4);
205
206	next;
207	}
208	if ($line =~ /\@/) {
209	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
210	if ($verbosity >= 2);
211	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
212	if ($verbosity >= 2);
213	}
214
215	# otherwise, parse the metadata out of this line
216	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
217	$entryname = lc($1);
218	$value = $2;
219	$value =~ s/,?\s*$//; # remove trailing comma and space
220	if ($value =~ /^"/ && $value =~ /"$/) {
221	# remove surrounding " marks
222	$value =~ s/^"//; $value =~ s/"$//;
223	} elsif ($value =~ /^\{/ && $value =~ /\}$/) {
224	# remove surrounding {} marks
225	$value =~ s/^\{//; $value =~ s/\}$//;
226	}
227	# special case for year - we only want a 4 digit year, not
228	# "circa 2004" etc
229	if ($entryname=~ /^year$/i) {
230	if ($value=~ /(\d{4})/) {
231	$value=$1;
232	$metadata{$entryname} .= "$value";
233	}
234	}
235	else {
236
237	$value = &process_latex($value);
238	# Add this line of metadata
239	$metadata{$entryname} .= "$value";
240	}
241	}
242
243	# Add the Entry type as metadata
244	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
245
246	#Add the fileformat as metadata
247	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
248
249
250	# Add the various field as metadata
251	foreach my $entryname (keys %metadata) {
252	next unless (defined $field{$entryname});
253	next unless (defined $metadata{$entryname});
254
255	$name = $field{$entryname};
256	$value = $metadata{$entryname};
257
258	if ($name =~ /^Month/) {
259	$value=expand_month($value);
260	}
261
262	# Several special operatons on metadata follow
263
264	# Add individual keywords.
265	# The full set of keywords will be added, in due course, as "Keywords".
266	# However, we also want to add them as individual "Keyword" metadata elements.
267	if ($entryname eq "keywords") {
268	my @keywordlist = split(/,/, $value);
269	foreach my $k (@keywordlist) {
270	$k = lc($k);
271	$k =~ s/\s*$//;
272	$k =~ s/^\s*//;
273	if ($k =~ /\w/) {
274	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
275	}
276	}
277	}
278
279	# Add individual authors
280	# The author metadata will be stored as one "Creator" entry, but we
281	# also want to split it into several individual "Author" fields in
282	# "Lastname, Firstnames" format so we can browse it.
283	if ($entryname eq "author") { #added also comparison with editor
284
285	# take care of "et al."...
286	my $etal='';
287	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
288	$etal=' <em>et. al.</em>';
289	}
290	# und here for german language...
291	# don't use brackets in pattern, else the matched bit becomes
292	# an element in the list!
293	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
294	my @formattedlist = ();
295	foreach $a (@authorlist) {
296	$a =~ s/\s*$//;
297	$a =~ s/^\s*//;
298	# Reformat and add author name
299	next if $a=~ /^\s*$/;
300
301	# names are "First von Last", "von Last, First"
302	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
303	my $first="";
304	my $vonlast="";
305	my $jr="";
306
307	if ($a =~ /,/) {
308	my @parts=split(/,\s*/, $a);
309	$first = pop @parts;
310	if (scalar(@parts) == 2) {
311	$jr = pop @parts;
312	}
313	$vonlast=shift @parts;
314	if (scalar(@parts) > 0) {
315	print $outhandle "BibTexPlugin: couldn't parse name $a\n";
316	# but we continue anyway...
317	}
318	} else { # First von Last
319	my @words = split(/ /, $a);
320	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
321	$first .= " " . shift (@words);
322	}
323	$first =~ s/^\s//;
324	$vonlast = join (' ', @words); # whatever's left...
325	}
326	my $von="";
327	my $last="";
328	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
329	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
330	$von = $1;
331	if (!defined ($von)) {
332	# some non-English names do start with lowercase
333	# eg "Marie desJardins". Also we can get typos...
334	print $outhandle "BibTexPlugin: couldn't parse surname $vonlast\n";
335	$von="";
336	if ($vonlast =~ /^[a-z]+$/) {
337	# if it's all lowercase, uppercase 1st.
338	$vonlast =~ s/^(.)/\u$1/;
339
340	}
341	}
342	$von =~ s/\s*$//;
343	$last=$vonlast;
344	} else {
345	$last=$vonlast;
346	}
347	my $wholename="$first $von $last $jr";
348	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
349	$wholename =~ s/ $//;
350	push (@formattedlist, $wholename);
351	my $fullname = "$last";
352	$fullname .= " $jr" if ($jr);
353	$fullname .= ", $first";
354	$fullname .= " $von" if ($von);
355
356	# Add each name to set of Authors
357	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
358	}
359
360	# Only want at most one "and" in the Creator field
361	if (scalar(@formattedlist) > 2) {
362	my $lastauthor=pop @formattedlist;
363	$value=join(', ', @formattedlist);
364	$value.=" and $lastauthor";
365	} else { # 1 or 2 authors...
366	$value=join(" and ",@formattedlist);
367	}
368	$value.=$etal; # if there was "et al."
369	}
370
371	# Books and Journals are additionally marked for display purposes
372	if ($entryname eq "booktitle") {
373	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
374	} elsif ($entryname eq "journal") {
375	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
376	}
377
378	# Add the various fields as metadata
379	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
380
381	}
382
383	# for books and journals...
384	if (!exists $metadata{'title'}) {
385	my $name=$field{'title'}; # get Greenstone metadata name
386	my $value;
387	if (exists $metadata{'booktitle'}) {
388	$value=$metadata{'booktitle'};
389	} elsif (exists $metadata{'journal'}) {
390	$value=$metadata{'journal'};
391	}
392	if ($value) {
393	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
394	}
395	}
396
397	# Add Date (yyyymmdd) metadata
398	if (defined ($metadata{'year'}) ) {
399	my $date=$metadata{'year'};
400	chomp $date;
401	my $month=$metadata{'month'};
402	if (defined($month)) {
403	# month is currently 3 letter code or a range...
404	$month = expand_month($month);
405	# take the first month found... might not find one!
406	$month =~ m/_textmonth(\d\d)_/;
407	$month = $1;
408	}
409	if (!defined($month)) {
410	$month="00";
411	}
412	$date .= "${month}00";
413	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
414	}
415
416	# # Add the text in BibTex format (all fields)
417	if ($text =~ /\w/) {
418
419	$text =~ s@&@&@g;
420	$text =~ s@<@<@g;
421	$text =~ s@>@>@g;
422	$text =~ s@\n@<br/>\n@g;
423	$text =~ s@\\@\\\\@g;
424
425	$doc_obj->add_utf8_text ($cursection, $text);
426	}
427
428	return 1;
429	}
430
431
432
433
434	# convert email addresses and URLs into links
435	sub convert_urls_into_links{
436	my ($text) = @_;
437
438	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
439	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
440
441	return $text;
442	}
443
444	# Clean up whitespace and convert \n charaters to <BR> or <P>
445	sub clean_up_whitespaces{
446	my ($text) = @_;
447
448	$text =~ s/%%%%%/<BR> <BR>/g;
449	$text =~ s/ +/ /g;
450	$text =~ s/\s*$//;
451	$text =~ s/^\s*//;
452	$text =~ s/\n/\n<BR>/g;
453	$text =~ s/<BR>\s*<BR>/<P>/g;
454
455	return $text;
456	}
457
458
459	sub convert_problem_characters_without_ampersand{
460	my ($text) = @_;
461	$text =~ s/</</g;
462	$text =~ s/>/>/g;
463
464	$text =~ s/\'\'/\"/g; #Latex -specific conversion
465	$text =~ s/\`\`/\"/g; #Latex -specific conversion
466
467	$text =~ s/\"/"/g;
468	$text =~ s/\'/’/g;
469	$text =~ s/\`/‘/g;
470
471	# $text =~ s/\+/ /g;
472	# $text =~ s/\(/ /g;
473	# $text =~ s/\)/ /g;
474
475	$text =~ s/\\/\\\\/g;
476
477	# $text =~ s/\./\\\./g;
478
479	return $text;
480	}
481
482	# Convert a text string into HTML.
483
484	# The HTML is going to be inserted into a GML file, so we have to be
485	# careful not to use symbols like ">", which occurs frequently in email
486	# messages (and use &gt instead.
487
488	# This function also turns URLs and email addresses into links, and
489	# replaces carriage returns with <BR> tags (and multiple carriage returns
490	# with <P> tags).
491
492	sub text_into_html {
493	my ($text) = @_;
494
495	# Convert problem characters into HTML symbols
496	$text =~ s/&/&/g;
497
498	$text = &convert_problem_characters_without_ampersand( $text );
499
500	# convert email addresses and URLs into links
501	$text = &convert_urls_into_links( $text );
502
503	$text = &clean_up_whitespaces( $text );
504
505	return $text;
506	}
507
508
509	sub expand_month {
510	my $text=shift;
511
512	# bibtex style files expand abbreviations for months.
513	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
514	$text =~ s/jan/_textmonth01_/g;
515	$text =~ s/feb/_textmonth02_/g;
516	$text =~ s/mar/_textmonth03_/g;
517	$text =~ s/apr/_textmonth04_/g;
518	$text =~ s/may/_textmonth05_/g;
519	$text =~ s/jun/_textmonth06_/g;
520	$text =~ s/jul/_textmonth07_/g;
521	$text =~ s/aug/_textmonth08_/g;
522	$text =~ s/sep/_textmonth09_/g;
523	$text =~ s/oct/_textmonth10_/g;
524	$text =~ s/nov/_textmonth11_/g;
525	$text =~ s/dec/_textmonth12_/g;
526
527	return $text;
528	}
529
530
531	# Convert accented characters, remove { }, interprete some commands....
532	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
533	sub process_latex {
534	my ($text) = @_;
535
536	# note - this is really ugly, but it works. There may be a prettier way
537	# of mapping latex accented chars to utf8, but we just brute force it here.
538	# Also, this isn't complete - not every single possible accented letter
539	# is in here yet, but most of the common ones are.
540
541	my %utf8_chars =
542	(
543	# acutes
544	'\'a' => chr(0xc3).chr(0xa1),
545	'\'c' => chr(0xc4).chr(0x87),
546	'\'e' => chr(0xc3).chr(0xa9),
547	'\'i' => chr(0xc3).chr(0xad),
548	'\'l' => chr(0xc3).chr(0xba),
549	'\'n' => chr(0xc3).chr(0x84),
550	'\'o' => chr(0xc3).chr(0xb3),
551	'\'r' => chr(0xc5).chr(0x95),
552	'\'s' => chr(0xc5).chr(0x9b),
553	'\'u' => chr(0xc3).chr(0xba),
554	'\'y' => chr(0xc3).chr(0xbd),
555	'\'z' => chr(0xc5).chr(0xba),
556	# graves
557	'`a' => chr(0xc3).chr(0xa0),
558	'`A' => chr(0xc3).chr(0x80),
559	'`e' => chr(0xc3).chr(0xa8),
560	'`E' => chr(0xc3).chr(0x88),
561	'`i' => chr(0xc3).chr(0xac),
562	'`I' => chr(0xc3).chr(0x8c),
563	'`o' => chr(0xc3).chr(0xb2),
564	'`O' => chr(0xc3).chr(0x92),
565	'`u' => chr(0xc3).chr(0xb9),
566	'`U' => chr(0xc3).chr(0x99),
567	# circumflex
568	'^a' => chr(0xc3).chr(0xa2),
569	'^A' => chr(0xc3).chr(0x82),
570	'^c' => chr(0xc4).chr(0x89),
571	'^C' => chr(0xc4).chr(0x88),
572	'^e' => chr(0xc3).chr(0xaa),
573	'^E' => chr(0xc3).chr(0x8a),
574	'^g' => chr(0xc4).chr(0x9d),
575	'^G' => chr(0xc4).chr(0x9c),
576	'^h' => chr(0xc4).chr(0xa5),
577	'^H' => chr(0xc4).chr(0xa4),
578	'^i' => chr(0xc3).chr(0xae),
579	'^I' => chr(0xc3).chr(0x8e),
580	'^j' => chr(0xc4).chr(0xb5),
581	'^J' => chr(0xc4).chr(0xb4),
582	'^o' => chr(0xc3).chr(0xb4),
583	'^O' => chr(0xc3).chr(0x94),
584	'^s' => chr(0xc5).chr(0x9d),
585	'^S' => chr(0xc5).chr(0x9c),
586	'^u' => chr(0xc3).chr(0xa2),
587	'^U' => chr(0xc3).chr(0xbb),
588	'^w' => chr(0xc5).chr(0xb5),
589	'^W' => chr(0xc5).chr(0xb4),
590	'^y' => chr(0xc5).chr(0xb7),
591	'^Y' => chr(0xc5).chr(0xb6),
592
593	# diaeresis
594	'"a' => chr(0xc3).chr(0xa4),
595	'"A' => chr(0xc3).chr(0x84),
596	'"e' => chr(0xc3).chr(0xab),
597	'"E' => chr(0xc3).chr(0x8b),
598	'"\\\\i' => chr(0xc3).chr(0xaf),
599	'"\\\\I' => chr(0xc3).chr(0x8f),
600	'"o' => chr(0xc3).chr(0xb6),
601	'"O' => chr(0xc3).chr(0x96),
602	'"u' => chr(0xc3).chr(0xbc),
603	'"U' => chr(0xc3).chr(0x9c),
604	'"y' => chr(0xc3).chr(0xbf),
605	'"Y' => chr(0xc3).chr(0xb8),
606	# tilde
607	'~A' => chr(0xc3).chr(0x83),
608	'~N' => chr(0xc3).chr(0x91),
609	'~O' => chr(0xc3).chr(0x95),
610	'~a' => chr(0xc3).chr(0xa3),
611	'~n' => chr(0xc3).chr(0xb1),
612	'~o' => chr(0xc3).chr(0xb5),
613	# caron - handled specially
614	# double acute
615	# ring
616	# dot
617	'.c' => chr(0xc4).chr(0x8b),
618	'.C' => chr(0xc4).chr(0x8a),
619	'.e' => chr(0xc4).chr(0x97),
620	'.E' => chr(0xc4).chr(0x96),
621	'.g' => chr(0xc4).chr(0xa1),
622	'.G' => chr(0xc4).chr(0xa0),
623	'.I' => chr(0xc4).chr(0xb0),
624	'.z' => chr(0xc5).chr(0xbc),
625	'.Z' => chr(0xc5).chr(0xbb),
626	# macron
627	'=a' => chr(0xc4).chr(0x81),
628	'=A' => chr(0xc4).chr(0x80),
629	'=e' => chr(0xc4).chr(0x93),
630	'=E' => chr(0xc4).chr(0x92),
631	'=i' => chr(0xc4).chr(0xab),
632	'=I' => chr(0xc4).chr(0xaa),
633	'=o' => chr(0xc4).chr(0x8d),
634	'=O' => chr(0xc4).chr(0x8c),
635	'=u' => chr(0xc4).chr(0xab),
636	'=U' => chr(0xc4).chr(0xaa),
637
638	# stroke - handled specially - see below
639
640	# cedilla - handled specially
641	);
642
643	# these are one letter latex commands - we make sure they're not a longer
644	# command name. eg {\d} is d+stroke, so careful of \d
645	my %special_utf8_chars =
646	(
647	# breve
648	'u g' => chr(0xc4).chr(0x9f),
649	'u G' => chr(0xc4).chr(0x9e),
650	'u i' => chr(0xc4).chr(0xad),
651	'u I' => chr(0xc4).chr(0xac),
652	'u o' => chr(0xc5).chr(0x8f),
653	'u O' => chr(0xc5).chr(0x8e),
654	'u u' => chr(0xc5).chr(0xad),
655	'u U' => chr(0xc5).chr(0xac),
656	# caron
657	'v c' => chr(0xc4).chr(0x8d),
658	'v C' => chr(0xc4).chr(0x8c),
659	'v n' => chr(0xc5).chr(0x88),
660	'v N' => chr(0xc5).chr(0x87),
661	'v s' => chr(0xc5).chr(0xa1),
662	'v S' => chr(0xc5).chr(0xa5),
663	'v z' => chr(0xc5).chr(0xbe),
664	'v Z' => chr(0xc5).chr(0xbd),
665	# cedilla
666	'c c' => chr(0xc3).chr(0xa7),
667	'c C' => chr(0xc3).chr(0x87),
668	'c g' => chr(0xc4).chr(0xa3),
669	'c G' => chr(0xc4).chr(0xa2),
670	'c k' => chr(0xc4).chr(0xb7),
671	'c K' => chr(0xc4).chr(0xb6),
672	'c l' => chr(0xc4).chr(0xbc),
673	'c L' => chr(0xc4).chr(0xbb),
674	'c n' => chr(0xc5).chr(0x86),
675	'c N' => chr(0xc5).chr(0x85),
676	'c r' => chr(0xc5).chr(0x97),
677	'c R' => chr(0xc5).chr(0x96),
678	'c s' => chr(0xc5).chr(0x9f),
679	'c S' => chr(0xc5).chr(0x9e),
680	'c t' => chr(0xc5).chr(0xa3),
681	'c T' => chr(0xc5).chr(0xa2),
682	# double acute / Hungarian accent
683	'H O' => chr(0xc5).chr(0x90),
684	'H o' => chr(0xc5).chr(0x91),
685	'H U' => chr(0xc5).chr(0xb0),
686	'H u' => chr(0xc5).chr(0xb1),
687
688	# stroke
689	'd' => chr(0xc4).chr(0x91),
690	'D' => chr(0xc4).chr(0x90),
691	'h' => chr(0xc4).chr(0xa7),
692	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
693	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
694	'l' => chr(0xc5).chr(0x82),
695	'L' => chr(0xc5).chr(0x81),
696	'o' => chr(0xc3).chr(0xb8),
697	'O' => chr(0xc3).chr(0x98),
698	't' => chr(0xc5).chr(0xa7),
699	'T' => chr(0xc5).chr(0xa6),
700	# german ss/szlig/sharp s
701	'ss' => chr(0xc3).chr(0x9f),
702	);
703
704	# convert latex-style accented characters.
705
706	# remove space (if any) between \ and letter to accent (eg {\' a})
707
708	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
709
710	# remove {} around a single character (eg \'{e})
711	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
712
713	# \, is another way of doing cedilla \c
714	$text =~ s@\\,(.)@\\c $1@g;
715
716	# remove {} around a single character for special 1 letter commands -
717	# need to insert a space. Eg \v{s} -> {\v s}
718	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
719
720	# only do if the text contains a '\' character.
721	if ($text =~ m\|\\\|) {
722	# "normal" accents - ie non-alpha latex tag
723	while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
724	my $tex="$1$2"; my $char="$2";
725	my $replacement=$utf8_chars{$tex};
726	if (!defined($replacement)) {
727	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
728	$replacement=$char;
729	}
730	$text =~ s/\\\Q$tex/$replacement/g;
731	}
732
733	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
734	# only do the change if immediately followed by a space, }, {, or \
735	# one letter accents ( + ss)
736	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
737	my $tex=$1;
738	my $replacement=$special_utf8_chars{$tex};
739	if (!defined($replacement)) {
740	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
741	$replacement=$tex;
742	}
743	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
744
745	}
746
747	# one letter latex accent commands that affect following letter
748	while ($text =~ m@\\([uvcH]) ([\w])@) {
749	my $tex="$1 $2"; my $char="$2";
750	my $replacement=$special_utf8_chars{$tex};
751	if (!defined($replacement)) {
752	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
753	$replacement=$char;
754	}
755	$text =~ s/\\$tex/$replacement/g;
756	}
757	}
758
759	# escape html-sensitive characters
760	$text =~ s@&@&@g;
761	$text =~ s@<@<@g;
762	$text =~ s@>@>@g;
763	$text =~ s/''/"/g; # Latex-specific
764	$text =~ s/``/"/g; # Latex-specific
765	# greenstone-specific
766	$text =~ s@\[@&\#91;@g;
767	$text =~ s@\]@&\#93;@g;
768
769	# remove latex commands
770
771	# explicitly recognised commands
772	$text =~ s@\\ldots@…@g;
773
774	# maths mode
775	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
776
777	# remove all other commands with optional arguments...
778	$text =~ s@\\\w+(\[.?\])?\s@@g;
779	# $text =~ s@\\noopsort{[^}]+\}@@g;
780	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
781
782	# remove latex groupings { } (but not \{ or \} )
783	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
784	$text =~ s/^\{//; # remove { if first char
785
786	# latex characters
787	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
788	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
789	# optional break "\-"
790	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
791	# the non-macro bits have quotes around them - we just remove quotes
792	$text =~ s/([^&])[\"\#]/$1/g;
793	}
794	# dashes. Convert (m\|n)-dash into single dash for html.
795	$text =~ s@\-\-+@\-@g;
796
797	# quoted { } chars
798	$text =~ s@\\{@{@g;
799	$text =~ s@\\}@}@g;
800
801	# finally to protect against macro language...
802	$text =~ s@\\@\\\\@g;
803
804	return $text;
805	}
806
807
808	sub process_latex_math {
809	my $text = shift;
810
811	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
812	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
813	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
814	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
815	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
816
817	# put all other command names in italics
818	$text =~ s@\\([\w]+)@<i>$1</i>@g;
819
820	return $text;
821	}
822
823	sub set_OID {
824	my $self = shift (@_);
825	my ($doc_obj, $id, $segment_number) = @_;
826
827	if ( $self->{'key'} eq "default") {
828	$doc_obj->set_OID("$id\_$segment_number");
829	} else {
830	$doc_obj->set_OID($self->{'key'});
831	}
832	}
833
834	1;
835
836

Note: See TracBrowser for help on using the repository browser.

Download in other formats: