Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 9403

Last change on this file since 9403 was 9120, checked in by kjdon, 19 years ago
BibTex plug can do exploding - set 'explodes' to yes in xml description
Property svn:keywords set to `Author Date Id Revision`
File size: 22.5 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42
43
44	package BibTexPlug;
45
46	use SplitPlug;
47
48	# BibTexPlug is a sub-class of BasPlug.
49	sub BEGIN {
50	@ISA = ('SplitPlug');
51	}
52
53	my $arguments =
54	[ { 'name' => "process_exp",
55	'desc' => "{BasPlug.process_exp}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => &get_default_process_exp() },
59	{ 'name' => "split_exp",
60	'desc' => "{SplitPlug.split_exp}",
61	'type' => "regexp",
62	'deft' => &get_default_split_exp(),
63	'reqd' => "no" }
64	];
65
66	my $options = { 'name' => "BibTexPlug",
67	'desc' => "{BibTexPlug.desc}",
68	'abstract' => "no",
69	'inherits' => "yes",
70	'explodes' => "yes",
71	'args' => $arguments };
72
73	# This plugin processes files with the suffix ".bib"
74	sub get_default_process_exp {
75	return q^(?i)\.bib$^;
76	}
77
78	# This plugin splits the input text at blank lines
79	sub get_default_split_exp {
80	return q^\n+(?=@)^;
81	}
82	sub new {
83	my $class = shift (@_);
84	my $self = new SplitPlug ($class, @_);
85	$self->{'plugin_type'} = "BibTexPlug";
86	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
87	my $option_list = $self->{'option_list'};
88	push( @{$option_list}, $options );
89	return bless $self, $class;
90	}
91
92
93
94	# The process function reads a single bibliographic record and stores
95	# it as a new document.
96
97	sub process {
98	my $self = shift (@_);
99	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
100	my $outhandle = $self->{'outhandle'};
101
102	$self->{'key'} = "default";
103
104	# Check that we're dealing with a valid BibTex record
105	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
106
107	# Ignore things we can't use
108	return 0 if ($$textref =~ /^\@String/);
109
110	# Report that we're processing the file
111	print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
112	print $outhandle "BibTexPlug: processing $file\n"
113	if ($self->{'verbosity'}) > 1;
114
115
116	# This hash translates BibTex field names into metadata names. The
117	# BibTex names are taken from the "Local Guide to Latex" Graeme
118	# McKinstry. Metadata names are consistent with ReferPlug.
119
120	my %field = (
121	'address', 'PublisherAddress',
122	'author', 'Creator',
123
124	'booktitle', 'Booktitle',
125	'chapter', 'Chapter',
126	'edition', 'Edition',
127	'editor', 'Editor',
128	'institution', 'Publisher',
129	'journal', 'Journal',
130	'month', 'Month',
131	'number', 'Number',
132	'pages', 'Pages',
133	'publisher', 'Publisher',
134	'school', 'Publisher',
135	'title', 'Title',
136	'volume', 'Volume',
137	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
138
139	'keywords', 'Keywords',
140	'abstract', 'Abstract',
141	'copyright', 'Copyright',
142	'note', 'Note',
143	);
144
145	# Metadata fields
146	my %metadata;
147	my ($EntryType, $Creator, $Keywords, $text);
148
149	my $verbosity = $self->{'verbosity'};
150	$verbosity = 0 unless $verbosity;
151
152	# Make sure the text has exactly one entry per line -
153	# append line to previous if it doesn't start with " <key> = "
154
155	my @input_lines=split(/\r?\n/, $$textref);
156	my @all_lines;
157	my $entry_line=shift @input_lines;
158	foreach my $input_line (@input_lines) {
159	if ($input_line =~ m/^\s\w+\s=\s*/) {
160	# this is a new key
161	push(@all_lines, $entry_line);
162	$entry_line=$input_line;
163	} else {
164	# this is a continuation of previous line
165	$entry_line .= " " . $input_line;
166	}
167
168	}
169	# add final line, removing trailing '}'
170	$entry_line =~ s/\}\s*$//;
171	push(@all_lines, $entry_line);
172	push(@all_lines, "}");
173
174	# Read and process each line in the bib file.
175	my ($entryname, $name, $value, $line);
176	foreach $line (@all_lines) {
177
178	# Add each line. Most lines consist of a field identifer and
179	# then data, and we simply store them, though we treat some
180	# of the fields a bit differently.
181
182	$line =~ s/\s+/ /g;
183	$text .= "$line\n";
184
185
186	print "Processing line = $line \n" if $verbosity>=4;
187
188	# The first line is special, it contains the reference type and OID
189	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
190	$EntryType = $1;
191	my $EntryID = (defined $2) ? $2 : "default";
192	print "** $EntryType - \"$EntryID\" \n"
193	if ($verbosity >= 4);
194
195	next;
196	}
197	if ($line =~ /\@/) {
198	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
199	if ($verbosity >= 2);
200	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
201	if ($verbosity >= 2);
202	}
203
204	# otherwise, parse the metadata out of this line
205	next unless ($line =~ /^\s(\w+)\s+=\s+(.)/);
206	$entryname = lc($1);
207	$value = $2;
208	$value =~ s/,?\s*$//; # remove trailing comma and space
209	if ($value =~ /^"/ && $value =~ /"$/) {
210	# remove surrounding " marks
211	$value =~ s/^"//; $value =~ s/"$//;
212	}
213	$value = &process_latex($value);
214
215	# Add this line of metadata
216	$metadata{$entryname} .= "$value";
217
218	}
219
220	# Add the Entry type as metadata
221	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
222
223	#Add the fileformat as metadata
224	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
225
226
227	# Add the various field as metadata
228	foreach my $entryname (keys %metadata) {
229	next unless (defined $field{$entryname});
230	next unless (defined $metadata{$entryname});
231
232	$name = $field{$entryname};
233	$value = $metadata{$entryname};
234
235	if ($name =~ /^Month/) {
236	$value=expand_month($value);
237	}
238
239	# Several special operatons on metadata follow
240
241	# Add individual keywords.
242	# The full set of keywords will be added, in due course, as "Keywords".
243	# However, we also want to add them as individual "Keyword" metadata elements.
244	if ($entryname eq "keywords") {
245	my @keywordlist = split(/,/, $value);
246	foreach my $k (@keywordlist) {
247	$k = lc($k);
248	$k =~ s/\s*$//;
249	$k =~ s/^\s*//;
250	if ($k =~ /\w/) {
251	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
252	}
253	}
254	}
255
256	# Add individual authors
257	# The author metadata will be stored as one "Creator" entry, but we
258	# also want to split it into several individual "Author" fields in
259	# "Lastname, Firstnames" format so we can browse it.
260	if ($entryname eq "author") { #added also comparison with editor
261
262	# take care of "et al."...
263	$value =~ s/(\s+et\.?\s+al\.?)\s*$//;
264	my $etal=$1;
265	$etal="" if (!defined ($etal));
266	# und here for german language...
267	# don't use brackets in pattern, else the matched bit becomes
268	# an element in the list!
269	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
270	my @formattedlist = ();
271	foreach $a (@authorlist) {
272	$a =~ s/\s*$//;
273	$a =~ s/^\s*//;
274	# Reformat and add author name
275	next if $a=~ /^\s*$/;
276
277	# names are "First von Last", "von Last, First"
278	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
279	my $first="";
280	my $vonlast="";
281	my $jr="";
282
283	if ($a =~ /,/) {
284	my @parts=split(/,\s*/, $a);
285	$first = pop @parts;
286	if (scalar(@parts) == 2) {
287	$jr = pop @parts;
288	}
289	$vonlast=shift @parts;
290	if (scalar(@parts) > 0) {
291	print $outhandle "BibTexPlug: couldn't parse name $a\n";
292	# but we continue anyway...
293	}
294	} else { # First von Last
295	my @words = split(/ /, $a);
296	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
297	$first .= " " . shift (@words);
298	}
299	$first =~ s/^\s//;
300	$vonlast = join (' ', @words); # whatever's left...
301	}
302	my $von="";
303	my $last="";
304	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
305	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
306	$von = $1;
307	if (!defined ($von)) {
308	# some non-English names do start with lowercase
309	# eg "Marie desJardins". Also we can get typos...
310	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
311	$von="";
312	if ($vonlast =~ /^[a-z]+$/) {
313	# if it's all lowercase, uppercase 1st.
314	$vonlast =~ s/^(.)/\u$1/;
315
316	}
317	}
318	$von =~ s/\s*$//;
319	$last=$vonlast;
320	} else {
321	$last=$vonlast;
322	}
323	my $wholename="$first $von $last $jr";
324	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
325	push (@formattedlist, $wholename);
326	my $fullname = "$last";
327	$fullname .= " $jr" if ($jr);
328	$fullname .= ", $first";
329	$fullname .= " $von" if ($von);
330
331	# Add each name to set of Authors
332	# force utf8 pragma so that \w matches in this scope
333	use utf8;
334	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
335	}
336
337	# Only want at most one "and" in the Creator field
338	if (scalar(@formattedlist) > 2) {
339	my $lastauthor=pop @formattedlist;
340	$value=join(', ', @formattedlist);
341	$value.=" and $lastauthor";
342	} else { # 1 or 2 authors...
343	$value=join(" and ",@formattedlist);
344	}
345	$value.=$etal; # if there was "et al."
346	}
347
348	# Books and Journals are additionally marked for display purposes
349	if ($entryname eq "booktitle") {
350	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
351	} elsif ($entryname eq "journal") {
352	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
353	}
354
355
356	# Add the various fields as metadata
357	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
358	}
359
360	# Add Date (yyyymmdd) metadata
361	if (defined ($metadata{'year'}) ) {
362	my $date=$metadata{'year'};
363	chomp $date;
364	my $month=$metadata{'month'};
365	if (defined($month)) {
366	# month is currently 3 letter code or a range...
367	$month = expand_month($month);
368	# take the first month found... might not find one!
369	$month =~ m/_textmonth(\d\d)_/;
370	$month = $1;
371	}
372	if (!defined($month)) {
373	$month="00";
374	}
375	$date .= "${month}00";
376	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
377	}
378
379	# # Add the text in BibTex format (all fields)
380	if ($text =~ /\w/) {
381
382	$text =~ s@&@&@g;
383	$text =~ s@<@<@g;
384	$text =~ s@>@>@g;
385	$text =~ s@\n@<br/>\n@g;
386	$text =~ s@\\@\\\\@g;
387
388	$doc_obj->add_utf8_text ($cursection, $text);
389	}
390
391	return 1;
392	}
393
394
395
396
397	# convert email addresses and URLs into links
398	sub convert_urls_into_links{
399	my ($text) = @_;
400
401	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
402	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
403
404	return $text;
405	}
406
407	# Clean up whitespace and convert \n charaters to <BR> or <P>
408	sub clean_up_whitespaces{
409	my ($text) = @_;
410
411	$text =~ s/%%%%%/<BR> <BR>/g;
412	$text =~ s/ +/ /g;
413	$text =~ s/\s*$//;
414	$text =~ s/^\s*//;
415	$text =~ s/\n/\n<BR>/g;
416	$text =~ s/<BR>\s*<BR>/<P>/g;
417
418	return $text;
419	}
420
421
422	sub convert_problem_characters_without_ampersand{
423	my ($text) = @_;
424	$text =~ s/</</g;
425	$text =~ s/>/>/g;
426
427	$text =~ s/\'\'/\"/g; #Latex -specific conversion
428	$text =~ s/\`\`/\"/g; #Latex -specific conversion
429
430	$text =~ s/\"/"/g;
431	$text =~ s/\'/’/g;
432	$text =~ s/\`/‘/g;
433
434	# $text =~ s/\+/ /g;
435	# $text =~ s/\(/ /g;
436	# $text =~ s/\)/ /g;
437
438	$text =~ s/\\/\\\\/g;
439
440	# $text =~ s/\./\\\./g;
441
442	return $text;
443	}
444
445	# Convert a text string into HTML.
446
447	# The HTML is going to be inserted into a GML file, so we have to be
448	# careful not to use symbols like ">", which occurs frequently in email
449	# messages (and use &gt instead.
450
451	# This function also turns URLs and email addresses into links, and
452	# replaces carriage returns with <BR> tags (and multiple carriage returns
453	# with <P> tags).
454
455	sub text_into_html {
456	my ($text) = @_;
457
458	# Convert problem characters into HTML symbols
459	$text =~ s/&/&/g;
460
461	$text = &convert_problem_characters_without_ampersand( $text );
462
463	# convert email addresses and URLs into links
464	$text = &convert_urls_into_links( $text );
465
466	$text = &clean_up_whitespaces( $text );
467
468	return $text;
469	}
470
471
472	sub expand_month {
473	my $text=shift;
474
475	# bibtex style files expand abbreviations for months.
476	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
477	$text =~ s/jan/_textmonth01_/g;
478	$text =~ s/feb/_textmonth02_/g;
479	$text =~ s/mar/_textmonth03_/g;
480	$text =~ s/apr/_textmonth04_/g;
481	$text =~ s/may/_textmonth05_/g;
482	$text =~ s/jun/_textmonth06_/g;
483	$text =~ s/jul/_textmonth07_/g;
484	$text =~ s/aug/_textmonth08_/g;
485	$text =~ s/sep/_textmonth09_/g;
486	$text =~ s/oct/_textmonth10_/g;
487	$text =~ s/nov/_textmonth11_/g;
488	$text =~ s/dec/_textmonth12_/g;
489
490	return $text;
491	}
492
493
494	# Convert accented characters, remove { }, interprete some commands....
495	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
496	sub process_latex {
497	my ($text) = @_;
498
499	# note - this is really ugly, but it works. There may be a prettier way
500	# of mapping latex accented chars to utf8, but we just brute force it here.
501	# Also, this isn't complete - not every single possible accented letter
502	# is in here yet, but most of the common ones are.
503
504	my %utf8_chars =
505	(
506	# acutes
507	'\'a' => chr(0xc3).chr(0xa1),
508	'\'c' => chr(0xc4).chr(0x87),
509	'\'e' => chr(0xc3).chr(0xa9),
510	'\'i' => chr(0xc3).chr(0xad),
511	'\'l' => chr(0xc3).chr(0xba),
512	'\'n' => chr(0xc3).chr(0x84),
513	'\'o' => chr(0xc3).chr(0xb3),
514	'\'r' => chr(0xc5).chr(0x95),
515	'\'s' => chr(0xc5).chr(0x9b),
516	'\'u' => chr(0xc3).chr(0xba),
517	'\'y' => chr(0xc3).chr(0xbd),
518	'\'z' => chr(0xc5).chr(0xba),
519	# graves
520	'`a' => chr(0xc3).chr(0xa0),
521	'`A' => chr(0xc3).chr(0x80),
522	'`e' => chr(0xc3).chr(0xa8),
523	'`E' => chr(0xc3).chr(0x88),
524	'`i' => chr(0xc3).chr(0xac),
525	'`I' => chr(0xc3).chr(0x8c),
526	'`o' => chr(0xc3).chr(0xb2),
527	'`O' => chr(0xc3).chr(0x92),
528	'`u' => chr(0xc3).chr(0xb9),
529	'`U' => chr(0xc3).chr(0x99),
530	# circumflex
531	'^a' => chr(0xc3).chr(0xa2),
532	'^A' => chr(0xc3).chr(0x82),
533	'^c' => chr(0xc4).chr(0x89),
534	'^C' => chr(0xc4).chr(0x88),
535	'^e' => chr(0xc3).chr(0xaa),
536	'^E' => chr(0xc3).chr(0x8a),
537	'^g' => chr(0xc4).chr(0x9d),
538	'^G' => chr(0xc4).chr(0x9c),
539	'^h' => chr(0xc4).chr(0xa5),
540	'^H' => chr(0xc4).chr(0xa4),
541	'^i' => chr(0xc3).chr(0xae),
542	'^I' => chr(0xc3).chr(0x8e),
543	'^j' => chr(0xc4).chr(0xb5),
544	'^J' => chr(0xc4).chr(0xb4),
545	'^o' => chr(0xc3).chr(0xb4),
546	'^O' => chr(0xc3).chr(0x94),
547	'^s' => chr(0xc5).chr(0x9d),
548	'^S' => chr(0xc5).chr(0x9c),
549	'^u' => chr(0xc3).chr(0xa2),
550	'^U' => chr(0xc3).chr(0xbb),
551	'^w' => chr(0xc5).chr(0xb5),
552	'^W' => chr(0xc5).chr(0xb4),
553	'^y' => chr(0xc5).chr(0xb7),
554	'^Y' => chr(0xc5).chr(0xb6),
555
556	# diaeresis
557	'"a' => chr(0xc3).chr(0xa4),
558	'"A' => chr(0xc3).chr(0x84),
559	'"e' => chr(0xc3).chr(0xab),
560	'"E' => chr(0xc3).chr(0x8b),
561	'"\\\\i' => chr(0xc3).chr(0xaf),
562	'"\\\\I' => chr(0xc3).chr(0x8f),
563	'"o' => chr(0xc3).chr(0xb6),
564	'"O' => chr(0xc3).chr(0x96),
565	'"u' => chr(0xc3).chr(0xbc),
566	'"U' => chr(0xc3).chr(0x9c),
567	'"y' => chr(0xc3).chr(0xbf),
568	'"Y' => chr(0xc3).chr(0xb8),
569	# tilde
570	'~A' => chr(0xc3).chr(0x83),
571	'~N' => chr(0xc3).chr(0x91),
572	'~O' => chr(0xc3).chr(0x95),
573	'~a' => chr(0xc3).chr(0xa3),
574	'~n' => chr(0xc3).chr(0xb1),
575	'~o' => chr(0xc3).chr(0xb5),
576	# caron - handled specially
577	# double acute
578	# ring
579	# dot
580	'.c' => chr(0xc4).chr(0x8b),
581	'.C' => chr(0xc4).chr(0x8a),
582	'.e' => chr(0xc4).chr(0x97),
583	'.E' => chr(0xc4).chr(0x96),
584	'.g' => chr(0xc4).chr(0xa1),
585	'.G' => chr(0xc4).chr(0xa0),
586	'.I' => chr(0xc4).chr(0xb0),
587	'.z' => chr(0xc5).chr(0xbc),
588	'.Z' => chr(0xc5).chr(0xbb),
589	# macron
590	'=a' => chr(0xc4).chr(0x81),
591	'=A' => chr(0xc4).chr(0x80),
592	'=e' => chr(0xc4).chr(0x93),
593	'=E' => chr(0xc4).chr(0x92),
594	'=i' => chr(0xc4).chr(0xab),
595	'=I' => chr(0xc4).chr(0xaa),
596	'=o' => chr(0xc4).chr(0x8d),
597	'=O' => chr(0xc4).chr(0x8c),
598	'=u' => chr(0xc4).chr(0xab),
599	'=U' => chr(0xc4).chr(0xaa),
600
601	# stroke - handled specially - see below
602
603	# cedilla - handled specially
604	);
605
606	# these are one letter latex commands - we make sure they're not a longer
607	# command name. eg {\d} is d+stroke, so careful of \d
608	my %special_utf8_chars =
609	(
610	# breve
611	'u g' => chr(0xc4).chr(0x9f),
612	'u G' => chr(0xc4).chr(0x9e),
613	'u i' => chr(0xc4).chr(0xad),
614	'u I' => chr(0xc4).chr(0xac),
615	'u o' => chr(0xc5).chr(0x8f),
616	'u O' => chr(0xc5).chr(0x8e),
617	'u u' => chr(0xc5).chr(0xad),
618	'u U' => chr(0xc5).chr(0xac),
619	# caron
620	'v c' => chr(0xc4).chr(0x8d),
621	'v C' => chr(0xc4).chr(0x8c),
622	'v n' => chr(0xc5).chr(0x88),
623	'v N' => chr(0xc5).chr(0x87),
624	'v s' => chr(0xc5).chr(0xa1),
625	'v S' => chr(0xc5).chr(0xa5),
626	'v z' => chr(0xc5).chr(0xbe),
627	'v Z' => chr(0xc5).chr(0xbd),
628	# cedilla
629	'c c' => chr(0xc3).chr(0xa7),
630	'c C' => chr(0xc3).chr(0x87),
631	'c g' => chr(0xc4).chr(0xa3),
632	'c G' => chr(0xc4).chr(0xa2),
633	'c k' => chr(0xc4).chr(0xb7),
634	'c K' => chr(0xc4).chr(0xb6),
635	'c l' => chr(0xc4).chr(0xbc),
636	'c L' => chr(0xc4).chr(0xbb),
637	'c n' => chr(0xc5).chr(0x86),
638	'c N' => chr(0xc5).chr(0x85),
639	'c r' => chr(0xc5).chr(0x97),
640	'c R' => chr(0xc5).chr(0x96),
641	'c s' => chr(0xc5).chr(0x9f),
642	'c S' => chr(0xc5).chr(0x9e),
643	'c t' => chr(0xc5).chr(0xa3),
644	'c T' => chr(0xc5).chr(0xa2),
645	# double acute / Hungarian accent
646	'H O' => chr(0xc5).chr(0x90),
647	'H o' => chr(0xc5).chr(0x91),
648	'H U' => chr(0xc5).chr(0xb0),
649	'H u' => chr(0xc5).chr(0xb1),
650
651	# stroke
652	'd' => chr(0xc4).chr(0x91),
653	'D' => chr(0xc4).chr(0x90),
654	'h' => chr(0xc4).chr(0xa7),
655	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
656	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
657	'l' => chr(0xc5).chr(0x82),
658	'L' => chr(0xc5).chr(0x81),
659	'o' => chr(0xc3).chr(0xb8),
660	'O' => chr(0xc3).chr(0x98),
661	't' => chr(0xc5).chr(0xa7),
662	'T' => chr(0xc5).chr(0xa6),
663	# german ss/szlig/sharp s
664	'ss' => chr(0xc3).chr(0x9f),
665	);
666
667	# convert latex-style accented characters.
668
669	# remove space (if any) between \ and letter to accent (eg {\' a})
670
671	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
672
673	# remove {} around a single character (eg \'{e})
674	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
675
676	# \, is another way of doing cedilla \c
677	$text =~ s@\\,(.)@\\c $1@g;
678
679	# remove {} around a single character for special 1 letter commands -
680	# need to insert a space. Eg \v{s} -> {\v s}
681	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
682
683	# only do if the text contains a '\' character.
684	if ($text =~ m\|\\\|) {
685	# "normal" accents - ie non-alpha latex tag
686	while ($text =~ m@\\([`'="^~\.])([\w])@) {
687	my $tex="$1$2"; my $char="$2";
688	my $replacement=$utf8_chars{$tex};
689	if (!defined($replacement)) {
690	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
691	$replacement=$char;
692	}
693	$text =~ s/\\\Q$tex/$replacement/g;
694	}
695
696	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
697	# only do the change if immediately followed by a space, }, {, or \
698	# one letter accents ( + ss)
699	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
700	my $tex=$1;
701	my $replacement=$special_utf8_chars{$tex};
702	if (!defined($replacement)) {
703	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
704	$replacement=$tex;
705	}
706	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
707
708	}
709
710	# one letter latex accent commands that affect following letter
711	while ($text =~ m@\\([uvcH]) ([\w])@) {
712	my $tex="$1 $2"; my $char="$2";
713	my $replacement=$special_utf8_chars{$tex};
714	if (!defined($replacement)) {
715	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
716	$replacement=$char;
717	}
718	$text =~ s/\\$tex/$replacement/g;
719	}
720	}
721
722	# escape html-sensitive characters
723	$text =~ s@&@&@g;
724	$text =~ s@<@<@g;
725	$text =~ s@>@>@g;
726	$text =~ s/''/"/g; # Latex-specific
727	$text =~ s/``/"/g; # Latex-specific
728	# greenstone-specific
729	$text =~ s@\[@&\#91;@g;
730	$text =~ s@\]@&\#93;@g;
731
732	# remove latex commands
733
734	# explicitly recognised commands
735	$text =~ s@\\ldots@…@g;
736
737	# maths mode
738	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
739
740	# remove all other commands with optional arguments...
741	$text =~ s@\\\w+(\[.?\])?\s@@g;
742	# $text =~ s@\\noopsort{[^}]+\}@@g;
743	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
744
745	# remove latex groupings { } (but not \{ or \} )
746	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
747	$text =~ s/^\{//; # remove { if first char
748
749	# latex characters
750	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
751	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
752	# optional break "\-"
753	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
754	# the non-macro bits have quotes around them - we just remove quotes
755	$text =~ s/([^&])[\"\#]/$1/g;
756	}
757	# dashes. Convert (m\|n)-dash into single dash for html.
758	$text =~ s@\-\-+@\-@g;
759
760	# quoted { } chars
761	$text =~ s@\\{@{@g;
762	$text =~ s@\\}@}@g;
763
764	# finally to protect against macro language...
765	$text =~ s@\\@\\\\@g;
766
767	return $text;
768	}
769
770
771	sub process_latex_math {
772	my $text = shift;
773
774	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
775	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
776	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
777	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
778	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
779
780	# put all other command names in italics
781	$text =~ s@\\([\w]+)@<i>$1</i>@g;
782
783	return $text;
784	}
785
786	sub set_OID {
787	my $self = shift (@_);
788	my ($doc_obj, $id, $segment_number) = @_;
789
790	if ( $self->{'key'} eq "default") {
791	$doc_obj->set_OID("$id\_$segment_number");
792	} else {
793	$doc_obj->set_OID($self->{'key'});
794	}
795	}
796
797	1;
798
799

Note: See TracBrowser for help on using the repository browser.

Download in other formats: