Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 4785

Last change on this file since 4785 was 4744, checked in by mdewsnip, 21 years ago
Tidied up and structures (representing the options of the plugin) in preparation for removing the print_usage() routines.
Property svn:keywords set to `Author Date Id Revision`
File size: 22.4 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42
43
44	package BibTexPlug;
45
46	use SplitPlug;
47
48	# BibTexPlug is a sub-class of BasPlug.
49	sub BEGIN {
50	@ISA = ('SplitPlug');
51	}
52
53	my $arguments =
54	[ { 'name' => "process_exp",
55	'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
56	'type' => "string",
57	'reqd' => "no" ,
58	'deft' => q^(?i)\.bib$^ } ];
59
60	my $options = { 'name' => "BibTexPlug",
61	'desc' => "BibTexPlug reads bibliography files in BibTex format. BibTexPlug creates a document object for every reference in the file. It is a subclass of SplitPlug, so if there are multiple records, all are read.",
62	'inherits' => "Yes",
63	'args' => $arguments };
64
65	# This plugin processes files with the suffix ".bib"
66	sub get_default_process_exp {
67	return q^(?i)\.bib$^;
68	}
69
70	# This plugin splits the input text at blank lines
71	sub get_default_split_exp {
72	return q^\n+(?=@)^;
73	}
74	sub new {
75	my $class = shift (@_);
76	my $self = new SplitPlug ($class, @_);
77	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
78	my $option_list = $self->{'option_list'};
79	push( @{$option_list}, $options );
80	return bless $self, $class;
81	}
82
83
84
85	# The process function reads a single bibliographic record and stores
86	# it as a new document.
87
88	sub process {
89	my $self = shift (@_);
90	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
91	my $outhandle = $self->{'outhandle'};
92
93	$self->{'key'} = "default";
94
95	# Check that we're dealing with a valid BibTex record
96	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
97
98	# Ignore things we can't use
99	return 0 if ($$textref =~ /^\@String/);
100
101	# Report that we're processing the file
102	print $outhandle "BibTexPlug: processing $file\n"
103	if ($self->{'verbosity'}) > 1;
104
105
106	# This hash translates BibTex field names into metadata names. The
107	# BibTex names are taken from the "Local Guide to Latex" Graeme
108	# McKinstry. Metadata names are consistent with ReferPlug.
109
110	my %field = (
111	'address', 'PublisherAddress',
112	'author', 'Creator',
113
114	'booktitle', 'Booktitle',
115	'chapter', 'Chapter',
116	'edition', 'Edition',
117	'editor', 'Editor',
118	'institution', 'Publisher',
119	'journal', 'Journal',
120	'month', 'Month',
121	'number', 'Number',
122	'pages', 'Pages',
123	'publisher', 'Publisher',
124	'school', 'Publisher',
125	'title', 'Title',
126	'volume', 'Volume',
127	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
128
129	'keywords', 'Keywords',
130	'abstract', 'Abstract',
131	'copyright', 'Copyright'
132	);
133
134	# Metadata fields
135	my %metadata;
136	my ($EntryType, $EntryID, $Creator, $Keywords, $text);
137
138	my $verbosity = $self->{'verbosity'};
139	$verbosity = 0 unless $verbosity;
140
141	# Make sure the text has exactly one entry per line -
142	# append line to previous if it doesn't start with " <key> = "
143
144	my @input_lines=split('\n', $$textref);
145	my @all_lines;
146	my $entry_line=shift @input_lines;
147	foreach my $input_line (@input_lines) {
148	if ($input_line =~ m/^\s\w+\s=\s*/) {
149	# this is a new key
150	push(@all_lines, $entry_line);
151	$entry_line=$input_line;
152	} else {
153	# this is a continuation of previous line
154	$entry_line .= " " . $input_line;
155	}
156
157	}
158	# add final line, removing trailing '}'
159	$entry_line =~ s/\}\s*$//;
160	push(@all_lines, $entry_line);
161	push(@all_lines, "}");
162
163
164
165	# Read and process each line in the bib file.
166	my ($entryname, $name, $value, $line);
167	foreach $line (@all_lines) {
168
169	# Add each line. Most lines consist of a field identifer and
170	# then data, and we simply store them, though we treat some
171	# of the fields a bit differently.
172
173	$line =~ s/\s+/ /g;
174	$text .= "$line\n";
175
176	print "Processing line = $line \n" if $verbosity>=4;
177
178	# The first line is special, it contains the reference type and OID
179	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-]+)\W$/) {
180	$EntryType = $1;
181	$EntryID = $2;
182	print "** $EntryType - \"$EntryID\" \n"
183	if ($verbosity >= 4);
184	$self->{'key'} = $EntryID;
185	next;
186	}
187	if ($line =~ /\@/) {
188	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
189	if ($verbosity >= 2);
190	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
191	if ($verbosity >= 2);
192	}
193
194	# otherwise, parse the metadata out of this line
195	next unless ($line =~ /^\s(\w+)\s+=\s+(.)/);
196	$entryname = lc($1);
197	$value = $2;
198	$value =~ s/,?\s*$//; # remove trailing comma and space
199	if ($value =~ /^"/ && $value =~ /"$/) {
200	# remove surrounding " marks
201	$value =~ s/^"//; $value =~ s/"$//;
202	}
203	$value = &process_latex($value);
204
205	# Add this line of metadata
206	$metadata{$entryname} .= "$value";
207
208	}
209
210	# Add the Entry type as metadata
211	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
212
213	# Add the various field as metadata
214	foreach my $entryname (keys %metadata) {
215	next unless (defined $field{$entryname});
216	next unless (defined $metadata{$entryname});
217
218	$name = $field{$entryname};
219	$value = $metadata{$entryname};
220
221	if ($name =~ /^Month/) {
222	$value=expand_month($value);
223	}
224
225	# Several special operatons on metadata follow
226
227	# Add individual keywords.
228	# The full set of keywords will be added, in due course, as "Keywords".
229	# However, we also want to add them as individual "Keyword" metadata elements.
230	if ($entryname eq "keywords") {
231	my @keywordlist = split(/,/, $value);
232	foreach my $k (@keywordlist) {
233	$k = lc($k);
234	$k =~ s/\s*$//;
235	$k =~ s/^\s*//;
236	if ($k =~ /\w/) {
237	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
238	}
239	}
240	}
241
242	# Add individual authors
243	# The author metadata will be stored as one "Creator" entry, but we
244	# also want to split it into several individual "Author" fields in
245	# "Lastname, Firstnames" format so we can browse it.
246	if ($entryname eq "author") { #added also comparison with editor
247
248	# take care of "et al."...
249	$value =~ s/(\s+et\.?\s+al\.?)\s*$//;
250	my $etal=$1;
251	$etal="" if (!defined ($etal));
252	# und here for german language...
253	# don't use brackets in pattern, else the matched bit becomes
254	# an element in the list!
255	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
256	my @formattedlist = ();
257	foreach $a (@authorlist) {
258	$a =~ s/\s*$//;
259	$a =~ s/^\s*//;
260	# Reformat and add author name
261	next if $a=~ /^\s*$/;
262
263	# names are "First von Last", "von Last, First"
264	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
265	my $first="";
266	my $vonlast="";
267	my $jr="";
268
269	if ($a =~ /,/) {
270	my @parts=split(/,\s*/, $a);
271	$first = pop @parts;
272	if (scalar(@parts) == 2) {
273	$jr = pop @parts;
274	}
275	$vonlast=shift @parts;
276	if (scalar(@parts) > 0) {
277	print $outhandle "BibTexPlug: couldn't parse name $a\n";
278	# but we continue anyway...
279	}
280	} else { # First von Last
281	my @words = split(/ /, $a);
282	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
283	$first .= " " . shift (@words);
284	}
285	$first =~ s/^\s//;
286	$vonlast = join (' ', @words); # whatever's left...
287	}
288	my $von="";
289	my $last="";
290	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
291	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
292	$von = $1;
293	if (!defined ($von)) {
294	# some non-English names do start with lowercase
295	# eg "Marie desJardins". Also we can get typos...
296	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
297	$von="";
298	if ($vonlast =~ /^[a-z]+$/) {
299	# if it's all lowercase, uppercase 1st.
300	$vonlast =~ s/^(.)/\u$1/;
301
302	}
303	}
304	$von =~ s/\s*$//;
305	$last=$vonlast;
306	} else {
307	$last=$vonlast;
308	}
309	my $wholename="$first $von $last $jr";
310	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
311	push (@formattedlist, $wholename);
312	my $fullname = "$last";
313	$fullname .= " $jr" if ($jr);
314	$fullname .= ", $first";
315	$fullname .= " $von" if ($von);
316
317	# Add each name to set of Authors
318	# force utf8 pragma so that \w matches in this scope
319	use utf8;
320	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
321	}
322
323	# Only want at most one "and" in the Creator field
324	if (scalar(@formattedlist) > 2) {
325	my $lastauthor=pop @formattedlist;
326	$value=join(', ', @formattedlist);
327	$value.=" and $lastauthor";
328	} else { # 1 or 2 authors...
329	$value=join(" and ",@formattedlist);
330	}
331	$value.=$etal; # if there was "et al."
332	}
333
334	# Books and Journals are additionally marked for display purposes
335	if ($entryname eq "booktitle") {
336	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
337	} elsif ($entryname eq "journal") {
338	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
339	}
340
341
342	# Add the various fields as metadata
343	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
344	}
345
346	# Add Date (yyyymmdd) metadata
347	if (defined ($metadata{'year'}) ) {
348	my $date=$metadata{'year'};
349	chomp $date;
350	my $month=$metadata{'month'};
351	if (defined($month)) {
352	# month is currently 3 letter code or a range...
353	$month = expand_month($month);
354	# take the first month found... might not find one!
355	$month =~ m/_textmonth(\d\d)_/;
356	$month = $1;
357	}
358	if (!defined($month)) {
359	$month="00";
360	}
361	$date .= "${month}00";
362	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
363	}
364
365	# # Add the text in BibTex format (all fields)
366	if ($text =~ /\w/) {
367
368	$text =~ s@&@&@g;
369	$text =~ s@<@<@g;
370	$text =~ s@>@>@g;
371	$text =~ s@\n@<br/>\n@g;
372	$text =~ s@\\@\\\\@g;
373
374	$doc_obj->add_utf8_text ($cursection, $text);
375	}
376
377	return 1;
378	}
379
380
381
382
383	# convert email addresses and URLs into links
384	sub convert_urls_into_links{
385	my ($text) = @_;
386
387	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
388	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
389
390	return $text;
391	}
392
393	# Clean up whitespace and convert \n charaters to <BR> or <P>
394	sub clean_up_whitespaces{
395	my ($text) = @_;
396
397	$text =~ s/%%%%%/<BR> <BR>/g;
398	$text =~ s/ +/ /g;
399	$text =~ s/\s*$//;
400	$text =~ s/^\s*//;
401	$text =~ s/\n/\n<BR>/g;
402	$text =~ s/<BR>\s*<BR>/<P>/g;
403
404	return $text;
405	}
406
407
408	sub convert_problem_characters_without_ampersand{
409	my ($text) = @_;
410	$text =~ s/</</g;
411	$text =~ s/>/>/g;
412
413	$text =~ s/\'\'/\"/g; #Latex -specific conversion
414	$text =~ s/\`\`/\"/g; #Latex -specific conversion
415
416	$text =~ s/\"/"/g;
417	$text =~ s/\'/’/g;
418	$text =~ s/\`/‘/g;
419
420	# $text =~ s/\+/ /g;
421	# $text =~ s/\(/ /g;
422	# $text =~ s/\)/ /g;
423
424	$text =~ s/\\/\\\\/g;
425
426	# $text =~ s/\./\\\./g;
427
428	return $text;
429	}
430
431	# Convert a text string into HTML.
432
433	# The HTML is going to be inserted into a GML file, so we have to be
434	# careful not to use symbols like ">", which occurs frequently in email
435	# messages (and use &gt instead.
436
437	# This function also turns URLs and email addresses into links, and
438	# replaces carriage returns with <BR> tags (and multiple carriage returns
439	# with <P> tags).
440
441	sub text_into_html {
442	my ($text) = @_;
443
444	# Convert problem characters into HTML symbols
445	$text =~ s/&/&/g;
446
447	$text = &convert_problem_characters_without_ampersand( $text );
448
449	# convert email addresses and URLs into links
450	$text = &convert_urls_into_links( $text );
451
452	$text = &clean_up_whitespaces( $text );
453
454	return $text;
455	}
456
457
458	sub expand_month {
459	my $text=shift;
460
461	# bibtex style files expand abbreviations for months.
462	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
463	$text =~ s/jan/_textmonth01_/g;
464	$text =~ s/feb/_textmonth02_/g;
465	$text =~ s/mar/_textmonth03_/g;
466	$text =~ s/apr/_textmonth04_/g;
467	$text =~ s/may/_textmonth05_/g;
468	$text =~ s/jun/_textmonth06_/g;
469	$text =~ s/jul/_textmonth07_/g;
470	$text =~ s/aug/_textmonth08_/g;
471	$text =~ s/sep/_textmonth09_/g;
472	$text =~ s/oct/_textmonth10_/g;
473	$text =~ s/nov/_textmonth11_/g;
474	$text =~ s/dec/_textmonth12_/g;
475
476	return $text;
477	}
478
479
480	# Convert accented characters, remove { }, interprete some commands....
481	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
482	sub process_latex {
483	my ($text) = @_;
484
485	# note - this is really ugly, but it works. There may be a prettier way
486	# of mapping latex accented chars to utf8, but we just brute force it here.
487	# Also, this isn't complete - not every single possible accented letter
488	# is in here yet, but most of the common ones are.
489
490	my %utf8_chars =
491	(
492	# acutes
493	'\'a' => chr(0xc3).chr(0xa1),
494	'\'c' => chr(0xc4).chr(0x87),
495	'\'e' => chr(0xc3).chr(0xa9),
496	'\'i' => chr(0xc3).chr(0xad),
497	'\'l' => chr(0xc3).chr(0xba),
498	'\'n' => chr(0xc3).chr(0x84),
499	'\'o' => chr(0xc3).chr(0xb3),
500	'\'r' => chr(0xc5).chr(0x95),
501	'\'s' => chr(0xc5).chr(0x9b),
502	'\'u' => chr(0xc3).chr(0xba),
503	'\'y' => chr(0xc3).chr(0xbd),
504	'\'z' => chr(0xc5).chr(0xba),
505	# graves
506	'`a' => chr(0xc3).chr(0xa0),
507	'`A' => chr(0xc3).chr(0x80),
508	'`e' => chr(0xc3).chr(0xa8),
509	'`E' => chr(0xc3).chr(0x88),
510	'`i' => chr(0xc3).chr(0xac),
511	'`I' => chr(0xc3).chr(0x8c),
512	'`o' => chr(0xc3).chr(0xb2),
513	'`O' => chr(0xc3).chr(0x92),
514	'`u' => chr(0xc3).chr(0xb9),
515	'`U' => chr(0xc3).chr(0x99),
516	# circumflex
517	'^a' => chr(0xc3).chr(0xa2),
518	'^A' => chr(0xc3).chr(0x82),
519	'^c' => chr(0xc4).chr(0x89),
520	'^C' => chr(0xc4).chr(0x88),
521	'^e' => chr(0xc3).chr(0xaa),
522	'^E' => chr(0xc3).chr(0x8a),
523	'^g' => chr(0xc4).chr(0x9d),
524	'^G' => chr(0xc4).chr(0x9c),
525	'^h' => chr(0xc4).chr(0xa5),
526	'^H' => chr(0xc4).chr(0xa4),
527	'^i' => chr(0xc3).chr(0xae),
528	'^I' => chr(0xc3).chr(0x8e),
529	'^j' => chr(0xc4).chr(0xb5),
530	'^J' => chr(0xc4).chr(0xb4),
531	'^o' => chr(0xc3).chr(0xb4),
532	'^O' => chr(0xc3).chr(0x94),
533	'^s' => chr(0xc5).chr(0x9d),
534	'^S' => chr(0xc5).chr(0x9c),
535	'^u' => chr(0xc3).chr(0xa2),
536	'^U' => chr(0xc3).chr(0xbb),
537	'^w' => chr(0xc5).chr(0xb5),
538	'^W' => chr(0xc5).chr(0xb4),
539	'^y' => chr(0xc5).chr(0xb7),
540	'^Y' => chr(0xc5).chr(0xb6),
541
542	# diaeresis
543	'"a' => chr(0xc3).chr(0xa4),
544	'"A' => chr(0xc3).chr(0x84),
545	'"e' => chr(0xc3).chr(0xab),
546	'"E' => chr(0xc3).chr(0x8b),
547	'"\\\\i' => chr(0xc3).chr(0xaf),
548	'"\\\\I' => chr(0xc3).chr(0x8f),
549	'"o' => chr(0xc3).chr(0xb6),
550	'"O' => chr(0xc3).chr(0x96),
551	'"u' => chr(0xc3).chr(0xbc),
552	'"U' => chr(0xc3).chr(0x9c),
553	'"y' => chr(0xc3).chr(0xbf),
554	'"Y' => chr(0xc3).chr(0xb8),
555	# tilde
556	'~A' => chr(0xc3).chr(0x83),
557	'~N' => chr(0xc3).chr(0x91),
558	'~O' => chr(0xc3).chr(0x95),
559	'~a' => chr(0xc3).chr(0xa3),
560	'~n' => chr(0xc3).chr(0xb1),
561	'~o' => chr(0xc3).chr(0xb5),
562	# caron - handled specially
563	# double acute
564	# ring
565	# dot
566	'.c' => chr(0xc4).chr(0x8b),
567	'.C' => chr(0xc4).chr(0x8a),
568	'.e' => chr(0xc4).chr(0x97),
569	'.E' => chr(0xc4).chr(0x96),
570	'.g' => chr(0xc4).chr(0xa1),
571	'.G' => chr(0xc4).chr(0xa0),
572	'.I' => chr(0xc4).chr(0xb0),
573	'.z' => chr(0xc5).chr(0xbc),
574	'.Z' => chr(0xc5).chr(0xbb),
575	# macron
576	'=a' => chr(0xc4).chr(0x81),
577	'=A' => chr(0xc4).chr(0x80),
578	'=e' => chr(0xc4).chr(0x93),
579	'=E' => chr(0xc4).chr(0x92),
580	'=i' => chr(0xc4).chr(0xab),
581	'=I' => chr(0xc4).chr(0xaa),
582	'=o' => chr(0xc4).chr(0x8d),
583	'=O' => chr(0xc4).chr(0x8c),
584	'=u' => chr(0xc4).chr(0xab),
585	'=U' => chr(0xc4).chr(0xaa),
586
587	# stroke - handled specially - see below
588
589	# cedilla - handled specially
590	);
591
592	# these are one letter latex commands - we make sure they're not a longer
593	# command name. eg {\d} is d+stroke, so careful of \d
594	my %special_utf8_chars =
595	(
596	# breve
597	'u g' => chr(0xc4).chr(0x9f),
598	'u G' => chr(0xc4).chr(0x9e),
599	'u i' => chr(0xc4).chr(0xad),
600	'u I' => chr(0xc4).chr(0xac),
601	'u o' => chr(0xc5).chr(0x8f),
602	'u O' => chr(0xc5).chr(0x8e),
603	'u u' => chr(0xc5).chr(0xad),
604	'u U' => chr(0xc5).chr(0xac),
605	# caron
606	'v c' => chr(0xc4).chr(0x8d),
607	'v C' => chr(0xc4).chr(0x8c),
608	'v n' => chr(0xc5).chr(0x88),
609	'v N' => chr(0xc5).chr(0x87),
610	'v s' => chr(0xc5).chr(0xa1),
611	'v S' => chr(0xc5).chr(0xa5),
612	'v z' => chr(0xc5).chr(0xbe),
613	'v Z' => chr(0xc5).chr(0xbd),
614	# cedilla
615	'c c' => chr(0xc3).chr(0xa7),
616	'c C' => chr(0xc3).chr(0x87),
617	'c g' => chr(0xc4).chr(0xa3),
618	'c G' => chr(0xc4).chr(0xa2),
619	'c k' => chr(0xc4).chr(0xb7),
620	'c K' => chr(0xc4).chr(0xb6),
621	'c l' => chr(0xc4).chr(0xbc),
622	'c L' => chr(0xc4).chr(0xbb),
623	'c n' => chr(0xc5).chr(0x86),
624	'c N' => chr(0xc5).chr(0x85),
625	'c r' => chr(0xc5).chr(0x97),
626	'c R' => chr(0xc5).chr(0x96),
627	'c s' => chr(0xc5).chr(0x9f),
628	'c S' => chr(0xc5).chr(0x9e),
629	'c t' => chr(0xc5).chr(0xa3),
630	'c T' => chr(0xc5).chr(0xa2),
631	# double acute / Hungarian accent
632	'H O' => chr(0xc5).chr(0x90),
633	'H o' => chr(0xc5).chr(0x91),
634	'H U' => chr(0xc5).chr(0xb0),
635	'H u' => chr(0xc5).chr(0xb1),
636
637	# stroke
638	'd' => chr(0xc4).chr(0x91),
639	'D' => chr(0xc4).chr(0x90),
640	'h' => chr(0xc4).chr(0xa7),
641	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
642	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
643	'l' => chr(0xc5).chr(0x82),
644	'L' => chr(0xc5).chr(0x81),
645	'o' => chr(0xc3).chr(0xb8),
646	'O' => chr(0xc3).chr(0x98),
647	't' => chr(0xc5).chr(0xa7),
648	'T' => chr(0xc5).chr(0xa6),
649	# german ss/szlig/sharp s
650	'ss' => chr(0xc3).chr(0x9f),
651	);
652
653	# convert latex-style accented characters.
654
655	# remove space (if any) between \ and letter to accent (eg {\' a})
656
657	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
658
659	# remove {} around a single character (eg \'{e})
660	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
661
662	# \, is another way of doing cedilla \c
663	$text =~ s@\\,(.)@\\c $1@g;
664
665	# remove {} around a single character for special 1 letter commands -
666	# need to insert a space. Eg \v{s} -> {\v s}
667	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
668
669	# only do if the text contains a '\' character.
670	if ($text =~ m\|\\\|) {
671	# "normal" accents - ie non-alpha latex tag
672	while ($text =~ m@\\([`'="^~\.])([\w])@) {
673	my $tex="$1$2"; my $char="$2";
674	my $replacement=$utf8_chars{$tex};
675	if (!defined($replacement)) {
676	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
677	$replacement=$char;
678	}
679	$text =~ s/\\$tex/$replacement/g;
680	}
681
682	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
683	# only do the change if immediately followed by a space, }, {, or \
684	# one letter accents ( + ss)
685	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
686	my $tex=$1;
687	my $replacement=$special_utf8_chars{$tex};
688	if (!defined($replacement)) {
689	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
690	$replacement=$tex;
691	}
692	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
693
694	}
695
696	# one letter latex accent commands that affect following letter
697	while ($text =~ m@\\([uvcH]) ([\w])@) {
698	my $tex="$1 $2"; my $char="$2";
699	my $replacement=$special_utf8_chars{$tex};
700	if (!defined($replacement)) {
701	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
702	$replacement=$char;
703	}
704	$text =~ s/\\$tex/$replacement/g;
705	}
706	}
707
708	# escape html-sensitive characters
709	$text =~ s@&@&@g;
710	$text =~ s@<@<@g;
711	$text =~ s@>@>@g;
712	$text =~ s/''/"/g; # Latex-specific
713	$text =~ s/``/"/g; # Latex-specific
714	# greenstone-specific
715	$text =~ s@\[@&\#91;@g;
716	$text =~ s@\]@&\#93;@g;
717
718	# remove latex commands
719
720	# explicitly recognised commands
721	$text =~ s@\\ldots@…@g;
722
723	# maths mode
724	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
725
726	# remove all other commands with optional arguments...
727	$text =~ s@\\\w+(\[.?\])?\s@@g;
728	# $text =~ s@\\noopsort{[^}]+\}@@g;
729	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
730
731	# remove latex groupings { } (but not \{ or \} )
732	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
733	$text =~ s/^\{//; # remove { if first char
734
735	# latex characters
736	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
737	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
738	# optional break "\-"
739	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
740	# the non-macro bits have quotes around them - we just remove quotes
741	$text =~ s/([^&])[\"\#]/$1/g;
742	}
743	# dashes. Convert (m\|n)-dash into single dash for html.
744	$text =~ s@\-\-+@\-@g;
745
746	# quoted { } chars
747	$text =~ s@\\{@{@g;
748	$text =~ s@\\}@}@g;
749
750	# finally to protect against macro language...
751	$text =~ s@\\@\\\\@g;
752
753	return $text;
754	}
755
756
757	sub process_latex_math {
758	my $text = shift;
759
760	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
761	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
762	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
763	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
764	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
765
766	# put all other command names in italics
767	$text =~ s@\\([\w]+)@<i>$1</i>@g;
768
769	return $text;
770	}
771
772	sub set_OID {
773	my $self = shift (@_);
774	my ($doc_obj, $id, $segment_number) = @_;
775
776	if ( $self->{'key'} eq "default") {
777	$doc_obj->set_OID();
778	} else {
779	$doc_obj->set_OID($self->{'key'});
780	}
781	}
782
783	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: