Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago
Jeffrey's new parsing modifications, committed approx 6 July, 15.16
Property svn:keywords set to `Author Date Id Revision`
File size: 23.2 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42
43
44	package BibTexPlug;
45
46	use SplitPlug;
47
48	# BibTexPlug is a sub-class of BasPlug.
49	sub BEGIN {
50	@ISA = ('SplitPlug');
51	}
52
53	my $arguments =
54	[ { 'name' => "process_exp",
55	'desc' => "{BasPlug.process_exp}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => &get_default_process_exp() },
59	{ 'name' => "split_exp",
60	'desc' => "{SplitPlug.split_exp}",
61	'type' => "regexp",
62	'deft' => &get_default_split_exp(),
63	'reqd' => "no" }
64	];
65
66	my $options = { 'name' => "BibTexPlug",
67	'desc' => "{BibTexPlug.desc}",
68	'abstract' => "no",
69	'inherits' => "yes",
70	'explodes' => "yes",
71	'args' => $arguments };
72
73	# This plugin processes files with the suffix ".bib"
74	sub get_default_process_exp {
75	return q^(?i)\.bib$^;
76	}
77
78	# This plugin splits the input text at blank lines
79	sub get_default_split_exp {
80	return q^\n+(?=@)^;
81	}
82	sub new {
83	my ($class) = shift (@_);
84	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
85	push(@$pluginlist, $class);
86
87	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
88	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
89
90	my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
91
92	return bless $self, $class;
93	}
94
95
96
97	# The process function reads a single bibliographic record and stores
98	# it as a new document.
99
100	sub process {
101	my $self = shift (@_);
102	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
103	my $outhandle = $self->{'outhandle'};
104
105	$self->{'key'} = "default";
106
107	# Check that we're dealing with a valid BibTex record
108	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
109
110	# Ignore things we can't use
111	return 0 if ($$textref =~ /^\@String/);
112
113	# Report that we're processing the file
114	print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
115	print $outhandle "BibTexPlug: processing $file\n"
116	if ($self->{'verbosity'}) > 1;
117
118
119	# This hash translates BibTex field names into metadata names. The
120	# BibTex names are taken from the "Local Guide to Latex" Graeme
121	# McKinstry. Metadata names are consistent with ReferPlug.
122
123	# The author metadata will be stored as one "Creator" entry, but will
124	# also be split up into several individual "Author" fields.
125
126	my %field = (
127	'address', 'PublisherAddress',
128	'author', 'Creator',
129
130	'booktitle', 'Booktitle',
131	'chapter', 'Chapter',
132	'edition', 'Edition',
133	'editor', 'Editor',
134	'institution', 'Publisher',
135	'journal', 'Journal',
136	'month', 'Month',
137	'number', 'Number',
138	'pages', 'Pages',
139	'publisher', 'Publisher',
140	'school', 'Publisher',
141	'title', 'Title',
142	'volume', 'Volume',
143	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
144
145	'keywords', 'Keywords',
146	'abstract', 'Abstract',
147	'copyright', 'Copyright',
148	'note', 'Note',
149	);
150
151	# Metadata fields
152	my %metadata;
153	my ($EntryType, $Creator, $Keywords, $text);
154
155	my $verbosity = $self->{'verbosity'};
156	$verbosity = 0 unless $verbosity;
157
158	# Make sure the text has exactly one entry per line -
159	# append line to previous if it doesn't start with " <key> = "
160
161	my @input_lines=split(/\r?\n/, $$textref);
162	my @all_lines;
163	my $entry_line=shift @input_lines;
164	foreach my $input_line (@input_lines) {
165	if ($input_line =~ m/^\s\w+\s=\s*/) {
166	# this is a new key
167	push(@all_lines, $entry_line);
168	$entry_line=$input_line;
169	} else {
170	# this is a continuation of previous line
171	$entry_line .= " " . $input_line;
172	}
173
174	}
175	# add final line, removing trailing '}'
176	$entry_line =~ s/\}\s*$//;
177	push(@all_lines, $entry_line);
178	push(@all_lines, "}");
179
180	# Read and process each line in the bib file.
181	my ($entryname, $name, $value, $line);
182	foreach $line (@all_lines) {
183
184	# Add each line. Most lines consist of a field identifer and
185	# then data, and we simply store them, though we treat some
186	# of the fields a bit differently.
187
188	$line =~ s/\s+/ /g;
189	$text .= "$line\n";
190
191
192	print "Processing line = $line \n" if $verbosity>=4;
193
194	# The first line is special, it contains the reference type and OID
195	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
196	$EntryType = $1;
197	my $EntryID = (defined $2) ? $2 : "default";
198	print "** $EntryType - \"$EntryID\" \n"
199	if ($verbosity >= 4);
200
201	next;
202	}
203	if ($line =~ /\@/) {
204	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
205	if ($verbosity >= 2);
206	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
207	if ($verbosity >= 2);
208	}
209
210	# otherwise, parse the metadata out of this line
211	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
212	$entryname = lc($1);
213	$value = $2;
214	$value =~ s/,?\s*$//; # remove trailing comma and space
215	if ($value =~ /^"/ && $value =~ /"$/) {
216	# remove surrounding " marks
217	$value =~ s/^"//; $value =~ s/"$//;
218	}
219	$value = &process_latex($value);
220
221	# Add this line of metadata
222	$metadata{$entryname} .= "$value";
223
224	}
225
226	# Add the Entry type as metadata
227	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
228
229	#Add the fileformat as metadata
230	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
231
232
233	# Add the various field as metadata
234	foreach my $entryname (keys %metadata) {
235	next unless (defined $field{$entryname});
236	next unless (defined $metadata{$entryname});
237
238	$name = $field{$entryname};
239	$value = $metadata{$entryname};
240
241	if ($name =~ /^Month/) {
242	$value=expand_month($value);
243	}
244
245	# Several special operatons on metadata follow
246
247	# Add individual keywords.
248	# The full set of keywords will be added, in due course, as "Keywords".
249	# However, we also want to add them as individual "Keyword" metadata elements.
250	if ($entryname eq "keywords") {
251	my @keywordlist = split(/,/, $value);
252	foreach my $k (@keywordlist) {
253	$k = lc($k);
254	$k =~ s/\s*$//;
255	$k =~ s/^\s*//;
256	if ($k =~ /\w/) {
257	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
258	}
259	}
260	}
261
262	# Add individual authors
263	# The author metadata will be stored as one "Creator" entry, but we
264	# also want to split it into several individual "Author" fields in
265	# "Lastname, Firstnames" format so we can browse it.
266	if ($entryname eq "author") { #added also comparison with editor
267
268	# take care of "et al."...
269	my $etal='';
270	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
271	$etal=' <em>et. al.</em>';
272	}
273	# und here for german language...
274	# don't use brackets in pattern, else the matched bit becomes
275	# an element in the list!
276	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
277	my @formattedlist = ();
278	foreach $a (@authorlist) {
279	$a =~ s/\s*$//;
280	$a =~ s/^\s*//;
281	# Reformat and add author name
282	next if $a=~ /^\s*$/;
283
284	# names are "First von Last", "von Last, First"
285	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
286	my $first="";
287	my $vonlast="";
288	my $jr="";
289
290	if ($a =~ /,/) {
291	my @parts=split(/,\s*/, $a);
292	$first = pop @parts;
293	if (scalar(@parts) == 2) {
294	$jr = pop @parts;
295	}
296	$vonlast=shift @parts;
297	if (scalar(@parts) > 0) {
298	print $outhandle "BibTexPlug: couldn't parse name $a\n";
299	# but we continue anyway...
300	}
301	} else { # First von Last
302	my @words = split(/ /, $a);
303	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
304	$first .= " " . shift (@words);
305	}
306	$first =~ s/^\s//;
307	$vonlast = join (' ', @words); # whatever's left...
308	}
309	my $von="";
310	my $last="";
311	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
312	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
313	$von = $1;
314	if (!defined ($von)) {
315	# some non-English names do start with lowercase
316	# eg "Marie desJardins". Also we can get typos...
317	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
318	$von="";
319	if ($vonlast =~ /^[a-z]+$/) {
320	# if it's all lowercase, uppercase 1st.
321	$vonlast =~ s/^(.)/\u$1/;
322
323	}
324	}
325	$von =~ s/\s*$//;
326	$last=$vonlast;
327	} else {
328	$last=$vonlast;
329	}
330	my $wholename="$first $von $last $jr";
331	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
332	$wholename =~ s/ $//;
333	push (@formattedlist, $wholename);
334	my $fullname = "$last";
335	$fullname .= " $jr" if ($jr);
336	$fullname .= ", $first";
337	$fullname .= " $von" if ($von);
338
339	# Add each name to set of Authors
340	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
341	}
342
343	# Only want at most one "and" in the Creator field
344	if (scalar(@formattedlist) > 2) {
345	my $lastauthor=pop @formattedlist;
346	$value=join(', ', @formattedlist);
347	$value.=" and $lastauthor";
348	} else { # 1 or 2 authors...
349	$value=join(" and ",@formattedlist);
350	}
351	$value.=$etal; # if there was "et al."
352	}
353
354	# Books and Journals are additionally marked for display purposes
355	if ($entryname eq "booktitle") {
356	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
357	} elsif ($entryname eq "journal") {
358	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
359	}
360
361	# Add the various fields as metadata
362	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
363	}
364
365	# for books and journals...
366	if (!exists $metadata{'title'}) {
367	my $name=$field{'title'}; # get Greenstone metadata name
368	my $value;
369	if (exists $metadata{'booktitle'}) {
370	$value=$metadata{'booktitle'};
371	} elsif (exists $metadata{'journal'}) {
372	$value=$metadata{'journal'};
373	}
374	if ($value) {
375	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
376	}
377	}
378
379	# Add Date (yyyymmdd) metadata
380	if (defined ($metadata{'year'}) ) {
381	my $date=$metadata{'year'};
382	chomp $date;
383	my $month=$metadata{'month'};
384	if (defined($month)) {
385	# month is currently 3 letter code or a range...
386	$month = expand_month($month);
387	# take the first month found... might not find one!
388	$month =~ m/_textmonth(\d\d)_/;
389	$month = $1;
390	}
391	if (!defined($month)) {
392	$month="00";
393	}
394	$date .= "${month}00";
395	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
396	}
397
398	# # Add the text in BibTex format (all fields)
399	if ($text =~ /\w/) {
400
401	$text =~ s@&@&@g;
402	$text =~ s@<@<@g;
403	$text =~ s@>@>@g;
404	$text =~ s@\n@<br/>\n@g;
405	$text =~ s@\\@\\\\@g;
406
407	$doc_obj->add_utf8_text ($cursection, $text);
408	}
409
410	return 1;
411	}
412
413
414
415
416	# convert email addresses and URLs into links
417	sub convert_urls_into_links{
418	my ($text) = @_;
419
420	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
421	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
422
423	return $text;
424	}
425
426	# Clean up whitespace and convert \n charaters to <BR> or <P>
427	sub clean_up_whitespaces{
428	my ($text) = @_;
429
430	$text =~ s/%%%%%/<BR> <BR>/g;
431	$text =~ s/ +/ /g;
432	$text =~ s/\s*$//;
433	$text =~ s/^\s*//;
434	$text =~ s/\n/\n<BR>/g;
435	$text =~ s/<BR>\s*<BR>/<P>/g;
436
437	return $text;
438	}
439
440
441	sub convert_problem_characters_without_ampersand{
442	my ($text) = @_;
443	$text =~ s/</</g;
444	$text =~ s/>/>/g;
445
446	$text =~ s/\'\'/\"/g; #Latex -specific conversion
447	$text =~ s/\`\`/\"/g; #Latex -specific conversion
448
449	$text =~ s/\"/"/g;
450	$text =~ s/\'/’/g;
451	$text =~ s/\`/‘/g;
452
453	# $text =~ s/\+/ /g;
454	# $text =~ s/\(/ /g;
455	# $text =~ s/\)/ /g;
456
457	$text =~ s/\\/\\\\/g;
458
459	# $text =~ s/\./\\\./g;
460
461	return $text;
462	}
463
464	# Convert a text string into HTML.
465
466	# The HTML is going to be inserted into a GML file, so we have to be
467	# careful not to use symbols like ">", which occurs frequently in email
468	# messages (and use &gt instead.
469
470	# This function also turns URLs and email addresses into links, and
471	# replaces carriage returns with <BR> tags (and multiple carriage returns
472	# with <P> tags).
473
474	sub text_into_html {
475	my ($text) = @_;
476
477	# Convert problem characters into HTML symbols
478	$text =~ s/&/&/g;
479
480	$text = &convert_problem_characters_without_ampersand( $text );
481
482	# convert email addresses and URLs into links
483	$text = &convert_urls_into_links( $text );
484
485	$text = &clean_up_whitespaces( $text );
486
487	return $text;
488	}
489
490
491	sub expand_month {
492	my $text=shift;
493
494	# bibtex style files expand abbreviations for months.
495	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
496	$text =~ s/jan/_textmonth01_/g;
497	$text =~ s/feb/_textmonth02_/g;
498	$text =~ s/mar/_textmonth03_/g;
499	$text =~ s/apr/_textmonth04_/g;
500	$text =~ s/may/_textmonth05_/g;
501	$text =~ s/jun/_textmonth06_/g;
502	$text =~ s/jul/_textmonth07_/g;
503	$text =~ s/aug/_textmonth08_/g;
504	$text =~ s/sep/_textmonth09_/g;
505	$text =~ s/oct/_textmonth10_/g;
506	$text =~ s/nov/_textmonth11_/g;
507	$text =~ s/dec/_textmonth12_/g;
508
509	return $text;
510	}
511
512
513	# Convert accented characters, remove { }, interprete some commands....
514	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
515	sub process_latex {
516	my ($text) = @_;
517
518	# note - this is really ugly, but it works. There may be a prettier way
519	# of mapping latex accented chars to utf8, but we just brute force it here.
520	# Also, this isn't complete - not every single possible accented letter
521	# is in here yet, but most of the common ones are.
522
523	my %utf8_chars =
524	(
525	# acutes
526	'\'a' => chr(0xc3).chr(0xa1),
527	'\'c' => chr(0xc4).chr(0x87),
528	'\'e' => chr(0xc3).chr(0xa9),
529	'\'i' => chr(0xc3).chr(0xad),
530	'\'l' => chr(0xc3).chr(0xba),
531	'\'n' => chr(0xc3).chr(0x84),
532	'\'o' => chr(0xc3).chr(0xb3),
533	'\'r' => chr(0xc5).chr(0x95),
534	'\'s' => chr(0xc5).chr(0x9b),
535	'\'u' => chr(0xc3).chr(0xba),
536	'\'y' => chr(0xc3).chr(0xbd),
537	'\'z' => chr(0xc5).chr(0xba),
538	# graves
539	'`a' => chr(0xc3).chr(0xa0),
540	'`A' => chr(0xc3).chr(0x80),
541	'`e' => chr(0xc3).chr(0xa8),
542	'`E' => chr(0xc3).chr(0x88),
543	'`i' => chr(0xc3).chr(0xac),
544	'`I' => chr(0xc3).chr(0x8c),
545	'`o' => chr(0xc3).chr(0xb2),
546	'`O' => chr(0xc3).chr(0x92),
547	'`u' => chr(0xc3).chr(0xb9),
548	'`U' => chr(0xc3).chr(0x99),
549	# circumflex
550	'^a' => chr(0xc3).chr(0xa2),
551	'^A' => chr(0xc3).chr(0x82),
552	'^c' => chr(0xc4).chr(0x89),
553	'^C' => chr(0xc4).chr(0x88),
554	'^e' => chr(0xc3).chr(0xaa),
555	'^E' => chr(0xc3).chr(0x8a),
556	'^g' => chr(0xc4).chr(0x9d),
557	'^G' => chr(0xc4).chr(0x9c),
558	'^h' => chr(0xc4).chr(0xa5),
559	'^H' => chr(0xc4).chr(0xa4),
560	'^i' => chr(0xc3).chr(0xae),
561	'^I' => chr(0xc3).chr(0x8e),
562	'^j' => chr(0xc4).chr(0xb5),
563	'^J' => chr(0xc4).chr(0xb4),
564	'^o' => chr(0xc3).chr(0xb4),
565	'^O' => chr(0xc3).chr(0x94),
566	'^s' => chr(0xc5).chr(0x9d),
567	'^S' => chr(0xc5).chr(0x9c),
568	'^u' => chr(0xc3).chr(0xa2),
569	'^U' => chr(0xc3).chr(0xbb),
570	'^w' => chr(0xc5).chr(0xb5),
571	'^W' => chr(0xc5).chr(0xb4),
572	'^y' => chr(0xc5).chr(0xb7),
573	'^Y' => chr(0xc5).chr(0xb6),
574
575	# diaeresis
576	'"a' => chr(0xc3).chr(0xa4),
577	'"A' => chr(0xc3).chr(0x84),
578	'"e' => chr(0xc3).chr(0xab),
579	'"E' => chr(0xc3).chr(0x8b),
580	'"\\\\i' => chr(0xc3).chr(0xaf),
581	'"\\\\I' => chr(0xc3).chr(0x8f),
582	'"o' => chr(0xc3).chr(0xb6),
583	'"O' => chr(0xc3).chr(0x96),
584	'"u' => chr(0xc3).chr(0xbc),
585	'"U' => chr(0xc3).chr(0x9c),
586	'"y' => chr(0xc3).chr(0xbf),
587	'"Y' => chr(0xc3).chr(0xb8),
588	# tilde
589	'~A' => chr(0xc3).chr(0x83),
590	'~N' => chr(0xc3).chr(0x91),
591	'~O' => chr(0xc3).chr(0x95),
592	'~a' => chr(0xc3).chr(0xa3),
593	'~n' => chr(0xc3).chr(0xb1),
594	'~o' => chr(0xc3).chr(0xb5),
595	# caron - handled specially
596	# double acute
597	# ring
598	# dot
599	'.c' => chr(0xc4).chr(0x8b),
600	'.C' => chr(0xc4).chr(0x8a),
601	'.e' => chr(0xc4).chr(0x97),
602	'.E' => chr(0xc4).chr(0x96),
603	'.g' => chr(0xc4).chr(0xa1),
604	'.G' => chr(0xc4).chr(0xa0),
605	'.I' => chr(0xc4).chr(0xb0),
606	'.z' => chr(0xc5).chr(0xbc),
607	'.Z' => chr(0xc5).chr(0xbb),
608	# macron
609	'=a' => chr(0xc4).chr(0x81),
610	'=A' => chr(0xc4).chr(0x80),
611	'=e' => chr(0xc4).chr(0x93),
612	'=E' => chr(0xc4).chr(0x92),
613	'=i' => chr(0xc4).chr(0xab),
614	'=I' => chr(0xc4).chr(0xaa),
615	'=o' => chr(0xc4).chr(0x8d),
616	'=O' => chr(0xc4).chr(0x8c),
617	'=u' => chr(0xc4).chr(0xab),
618	'=U' => chr(0xc4).chr(0xaa),
619
620	# stroke - handled specially - see below
621
622	# cedilla - handled specially
623	);
624
625	# these are one letter latex commands - we make sure they're not a longer
626	# command name. eg {\d} is d+stroke, so careful of \d
627	my %special_utf8_chars =
628	(
629	# breve
630	'u g' => chr(0xc4).chr(0x9f),
631	'u G' => chr(0xc4).chr(0x9e),
632	'u i' => chr(0xc4).chr(0xad),
633	'u I' => chr(0xc4).chr(0xac),
634	'u o' => chr(0xc5).chr(0x8f),
635	'u O' => chr(0xc5).chr(0x8e),
636	'u u' => chr(0xc5).chr(0xad),
637	'u U' => chr(0xc5).chr(0xac),
638	# caron
639	'v c' => chr(0xc4).chr(0x8d),
640	'v C' => chr(0xc4).chr(0x8c),
641	'v n' => chr(0xc5).chr(0x88),
642	'v N' => chr(0xc5).chr(0x87),
643	'v s' => chr(0xc5).chr(0xa1),
644	'v S' => chr(0xc5).chr(0xa5),
645	'v z' => chr(0xc5).chr(0xbe),
646	'v Z' => chr(0xc5).chr(0xbd),
647	# cedilla
648	'c c' => chr(0xc3).chr(0xa7),
649	'c C' => chr(0xc3).chr(0x87),
650	'c g' => chr(0xc4).chr(0xa3),
651	'c G' => chr(0xc4).chr(0xa2),
652	'c k' => chr(0xc4).chr(0xb7),
653	'c K' => chr(0xc4).chr(0xb6),
654	'c l' => chr(0xc4).chr(0xbc),
655	'c L' => chr(0xc4).chr(0xbb),
656	'c n' => chr(0xc5).chr(0x86),
657	'c N' => chr(0xc5).chr(0x85),
658	'c r' => chr(0xc5).chr(0x97),
659	'c R' => chr(0xc5).chr(0x96),
660	'c s' => chr(0xc5).chr(0x9f),
661	'c S' => chr(0xc5).chr(0x9e),
662	'c t' => chr(0xc5).chr(0xa3),
663	'c T' => chr(0xc5).chr(0xa2),
664	# double acute / Hungarian accent
665	'H O' => chr(0xc5).chr(0x90),
666	'H o' => chr(0xc5).chr(0x91),
667	'H U' => chr(0xc5).chr(0xb0),
668	'H u' => chr(0xc5).chr(0xb1),
669
670	# stroke
671	'd' => chr(0xc4).chr(0x91),
672	'D' => chr(0xc4).chr(0x90),
673	'h' => chr(0xc4).chr(0xa7),
674	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
675	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
676	'l' => chr(0xc5).chr(0x82),
677	'L' => chr(0xc5).chr(0x81),
678	'o' => chr(0xc3).chr(0xb8),
679	'O' => chr(0xc3).chr(0x98),
680	't' => chr(0xc5).chr(0xa7),
681	'T' => chr(0xc5).chr(0xa6),
682	# german ss/szlig/sharp s
683	'ss' => chr(0xc3).chr(0x9f),
684	);
685
686	# convert latex-style accented characters.
687
688	# remove space (if any) between \ and letter to accent (eg {\' a})
689
690	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
691
692	# remove {} around a single character (eg \'{e})
693	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
694
695	# \, is another way of doing cedilla \c
696	$text =~ s@\\,(.)@\\c $1@g;
697
698	# remove {} around a single character for special 1 letter commands -
699	# need to insert a space. Eg \v{s} -> {\v s}
700	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
701
702	# only do if the text contains a '\' character.
703	if ($text =~ m\|\\\|) {
704	# "normal" accents - ie non-alpha latex tag
705	while ($text =~ m@\\([`'="^~\.])([\w])@) {
706	my $tex="$1$2"; my $char="$2";
707	my $replacement=$utf8_chars{$tex};
708	if (!defined($replacement)) {
709	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
710	$replacement=$char;
711	}
712	$text =~ s/\\\Q$tex/$replacement/g;
713	}
714
715	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
716	# only do the change if immediately followed by a space, }, {, or \
717	# one letter accents ( + ss)
718	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
719	my $tex=$1;
720	my $replacement=$special_utf8_chars{$tex};
721	if (!defined($replacement)) {
722	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
723	$replacement=$tex;
724	}
725	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
726
727	}
728
729	# one letter latex accent commands that affect following letter
730	while ($text =~ m@\\([uvcH]) ([\w])@) {
731	my $tex="$1 $2"; my $char="$2";
732	my $replacement=$special_utf8_chars{$tex};
733	if (!defined($replacement)) {
734	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
735	$replacement=$char;
736	}
737	$text =~ s/\\$tex/$replacement/g;
738	}
739	}
740
741	# escape html-sensitive characters
742	$text =~ s@&@&@g;
743	$text =~ s@<@<@g;
744	$text =~ s@>@>@g;
745	$text =~ s/''/"/g; # Latex-specific
746	$text =~ s/``/"/g; # Latex-specific
747	# greenstone-specific
748	$text =~ s@\[@&\#91;@g;
749	$text =~ s@\]@&\#93;@g;
750
751	# remove latex commands
752
753	# explicitly recognised commands
754	$text =~ s@\\ldots@…@g;
755
756	# maths mode
757	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
758
759	# remove all other commands with optional arguments...
760	$text =~ s@\\\w+(\[.?\])?\s@@g;
761	# $text =~ s@\\noopsort{[^}]+\}@@g;
762	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
763
764	# remove latex groupings { } (but not \{ or \} )
765	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
766	$text =~ s/^\{//; # remove { if first char
767
768	# latex characters
769	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
770	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
771	# optional break "\-"
772	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
773	# the non-macro bits have quotes around them - we just remove quotes
774	$text =~ s/([^&])[\"\#]/$1/g;
775	}
776	# dashes. Convert (m\|n)-dash into single dash for html.
777	$text =~ s@\-\-+@\-@g;
778
779	# quoted { } chars
780	$text =~ s@\\{@{@g;
781	$text =~ s@\\}@}@g;
782
783	# finally to protect against macro language...
784	$text =~ s@\\@\\\\@g;
785
786	return $text;
787	}
788
789
790	sub process_latex_math {
791	my $text = shift;
792
793	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
794	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
795	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
796	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
797	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
798
799	# put all other command names in italics
800	$text =~ s@\\([\w]+)@<i>$1</i>@g;
801
802	return $text;
803	}
804
805	sub set_OID {
806	my $self = shift (@_);
807	my ($doc_obj, $id, $segment_number) = @_;
808
809	if ( $self->{'key'} eq "default") {
810	$doc_obj->set_OID("$id\_$segment_number");
811	} else {
812	$doc_obj->set_OID($self->{'key'});
813	}
814	}
815
816	1;
817
818

Note: See TracBrowser for help on using the repository browser.

Download in other formats: