Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 3517

Last change on this file since 3517 was 3426, checked in by jrm21, 22 years ago
Don't add \n to the end of each metadata value.
Property svn:keywords set to `Author Date Id Revision`
File size: 21.7 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42	# * The raw ascii bibtex entry is stored as "BibTex" metadata.
43
44	package BibTexPlug;
45
46	use SplitPlug;
47
48	# BibTexPlug is a sub-class of BasPlug.
49	sub BEGIN {
50	@ISA = ('SplitPlug');
51	}
52
53	# This plugin processes files with the suffix ".bib"
54	sub get_default_process_exp {
55	return q^(?i)\.bib$^;
56	}
57
58	# This plugin splits the input text at blank lines
59	sub get_default_split_exp {
60	return q^\n+(?=@)^;
61	}
62
63
64
65	# The process function reads a single bibliographic record and stores
66	# it as a new document.
67
68	sub process {
69	my $self = shift (@_);
70	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
71	my $outhandle = $self->{'outhandle'};
72
73	$self->{'key'} = "default";
74
75	# Check that we're dealing with a valid BibTex record
76	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
77
78	# Ignore things we can't use
79	return 0 if ($$textref =~ /^\@String/);
80
81	# Report that we're processing the file
82	print $outhandle "BibTexPlug: processing $file\n"
83	if ($self->{'verbosity'}) > 1;
84
85
86	# This hash translates BibTex field names into metadata names. The
87	# BibTex names are taken from the "Local Guide to Latex" Graeme
88	# McKinstry. Metadata names are consistent with ReferPlug.
89
90	my %field = (
91	'address', 'PublisherAddress',
92	'author', 'Creator',
93
94	'booktitle', 'Booktitle',
95	'chapter', 'Chapter',
96	'edition', 'Edition',
97	'editor', 'Editor',
98	'institution', 'Publisher',
99	'journal', 'Journal',
100	'month', 'Month',
101	'number', 'Number',
102	'pages', 'Pages',
103	'publisher', 'Publisher',
104	'school', 'Publisher',
105	'title', 'Title',
106	'volume', 'Volume',
107	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
108
109	'keywords', 'Keywords',
110	'abstract', 'Abstract',
111	'copyright', 'Copyright'
112	);
113
114	# Metadata fields
115	my %metadata;
116	my ($EntryType, $EntryID, $Creator, $Keywords, $text);
117
118	my $verbosity = $self->{'verbosity'};
119	$verbosity = 0 unless $verbosity;
120
121	# Make sure the text has exactly one entry per line -
122	# append line to previous if it doesn't start with " <key> = "
123
124	my @input_lines=split('\n', $$textref);
125	my @all_lines;
126	my $entry_line=shift @input_lines;
127	foreach my $input_line (@input_lines) {
128	if ($input_line =~ m/^\s\w+\s=\s*/) {
129	# this is a new key
130	push(@all_lines, $entry_line);
131	$entry_line=$input_line;
132	} else {
133	# this is a continuation of previous line
134	$entry_line .= " " . $input_line;
135	}
136
137	}
138	# add final line, removing trailing '}'
139	$entry_line =~ s/\}\s*$//;
140	push(@all_lines, $entry_line);
141	push(@all_lines, "}");
142
143
144
145	# Read and process each line in the bib file.
146	my ($entryname, $name, $value, $line);
147	foreach $line (@all_lines) {
148
149	# Add each line. Most lines consist of a field identifer and
150	# then data, and we simply store them, though we treat some
151	# of the fields a bit differently.
152
153	$line =~ s/\s+/ /g;
154	$text .= "$line\n";
155
156	print "Processing line = $line \n" if $verbosity>=4;
157
158	# The first line is special, it contains the reference type and OID
159	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-]+)\W$/) {
160	$EntryType = $1;
161	$EntryID = $2;
162	print "** $EntryType - \"$EntryID\" \n"
163	if ($verbosity >= 4);
164	$self->{'key'} = $EntryID;
165	next;
166	}
167	if ($line =~ /\@/) {
168	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
169	if ($verbosity >= 2);
170	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
171	if ($verbosity >= 2);
172	}
173
174	# otherwise, parse the metadata out of this line
175	next unless ($line =~ /^\s(\w+)\s+=\s+(.)/);
176	$entryname = lc($1);
177	$value = $2;
178	$value =~ s/,?\s*$//; # remove trailing comma and space
179	if ($value =~ /^"/ && $value =~ /"$/) {
180	# remove surrounding " marks
181	$value =~ s/^"//; $value =~ s/"$//;
182	}
183	$value = &process_latex($value);
184
185	# Add this line of metadata
186	$metadata{$entryname} .= "$value";
187
188	}
189
190	# Add the Entry type as metadata
191	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
192
193	# Add the various field as metadata
194	foreach my $entryname (keys %metadata) {
195	next unless (defined $field{$entryname});
196	next unless (defined $metadata{$entryname});
197
198	$name = $field{$entryname};
199	$value = $metadata{$entryname};
200
201	if ($name =~ /^Month/) {
202	$value=expand_month($value);
203	}
204
205	# Several special operatons on metadata follow
206
207	# Add individual keywords.
208	# The full set of keywords will be added, in due course, as "Keywords".
209	# However, we also want to add them as individual "Keyword" metadata elements.
210	if ($entryname eq "keywords") {
211	my @keywordlist = split(/,/, $value);
212	foreach my $k (@keywordlist) {
213	$k = lc($k);
214	$k =~ s/\s*$//;
215	$k =~ s/^\s*//;
216	if ($k =~ /\w/) {
217	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
218	}
219	}
220	}
221
222	# Add individual authors
223	# The author metadata will be stored as one "Creator" entry, but we
224	# also want to split it into several individual "Author" fields in
225	# "Lastname, Firstnames" format so we can browse it.
226	if ($entryname eq "author") { #added also comparison with editor
227
228	# take care of "et al."...
229	$value =~ s/(\s+et\.?\s+al\.?)\s*$//;
230	my $etal=$1;
231	$etal="" if (!defined ($etal));
232	# und here for german language...
233	# don't use brackets in pattern, else the matched bit becomes
234	# an element in the list!
235	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
236	my @formattedlist = ();
237	foreach $a (@authorlist) {
238	$a =~ s/\s*$//;
239	$a =~ s/^\s*//;
240	# Reformat and add author name
241	next if $a=~ /^\s*$/;
242
243	# names are "First von Last", "von Last, First"
244	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
245	my $first="";
246	my $vonlast="";
247	my $jr="";
248
249	if ($a =~ /,/) {
250	my @parts=split(/,\s*/, $a);
251	$first = pop @parts;
252	if (scalar(@parts) == 2) {
253	$jr = pop @parts;
254	}
255	$vonlast=shift @parts;
256	if (scalar(@parts) > 0) {
257	print $outhandle "BibTexPlug: couldn't parse name $a\n";
258	# but we continue anyway...
259	}
260	} else { # First von Last
261	my @words = split(/ /, $a);
262	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
263	$first .= " " . shift (@words);
264	}
265	$first =~ s/^\s//;
266	$vonlast = join (' ', @words); # whatever's left...
267	}
268	my $von="";
269	my $last="";
270	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
271	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
272	$von = $1;
273	if (!defined ($von)) {
274	# some non-English names do start with lowercase
275	# eg "Marie desJardins". Also we can get typos...
276	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
277	$von="";
278	if ($vonlast =~ /^[a-z]+$/) {
279	# if it's all lowercase, uppercase 1st.
280	$vonlast =~ s/^(.)/\u$1/;
281
282	}
283	}
284	$von =~ s/\s*$//;
285	$last=$vonlast;
286	} else {
287	$last=$vonlast;
288	}
289	my $wholename="$first $von $last $jr";
290	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
291	push (@formattedlist, $wholename);
292	my $fullname = "$last";
293	$fullname .= " $jr" if ($jr);
294	$fullname .= ", $first";
295	$fullname .= " $von" if ($von);
296
297	# Add each name to set of Authors
298	# force utf8 pragma so that \w matches in this scope
299	use utf8;
300	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
301	}
302
303	# Only want at most one "and" in the Creator field
304	if (scalar(@formattedlist) > 2) {
305	my $lastauthor=pop @formattedlist;
306	$value=join(', ', @formattedlist);
307	$value.=" and $lastauthor";
308	} else { # 1 or 2 authors...
309	$value=join(" and ",@formattedlist);
310	}
311	$value.=$etal; # if there was "et al."
312	}
313
314	# Books and Journals are additionally marked for display purposes
315	if ($entryname eq "booktitle") {
316	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
317	} elsif ($entryname eq "journal") {
318	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
319	}
320
321
322	# Add the various fields as metadata
323	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
324	}
325
326	# Add Date (yyyymmdd) metadata
327	if (defined ($metadata{'year'}) ) {
328	my $date=$metadata{'year'};
329	chomp $date;
330	my $month=$metadata{'month'};
331	if (defined($month)) {
332	# month is currently 3 letter code or a range...
333	$month = expand_month($month);
334	# take the first month found... might not find one!
335	$month =~ m/_textmonth(\d\d)_/;
336	$month = $1;
337	}
338	if (!defined($month)) {
339	$month="00";
340	}
341	$date .= "${month}00";
342	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
343	}
344
345	# # Add the text in BibTex format (all fields)
346	if ($text =~ /\w/) {
347
348	$text =~ s@&@&@g;
349	$text =~ s@<@<@g;
350	$text =~ s@>@>@g;
351	$text =~ s@\n@<br/>\n@g;
352	$text =~ s@\\@\\\\@g;
353
354	# Not really required...
355	# $doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
356	$doc_obj->add_utf8_text ($cursection, $text);
357	}
358
359	return 1;
360	}
361
362
363
364
365	# convert email addresses and URLs into links
366	sub convert_urls_into_links{
367	my ($text) = @_;
368
369	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
370	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
371
372	return $text;
373	}
374
375	# Clean up whitespace and convert \n charaters to <BR> or <P>
376	sub clean_up_whitespaces{
377	my ($text) = @_;
378
379	$text =~ s/%%%%%/<BR> <BR>/g;
380	$text =~ s/ +/ /g;
381	$text =~ s/\s*$//;
382	$text =~ s/^\s*//;
383	$text =~ s/\n/\n<BR>/g;
384	$text =~ s/<BR>\s*<BR>/<P>/g;
385
386	return $text;
387	}
388
389
390	sub convert_problem_characters_without_ampersand{
391	my ($text) = @_;
392	$text =~ s/</</g;
393	$text =~ s/>/>/g;
394
395	$text =~ s/\'\'/\"/g; #Latex -specific conversion
396	$text =~ s/\`\`/\"/g; #Latex -specific conversion
397
398	$text =~ s/\"/"/g;
399	$text =~ s/\'/’/g;
400	$text =~ s/\`/‘/g;
401
402	# $text =~ s/\+/ /g;
403	# $text =~ s/\(/ /g;
404	# $text =~ s/\)/ /g;
405
406	$text =~ s/\\/\\\\/g;
407
408	# $text =~ s/\./\\\./g;
409
410	return $text;
411	}
412
413	# Convert a text string into HTML.
414
415	# The HTML is going to be inserted into a GML file, so we have to be
416	# careful not to use symbols like ">", which occurs frequently in email
417	# messages (and use &gt instead.
418
419	# This function also turns URLs and email addresses into links, and
420	# replaces carriage returns with <BR> tags (and multiple carriage returns
421	# with <P> tags).
422
423	sub text_into_html {
424	my ($text) = @_;
425
426	# Convert problem characters into HTML symbols
427	$text =~ s/&/&/g;
428
429	$text = &convert_problem_characters_without_ampersand( $text );
430
431	# convert email addresses and URLs into links
432	$text = &convert_urls_into_links( $text );
433
434	$text = &clean_up_whitespaces( $text );
435
436	return $text;
437	}
438
439
440	sub expand_month {
441	my $text=shift;
442
443	# bibtex style files expand abbreviations for months.
444	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
445	$text =~ s/jan/_textmonth01_/g;
446	$text =~ s/feb/_textmonth02_/g;
447	$text =~ s/mar/_textmonth03_/g;
448	$text =~ s/apr/_textmonth04_/g;
449	$text =~ s/may/_textmonth05_/g;
450	$text =~ s/jun/_textmonth06_/g;
451	$text =~ s/jul/_textmonth07_/g;
452	$text =~ s/aug/_textmonth08_/g;
453	$text =~ s/sep/_textmonth09_/g;
454	$text =~ s/oct/_textmonth10_/g;
455	$text =~ s/nov/_textmonth11_/g;
456	$text =~ s/dec/_textmonth12_/g;
457
458	return $text;
459	}
460
461
462	# Convert accented characters, remove { }, interprete some commands....
463	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
464	sub process_latex {
465	my ($text) = @_;
466
467	# note - this is really ugly, but it works. There may be a prettier way
468	# of mapping latex accented chars to utf8, but we just brute force it here.
469	# Also, this isn't complete - not every single possible accented letter
470	# is in here yet, but most of the common ones are.
471
472	my %utf8_chars =
473	(
474	# acutes
475	'\'a' => chr(0xc3).chr(0xa1),
476	'\'c' => chr(0xc4).chr(0x87),
477	'\'e' => chr(0xc3).chr(0xa9),
478	'\'i' => chr(0xc3).chr(0xad),
479	'\'l' => chr(0xc3).chr(0xba),
480	'\'n' => chr(0xc3).chr(0x84),
481	'\'o' => chr(0xc3).chr(0xb3),
482	'\'r' => chr(0xc5).chr(0x95),
483	'\'s' => chr(0xc5).chr(0x9b),
484	'\'u' => chr(0xc3).chr(0xba),
485	'\'y' => chr(0xc3).chr(0xbd),
486	'\'z' => chr(0xc5).chr(0xba),
487	# graves
488	'`a' => chr(0xc3).chr(0xa0),
489	'`A' => chr(0xc3).chr(0x80),
490	'`e' => chr(0xc3).chr(0xa8),
491	'`E' => chr(0xc3).chr(0x88),
492	'`i' => chr(0xc3).chr(0xac),
493	'`I' => chr(0xc3).chr(0x8c),
494	'`o' => chr(0xc3).chr(0xb2),
495	'`O' => chr(0xc3).chr(0x92),
496	'`u' => chr(0xc3).chr(0xb9),
497	'`U' => chr(0xc3).chr(0x99),
498	# circumflex
499	'^a' => chr(0xc3).chr(0xa2),
500	'^A' => chr(0xc3).chr(0x82),
501	'^c' => chr(0xc4).chr(0x89),
502	'^C' => chr(0xc4).chr(0x88),
503	'^e' => chr(0xc3).chr(0xaa),
504	'^E' => chr(0xc3).chr(0x8a),
505	'^g' => chr(0xc4).chr(0x9d),
506	'^G' => chr(0xc4).chr(0x9c),
507	'^h' => chr(0xc4).chr(0xa5),
508	'^H' => chr(0xc4).chr(0xa4),
509	'^i' => chr(0xc3).chr(0xae),
510	'^I' => chr(0xc3).chr(0x8e),
511	'^j' => chr(0xc4).chr(0xb5),
512	'^J' => chr(0xc4).chr(0xb4),
513	'^o' => chr(0xc3).chr(0xb4),
514	'^O' => chr(0xc3).chr(0x94),
515	'^s' => chr(0xc5).chr(0x9d),
516	'^S' => chr(0xc5).chr(0x9c),
517	'^u' => chr(0xc3).chr(0xa2),
518	'^U' => chr(0xc3).chr(0xbb),
519	'^w' => chr(0xc5).chr(0xb5),
520	'^W' => chr(0xc5).chr(0xb4),
521	'^y' => chr(0xc5).chr(0xb7),
522	'^Y' => chr(0xc5).chr(0xb6),
523
524	# diaeresis
525	'"a' => chr(0xc3).chr(0xa4),
526	'"A' => chr(0xc3).chr(0x84),
527	'"e' => chr(0xc3).chr(0xab),
528	'"E' => chr(0xc3).chr(0x8b),
529	'"\\\\i' => chr(0xc3).chr(0xaf),
530	'"\\\\I' => chr(0xc3).chr(0x8f),
531	'"o' => chr(0xc3).chr(0xb6),
532	'"O' => chr(0xc3).chr(0x96),
533	'"u' => chr(0xc3).chr(0xbc),
534	'"U' => chr(0xc3).chr(0x9c),
535	'"y' => chr(0xc3).chr(0xbf),
536	'"Y' => chr(0xc3).chr(0xb8),
537	# tilde
538	'~A' => chr(0xc3).chr(0x83),
539	'~N' => chr(0xc3).chr(0x91),
540	'~O' => chr(0xc3).chr(0x95),
541	'~a' => chr(0xc3).chr(0xa3),
542	'~n' => chr(0xc3).chr(0xb1),
543	'~o' => chr(0xc3).chr(0xb5),
544	# caron - handled specially
545	# double acute
546	# ring
547	# dot
548	'.c' => chr(0xc4).chr(0x8b),
549	'.C' => chr(0xc4).chr(0x8a),
550	'.e' => chr(0xc4).chr(0x97),
551	'.E' => chr(0xc4).chr(0x96),
552	'.g' => chr(0xc4).chr(0xa1),
553	'.G' => chr(0xc4).chr(0xa0),
554	'.I' => chr(0xc4).chr(0xb0),
555	'.z' => chr(0xc5).chr(0xbc),
556	'.Z' => chr(0xc5).chr(0xbb),
557	# macron
558	'=a' => chr(0xc4).chr(0x81),
559	'=A' => chr(0xc4).chr(0x80),
560	'=e' => chr(0xc4).chr(0x93),
561	'=E' => chr(0xc4).chr(0x92),
562	'=i' => chr(0xc4).chr(0xab),
563	'=I' => chr(0xc4).chr(0xaa),
564	'=o' => chr(0xc4).chr(0x8d),
565	'=O' => chr(0xc4).chr(0x8c),
566	'=u' => chr(0xc4).chr(0xab),
567	'=U' => chr(0xc4).chr(0xaa),
568
569	# stroke - handled specially - see below
570
571	# cedilla - handled specially
572	);
573
574	# these are one letter latex commands - we make sure they're not a longer
575	# command name. eg {\d} is d+stroke, so careful of \d
576	my %special_utf8_chars =
577	(
578	# breve
579	'u g' => chr(0xc4).chr(0x9f),
580	'u G' => chr(0xc4).chr(0x9e),
581	'u i' => chr(0xc4).chr(0xad),
582	'u I' => chr(0xc4).chr(0xac),
583	'u o' => chr(0xc5).chr(0x8f),
584	'u O' => chr(0xc5).chr(0x8e),
585	'u u' => chr(0xc5).chr(0xad),
586	'u U' => chr(0xc5).chr(0xac),
587	# caron
588	'v c' => chr(0xc4).chr(0x8d),
589	'v C' => chr(0xc4).chr(0x8c),
590	'v n' => chr(0xc5).chr(0x88),
591	'v N' => chr(0xc5).chr(0x87),
592	'v s' => chr(0xc5).chr(0xa1),
593	'v S' => chr(0xc5).chr(0xa5),
594	'v z' => chr(0xc5).chr(0xbe),
595	'v Z' => chr(0xc5).chr(0xbd),
596	# cedilla
597	'c c' => chr(0xc3).chr(0xa7),
598	'c C' => chr(0xc3).chr(0x87),
599	'c g' => chr(0xc4).chr(0xa3),
600	'c G' => chr(0xc4).chr(0xa2),
601	'c k' => chr(0xc4).chr(0xb7),
602	'c K' => chr(0xc4).chr(0xb6),
603	'c l' => chr(0xc4).chr(0xbc),
604	'c L' => chr(0xc4).chr(0xbb),
605	'c n' => chr(0xc5).chr(0x86),
606	'c N' => chr(0xc5).chr(0x85),
607	'c r' => chr(0xc5).chr(0x97),
608	'c R' => chr(0xc5).chr(0x96),
609	'c s' => chr(0xc5).chr(0x9f),
610	'c S' => chr(0xc5).chr(0x9e),
611	'c t' => chr(0xc5).chr(0xa3),
612	'c T' => chr(0xc5).chr(0xa2),
613	# double acute / Hungarian accent
614	'H O' => chr(0xc5).chr(0x90),
615	'H o' => chr(0xc5).chr(0x91),
616	'H U' => chr(0xc5).chr(0xb0),
617	'H u' => chr(0xc5).chr(0xb1),
618
619	# stroke
620	'd' => chr(0xc4).chr(0x91),
621	'D' => chr(0xc4).chr(0x90),
622	'h' => chr(0xc4).chr(0xa7),
623	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
624	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
625	'l' => chr(0xc5).chr(0x82),
626	'L' => chr(0xc5).chr(0x81),
627	'o' => chr(0xc3).chr(0xb8),
628	'O' => chr(0xc3).chr(0x98),
629	't' => chr(0xc5).chr(0xa7),
630	'T' => chr(0xc5).chr(0xa6),
631	# german ss/szlig/sharp s
632	'ss' => chr(0xc3).chr(0x9f),
633	);
634
635	# convert latex-style accented characters.
636
637	# remove space (if any) between \ and letter to accent (eg {\' a})
638
639	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
640
641	# remove {} around a single character (eg \'{e})
642	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
643
644	# \, is another way of doing cedilla \c
645	$text =~ s@\\,(.)@\\c $1@g;
646
647	# remove {} around a single character for special 1 letter commands -
648	# need to insert a space. Eg \v{s} -> {\v s}
649	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
650
651	# only do if the text contains a '\' character.
652	if ($text =~ m\|\\\|) {
653	# "normal" accents - ie non-alpha latex tag
654	while ($text =~ m@\\([`'="^~\.])([\w])@) {
655	my $tex="$1$2"; my $char="$2";
656	my $replacement=$utf8_chars{$tex};
657	if (!defined($replacement)) {
658	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
659	$replacement=$char;
660	}
661	$text =~ s/\\$tex/$replacement/g;
662	}
663
664	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
665	# only do the change if immediately followed by a space, }, {, or \
666	# one letter accents ( + ss)
667	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
668	my $tex=$1;
669	my $replacement=$special_utf8_chars{$tex};
670	if (!defined($replacement)) {
671	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
672	$replacement=$tex;
673	}
674	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
675
676	}
677
678	# one letter latex accent commands that affect following letter
679	while ($text =~ m@\\([uvcH]) ([\w])@) {
680	my $tex="$1 $2"; my $char="$2";
681	my $replacement=$special_utf8_chars{$tex};
682	if (!defined($replacement)) {
683	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
684	$replacement=$char;
685	}
686	$text =~ s/\\$tex/$replacement/g;
687	}
688	}
689
690	# escape html-sensitive characters
691	$text =~ s@&@&@g;
692	$text =~ s@<@<@g;
693	$text =~ s@>@>@g;
694	$text =~ s/''/"/g; # Latex-specific
695	$text =~ s/``/"/g; # Latex-specific
696	# greenstone-specific
697	$text =~ s@\[@&\#91;@g;
698	$text =~ s@\]@&\#93;@g;
699
700	# remove latex commands
701
702	# explicitly recognised commands
703	$text =~ s@\\ldots@…@g;
704
705	# maths mode
706	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
707
708	# remove all other commands with optional arguments...
709	$text =~ s@\\\w+(\[.?\])?\s@@g;
710	# $text =~ s@\\noopsort{[^}]+\}@@g;
711	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
712
713	# remove latex groupings { } (but not \{ or \} )
714	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
715	$text =~ s/^\{//; # remove { if first char
716
717	# latex characters
718	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
719	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
720	# optional break "\-"
721	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
722	# the non-macro bits have quotes around them - we just remove quotes
723	$text =~ s/([^&])[\"\#]/$1/g;
724	}
725	# dashes. Convert (m\|n)-dash into single dash for html.
726	$text =~ s@\-\-+@\-@g;
727
728	# quoted { } chars
729	$text =~ s@\\{@{@g;
730	$text =~ s@\\}@}@g;
731
732	# finally to protect against macro language...
733	$text =~ s@\\@\\\\@g;
734
735	return $text;
736	}
737
738
739	sub process_latex_math {
740	my $text = shift;
741
742	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
743	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
744	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
745	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
746	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
747
748	# put all other command names in italics
749	$text =~ s@\\([\w]+)@<i>$1</i>@g;
750
751	return $text;
752	}
753
754	sub set_OID {
755	my $self = shift (@_);
756	my ($doc_obj, $id, $segment_number) = @_;
757
758	if ( $self->{'key'} eq "default") {
759	$doc_obj->set_OID();
760	} else {
761	$doc_obj->set_OID($self->{'key'});
762	}
763	}
764
765	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: