Context Navigation

source: gsdl/trunk/perllib/plugins/BibTexPlugin.pm@ 16822

Last change on this file since 16822 was 16104, checked in by kjdon, 16 years ago
tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process
Property svn:keywords set to `Author Date Id Revision`
File size: 23.4 KB

Line
1	###########################################################################
2	#
3	# BibTexPlugin.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlugin reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlugin creates a document object for every reference a the file.
35	# It is a subclass of SplitTextFile, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42
43
44	package BibTexPlugin;
45
46	use SplitTextFile;
47	use strict;
48	no strict 'refs'; # allow filehandles to be variables and viceversa
49
50	# BibTexPlugin is a sub-class of SplitTextFile.
51	sub BEGIN {
52	@BibTexPlugin::ISA = ('SplitTextFile');
53	}
54
55	my $arguments =
56	[ { 'name' => "process_exp",
57	'desc' => "{BasePlugin.process_exp}",
58	'type' => "regexp",
59	'reqd' => "no",
60	'deft' => &get_default_process_exp() },
61	{ 'name' => "split_exp",
62	'desc' => "{SplitTextFile.split_exp}",
63	'type' => "regexp",
64	'deft' => &get_default_split_exp(),
65	'reqd' => "no" }
66	];
67
68	my $options = { 'name' => "BibTexPlugin",
69	'desc' => "{BibTexPlugin.desc}",
70	'abstract' => "no",
71	'inherits' => "yes",
72	'explodes' => "yes",
73	'args' => $arguments };
74
75	# This plugin processes files with the suffix ".bib"
76	sub get_default_process_exp {
77	return q^(?i)\.bib$^;
78	}
79
80	# This plugin splits the input text at blank lines
81	sub get_default_split_exp {
82	return q^\n+(?=@)^;
83	}
84
85	sub new {
86	my ($class) = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88	push(@$pluginlist, $class);
89
90	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91	push(@{$hashArgOptLists->{"OptList"}},$options);
92
93	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
94
95	return bless $self, $class;
96	}
97
98
99
100	# The process function reads a single bibliographic record and stores
101	# it as a new document.
102
103	sub process {
104	my $self = shift (@_);
105	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
106	my $outhandle = $self->{'outhandle'};
107
108	my $cursection = $doc_obj->get_top_section();
109	$self->{'key'} = "default";
110
111	# Check that we're dealing with a valid BibTex record
112	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
113
114	# Ignore things we can't use
115	return 0 if ($$textref =~ /^\@String/);
116
117	# This hash translates BibTex field names into metadata names. The
118	# BibTex names are taken from the "Local Guide to Latex" Graeme
119	# McKinstry. Metadata names are consistent with ReferPlug.
120
121	# The author metadata will be stored as one "Creator" entry, but will
122	# also be split up into several individual "Author" fields.
123
124	my %field = (
125	'address', 'PublisherAddress',
126	'author', 'Creator',
127
128	'booktitle', 'Booktitle',
129	'chapter', 'Chapter',
130	'edition', 'Edition',
131	'editor', 'Editor',
132	'institution', 'Publisher',
133	'journal', 'Journal',
134	'month', 'Month',
135	'number', 'Number',
136	'pages', 'Pages',
137	'publisher', 'Publisher',
138	'school', 'Publisher',
139	'title', 'Title',
140	'volume', 'Volume',
141	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
142
143	'keywords', 'Keywords',
144	'abstract', 'Abstract',
145	'copyright', 'Copyright',
146	'note', 'Note',
147	'url', 'URL',
148	);
149
150	# Metadata fields
151	my %metadata;
152	my ($EntryType, $Creator, $Keywords, $text);
153
154	my $verbosity = $self->{'verbosity'};
155	$verbosity = 0 unless $verbosity;
156
157	# Make sure the text has exactly one entry per line -
158	# append line to previous if it doesn't start with " <key> = "
159
160	my @input_lines=split(/\r?\n/, $$textref);
161	my @all_lines;
162	my $entry_line=shift @input_lines;
163	foreach my $input_line (@input_lines) {
164	if ($input_line =~ m/^\s\w+\s=\s*/) {
165	# this is a new key
166	push(@all_lines, $entry_line);
167	$entry_line=$input_line;
168	} else {
169	# this is a continuation of previous line
170	$entry_line .= " " . $input_line;
171	}
172
173	}
174	# add final line, removing trailing '}'
175	$entry_line =~ s/\}\s*$//;
176	push(@all_lines, $entry_line);
177	push(@all_lines, "}");
178
179	# Read and process each line in the bib file.
180	my ($entryname, $name, $value, $line);
181	foreach $line (@all_lines) {
182
183	# Add each line. Most lines consist of a field identifer and
184	# then data, and we simply store them, though we treat some
185	# of the fields a bit differently.
186
187	$line =~ s/\s+/ /g;
188	$text .= "$line\n";
189
190
191	print "Processing line = $line \n" if $verbosity>=4;
192
193	# The first line is special, it contains the reference type and OID
194	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
195	$EntryType = $1;
196	my $EntryID = (defined $2) ? $2 : "default";
197	print "** $EntryType - \"$EntryID\" \n"
198	if ($verbosity >= 4);
199
200	next;
201	}
202	if ($line =~ /\@/) {
203	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
204	if ($verbosity >= 2);
205	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
206	if ($verbosity >= 2);
207	}
208
209	# otherwise, parse the metadata out of this line
210	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
211	$entryname = lc($1);
212	$value = $2;
213	$value =~ s/,?\s*$//; # remove trailing comma and space
214	if ($value =~ /^"/ && $value =~ /"$/) {
215	# remove surrounding " marks
216	$value =~ s/^"//; $value =~ s/"$//;
217	} elsif ($value =~ /^\{/ && $value =~ /\}$/) {
218	# remove surrounding {} marks
219	$value =~ s/^\{//; $value =~ s/\}$//;
220	}
221	# special case for year - we only want a 4 digit year, not
222	# "circa 2004" etc
223	if ($entryname=~ /^year$/i) {
224	if ($value=~ /(\d{4})/) {
225	$value=$1;
226	$metadata{$entryname} .= "$value";
227	}
228	}
229	else {
230
231	$value = &process_latex($value);
232	# Add this line of metadata
233	$metadata{$entryname} .= "$value";
234	}
235	}
236
237	# Add the Entry type as metadata
238	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
239
240	#Add the fileformat as metadata
241	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
242
243
244	# Add the various field as metadata
245	foreach my $entryname (keys %metadata) {
246	next unless (defined $field{$entryname});
247	next unless (defined $metadata{$entryname});
248
249	$name = $field{$entryname};
250	$value = $metadata{$entryname};
251
252	if ($name =~ /^Month/) {
253	$value=expand_month($value);
254	}
255
256	# Several special operatons on metadata follow
257
258	# Add individual keywords.
259	# The full set of keywords will be added, in due course, as "Keywords".
260	# However, we also want to add them as individual "Keyword" metadata elements.
261	if ($entryname eq "keywords") {
262	my @keywordlist = split(/,/, $value);
263	foreach my $k (@keywordlist) {
264	$k = lc($k);
265	$k =~ s/\s*$//;
266	$k =~ s/^\s*//;
267	if ($k =~ /\w/) {
268	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
269	}
270	}
271	}
272
273	# Add individual authors
274	# The author metadata will be stored as one "Creator" entry, but we
275	# also want to split it into several individual "Author" fields in
276	# "Lastname, Firstnames" format so we can browse it.
277	if ($entryname eq "author") { #added also comparison with editor
278
279	# take care of "et al."...
280	my $etal='';
281	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
282	$etal=' <em>et. al.</em>';
283	}
284	# und here for german language...
285	# don't use brackets in pattern, else the matched bit becomes
286	# an element in the list!
287	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
288	my @formattedlist = ();
289	foreach $a (@authorlist) {
290	$a =~ s/\s*$//;
291	$a =~ s/^\s*//;
292	# Reformat and add author name
293	next if $a=~ /^\s*$/;
294
295	# names are "First von Last", "von Last, First"
296	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
297	my $first="";
298	my $vonlast="";
299	my $jr="";
300
301	if ($a =~ /,/) {
302	my @parts=split(/,\s*/, $a);
303	$first = pop @parts;
304	if (scalar(@parts) == 2) {
305	$jr = pop @parts;
306	}
307	$vonlast=shift @parts;
308	if (scalar(@parts) > 0) {
309	print $outhandle "BibTexPlugin: couldn't parse name $a\n";
310	# but we continue anyway...
311	}
312	} else { # First von Last
313	my @words = split(/ /, $a);
314	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
315	$first .= " " . shift (@words);
316	}
317	$first =~ s/^\s//;
318	$vonlast = join (' ', @words); # whatever's left...
319	}
320	my $von="";
321	my $last="";
322	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
323	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
324	$von = $1;
325	if (!defined ($von)) {
326	# some non-English names do start with lowercase
327	# eg "Marie desJardins". Also we can get typos...
328	print $outhandle "BibTexPlugin: couldn't parse surname $vonlast\n";
329	$von="";
330	if ($vonlast =~ /^[a-z]+$/) {
331	# if it's all lowercase, uppercase 1st.
332	$vonlast =~ s/^(.)/\u$1/;
333
334	}
335	}
336	$von =~ s/\s*$//;
337	$last=$vonlast;
338	} else {
339	$last=$vonlast;
340	}
341	my $wholename="$first $von $last $jr";
342	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
343	$wholename =~ s/ $//;
344	push (@formattedlist, $wholename);
345	my $fullname = "$last";
346	$fullname .= " $jr" if ($jr);
347	$fullname .= ", $first";
348	$fullname .= " $von" if ($von);
349
350	# Add each name to set of Authors
351	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
352	}
353
354	# Only want at most one "and" in the Creator field
355	if (scalar(@formattedlist) > 2) {
356	my $lastauthor=pop @formattedlist;
357	$value=join(', ', @formattedlist);
358	$value.=" and $lastauthor";
359	} else { # 1 or 2 authors...
360	$value=join(" and ",@formattedlist);
361	}
362	$value.=$etal; # if there was "et al."
363	}
364
365	# Books and Journals are additionally marked for display purposes
366	if ($entryname eq "booktitle") {
367	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
368	} elsif ($entryname eq "journal") {
369	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
370	}
371
372	# Add the various fields as metadata
373	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
374
375	}
376
377	# for books and journals...
378	if (!exists $metadata{'title'}) {
379	my $name=$field{'title'}; # get Greenstone metadata name
380	my $value;
381	if (exists $metadata{'booktitle'}) {
382	$value=$metadata{'booktitle'};
383	} elsif (exists $metadata{'journal'}) {
384	$value=$metadata{'journal'};
385	}
386	if ($value) {
387	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
388	}
389	}
390
391	# Add Date (yyyymmdd) metadata
392	if (defined ($metadata{'year'}) ) {
393	my $date=$metadata{'year'};
394	chomp $date;
395	my $month=$metadata{'month'};
396	if (defined($month)) {
397	# month is currently 3 letter code or a range...
398	$month = expand_month($month);
399	# take the first month found... might not find one!
400	$month =~ m/_textmonth(\d\d)_/;
401	$month = $1;
402	}
403	if (!defined($month)) {
404	$month="00";
405	}
406	$date .= "${month}00";
407	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
408	}
409
410	# # Add the text in BibTex format (all fields)
411	if ($text =~ /\w/) {
412
413	$text =~ s@&@&@g;
414	$text =~ s@<@<@g;
415	$text =~ s@>@>@g;
416	$text =~ s@\n@<br/>\n@g;
417	$text =~ s@\\@\\\\@g;
418
419	$doc_obj->add_utf8_text ($cursection, $text);
420	}
421
422	return 1;
423	}
424
425
426
427
428	# convert email addresses and URLs into links
429	sub convert_urls_into_links{
430	my ($text) = @_;
431
432	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
433	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
434
435	return $text;
436	}
437
438	# Clean up whitespace and convert \n charaters to <BR> or <P>
439	sub clean_up_whitespaces{
440	my ($text) = @_;
441
442	$text =~ s/%%%%%/<BR> <BR>/g;
443	$text =~ s/ +/ /g;
444	$text =~ s/\s*$//;
445	$text =~ s/^\s*//;
446	$text =~ s/\n/\n<BR>/g;
447	$text =~ s/<BR>\s*<BR>/<P>/g;
448
449	return $text;
450	}
451
452
453	sub convert_problem_characters_without_ampersand{
454	my ($text) = @_;
455	$text =~ s/</</g;
456	$text =~ s/>/>/g;
457
458	$text =~ s/\'\'/\"/g; #Latex -specific conversion
459	$text =~ s/\`\`/\"/g; #Latex -specific conversion
460
461	$text =~ s/\"/"/g;
462	$text =~ s/\'/’/g;
463	$text =~ s/\`/‘/g;
464
465	# $text =~ s/\+/ /g;
466	# $text =~ s/\(/ /g;
467	# $text =~ s/\)/ /g;
468
469	$text =~ s/\\/\\\\/g;
470
471	# $text =~ s/\./\\\./g;
472
473	return $text;
474	}
475
476	# Convert a text string into HTML.
477
478	# The HTML is going to be inserted into a GML file, so we have to be
479	# careful not to use symbols like ">", which occurs frequently in email
480	# messages (and use &gt instead.
481
482	# This function also turns URLs and email addresses into links, and
483	# replaces carriage returns with <BR> tags (and multiple carriage returns
484	# with <P> tags).
485
486	sub text_into_html {
487	my ($text) = @_;
488
489	# Convert problem characters into HTML symbols
490	$text =~ s/&/&/g;
491
492	$text = &convert_problem_characters_without_ampersand( $text );
493
494	# convert email addresses and URLs into links
495	$text = &convert_urls_into_links( $text );
496
497	$text = &clean_up_whitespaces( $text );
498
499	return $text;
500	}
501
502
503	sub expand_month {
504	my $text=shift;
505
506	# bibtex style files expand abbreviations for months.
507	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
508	$text =~ s/jan/_textmonth01_/g;
509	$text =~ s/feb/_textmonth02_/g;
510	$text =~ s/mar/_textmonth03_/g;
511	$text =~ s/apr/_textmonth04_/g;
512	$text =~ s/may/_textmonth05_/g;
513	$text =~ s/jun/_textmonth06_/g;
514	$text =~ s/jul/_textmonth07_/g;
515	$text =~ s/aug/_textmonth08_/g;
516	$text =~ s/sep/_textmonth09_/g;
517	$text =~ s/oct/_textmonth10_/g;
518	$text =~ s/nov/_textmonth11_/g;
519	$text =~ s/dec/_textmonth12_/g;
520
521	return $text;
522	}
523
524
525	# Convert accented characters, remove { }, interprete some commands....
526	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
527	sub process_latex {
528	my ($text) = @_;
529
530	# note - this is really ugly, but it works. There may be a prettier way
531	# of mapping latex accented chars to utf8, but we just brute force it here.
532	# Also, this isn't complete - not every single possible accented letter
533	# is in here yet, but most of the common ones are.
534
535	my %utf8_chars =
536	(
537	# acutes
538	'\'a' => chr(0xc3).chr(0xa1),
539	'\'c' => chr(0xc4).chr(0x87),
540	'\'e' => chr(0xc3).chr(0xa9),
541	'\'i' => chr(0xc3).chr(0xad),
542	'\'l' => chr(0xc3).chr(0xba),
543	'\'n' => chr(0xc3).chr(0x84),
544	'\'o' => chr(0xc3).chr(0xb3),
545	'\'r' => chr(0xc5).chr(0x95),
546	'\'s' => chr(0xc5).chr(0x9b),
547	'\'u' => chr(0xc3).chr(0xba),
548	'\'y' => chr(0xc3).chr(0xbd),
549	'\'z' => chr(0xc5).chr(0xba),
550	# graves
551	'`a' => chr(0xc3).chr(0xa0),
552	'`A' => chr(0xc3).chr(0x80),
553	'`e' => chr(0xc3).chr(0xa8),
554	'`E' => chr(0xc3).chr(0x88),
555	'`i' => chr(0xc3).chr(0xac),
556	'`I' => chr(0xc3).chr(0x8c),
557	'`o' => chr(0xc3).chr(0xb2),
558	'`O' => chr(0xc3).chr(0x92),
559	'`u' => chr(0xc3).chr(0xb9),
560	'`U' => chr(0xc3).chr(0x99),
561	# circumflex
562	'^a' => chr(0xc3).chr(0xa2),
563	'^A' => chr(0xc3).chr(0x82),
564	'^c' => chr(0xc4).chr(0x89),
565	'^C' => chr(0xc4).chr(0x88),
566	'^e' => chr(0xc3).chr(0xaa),
567	'^E' => chr(0xc3).chr(0x8a),
568	'^g' => chr(0xc4).chr(0x9d),
569	'^G' => chr(0xc4).chr(0x9c),
570	'^h' => chr(0xc4).chr(0xa5),
571	'^H' => chr(0xc4).chr(0xa4),
572	'^i' => chr(0xc3).chr(0xae),
573	'^I' => chr(0xc3).chr(0x8e),
574	'^j' => chr(0xc4).chr(0xb5),
575	'^J' => chr(0xc4).chr(0xb4),
576	'^o' => chr(0xc3).chr(0xb4),
577	'^O' => chr(0xc3).chr(0x94),
578	'^s' => chr(0xc5).chr(0x9d),
579	'^S' => chr(0xc5).chr(0x9c),
580	'^u' => chr(0xc3).chr(0xa2),
581	'^U' => chr(0xc3).chr(0xbb),
582	'^w' => chr(0xc5).chr(0xb5),
583	'^W' => chr(0xc5).chr(0xb4),
584	'^y' => chr(0xc5).chr(0xb7),
585	'^Y' => chr(0xc5).chr(0xb6),
586
587	# diaeresis
588	'"a' => chr(0xc3).chr(0xa4),
589	'"A' => chr(0xc3).chr(0x84),
590	'"e' => chr(0xc3).chr(0xab),
591	'"E' => chr(0xc3).chr(0x8b),
592	'"\\\\i' => chr(0xc3).chr(0xaf),
593	'"\\\\I' => chr(0xc3).chr(0x8f),
594	'"o' => chr(0xc3).chr(0xb6),
595	'"O' => chr(0xc3).chr(0x96),
596	'"u' => chr(0xc3).chr(0xbc),
597	'"U' => chr(0xc3).chr(0x9c),
598	'"y' => chr(0xc3).chr(0xbf),
599	'"Y' => chr(0xc3).chr(0xb8),
600	# tilde
601	'~A' => chr(0xc3).chr(0x83),
602	'~N' => chr(0xc3).chr(0x91),
603	'~O' => chr(0xc3).chr(0x95),
604	'~a' => chr(0xc3).chr(0xa3),
605	'~n' => chr(0xc3).chr(0xb1),
606	'~o' => chr(0xc3).chr(0xb5),
607	# caron - handled specially
608	# double acute
609	# ring
610	# dot
611	'.c' => chr(0xc4).chr(0x8b),
612	'.C' => chr(0xc4).chr(0x8a),
613	'.e' => chr(0xc4).chr(0x97),
614	'.E' => chr(0xc4).chr(0x96),
615	'.g' => chr(0xc4).chr(0xa1),
616	'.G' => chr(0xc4).chr(0xa0),
617	'.I' => chr(0xc4).chr(0xb0),
618	'.z' => chr(0xc5).chr(0xbc),
619	'.Z' => chr(0xc5).chr(0xbb),
620	# macron
621	'=a' => chr(0xc4).chr(0x81),
622	'=A' => chr(0xc4).chr(0x80),
623	'=e' => chr(0xc4).chr(0x93),
624	'=E' => chr(0xc4).chr(0x92),
625	'=i' => chr(0xc4).chr(0xab),
626	'=I' => chr(0xc4).chr(0xaa),
627	'=o' => chr(0xc4).chr(0x8d),
628	'=O' => chr(0xc4).chr(0x8c),
629	'=u' => chr(0xc4).chr(0xab),
630	'=U' => chr(0xc4).chr(0xaa),
631
632	# stroke - handled specially - see below
633
634	# cedilla - handled specially
635	);
636
637	# these are one letter latex commands - we make sure they're not a longer
638	# command name. eg {\d} is d+stroke, so careful of \d
639	my %special_utf8_chars =
640	(
641	# breve
642	'u g' => chr(0xc4).chr(0x9f),
643	'u G' => chr(0xc4).chr(0x9e),
644	'u i' => chr(0xc4).chr(0xad),
645	'u I' => chr(0xc4).chr(0xac),
646	'u o' => chr(0xc5).chr(0x8f),
647	'u O' => chr(0xc5).chr(0x8e),
648	'u u' => chr(0xc5).chr(0xad),
649	'u U' => chr(0xc5).chr(0xac),
650	# caron
651	'v c' => chr(0xc4).chr(0x8d),
652	'v C' => chr(0xc4).chr(0x8c),
653	'v n' => chr(0xc5).chr(0x88),
654	'v N' => chr(0xc5).chr(0x87),
655	'v s' => chr(0xc5).chr(0xa1),
656	'v S' => chr(0xc5).chr(0xa5),
657	'v z' => chr(0xc5).chr(0xbe),
658	'v Z' => chr(0xc5).chr(0xbd),
659	# cedilla
660	'c c' => chr(0xc3).chr(0xa7),
661	'c C' => chr(0xc3).chr(0x87),
662	'c g' => chr(0xc4).chr(0xa3),
663	'c G' => chr(0xc4).chr(0xa2),
664	'c k' => chr(0xc4).chr(0xb7),
665	'c K' => chr(0xc4).chr(0xb6),
666	'c l' => chr(0xc4).chr(0xbc),
667	'c L' => chr(0xc4).chr(0xbb),
668	'c n' => chr(0xc5).chr(0x86),
669	'c N' => chr(0xc5).chr(0x85),
670	'c r' => chr(0xc5).chr(0x97),
671	'c R' => chr(0xc5).chr(0x96),
672	'c s' => chr(0xc5).chr(0x9f),
673	'c S' => chr(0xc5).chr(0x9e),
674	'c t' => chr(0xc5).chr(0xa3),
675	'c T' => chr(0xc5).chr(0xa2),
676	# double acute / Hungarian accent
677	'H O' => chr(0xc5).chr(0x90),
678	'H o' => chr(0xc5).chr(0x91),
679	'H U' => chr(0xc5).chr(0xb0),
680	'H u' => chr(0xc5).chr(0xb1),
681
682	# stroke
683	'd' => chr(0xc4).chr(0x91),
684	'D' => chr(0xc4).chr(0x90),
685	'h' => chr(0xc4).chr(0xa7),
686	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
687	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
688	'l' => chr(0xc5).chr(0x82),
689	'L' => chr(0xc5).chr(0x81),
690	'o' => chr(0xc3).chr(0xb8),
691	'O' => chr(0xc3).chr(0x98),
692	't' => chr(0xc5).chr(0xa7),
693	'T' => chr(0xc5).chr(0xa6),
694	# german ss/szlig/sharp s
695	'ss' => chr(0xc3).chr(0x9f),
696	);
697
698	# convert latex-style accented characters.
699
700	# remove space (if any) between \ and letter to accent (eg {\' a})
701
702	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
703
704	# remove {} around a single character (eg \'{e})
705	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
706
707	# \, is another way of doing cedilla \c
708	$text =~ s@\\,(.)@\\c $1@g;
709
710	# remove {} around a single character for special 1 letter commands -
711	# need to insert a space. Eg \v{s} -> {\v s}
712	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
713
714	# only do if the text contains a '\' character.
715	if ($text =~ m\|\\\|) {
716	# "normal" accents - ie non-alpha latex tag
717	while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
718	my $tex="$1$2"; my $char="$2";
719	my $replacement=$utf8_chars{$tex};
720	if (!defined($replacement)) {
721	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
722	$replacement=$char;
723	}
724	$text =~ s/\\\Q$tex/$replacement/g;
725	}
726
727	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
728	# only do the change if immediately followed by a space, }, {, or \
729	# one letter accents ( + ss)
730	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
731	my $tex=$1;
732	my $replacement=$special_utf8_chars{$tex};
733	if (!defined($replacement)) {
734	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
735	$replacement=$tex;
736	}
737	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
738
739	}
740
741	# one letter latex accent commands that affect following letter
742	while ($text =~ m@\\([uvcH]) ([\w])@) {
743	my $tex="$1 $2"; my $char="$2";
744	my $replacement=$special_utf8_chars{$tex};
745	if (!defined($replacement)) {
746	print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
747	$replacement=$char;
748	}
749	$text =~ s/\\$tex/$replacement/g;
750	}
751	}
752
753	# escape html-sensitive characters
754	$text =~ s@&@&@g;
755	$text =~ s@<@<@g;
756	$text =~ s@>@>@g;
757	$text =~ s/''/"/g; # Latex-specific
758	$text =~ s/``/"/g; # Latex-specific
759	# greenstone-specific
760	$text =~ s@\[@&\#91;@g;
761	$text =~ s@\]@&\#93;@g;
762
763	# remove latex commands
764
765	# explicitly recognised commands
766	$text =~ s@\\ldots@…@g;
767
768	# maths mode
769	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
770
771	# remove all other commands with optional arguments...
772	$text =~ s@\\\w+(\[.?\])?\s@@g;
773	# $text =~ s@\\noopsort{[^}]+\}@@g;
774	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
775
776	# remove latex groupings { } (but not \{ or \} )
777	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
778	$text =~ s/^\{//; # remove { if first char
779
780	# latex characters
781	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
782	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
783	# optional break "\-"
784	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
785	# the non-macro bits have quotes around them - we just remove quotes
786	$text =~ s/([^&])[\"\#]/$1/g;
787	}
788	# dashes. Convert (m\|n)-dash into single dash for html.
789	$text =~ s@\-\-+@\-@g;
790
791	# quoted { } chars
792	$text =~ s@\\{@{@g;
793	$text =~ s@\\}@}@g;
794
795	# finally to protect against macro language...
796	$text =~ s@\\@\\\\@g;
797
798	return $text;
799	}
800
801
802	sub process_latex_math {
803	my $text = shift;
804
805	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
806	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
807	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
808	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
809	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
810
811	# put all other command names in italics
812	$text =~ s@\\([\w]+)@<i>$1</i>@g;
813
814	return $text;
815	}
816
817	sub set_OID {
818	my $self = shift (@_);
819	my ($doc_obj, $id, $segment_number) = @_;
820
821	if ( $self->{'key'} eq "default") {
822	$doc_obj->set_OID("$id\_$segment_number");
823	} else {
824	$doc_obj->set_OID($self->{'key'});
825	}
826	}
827
828	1;
829
830

Note: See TracBrowser for help on using the repository browser.

Download in other formats: