Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 12969

Last change on this file since 12969 was 12169, checked in by mdewsnip, 18 years ago
Tidied up that horrible long line in the new() function of every plugin.
Property svn:keywords set to `Author Date Id Revision`
File size: 23.3 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42
43
44	package BibTexPlug;
45
46	use SplitPlug;
47	use strict;
48	no strict 'refs'; # allow filehandles to be variables and viceversa
49
50	# BibTexPlug is a sub-class of BasPlug.
51	sub BEGIN {
52	@BibTexPlug::ISA = ('SplitPlug');
53	}
54
55	my $arguments =
56	[ { 'name' => "process_exp",
57	'desc' => "{BasPlug.process_exp}",
58	'type' => "regexp",
59	'reqd' => "no",
60	'deft' => &get_default_process_exp() },
61	{ 'name' => "split_exp",
62	'desc' => "{SplitPlug.split_exp}",
63	'type' => "regexp",
64	'deft' => &get_default_split_exp(),
65	'reqd' => "no" }
66	];
67
68	my $options = { 'name' => "BibTexPlug",
69	'desc' => "{BibTexPlug.desc}",
70	'abstract' => "no",
71	'inherits' => "yes",
72	'explodes' => "yes",
73	'args' => $arguments };
74
75	# This plugin processes files with the suffix ".bib"
76	sub get_default_process_exp {
77	return q^(?i)\.bib$^;
78	}
79
80	# This plugin splits the input text at blank lines
81	sub get_default_split_exp {
82	return q^\n+(?=@)^;
83	}
84	sub new {
85	my ($class) = shift (@_);
86	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87	push(@$pluginlist, $class);
88
89	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
90	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
91
92	my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
93
94	return bless $self, $class;
95	}
96
97
98
99	# The process function reads a single bibliographic record and stores
100	# it as a new document.
101
102	sub process {
103	my $self = shift (@_);
104	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
105	my $outhandle = $self->{'outhandle'};
106
107	my $cursection = $doc_obj->get_top_section();
108	$self->{'key'} = "default";
109
110	# Check that we're dealing with a valid BibTex record
111	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
112
113	# Ignore things we can't use
114	return 0 if ($$textref =~ /^\@String/);
115
116	# Report that we're processing the file
117	print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
118	print $outhandle "BibTexPlug: processing $file\n"
119	if ($self->{'verbosity'}) > 1;
120
121
122	# This hash translates BibTex field names into metadata names. The
123	# BibTex names are taken from the "Local Guide to Latex" Graeme
124	# McKinstry. Metadata names are consistent with ReferPlug.
125
126	# The author metadata will be stored as one "Creator" entry, but will
127	# also be split up into several individual "Author" fields.
128
129	my %field = (
130	'address', 'PublisherAddress',
131	'author', 'Creator',
132
133	'booktitle', 'Booktitle',
134	'chapter', 'Chapter',
135	'edition', 'Edition',
136	'editor', 'Editor',
137	'institution', 'Publisher',
138	'journal', 'Journal',
139	'month', 'Month',
140	'number', 'Number',
141	'pages', 'Pages',
142	'publisher', 'Publisher',
143	'school', 'Publisher',
144	'title', 'Title',
145	'volume', 'Volume',
146	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
147
148	'keywords', 'Keywords',
149	'abstract', 'Abstract',
150	'copyright', 'Copyright',
151	'note', 'Note',
152	'url', 'URL',
153	);
154
155	# Metadata fields
156	my %metadata;
157	my ($EntryType, $Creator, $Keywords, $text);
158
159	my $verbosity = $self->{'verbosity'};
160	$verbosity = 0 unless $verbosity;
161
162	# Make sure the text has exactly one entry per line -
163	# append line to previous if it doesn't start with " <key> = "
164
165	my @input_lines=split(/\r?\n/, $$textref);
166	my @all_lines;
167	my $entry_line=shift @input_lines;
168	foreach my $input_line (@input_lines) {
169	if ($input_line =~ m/^\s\w+\s=\s*/) {
170	# this is a new key
171	push(@all_lines, $entry_line);
172	$entry_line=$input_line;
173	} else {
174	# this is a continuation of previous line
175	$entry_line .= " " . $input_line;
176	}
177
178	}
179	# add final line, removing trailing '}'
180	$entry_line =~ s/\}\s*$//;
181	push(@all_lines, $entry_line);
182	push(@all_lines, "}");
183
184	# Read and process each line in the bib file.
185	my ($entryname, $name, $value, $line);
186	foreach $line (@all_lines) {
187
188	# Add each line. Most lines consist of a field identifer and
189	# then data, and we simply store them, though we treat some
190	# of the fields a bit differently.
191
192	$line =~ s/\s+/ /g;
193	$text .= "$line\n";
194
195
196	print "Processing line = $line \n" if $verbosity>=4;
197
198	# The first line is special, it contains the reference type and OID
199	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
200	$EntryType = $1;
201	my $EntryID = (defined $2) ? $2 : "default";
202	print "** $EntryType - \"$EntryID\" \n"
203	if ($verbosity >= 4);
204
205	next;
206	}
207	if ($line =~ /\@/) {
208	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
209	if ($verbosity >= 2);
210	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
211	if ($verbosity >= 2);
212	}
213
214	# otherwise, parse the metadata out of this line
215	next unless ($line =~ /^\s(\w+)\s=\s(.)/);
216	$entryname = lc($1);
217	$value = $2;
218	$value =~ s/,?\s*$//; # remove trailing comma and space
219	if ($value =~ /^"/ && $value =~ /"$/) {
220	# remove surrounding " marks
221	$value =~ s/^"//; $value =~ s/"$//;
222	}
223	$value = &process_latex($value);
224
225	# Add this line of metadata
226	$metadata{$entryname} .= "$value";
227
228	}
229
230	# Add the Entry type as metadata
231	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
232
233	#Add the fileformat as metadata
234	$doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
235
236
237	# Add the various field as metadata
238	foreach my $entryname (keys %metadata) {
239	next unless (defined $field{$entryname});
240	next unless (defined $metadata{$entryname});
241
242	$name = $field{$entryname};
243	$value = $metadata{$entryname};
244
245	if ($name =~ /^Month/) {
246	$value=expand_month($value);
247	}
248
249	# Several special operatons on metadata follow
250
251	# Add individual keywords.
252	# The full set of keywords will be added, in due course, as "Keywords".
253	# However, we also want to add them as individual "Keyword" metadata elements.
254	if ($entryname eq "keywords") {
255	my @keywordlist = split(/,/, $value);
256	foreach my $k (@keywordlist) {
257	$k = lc($k);
258	$k =~ s/\s*$//;
259	$k =~ s/^\s*//;
260	if ($k =~ /\w/) {
261	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
262	}
263	}
264	}
265
266	# Add individual authors
267	# The author metadata will be stored as one "Creator" entry, but we
268	# also want to split it into several individual "Author" fields in
269	# "Lastname, Firstnames" format so we can browse it.
270	if ($entryname eq "author") { #added also comparison with editor
271
272	# take care of "et al."...
273	my $etal='';
274	if ($value =~ s/\s+(and\s+others\|et\.?\s+al\.?)\s*$//i) {
275	$etal=' <em>et. al.</em>';
276	}
277	# und here for german language...
278	# don't use brackets in pattern, else the matched bit becomes
279	# an element in the list!
280	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
281	my @formattedlist = ();
282	foreach $a (@authorlist) {
283	$a =~ s/\s*$//;
284	$a =~ s/^\s*//;
285	# Reformat and add author name
286	next if $a=~ /^\s*$/;
287
288	# names are "First von Last", "von Last, First"
289	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
290	my $first="";
291	my $vonlast="";
292	my $jr="";
293
294	if ($a =~ /,/) {
295	my @parts=split(/,\s*/, $a);
296	$first = pop @parts;
297	if (scalar(@parts) == 2) {
298	$jr = pop @parts;
299	}
300	$vonlast=shift @parts;
301	if (scalar(@parts) > 0) {
302	print $outhandle "BibTexPlug: couldn't parse name $a\n";
303	# but we continue anyway...
304	}
305	} else { # First von Last
306	my @words = split(/ /, $a);
307	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
308	$first .= " " . shift (@words);
309	}
310	$first =~ s/^\s//;
311	$vonlast = join (' ', @words); # whatever's left...
312	}
313	my $von="";
314	my $last="";
315	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
316	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
317	$von = $1;
318	if (!defined ($von)) {
319	# some non-English names do start with lowercase
320	# eg "Marie desJardins". Also we can get typos...
321	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
322	$von="";
323	if ($vonlast =~ /^[a-z]+$/) {
324	# if it's all lowercase, uppercase 1st.
325	$vonlast =~ s/^(.)/\u$1/;
326
327	}
328	}
329	$von =~ s/\s*$//;
330	$last=$vonlast;
331	} else {
332	$last=$vonlast;
333	}
334	my $wholename="$first $von $last $jr";
335	$wholename =~ s/\s+/ /g; # squeeze multiple spaces
336	$wholename =~ s/ $//;
337	push (@formattedlist, $wholename);
338	my $fullname = "$last";
339	$fullname .= " $jr" if ($jr);
340	$fullname .= ", $first";
341	$fullname .= " $von" if ($von);
342
343	# Add each name to set of Authors
344	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
345	}
346
347	# Only want at most one "and" in the Creator field
348	if (scalar(@formattedlist) > 2) {
349	my $lastauthor=pop @formattedlist;
350	$value=join(', ', @formattedlist);
351	$value.=" and $lastauthor";
352	} else { # 1 or 2 authors...
353	$value=join(" and ",@formattedlist);
354	}
355	$value.=$etal; # if there was "et al."
356	}
357
358	# Books and Journals are additionally marked for display purposes
359	if ($entryname eq "booktitle") {
360	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
361	} elsif ($entryname eq "journal") {
362	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
363	}
364
365	# Add the various fields as metadata
366	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
367	}
368
369	# for books and journals...
370	if (!exists $metadata{'title'}) {
371	my $name=$field{'title'}; # get Greenstone metadata name
372	my $value;
373	if (exists $metadata{'booktitle'}) {
374	$value=$metadata{'booktitle'};
375	} elsif (exists $metadata{'journal'}) {
376	$value=$metadata{'journal'};
377	}
378	if ($value) {
379	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
380	}
381	}
382
383	# Add Date (yyyymmdd) metadata
384	if (defined ($metadata{'year'}) ) {
385	my $date=$metadata{'year'};
386	chomp $date;
387	my $month=$metadata{'month'};
388	if (defined($month)) {
389	# month is currently 3 letter code or a range...
390	$month = expand_month($month);
391	# take the first month found... might not find one!
392	$month =~ m/_textmonth(\d\d)_/;
393	$month = $1;
394	}
395	if (!defined($month)) {
396	$month="00";
397	}
398	$date .= "${month}00";
399	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
400	}
401
402	# # Add the text in BibTex format (all fields)
403	if ($text =~ /\w/) {
404
405	$text =~ s@&@&@g;
406	$text =~ s@<@<@g;
407	$text =~ s@>@>@g;
408	$text =~ s@\n@<br/>\n@g;
409	$text =~ s@\\@\\\\@g;
410
411	$doc_obj->add_utf8_text ($cursection, $text);
412	}
413
414	return 1;
415	}
416
417
418
419
420	# convert email addresses and URLs into links
421	sub convert_urls_into_links{
422	my ($text) = @_;
423
424	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
425	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
426
427	return $text;
428	}
429
430	# Clean up whitespace and convert \n charaters to <BR> or <P>
431	sub clean_up_whitespaces{
432	my ($text) = @_;
433
434	$text =~ s/%%%%%/<BR> <BR>/g;
435	$text =~ s/ +/ /g;
436	$text =~ s/\s*$//;
437	$text =~ s/^\s*//;
438	$text =~ s/\n/\n<BR>/g;
439	$text =~ s/<BR>\s*<BR>/<P>/g;
440
441	return $text;
442	}
443
444
445	sub convert_problem_characters_without_ampersand{
446	my ($text) = @_;
447	$text =~ s/</</g;
448	$text =~ s/>/>/g;
449
450	$text =~ s/\'\'/\"/g; #Latex -specific conversion
451	$text =~ s/\`\`/\"/g; #Latex -specific conversion
452
453	$text =~ s/\"/"/g;
454	$text =~ s/\'/’/g;
455	$text =~ s/\`/‘/g;
456
457	# $text =~ s/\+/ /g;
458	# $text =~ s/\(/ /g;
459	# $text =~ s/\)/ /g;
460
461	$text =~ s/\\/\\\\/g;
462
463	# $text =~ s/\./\\\./g;
464
465	return $text;
466	}
467
468	# Convert a text string into HTML.
469
470	# The HTML is going to be inserted into a GML file, so we have to be
471	# careful not to use symbols like ">", which occurs frequently in email
472	# messages (and use &gt instead.
473
474	# This function also turns URLs and email addresses into links, and
475	# replaces carriage returns with <BR> tags (and multiple carriage returns
476	# with <P> tags).
477
478	sub text_into_html {
479	my ($text) = @_;
480
481	# Convert problem characters into HTML symbols
482	$text =~ s/&/&/g;
483
484	$text = &convert_problem_characters_without_ampersand( $text );
485
486	# convert email addresses and URLs into links
487	$text = &convert_urls_into_links( $text );
488
489	$text = &clean_up_whitespaces( $text );
490
491	return $text;
492	}
493
494
495	sub expand_month {
496	my $text=shift;
497
498	# bibtex style files expand abbreviations for months.
499	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
500	$text =~ s/jan/_textmonth01_/g;
501	$text =~ s/feb/_textmonth02_/g;
502	$text =~ s/mar/_textmonth03_/g;
503	$text =~ s/apr/_textmonth04_/g;
504	$text =~ s/may/_textmonth05_/g;
505	$text =~ s/jun/_textmonth06_/g;
506	$text =~ s/jul/_textmonth07_/g;
507	$text =~ s/aug/_textmonth08_/g;
508	$text =~ s/sep/_textmonth09_/g;
509	$text =~ s/oct/_textmonth10_/g;
510	$text =~ s/nov/_textmonth11_/g;
511	$text =~ s/dec/_textmonth12_/g;
512
513	return $text;
514	}
515
516
517	# Convert accented characters, remove { }, interprete some commands....
518	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
519	sub process_latex {
520	my ($text) = @_;
521
522	# note - this is really ugly, but it works. There may be a prettier way
523	# of mapping latex accented chars to utf8, but we just brute force it here.
524	# Also, this isn't complete - not every single possible accented letter
525	# is in here yet, but most of the common ones are.
526
527	my %utf8_chars =
528	(
529	# acutes
530	'\'a' => chr(0xc3).chr(0xa1),
531	'\'c' => chr(0xc4).chr(0x87),
532	'\'e' => chr(0xc3).chr(0xa9),
533	'\'i' => chr(0xc3).chr(0xad),
534	'\'l' => chr(0xc3).chr(0xba),
535	'\'n' => chr(0xc3).chr(0x84),
536	'\'o' => chr(0xc3).chr(0xb3),
537	'\'r' => chr(0xc5).chr(0x95),
538	'\'s' => chr(0xc5).chr(0x9b),
539	'\'u' => chr(0xc3).chr(0xba),
540	'\'y' => chr(0xc3).chr(0xbd),
541	'\'z' => chr(0xc5).chr(0xba),
542	# graves
543	'`a' => chr(0xc3).chr(0xa0),
544	'`A' => chr(0xc3).chr(0x80),
545	'`e' => chr(0xc3).chr(0xa8),
546	'`E' => chr(0xc3).chr(0x88),
547	'`i' => chr(0xc3).chr(0xac),
548	'`I' => chr(0xc3).chr(0x8c),
549	'`o' => chr(0xc3).chr(0xb2),
550	'`O' => chr(0xc3).chr(0x92),
551	'`u' => chr(0xc3).chr(0xb9),
552	'`U' => chr(0xc3).chr(0x99),
553	# circumflex
554	'^a' => chr(0xc3).chr(0xa2),
555	'^A' => chr(0xc3).chr(0x82),
556	'^c' => chr(0xc4).chr(0x89),
557	'^C' => chr(0xc4).chr(0x88),
558	'^e' => chr(0xc3).chr(0xaa),
559	'^E' => chr(0xc3).chr(0x8a),
560	'^g' => chr(0xc4).chr(0x9d),
561	'^G' => chr(0xc4).chr(0x9c),
562	'^h' => chr(0xc4).chr(0xa5),
563	'^H' => chr(0xc4).chr(0xa4),
564	'^i' => chr(0xc3).chr(0xae),
565	'^I' => chr(0xc3).chr(0x8e),
566	'^j' => chr(0xc4).chr(0xb5),
567	'^J' => chr(0xc4).chr(0xb4),
568	'^o' => chr(0xc3).chr(0xb4),
569	'^O' => chr(0xc3).chr(0x94),
570	'^s' => chr(0xc5).chr(0x9d),
571	'^S' => chr(0xc5).chr(0x9c),
572	'^u' => chr(0xc3).chr(0xa2),
573	'^U' => chr(0xc3).chr(0xbb),
574	'^w' => chr(0xc5).chr(0xb5),
575	'^W' => chr(0xc5).chr(0xb4),
576	'^y' => chr(0xc5).chr(0xb7),
577	'^Y' => chr(0xc5).chr(0xb6),
578
579	# diaeresis
580	'"a' => chr(0xc3).chr(0xa4),
581	'"A' => chr(0xc3).chr(0x84),
582	'"e' => chr(0xc3).chr(0xab),
583	'"E' => chr(0xc3).chr(0x8b),
584	'"\\\\i' => chr(0xc3).chr(0xaf),
585	'"\\\\I' => chr(0xc3).chr(0x8f),
586	'"o' => chr(0xc3).chr(0xb6),
587	'"O' => chr(0xc3).chr(0x96),
588	'"u' => chr(0xc3).chr(0xbc),
589	'"U' => chr(0xc3).chr(0x9c),
590	'"y' => chr(0xc3).chr(0xbf),
591	'"Y' => chr(0xc3).chr(0xb8),
592	# tilde
593	'~A' => chr(0xc3).chr(0x83),
594	'~N' => chr(0xc3).chr(0x91),
595	'~O' => chr(0xc3).chr(0x95),
596	'~a' => chr(0xc3).chr(0xa3),
597	'~n' => chr(0xc3).chr(0xb1),
598	'~o' => chr(0xc3).chr(0xb5),
599	# caron - handled specially
600	# double acute
601	# ring
602	# dot
603	'.c' => chr(0xc4).chr(0x8b),
604	'.C' => chr(0xc4).chr(0x8a),
605	'.e' => chr(0xc4).chr(0x97),
606	'.E' => chr(0xc4).chr(0x96),
607	'.g' => chr(0xc4).chr(0xa1),
608	'.G' => chr(0xc4).chr(0xa0),
609	'.I' => chr(0xc4).chr(0xb0),
610	'.z' => chr(0xc5).chr(0xbc),
611	'.Z' => chr(0xc5).chr(0xbb),
612	# macron
613	'=a' => chr(0xc4).chr(0x81),
614	'=A' => chr(0xc4).chr(0x80),
615	'=e' => chr(0xc4).chr(0x93),
616	'=E' => chr(0xc4).chr(0x92),
617	'=i' => chr(0xc4).chr(0xab),
618	'=I' => chr(0xc4).chr(0xaa),
619	'=o' => chr(0xc4).chr(0x8d),
620	'=O' => chr(0xc4).chr(0x8c),
621	'=u' => chr(0xc4).chr(0xab),
622	'=U' => chr(0xc4).chr(0xaa),
623
624	# stroke - handled specially - see below
625
626	# cedilla - handled specially
627	);
628
629	# these are one letter latex commands - we make sure they're not a longer
630	# command name. eg {\d} is d+stroke, so careful of \d
631	my %special_utf8_chars =
632	(
633	# breve
634	'u g' => chr(0xc4).chr(0x9f),
635	'u G' => chr(0xc4).chr(0x9e),
636	'u i' => chr(0xc4).chr(0xad),
637	'u I' => chr(0xc4).chr(0xac),
638	'u o' => chr(0xc5).chr(0x8f),
639	'u O' => chr(0xc5).chr(0x8e),
640	'u u' => chr(0xc5).chr(0xad),
641	'u U' => chr(0xc5).chr(0xac),
642	# caron
643	'v c' => chr(0xc4).chr(0x8d),
644	'v C' => chr(0xc4).chr(0x8c),
645	'v n' => chr(0xc5).chr(0x88),
646	'v N' => chr(0xc5).chr(0x87),
647	'v s' => chr(0xc5).chr(0xa1),
648	'v S' => chr(0xc5).chr(0xa5),
649	'v z' => chr(0xc5).chr(0xbe),
650	'v Z' => chr(0xc5).chr(0xbd),
651	# cedilla
652	'c c' => chr(0xc3).chr(0xa7),
653	'c C' => chr(0xc3).chr(0x87),
654	'c g' => chr(0xc4).chr(0xa3),
655	'c G' => chr(0xc4).chr(0xa2),
656	'c k' => chr(0xc4).chr(0xb7),
657	'c K' => chr(0xc4).chr(0xb6),
658	'c l' => chr(0xc4).chr(0xbc),
659	'c L' => chr(0xc4).chr(0xbb),
660	'c n' => chr(0xc5).chr(0x86),
661	'c N' => chr(0xc5).chr(0x85),
662	'c r' => chr(0xc5).chr(0x97),
663	'c R' => chr(0xc5).chr(0x96),
664	'c s' => chr(0xc5).chr(0x9f),
665	'c S' => chr(0xc5).chr(0x9e),
666	'c t' => chr(0xc5).chr(0xa3),
667	'c T' => chr(0xc5).chr(0xa2),
668	# double acute / Hungarian accent
669	'H O' => chr(0xc5).chr(0x90),
670	'H o' => chr(0xc5).chr(0x91),
671	'H U' => chr(0xc5).chr(0xb0),
672	'H u' => chr(0xc5).chr(0xb1),
673
674	# stroke
675	'd' => chr(0xc4).chr(0x91),
676	'D' => chr(0xc4).chr(0x90),
677	'h' => chr(0xc4).chr(0xa7),
678	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
679	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
680	'l' => chr(0xc5).chr(0x82),
681	'L' => chr(0xc5).chr(0x81),
682	'o' => chr(0xc3).chr(0xb8),
683	'O' => chr(0xc3).chr(0x98),
684	't' => chr(0xc5).chr(0xa7),
685	'T' => chr(0xc5).chr(0xa6),
686	# german ss/szlig/sharp s
687	'ss' => chr(0xc3).chr(0x9f),
688	);
689
690	# convert latex-style accented characters.
691
692	# remove space (if any) between \ and letter to accent (eg {\' a})
693
694	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
695
696	# remove {} around a single character (eg \'{e})
697	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
698
699	# \, is another way of doing cedilla \c
700	$text =~ s@\\,(.)@\\c $1@g;
701
702	# remove {} around a single character for special 1 letter commands -
703	# need to insert a space. Eg \v{s} -> {\v s}
704	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
705
706	# only do if the text contains a '\' character.
707	if ($text =~ m\|\\\|) {
708	# "normal" accents - ie non-alpha latex tag
709	while ($text =~ m@\\([`'="^~\.])([\w])@) {
710	my $tex="$1$2"; my $char="$2";
711	my $replacement=$utf8_chars{$tex};
712	if (!defined($replacement)) {
713	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
714	$replacement=$char;
715	}
716	$text =~ s/\\\Q$tex/$replacement/g;
717	}
718
719	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
720	# only do the change if immediately followed by a space, }, {, or \
721	# one letter accents ( + ss)
722	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
723	my $tex=$1;
724	my $replacement=$special_utf8_chars{$tex};
725	if (!defined($replacement)) {
726	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
727	$replacement=$tex;
728	}
729	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
730
731	}
732
733	# one letter latex accent commands that affect following letter
734	while ($text =~ m@\\([uvcH]) ([\w])@) {
735	my $tex="$1 $2"; my $char="$2";
736	my $replacement=$special_utf8_chars{$tex};
737	if (!defined($replacement)) {
738	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
739	$replacement=$char;
740	}
741	$text =~ s/\\$tex/$replacement/g;
742	}
743	}
744
745	# escape html-sensitive characters
746	$text =~ s@&@&@g;
747	$text =~ s@<@<@g;
748	$text =~ s@>@>@g;
749	$text =~ s/''/"/g; # Latex-specific
750	$text =~ s/``/"/g; # Latex-specific
751	# greenstone-specific
752	$text =~ s@\[@&\#91;@g;
753	$text =~ s@\]@&\#93;@g;
754
755	# remove latex commands
756
757	# explicitly recognised commands
758	$text =~ s@\\ldots@…@g;
759
760	# maths mode
761	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
762
763	# remove all other commands with optional arguments...
764	$text =~ s@\\\w+(\[.?\])?\s@@g;
765	# $text =~ s@\\noopsort{[^}]+\}@@g;
766	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
767
768	# remove latex groupings { } (but not \{ or \} )
769	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
770	$text =~ s/^\{//; # remove { if first char
771
772	# latex characters
773	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
774	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
775	# optional break "\-"
776	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
777	# the non-macro bits have quotes around them - we just remove quotes
778	$text =~ s/([^&])[\"\#]/$1/g;
779	}
780	# dashes. Convert (m\|n)-dash into single dash for html.
781	$text =~ s@\-\-+@\-@g;
782
783	# quoted { } chars
784	$text =~ s@\\{@{@g;
785	$text =~ s@\\}@}@g;
786
787	# finally to protect against macro language...
788	$text =~ s@\\@\\\\@g;
789
790	return $text;
791	}
792
793
794	sub process_latex_math {
795	my $text = shift;
796
797	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
798	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
799	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
800	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
801	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
802
803	# put all other command names in italics
804	$text =~ s@\\([\w]+)@<i>$1</i>@g;
805
806	return $text;
807	}
808
809	sub set_OID {
810	my $self = shift (@_);
811	my ($doc_obj, $id, $segment_number) = @_;
812
813	if ( $self->{'key'} eq "default") {
814	$doc_obj->set_OID("$id\_$segment_number");
815	} else {
816	$doc_obj->set_OID($self->{'key'});
817	}
818	}
819
820	1;
821
822

Note: See TracBrowser for help on using the repository browser.

Download in other formats: