Context Navigation

source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 7243

Last change on this file since 7243 was 6408, checked in by jmt12, 20 years ago
Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)
Property svn:keywords set to `Author Date Id Revision`
File size: 22.4 KB

Line
1	###########################################################################
2	#
3	# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 2000 Gordon W. Paynter
10	# Copyright 1999-2001 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28
29	# BibTexPlug reads bibliography files in BibTex format.
30	#
31	# by Gordon W. Paynter ([email protected]), November 2000
32	# Based on ReferPlug. See ReferPlug for geneology.
33	#
34	# BibTexPlug creates a document object for every reference a the file.
35	# It is a subclass of SplitPlug, so if there are multiple records, all
36	# are read.
37	#
38	# Modified Dec 2001 by John McPherson:
39	# * some modifications submitted by Sergey Yevtushenko
40	# <[email protected]>
41	# * some non-ascii char support (ie mostly Latin)
42
43
44	package BibTexPlug;
45
46	use SplitPlug;
47
48	# BibTexPlug is a sub-class of BasPlug.
49	sub BEGIN {
50	@ISA = ('SplitPlug');
51	}
52
53	my $arguments =
54	[ { 'name' => "process_exp",
55	'desc' => "{BasPlug.process_exp}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => &get_default_process_exp() },
59	{ 'name' => "split_exp",
60	'desc' => "{SplitPlug.split_exp}",
61	'type' => "regexp",
62	'deft' => &get_default_split_exp(),
63	'reqd' => "no" }
64	];
65
66	my $options = { 'name' => "BibTexPlug",
67	'desc' => "{BibTexPlug.desc}",
68	'abstract' => "no",
69	'inherits' => "yes",
70	'args' => $arguments };
71
72	# This plugin processes files with the suffix ".bib"
73	sub get_default_process_exp {
74	return q^(?i)\.bib$^;
75	}
76
77	# This plugin splits the input text at blank lines
78	sub get_default_split_exp {
79	return q^\n+(?=@)^;
80	}
81	sub new {
82	my $class = shift (@_);
83	my $self = new SplitPlug ($class, @_);
84	$self->{'plugin_type'} = "BibTexPlug";
85	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
86	my $option_list = $self->{'option_list'};
87	push( @{$option_list}, $options );
88	return bless $self, $class;
89	}
90
91
92
93	# The process function reads a single bibliographic record and stores
94	# it as a new document.
95
96	sub process {
97	my $self = shift (@_);
98	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
99	my $outhandle = $self->{'outhandle'};
100
101	$self->{'key'} = "default";
102
103	# Check that we're dealing with a valid BibTex record
104	return undef unless ($$textref =~ /^@\w+\{.*\}/s);
105
106	# Ignore things we can't use
107	return 0 if ($$textref =~ /^\@String/);
108
109	# Report that we're processing the file
110	print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
111	print $outhandle "BibTexPlug: processing $file\n"
112	if ($self->{'verbosity'}) > 1;
113
114
115	# This hash translates BibTex field names into metadata names. The
116	# BibTex names are taken from the "Local Guide to Latex" Graeme
117	# McKinstry. Metadata names are consistent with ReferPlug.
118
119	my %field = (
120	'address', 'PublisherAddress',
121	'author', 'Creator',
122
123	'booktitle', 'Booktitle',
124	'chapter', 'Chapter',
125	'edition', 'Edition',
126	'editor', 'Editor',
127	'institution', 'Publisher',
128	'journal', 'Journal',
129	'month', 'Month',
130	'number', 'Number',
131	'pages', 'Pages',
132	'publisher', 'Publisher',
133	'school', 'Publisher',
134	'title', 'Title',
135	'volume', 'Volume',
136	'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
137
138	'keywords', 'Keywords',
139	'abstract', 'Abstract',
140	'copyright', 'Copyright',
141	'note', 'Note',
142	);
143
144	# Metadata fields
145	my %metadata;
146	my ($EntryType, $Creator, $Keywords, $text);
147
148	my $verbosity = $self->{'verbosity'};
149	$verbosity = 0 unless $verbosity;
150
151	# Make sure the text has exactly one entry per line -
152	# append line to previous if it doesn't start with " <key> = "
153
154	my @input_lines=split(/\r?\n/, $$textref);
155	my @all_lines;
156	my $entry_line=shift @input_lines;
157	foreach my $input_line (@input_lines) {
158	if ($input_line =~ m/^\s\w+\s=\s*/) {
159	# this is a new key
160	push(@all_lines, $entry_line);
161	$entry_line=$input_line;
162	} else {
163	# this is a continuation of previous line
164	$entry_line .= " " . $input_line;
165	}
166
167	}
168	# add final line, removing trailing '}'
169	$entry_line =~ s/\}\s*$//;
170	push(@all_lines, $entry_line);
171	push(@all_lines, "}");
172
173	# Read and process each line in the bib file.
174	my ($entryname, $name, $value, $line);
175	foreach $line (@all_lines) {
176
177	# Add each line. Most lines consist of a field identifer and
178	# then data, and we simply store them, though we treat some
179	# of the fields a bit differently.
180
181	$line =~ s/\s+/ /g;
182	$text .= "$line\n";
183
184
185	print "Processing line = $line \n" if $verbosity>=4;
186
187	# The first line is special, it contains the reference type and OID
188	if ($line =~ /\@(\w+)\W\{\W([\\.\w\d:-])\W*$/) {
189	$EntryType = $1;
190	my $EntryID = (defined $2) ? $2 : "default";
191	print "** $EntryType - \"$EntryID\" \n"
192	if ($verbosity >= 4);
193
194	next;
195	}
196	if ($line =~ /\@/) {
197	print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
198	if ($verbosity >= 2);
199	print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
200	if ($verbosity >= 2);
201	}
202
203	# otherwise, parse the metadata out of this line
204	next unless ($line =~ /^\s(\w+)\s+=\s+(.)/);
205	$entryname = lc($1);
206	$value = $2;
207	$value =~ s/,?\s*$//; # remove trailing comma and space
208	if ($value =~ /^"/ && $value =~ /"$/) {
209	# remove surrounding " marks
210	$value =~ s/^"//; $value =~ s/"$//;
211	}
212	$value = &process_latex($value);
213
214	# Add this line of metadata
215	$metadata{$entryname} .= "$value";
216
217	}
218
219	# Add the Entry type as metadata
220	$doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
221
222	# Add the various field as metadata
223	foreach my $entryname (keys %metadata) {
224	next unless (defined $field{$entryname});
225	next unless (defined $metadata{$entryname});
226
227	$name = $field{$entryname};
228	$value = $metadata{$entryname};
229
230	if ($name =~ /^Month/) {
231	$value=expand_month($value);
232	}
233
234	# Several special operatons on metadata follow
235
236	# Add individual keywords.
237	# The full set of keywords will be added, in due course, as "Keywords".
238	# However, we also want to add them as individual "Keyword" metadata elements.
239	if ($entryname eq "keywords") {
240	my @keywordlist = split(/,/, $value);
241	foreach my $k (@keywordlist) {
242	$k = lc($k);
243	$k =~ s/\s*$//;
244	$k =~ s/^\s*//;
245	if ($k =~ /\w/) {
246	$doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
247	}
248	}
249	}
250
251	# Add individual authors
252	# The author metadata will be stored as one "Creator" entry, but we
253	# also want to split it into several individual "Author" fields in
254	# "Lastname, Firstnames" format so we can browse it.
255	if ($entryname eq "author") { #added also comparison with editor
256
257	# take care of "et al."...
258	$value =~ s/(\s+et\.?\s+al\.?)\s*$//;
259	my $etal=$1;
260	$etal="" if (!defined ($etal));
261	# und here for german language...
262	# don't use brackets in pattern, else the matched bit becomes
263	# an element in the list!
264	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $value);
265	my @formattedlist = ();
266	foreach $a (@authorlist) {
267	$a =~ s/\s*$//;
268	$a =~ s/^\s*//;
269	# Reformat and add author name
270	next if $a=~ /^\s*$/;
271
272	# names are "First von Last", "von Last, First"
273	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
274	my $first="";
275	my $vonlast="";
276	my $jr="";
277
278	if ($a =~ /,/) {
279	my @parts=split(/,\s*/, $a);
280	$first = pop @parts;
281	if (scalar(@parts) == 2) {
282	$jr = pop @parts;
283	}
284	$vonlast=shift @parts;
285	if (scalar(@parts) > 0) {
286	print $outhandle "BibTexPlug: couldn't parse name $a\n";
287	# but we continue anyway...
288	}
289	} else { # First von Last
290	my @words = split(/ /, $a);
291	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
292	$first .= " " . shift (@words);
293	}
294	$first =~ s/^\s//;
295	$vonlast = join (' ', @words); # whatever's left...
296	}
297	my $von="";
298	my $last="";
299	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
300	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
301	$von = $1;
302	if (!defined ($von)) {
303	# some non-English names do start with lowercase
304	# eg "Marie desJardins". Also we can get typos...
305	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
306	$von="";
307	if ($vonlast =~ /^[a-z]+$/) {
308	# if it's all lowercase, uppercase 1st.
309	$vonlast =~ s/^(.)/\u$1/;
310
311	}
312	}
313	$von =~ s/\s*$//;
314	$last=$vonlast;
315	} else {
316	$last=$vonlast;
317	}
318	my $wholename="$first $von $last $jr";
319	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
320	push (@formattedlist, $wholename);
321	my $fullname = "$last";
322	$fullname .= " $jr" if ($jr);
323	$fullname .= ", $first";
324	$fullname .= " $von" if ($von);
325
326	# Add each name to set of Authors
327	# force utf8 pragma so that \w matches in this scope
328	use utf8;
329	$doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
330	}
331
332	# Only want at most one "and" in the Creator field
333	if (scalar(@formattedlist) > 2) {
334	my $lastauthor=pop @formattedlist;
335	$value=join(', ', @formattedlist);
336	$value.=" and $lastauthor";
337	} else { # 1 or 2 authors...
338	$value=join(" and ",@formattedlist);
339	}
340	$value.=$etal; # if there was "et al."
341	}
342
343	# Books and Journals are additionally marked for display purposes
344	if ($entryname eq "booktitle") {
345	$doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
346	} elsif ($entryname eq "journal") {
347	$doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
348	}
349
350
351	# Add the various fields as metadata
352	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
353	}
354
355	# Add Date (yyyymmdd) metadata
356	if (defined ($metadata{'year'}) ) {
357	my $date=$metadata{'year'};
358	chomp $date;
359	my $month=$metadata{'month'};
360	if (defined($month)) {
361	# month is currently 3 letter code or a range...
362	$month = expand_month($month);
363	# take the first month found... might not find one!
364	$month =~ m/_textmonth(\d\d)_/;
365	$month = $1;
366	}
367	if (!defined($month)) {
368	$month="00";
369	}
370	$date .= "${month}00";
371	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
372	}
373
374	# # Add the text in BibTex format (all fields)
375	if ($text =~ /\w/) {
376
377	$text =~ s@&@&@g;
378	$text =~ s@<@<@g;
379	$text =~ s@>@>@g;
380	$text =~ s@\n@<br/>\n@g;
381	$text =~ s@\\@\\\\@g;
382
383	$doc_obj->add_utf8_text ($cursection, $text);
384	}
385
386	return 1;
387	}
388
389
390
391
392	# convert email addresses and URLs into links
393	sub convert_urls_into_links{
394	my ($text) = @_;
395
396	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
397	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
398
399	return $text;
400	}
401
402	# Clean up whitespace and convert \n charaters to <BR> or <P>
403	sub clean_up_whitespaces{
404	my ($text) = @_;
405
406	$text =~ s/%%%%%/<BR> <BR>/g;
407	$text =~ s/ +/ /g;
408	$text =~ s/\s*$//;
409	$text =~ s/^\s*//;
410	$text =~ s/\n/\n<BR>/g;
411	$text =~ s/<BR>\s*<BR>/<P>/g;
412
413	return $text;
414	}
415
416
417	sub convert_problem_characters_without_ampersand{
418	my ($text) = @_;
419	$text =~ s/</</g;
420	$text =~ s/>/>/g;
421
422	$text =~ s/\'\'/\"/g; #Latex -specific conversion
423	$text =~ s/\`\`/\"/g; #Latex -specific conversion
424
425	$text =~ s/\"/"/g;
426	$text =~ s/\'/’/g;
427	$text =~ s/\`/‘/g;
428
429	# $text =~ s/\+/ /g;
430	# $text =~ s/\(/ /g;
431	# $text =~ s/\)/ /g;
432
433	$text =~ s/\\/\\\\/g;
434
435	# $text =~ s/\./\\\./g;
436
437	return $text;
438	}
439
440	# Convert a text string into HTML.
441
442	# The HTML is going to be inserted into a GML file, so we have to be
443	# careful not to use symbols like ">", which occurs frequently in email
444	# messages (and use &gt instead.
445
446	# This function also turns URLs and email addresses into links, and
447	# replaces carriage returns with <BR> tags (and multiple carriage returns
448	# with <P> tags).
449
450	sub text_into_html {
451	my ($text) = @_;
452
453	# Convert problem characters into HTML symbols
454	$text =~ s/&/&/g;
455
456	$text = &convert_problem_characters_without_ampersand( $text );
457
458	# convert email addresses and URLs into links
459	$text = &convert_urls_into_links( $text );
460
461	$text = &clean_up_whitespaces( $text );
462
463	return $text;
464	}
465
466
467	sub expand_month {
468	my $text=shift;
469
470	# bibtex style files expand abbreviations for months.
471	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
472	$text =~ s/jan/_textmonth01_/g;
473	$text =~ s/feb/_textmonth02_/g;
474	$text =~ s/mar/_textmonth03_/g;
475	$text =~ s/apr/_textmonth04_/g;
476	$text =~ s/may/_textmonth05_/g;
477	$text =~ s/jun/_textmonth06_/g;
478	$text =~ s/jul/_textmonth07_/g;
479	$text =~ s/aug/_textmonth08_/g;
480	$text =~ s/sep/_textmonth09_/g;
481	$text =~ s/oct/_textmonth10_/g;
482	$text =~ s/nov/_textmonth11_/g;
483	$text =~ s/dec/_textmonth12_/g;
484
485	return $text;
486	}
487
488
489	# Convert accented characters, remove { }, interprete some commands....
490	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
491	sub process_latex {
492	my ($text) = @_;
493
494	# note - this is really ugly, but it works. There may be a prettier way
495	# of mapping latex accented chars to utf8, but we just brute force it here.
496	# Also, this isn't complete - not every single possible accented letter
497	# is in here yet, but most of the common ones are.
498
499	my %utf8_chars =
500	(
501	# acutes
502	'\'a' => chr(0xc3).chr(0xa1),
503	'\'c' => chr(0xc4).chr(0x87),
504	'\'e' => chr(0xc3).chr(0xa9),
505	'\'i' => chr(0xc3).chr(0xad),
506	'\'l' => chr(0xc3).chr(0xba),
507	'\'n' => chr(0xc3).chr(0x84),
508	'\'o' => chr(0xc3).chr(0xb3),
509	'\'r' => chr(0xc5).chr(0x95),
510	'\'s' => chr(0xc5).chr(0x9b),
511	'\'u' => chr(0xc3).chr(0xba),
512	'\'y' => chr(0xc3).chr(0xbd),
513	'\'z' => chr(0xc5).chr(0xba),
514	# graves
515	'`a' => chr(0xc3).chr(0xa0),
516	'`A' => chr(0xc3).chr(0x80),
517	'`e' => chr(0xc3).chr(0xa8),
518	'`E' => chr(0xc3).chr(0x88),
519	'`i' => chr(0xc3).chr(0xac),
520	'`I' => chr(0xc3).chr(0x8c),
521	'`o' => chr(0xc3).chr(0xb2),
522	'`O' => chr(0xc3).chr(0x92),
523	'`u' => chr(0xc3).chr(0xb9),
524	'`U' => chr(0xc3).chr(0x99),
525	# circumflex
526	'^a' => chr(0xc3).chr(0xa2),
527	'^A' => chr(0xc3).chr(0x82),
528	'^c' => chr(0xc4).chr(0x89),
529	'^C' => chr(0xc4).chr(0x88),
530	'^e' => chr(0xc3).chr(0xaa),
531	'^E' => chr(0xc3).chr(0x8a),
532	'^g' => chr(0xc4).chr(0x9d),
533	'^G' => chr(0xc4).chr(0x9c),
534	'^h' => chr(0xc4).chr(0xa5),
535	'^H' => chr(0xc4).chr(0xa4),
536	'^i' => chr(0xc3).chr(0xae),
537	'^I' => chr(0xc3).chr(0x8e),
538	'^j' => chr(0xc4).chr(0xb5),
539	'^J' => chr(0xc4).chr(0xb4),
540	'^o' => chr(0xc3).chr(0xb4),
541	'^O' => chr(0xc3).chr(0x94),
542	'^s' => chr(0xc5).chr(0x9d),
543	'^S' => chr(0xc5).chr(0x9c),
544	'^u' => chr(0xc3).chr(0xa2),
545	'^U' => chr(0xc3).chr(0xbb),
546	'^w' => chr(0xc5).chr(0xb5),
547	'^W' => chr(0xc5).chr(0xb4),
548	'^y' => chr(0xc5).chr(0xb7),
549	'^Y' => chr(0xc5).chr(0xb6),
550
551	# diaeresis
552	'"a' => chr(0xc3).chr(0xa4),
553	'"A' => chr(0xc3).chr(0x84),
554	'"e' => chr(0xc3).chr(0xab),
555	'"E' => chr(0xc3).chr(0x8b),
556	'"\\\\i' => chr(0xc3).chr(0xaf),
557	'"\\\\I' => chr(0xc3).chr(0x8f),
558	'"o' => chr(0xc3).chr(0xb6),
559	'"O' => chr(0xc3).chr(0x96),
560	'"u' => chr(0xc3).chr(0xbc),
561	'"U' => chr(0xc3).chr(0x9c),
562	'"y' => chr(0xc3).chr(0xbf),
563	'"Y' => chr(0xc3).chr(0xb8),
564	# tilde
565	'~A' => chr(0xc3).chr(0x83),
566	'~N' => chr(0xc3).chr(0x91),
567	'~O' => chr(0xc3).chr(0x95),
568	'~a' => chr(0xc3).chr(0xa3),
569	'~n' => chr(0xc3).chr(0xb1),
570	'~o' => chr(0xc3).chr(0xb5),
571	# caron - handled specially
572	# double acute
573	# ring
574	# dot
575	'.c' => chr(0xc4).chr(0x8b),
576	'.C' => chr(0xc4).chr(0x8a),
577	'.e' => chr(0xc4).chr(0x97),
578	'.E' => chr(0xc4).chr(0x96),
579	'.g' => chr(0xc4).chr(0xa1),
580	'.G' => chr(0xc4).chr(0xa0),
581	'.I' => chr(0xc4).chr(0xb0),
582	'.z' => chr(0xc5).chr(0xbc),
583	'.Z' => chr(0xc5).chr(0xbb),
584	# macron
585	'=a' => chr(0xc4).chr(0x81),
586	'=A' => chr(0xc4).chr(0x80),
587	'=e' => chr(0xc4).chr(0x93),
588	'=E' => chr(0xc4).chr(0x92),
589	'=i' => chr(0xc4).chr(0xab),
590	'=I' => chr(0xc4).chr(0xaa),
591	'=o' => chr(0xc4).chr(0x8d),
592	'=O' => chr(0xc4).chr(0x8c),
593	'=u' => chr(0xc4).chr(0xab),
594	'=U' => chr(0xc4).chr(0xaa),
595
596	# stroke - handled specially - see below
597
598	# cedilla - handled specially
599	);
600
601	# these are one letter latex commands - we make sure they're not a longer
602	# command name. eg {\d} is d+stroke, so careful of \d
603	my %special_utf8_chars =
604	(
605	# breve
606	'u g' => chr(0xc4).chr(0x9f),
607	'u G' => chr(0xc4).chr(0x9e),
608	'u i' => chr(0xc4).chr(0xad),
609	'u I' => chr(0xc4).chr(0xac),
610	'u o' => chr(0xc5).chr(0x8f),
611	'u O' => chr(0xc5).chr(0x8e),
612	'u u' => chr(0xc5).chr(0xad),
613	'u U' => chr(0xc5).chr(0xac),
614	# caron
615	'v c' => chr(0xc4).chr(0x8d),
616	'v C' => chr(0xc4).chr(0x8c),
617	'v n' => chr(0xc5).chr(0x88),
618	'v N' => chr(0xc5).chr(0x87),
619	'v s' => chr(0xc5).chr(0xa1),
620	'v S' => chr(0xc5).chr(0xa5),
621	'v z' => chr(0xc5).chr(0xbe),
622	'v Z' => chr(0xc5).chr(0xbd),
623	# cedilla
624	'c c' => chr(0xc3).chr(0xa7),
625	'c C' => chr(0xc3).chr(0x87),
626	'c g' => chr(0xc4).chr(0xa3),
627	'c G' => chr(0xc4).chr(0xa2),
628	'c k' => chr(0xc4).chr(0xb7),
629	'c K' => chr(0xc4).chr(0xb6),
630	'c l' => chr(0xc4).chr(0xbc),
631	'c L' => chr(0xc4).chr(0xbb),
632	'c n' => chr(0xc5).chr(0x86),
633	'c N' => chr(0xc5).chr(0x85),
634	'c r' => chr(0xc5).chr(0x97),
635	'c R' => chr(0xc5).chr(0x96),
636	'c s' => chr(0xc5).chr(0x9f),
637	'c S' => chr(0xc5).chr(0x9e),
638	'c t' => chr(0xc5).chr(0xa3),
639	'c T' => chr(0xc5).chr(0xa2),
640	# double acute / Hungarian accent
641	'H O' => chr(0xc5).chr(0x90),
642	'H o' => chr(0xc5).chr(0x91),
643	'H U' => chr(0xc5).chr(0xb0),
644	'H u' => chr(0xc5).chr(0xb1),
645
646	# stroke
647	'd' => chr(0xc4).chr(0x91),
648	'D' => chr(0xc4).chr(0x90),
649	'h' => chr(0xc4).chr(0xa7),
650	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
651	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
652	'l' => chr(0xc5).chr(0x82),
653	'L' => chr(0xc5).chr(0x81),
654	'o' => chr(0xc3).chr(0xb8),
655	'O' => chr(0xc3).chr(0x98),
656	't' => chr(0xc5).chr(0xa7),
657	'T' => chr(0xc5).chr(0xa6),
658	# german ss/szlig/sharp s
659	'ss' => chr(0xc3).chr(0x9f),
660	);
661
662	# convert latex-style accented characters.
663
664	# remove space (if any) between \ and letter to accent (eg {\' a})
665
666	$text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
667
668	# remove {} around a single character (eg \'{e})
669	$text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
670
671	# \, is another way of doing cedilla \c
672	$text =~ s@\\,(.)@\\c $1@g;
673
674	# remove {} around a single character for special 1 letter commands -
675	# need to insert a space. Eg \v{s} -> {\v s}
676	$text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
677
678	# only do if the text contains a '\' character.
679	if ($text =~ m\|\\\|) {
680	# "normal" accents - ie non-alpha latex tag
681	while ($text =~ m@\\([`'="^~\.])([\w])@) {
682	my $tex="$1$2"; my $char="$2";
683	my $replacement=$utf8_chars{$tex};
684	if (!defined($replacement)) {
685	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
686	$replacement=$char;
687	}
688	$text =~ s/\\\Q$tex/$replacement/g;
689	}
690
691	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
692	# only do the change if immediately followed by a space, }, {, or \
693	# one letter accents ( + ss)
694	while ($text =~ m@\\([DdhiLlOoTt]\|ss)[{}\s\"\\]@) {
695	my $tex=$1;
696	my $replacement=$special_utf8_chars{$tex};
697	if (!defined($replacement)) {
698	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
699	$replacement=$tex;
700	}
701	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
702
703	}
704
705	# one letter latex accent commands that affect following letter
706	while ($text =~ m@\\([uvcH]) ([\w])@) {
707	my $tex="$1 $2"; my $char="$2";
708	my $replacement=$special_utf8_chars{$tex};
709	if (!defined($replacement)) {
710	print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
711	$replacement=$char;
712	}
713	$text =~ s/\\$tex/$replacement/g;
714	}
715	}
716
717	# escape html-sensitive characters
718	$text =~ s@&@&@g;
719	$text =~ s@<@<@g;
720	$text =~ s@>@>@g;
721	$text =~ s/''/"/g; # Latex-specific
722	$text =~ s/``/"/g; # Latex-specific
723	# greenstone-specific
724	$text =~ s@\[@&\#91;@g;
725	$text =~ s@\]@&\#93;@g;
726
727	# remove latex commands
728
729	# explicitly recognised commands
730	$text =~ s@\\ldots@…@g;
731
732	# maths mode
733	$text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
734
735	# remove all other commands with optional arguments...
736	$text =~ s@\\\w+(\[.?\])?\s@@g;
737	# $text =~ s@\\noopsort{[^}]+\}@@g;
738	# $text =~ s@\\\w+{(([^}][^\\]))}@$1@g; # all other commands
739
740	# remove latex groupings { } (but not \{ or \} )
741	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
742	$text =~ s/^\{//; # remove { if first char
743
744	# latex characters
745	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
746	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
747	# optional break "\-"
748	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
749	# the non-macro bits have quotes around them - we just remove quotes
750	$text =~ s/([^&])[\"\#]/$1/g;
751	}
752	# dashes. Convert (m\|n)-dash into single dash for html.
753	$text =~ s@\-\-+@\-@g;
754
755	# quoted { } chars
756	$text =~ s@\\{@{@g;
757	$text =~ s@\\}@}@g;
758
759	# finally to protect against macro language...
760	$text =~ s@\\@\\\\@g;
761
762	return $text;
763	}
764
765
766	sub process_latex_math {
767	my $text = shift;
768
769	$text =~ s@\\infty@infinity@g; # or unicode 0x221E...
770	$text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
771	$text =~ s@\^([^\{])@<sup>$1</sup>@g;
772	$text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
773	$text =~ s@\_([^\{])@<sub>$1</sub>@g;
774
775	# put all other command names in italics
776	$text =~ s@\\([\w]+)@<i>$1</i>@g;
777
778	return $text;
779	}
780
781	sub set_OID {
782	my $self = shift (@_);
783	my ($doc_obj, $id, $segment_number) = @_;
784
785	if ( $self->{'key'} eq "default") {
786	$doc_obj->set_OID("$id\_$segment_number");
787	} else {
788	$doc_obj->set_OID($self->{'key'});
789	}
790	}
791
792	1;
793
794

Note: See TracBrowser for help on using the repository browser.

Download in other formats: