Context Navigation

source: trunk/gsdl/perllib/plugins/LaTeXPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago
added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile
Property svn:keywords set to `Author Date Id Revision`
File size: 23.1 KB

Line
1	###########################################################################
2	#
3	# LaTeXPlug.pm
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Written by John McPherson
10	# Copyright (C) 2004 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	###########################################################################
23
24	# todo:
25	# \includegraphics
26	# parse/remove tex \if ... macros
27
28	package LaTeXPlug;
29
30	# System complains about $arguments if the strict is set
31	use strict;
32	no strict 'refs'; # so we can print to a handle named by a variable
33
34	# greenstone packages
35	use BasPlug;
36	use parsargv;
37	use unicode;
38	use util;
39
40	my $arguments = [
41	];
42	my $options={ 'name' => 'LaTeXPlug',
43	'desc' => '{LaTeXPlug.desc}',
44	'abstract' => 'no',
45	'inherits' => 'yes' };
46	sub BEGIN {
47	@LaTeXPlug::ISA = ('BasPlug');
48	}
49
50	sub print_usage {
51	print STDERR "\n usage: plugin LaTeXPlug [options]\n\n";
52	}
53
54	sub new {
55	my ($class) = shift (@_);
56	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
57	push(@$pluginlist, $class);
58
59	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
60	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
61
62	my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
63
64	$self->{'aux_files'} = {};
65	$self->{'dir_num'} = 0;
66	$self->{'file_num'} = 0;
67	return bless $self, $class;
68	}
69
70
71	sub get_default_process_exp {
72	my $self = shift (@_);
73	return q^\.tex$^;
74	}
75
76	sub get_default_block_exp {
77	# assume any .eps files are part of the latex stuff
78	return '\.(?:eps)$';
79	}
80
81
82	sub process {
83	my $self = shift (@_);
84	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
85
86	my $start=substr($$textref, 0, 200); # first 200 bytes
87
88	if ($start !~ m~\\ (?:documentclass \| documentstyle \| input \| section
89	\| chapter \| contents \| begin) ~x) {
90	# this doesn't look like latex...
91	return undef;
92	}
93	my $outhandle = $self->{'outhandle'};
94	if ($gli) {
95	print STDERR "<Processing n='$file' p='LaTeXPlug'>\n";
96	} elsif ($self->{'verbosity'} > 1) {
97	print $outhandle "LaTeXPlug: processing $file\n"
98	}
99	my $cursection = $doc_obj->get_top_section();
100
101	###### clean up text ######
102	$$textref =~ s/\r$//mg; # remove dos ^M
103	$$textref =~ s/%.*$//mg; # remove comments
104
105	# convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
106
107	$$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
108	# check that both sides are ascii, so we don't screw up utf-8 chars
109	$$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
110	unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
111
112
113	###### find metadata ######
114
115	## FileFormat metadata ##
116	$doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
117
118	### title metadata ###
119	$$textref =~ m@\\title\s{(.?)}@s;
120	my $title = $1;
121	if (!$title) {
122	# no title tag. look for a chapter/section heading
123	$$textref =~ m@\\(?:chapter\|section)\s{(.?)}@s; # will get 1st match
124	$title = $1;
125	}
126	if (!$title) {
127	# no chapter/section heading tags either... use filename
128	$title = $file;
129	$title =~ s/\.tex$//i;
130	$title =~ s/[-_.]/ /g; # turn punctuation into spaces
131	}
132	if ($title) {
133	$title =~ s@\\\\@ @g; # embedded newlines
134	$title = $self->process_latex($title); # no "-html" for title eg in browser
135	$doc_obj->add_utf8_metadata($cursection, "Title", $title);
136	}
137
138	### creator/individual author metadata ###
139	$$textref =~ m@\\author\s{((?:{.?}\|.?)+)}\s$@sm;
140	my $authors=$1;
141	if ($authors) {
142	# take care of "et al."...
143	$authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
144	my $etal=$1;
145	$etal="" if (!defined ($etal));
146
147	my @authorlist=parse_authors($self, $authors);
148
149	foreach my $author (@authorlist) {
150	# Add each name to set of Authors
151	$doc_obj->add_utf8_metadata ($cursection, "Author", $author);
152	}
153
154	# Only want at most one "and" in the Creator field
155	my $creator_str="";
156	if (scalar(@authorlist) > 2) {
157	my $lastauthor=pop @authorlist;
158	$creator_str=join(', ', @authorlist);
159	$creator_str.=" and $lastauthor";
160	} else { # 1 or 2 authors...
161	$creator_str=join(" and ",@authorlist);
162	}
163	$creator_str.=$etal; # if there was "et al."
164	$doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
165	}
166	### end of author metadata ###
167
168	###### process latex for the main text ######
169	$$textref =~ s/^.*?\\begin{document}//s;
170	$$textref =~ s/\\end{document}.*?$//s;
171	$$textref = $self->process_latex("-html",$$textref);
172	$doc_obj->add_utf8_text($cursection, $$textref);
173
174	return 1;
175	}
176
177
178	# returns a list of author names
179	sub parse_authors {
180	my $self=shift;
181	my $authors=shift;
182
183	my $outhandle=$self->{'outhandle'};
184
185	$authors =~ s/\n/ /g; # remove newlines
186
187	# some people do this for affiliation footnote/dagger
188	$authors =~ s@\$.*?\$@@g; # remove maths from author :(
189
190	# und here for german language...
191	# don't use brackets in pattern, else the matched bit becomes
192	# an element in the list!
193	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $authors);
194	my @formattedlist = ();
195	foreach my $author (@authorlist) {
196	$author =~ s/\s*$//;
197	$author =~ s/^\s*//;
198	# Reformat and add author name
199	next if $author=~ /^\s*$/;
200
201	# names are "First von Last", "von Last, First"
202	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
203	my $first="";
204	my $vonlast="";
205	my $jr="";
206
207	if ($author =~ /,/) {
208	my @parts=split(/,\s*/, $author);
209	$first = pop @parts;
210	if (scalar(@parts) == 2) {
211	$jr = pop @parts;
212	}
213	$vonlast=shift @parts;
214	if (scalar(@parts) > 0) {
215	print $outhandle $self->{'plugin_type'} .
216	": couldn't parse name $author\n";
217	# but we continue anyway...
218	}
219	} else { # First von Last
220	my @words = split(/ /, $author);
221	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
222	$first .= " " . shift (@words);
223	}
224	$first =~ s/^\s//;
225	$vonlast = join (' ', @words); # whatever's left...
226	}
227	my $von="";
228	my $last="";
229	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
230	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
231	$von = $1;
232	if (!defined ($von)) {
233	# some non-English names do start with lowercase
234	# eg "Marie desJardins". Also we can get typos...
235	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
236	$von="";
237	if ($vonlast =~ /^[a-z]+$/) {
238	# if it's all lowercase, uppercase 1st.
239	$vonlast =~ s/^(.)/\u$1/;
240	}
241	}
242	$von =~ s/\s*$//;
243	$last=$vonlast;
244	} else {
245	$last=$vonlast;
246	}
247	my $wholename="$first $von $last $jr";
248	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
249	# my $fullname = "$last";
250	# $fullname .= " $jr" if ($jr);
251	# $fullname .= ", $first";
252	# $fullname .= " $von" if ($von);
253	push (@formattedlist, $wholename);
254	}
255	return @formattedlist;
256	}
257
258
259	## following functions based on bibtex plugin ##
260	# not actually used at the moment, but might be useful in future?
261	sub expand_month {
262	my $text=shift;
263
264	# bibtex style files expand abbreviations for months.
265	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
266	$text =~ s/jan/_textmonth01_/g;
267	$text =~ s/feb/_textmonth02_/g;
268	$text =~ s/mar/_textmonth03_/g;
269	$text =~ s/apr/_textmonth04_/g;
270	$text =~ s/may/_textmonth05_/g;
271	$text =~ s/jun/_textmonth06_/g;
272	$text =~ s/jul/_textmonth07_/g;
273	$text =~ s/aug/_textmonth08_/g;
274	$text =~ s/sep/_textmonth09_/g;
275	$text =~ s/oct/_textmonth10_/g;
276	$text =~ s/nov/_textmonth11_/g;
277	$text =~ s/dec/_textmonth12_/g;
278
279	return $text;
280	}
281
282
283	# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
284	# as the first argument to this function.
285	#
286	# Convert accented characters, remove { }, interprete some commands....
287	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
288
289	# Also, it sucks quite a bit for complicated/nested commands since it doesn't
290	# match { with the corresponding }, only the nearest }
291
292	sub process_latex {
293	my $self=shift;
294	my $text=shift;
295
296	my $outhandle=$self->{'outhandle'};
297
298	my $html_markup=0;
299	if ($text =~ /^\-html/) {
300	$html_markup=1;
301	$text=shift;
302	}
303
304	if (! $text) {
305	return $text;
306	}
307	# escape html-sensitive characters
308	$text =~ s@&@&@g;
309	$text =~ s@<@<@g;
310	$text =~ s@>@>@g;
311
312	# do this before accents, since \= means something different in tabbing
313	# also \> is a tab stop too, and \\ is newline
314	sub do_tabbing {
315	my $tabbing=shift;
316	$tabbing =~ s!^.\\kill\s$!!mg; # \kill sets tab stops, kills line
317	$tabbing =~ s~\\(?:=\|>)~\xc2\xa0~g; # replace with nbsp
318	$tabbing =~ s~[\\][\\](?:\[.?\])?\s$~<br/>~mg;
319	return "<br/>" . $tabbing . "<br/>\n";
320	}
321	$text =~ s@\\begin{tabbing}(.*?)\\end{tabbing}@do_tabbing($1)@ges;
322	sub do_tabular {
323	my $tabular=shift;
324	$tabular =~ s~(?<!\\)\s&\s~</td><td>~g;
325	$tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
326	$tabular =~ s~\\hline~~g; # for now...
327	$tabular =~ s~<td>\s\\multicolumn{(\d+)}{.?}~<td colspan="$1">~g;
328	return "<table border=\"1\">\n <tr><td>"
329	. $tabular . "</td></tr></table>\n";
330	}
331	$text =~ s@\\begin{tabular}(?:\[.?\])?{.?}(.*?)\\end{tabular} @
332	do_tabular($1) @xges;
333
334	$text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
335
336	# process maths mode before accents... things like \, mean different!
337	# maths mode
338	$text =~ s@\$\$(.*?)\$\$
339	@ process_latex_math($html_markup,$1)
340	@xsge; # multi-line maths: $$ .... $$
341
342	$text =~ s@([^\\])\$(.*?[^\\])\$
343	@$1.process_latex_math($html_markup,$2)@xsge;
344
345
346	# is this an amstext environment, or just custom for that input file?
347	$text =~ s@\\begin{(algorithm)}(.*?)\\end{\1}@remove_equals($2)@ges;
348
349	# convert latex-style accented characters.
350	$self->latex_accents_to_utf8(\$text);
351
352	# replace quotes with utf-8
353
354	$text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (“)
355	$text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (”)
356	$text =~ s/`/\xe2\xc0\x98/g; # single left quote
357	$text =~ s/'/\xe2\xc0\x99/g; # single right quote
358
359	###### remove/replace latex commands ######
360	### commands that expand to something that gets displayed ###
361	$text =~ s~\\ldots~…~g;
362	$text =~ s~\\hrule~<hr/>\n~g;
363	$text =~ s~\\maketitle~ ~;
364	### space commands ###
365	$text =~ s~\\[vh]skip\s+\S+~~g;
366	$text =~ s~\\vspace\?{.?}~<div> </div>~g; # vertical space
367	$text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
368	$text =~ s~\\noindent\b~~g;
369	# newpage, etc
370	$text =~ s~\\(?:clearemptydoublepage\|newpage)~~g;
371	### counters, contents, environments, labels, etc ###
372	$text =~ s~\\(?:addcontentsline){.?}{.?}{.*}~~g;
373	$text =~ s~\s\\begin{itemize}\s~\n<ul>\n~g;
374	$text =~ s~\s\\end{itemize}\s~</li></ul>\n~g;
375	$text =~ s~\s\\begin{enumerate}\s~<ol>\n~g;
376	$text =~ s~\s\\end{enumerate}\s~</li></ol>\n~g;
377	if ($text =~ s~\s*\\item~</li>\n<li>~g) {
378	# (count for first list item)
379	$text =~ s~<([ou])l>\s</li>\s~<$1l>~g;
380	}
381	$text =~ s~\\(?:label\|begin\|end){.?}\s\n?~ ~g; # remove tag and contents
382	$text =~ s~\\(?:tableofcontents\|listoffigures)~ ~g;
383	### font sizes/styles ###
384	$text =~ s~\\(?:tiny\|small\|footnotesize\|normalsize\|large\|Large\|huge\|Huge)\b~~g;
385
386	if ($html_markup) {
387	$text =~ s~\\section\?{([^\}]+)}\s\n?~<H1>$1</H1>\n~g;
388	$text =~ s~\\subsection\?{(.?)}\s*\n?~<H2>$1</H2>\n~g;
389	$text =~ s~{\\tt\s(.?)}~<tt>$1</tt>~g;
390	$text =~ s~\\(?:texttt\|tt\|ttseries)\s{(.?)}~<tt>$1</tt>~g;
391	$text =~ s~\\emph{(.*?)}~<em>$1</em>~g;
392	$text =~ s~{\\(?:em\|it)\s(.?)}~<em>$1</em>~g;
393	$text =~ s~{\\(?:bf\|bfseries)\s(.?)}~<strong>$1</strong>~g;
394	$text =~ s~\\(?:textbf\|bf\|bfseries)\s{(.?)}~<strong>$1</strong>~g;
395	} else {
396	# remove tags for text-only
397	$text =~ s~\\(?:textbf\|bf\|bfseries\|em\|emph\|tt\|rm\|texttt)\b~ ~g;
398	}
399	$text =~ s ~ {\\sc\s+(.*?)} ~
400	{<span style="font-variant:\ small-caps">$1</span>} ~gx;
401	# ignore these font tags (if there are any left)
402	# sf is sans-serif
403	$text =~ s~\\(?:mdseries\|textmd\|bfseries\|textbf\|sffamily\|sf\|sc)\b~ ~;
404	#### end font-related stuff ####
405
406	### remove all other commands with optional arguments... ###
407	# don't remove commands without { }....
408	# $text =~ s~\\\w+(\[.?\])?\s~~g;
409	# $text =~ s~\\noopsort{[^}]+\}~~g;
410	# verbatim
411	$text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
412	# remove tags, keep contents for \tag[optional]{contents}
413	while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
414
415	# remove latex groupings { } (but not \{ or \} )
416	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
417	$text =~ s/^\{//; # remove { if first char
418
419	# latex characters
420	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
421	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
422	# optional break "\-"
423	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
424	# the non-macro bits have quotes around them - we just remove quotes
425	# XXX bibtex and latex differ here (for the '#' char)...
426	$text =~ s/([^&])[\"\#]/$1/g;
427	}
428	# dashes. Convert (m\|n)-dash into single dash for html.
429	$text =~ s~\-\-+~\-~g;
430
431	# quoted { } chars
432	$text =~ s~\\{~{~g;
433	$text =~ s~\\}~}~g;
434
435	# spaces
436	$text =~ s~\\ ~ ~g;
437
438	# finally to protect against macro language...
439	# greenstone-specific
440	$text =~ s~\[~&\#91;~g;
441	$text =~ s~\]~&\#93;~g;
442	$text =~ s~(?<!\\)([\\_])~\\$1~g;
443
444	if ($html_markup) {
445	$text =~ s~\n{2,}~\n</p>\n<p>~g;
446	return "<p>$text</p>";
447	}
448
449	return $text;
450	}
451
452	# only used by process_latex for \verb....
453	sub verb_text {
454	my $verbatim=shift;
455	$verbatim =~ s/([{}_])/\\$1/g;
456	return $verbatim;
457	}
458	# only used by process_latex_math
459	# returns a unicode char if applicable, otherwise ascii
460	sub math_fraction {
461	my $num=$1;
462	my $denom=$2;
463
464	if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
465	if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
466	if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
467	return "$num/$denom";
468	}
469
470	sub process_latex_math {
471
472	my $text = pop; # if given one or two args, this is the last one...
473	my $html_markup=pop; # if given two args, this is the first one else undef
474
475	$text =~ s~\\,~ ~g; # forces a space?
476	$text =~ s~\\infty~infinity~g; # or unicode 0x221E...
477
478	# use this one when more things can read 3-byte utf8 values like this!
479	# $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
480	$text =~ s~\\cup\b~ U ~g;
481
482	$text =~ s~\\frac\s*{(.+?)}{(.+?)}~math_fraction($1,$2)~ge;
483
484	if ($html_markup) {
485	$text =~ s~\^{(.*?)}~<sup>$1</sup>~g; # a^b superscript
486	$text =~ s~\^([^\{])~<sup>$1</sup>~g;
487	$text =~ s~\_{(.*?)}~<sub>$1</sub>~g; # a_b subscript
488	$text =~ s~\_([^\{])~<sub>$1</sub>~g;
489
490	$text =~ s~\\ldots~…~g; # use html named entity for now
491
492	# put all other command names in italics for now
493	$text =~ s~\\([\w]+)~<i>$1</i> ~g;
494	}
495
496	# special cases, for some input files
497	if ($text =~ m~^\\\w+$~) {
498	$text="{" . $text . "}";
499	}
500
501	return $text;
502	}
503
504
505
506	sub latex_accents_to_utf8 {
507
508	# note - this is really ugly, but it works. There may be a prettier way
509	# of mapping latex accented chars to utf8, but we just brute force it here.
510	# Also, this isn't complete - not every single possible accented letter
511	# is in here yet, but most of the common ones are.
512
513	my %utf8_chars =
514	(
515	# acutes
516	'\'a' => chr(0xc3).chr(0xa1),
517	'\'c' => chr(0xc4).chr(0x87),
518	'\'e' => chr(0xc3).chr(0xa9),
519	'\'i' => chr(0xc3).chr(0xad),
520	'\'l' => chr(0xc3).chr(0xba),
521	'\'n' => chr(0xc3).chr(0x84),
522	'\'o' => chr(0xc3).chr(0xb3),
523	'\'r' => chr(0xc5).chr(0x95),
524	'\'s' => chr(0xc5).chr(0x9b),
525	'\'u' => chr(0xc3).chr(0xba),
526	'\'y' => chr(0xc3).chr(0xbd),
527	'\'z' => chr(0xc5).chr(0xba),
528	# graves
529	'`a' => chr(0xc3).chr(0xa0),
530	'`A' => chr(0xc3).chr(0x80),
531	'`e' => chr(0xc3).chr(0xa8),
532	'`E' => chr(0xc3).chr(0x88),
533	'`i' => chr(0xc3).chr(0xac),
534	'`I' => chr(0xc3).chr(0x8c),
535	'`o' => chr(0xc3).chr(0xb2),
536	'`O' => chr(0xc3).chr(0x92),
537	'`u' => chr(0xc3).chr(0xb9),
538	'`U' => chr(0xc3).chr(0x99),
539	# circumflex
540	'^a' => chr(0xc3).chr(0xa2),
541	'^A' => chr(0xc3).chr(0x82),
542	'^c' => chr(0xc4).chr(0x89),
543	'^C' => chr(0xc4).chr(0x88),
544	'^e' => chr(0xc3).chr(0xaa),
545	'^E' => chr(0xc3).chr(0x8a),
546	'^g' => chr(0xc4).chr(0x9d),
547	'^G' => chr(0xc4).chr(0x9c),
548	'^h' => chr(0xc4).chr(0xa5),
549	'^H' => chr(0xc4).chr(0xa4),
550	'^i' => chr(0xc3).chr(0xae),
551	'^I' => chr(0xc3).chr(0x8e),
552	'^j' => chr(0xc4).chr(0xb5),
553	'^J' => chr(0xc4).chr(0xb4),
554	'^o' => chr(0xc3).chr(0xb4),
555	'^O' => chr(0xc3).chr(0x94),
556	'^s' => chr(0xc5).chr(0x9d),
557	'^S' => chr(0xc5).chr(0x9c),
558	'^u' => chr(0xc3).chr(0xa2),
559	'^U' => chr(0xc3).chr(0xbb),
560	'^w' => chr(0xc5).chr(0xb5),
561	'^W' => chr(0xc5).chr(0xb4),
562	'^y' => chr(0xc5).chr(0xb7),
563	'^Y' => chr(0xc5).chr(0xb6),
564
565	# diaeresis
566	'"a' => chr(0xc3).chr(0xa4),
567	'"A' => chr(0xc3).chr(0x84),
568	'"e' => chr(0xc3).chr(0xab),
569	'"E' => chr(0xc3).chr(0x8b),
570	'"i' => chr(0xc3).chr(0xaf),
571	'"I' => chr(0xc3).chr(0x8f),
572	'"\\\\i' => chr(0xc3).chr(0xaf),
573	'"\\\\I' => chr(0xc3).chr(0x8f),
574	'"o' => chr(0xc3).chr(0xb6),
575	'"O' => chr(0xc3).chr(0x96),
576	'"u' => chr(0xc3).chr(0xbc),
577	'"U' => chr(0xc3).chr(0x9c),
578	'"y' => chr(0xc3).chr(0xbf),
579	'"Y' => chr(0xc3).chr(0xb8),
580	# tilde
581	'~A' => chr(0xc3).chr(0x83),
582	'~N' => chr(0xc3).chr(0x91),
583	'~O' => chr(0xc3).chr(0x95),
584	'~a' => chr(0xc3).chr(0xa3),
585	'~n' => chr(0xc3).chr(0xb1),
586	'~o' => chr(0xc3).chr(0xb5),
587	# caron - handled specially
588	# double acute
589	# ring
590	# dot
591	'.c' => chr(0xc4).chr(0x8b),
592	'.C' => chr(0xc4).chr(0x8a),
593	'.e' => chr(0xc4).chr(0x97),
594	'.E' => chr(0xc4).chr(0x96),
595	'.g' => chr(0xc4).chr(0xa1),
596	'.G' => chr(0xc4).chr(0xa0),
597	'.I' => chr(0xc4).chr(0xb0),
598	'.z' => chr(0xc5).chr(0xbc),
599	'.Z' => chr(0xc5).chr(0xbb),
600	# macron
601	'=a' => chr(0xc4).chr(0x81),
602	'=A' => chr(0xc4).chr(0x80),
603	'=e' => chr(0xc4).chr(0x93),
604	'=E' => chr(0xc4).chr(0x92),
605	'=i' => chr(0xc4).chr(0xab),
606	'=I' => chr(0xc4).chr(0xaa),
607	'=o' => chr(0xc4).chr(0x8d),
608	'=O' => chr(0xc4).chr(0x8c),
609	'=u' => chr(0xc4).chr(0xab),
610	'=U' => chr(0xc4).chr(0xaa),
611
612	# stroke - handled specially - see below
613
614	# cedilla - handled specially
615	);
616
617	# these are one letter latex commands - we make sure they're not a longer
618	# command name. eg {\d} is d+stroke, so careful of \d
619	my %special_utf8_chars =
620	(
621	# breve
622	'u g' => chr(0xc4).chr(0x9f),
623	'u G' => chr(0xc4).chr(0x9e),
624	'u i' => chr(0xc4).chr(0xad),
625	'u I' => chr(0xc4).chr(0xac),
626	'u o' => chr(0xc5).chr(0x8f),
627	'u O' => chr(0xc5).chr(0x8e),
628	'u u' => chr(0xc5).chr(0xad),
629	'u U' => chr(0xc5).chr(0xac),
630	'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
631	'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
632	# caron
633	'v c' => chr(0xc4).chr(0x8d),
634	'v C' => chr(0xc4).chr(0x8c),
635	'v n' => chr(0xc5).chr(0x88),
636	'v N' => chr(0xc5).chr(0x87),
637	'v s' => chr(0xc5).chr(0xa1),
638	'v S' => chr(0xc5).chr(0xa5),
639	'v z' => chr(0xc5).chr(0xbe),
640	'v Z' => chr(0xc5).chr(0xbd),
641	# cedilla
642	'c c' => chr(0xc3).chr(0xa7),
643	'c C' => chr(0xc3).chr(0x87),
644	'c g' => chr(0xc4).chr(0xa3),
645	'c G' => chr(0xc4).chr(0xa2),
646	'c k' => chr(0xc4).chr(0xb7),
647	'c K' => chr(0xc4).chr(0xb6),
648	'c l' => chr(0xc4).chr(0xbc),
649	'c L' => chr(0xc4).chr(0xbb),
650	'c n' => chr(0xc5).chr(0x86),
651	'c N' => chr(0xc5).chr(0x85),
652	'c r' => chr(0xc5).chr(0x97),
653	'c R' => chr(0xc5).chr(0x96),
654	'c s' => chr(0xc5).chr(0x9f),
655	'c S' => chr(0xc5).chr(0x9e),
656	'c t' => chr(0xc5).chr(0xa3),
657	'c T' => chr(0xc5).chr(0xa2),
658	# double acute / Hungarian accent
659	'H O' => chr(0xc5).chr(0x90),
660	'H o' => chr(0xc5).chr(0x91),
661	'H U' => chr(0xc5).chr(0xb0),
662	'H u' => chr(0xc5).chr(0xb1),
663
664	# stroke
665	'd' => chr(0xc4).chr(0x91),
666	'D' => chr(0xc4).chr(0x90),
667	'h' => chr(0xc4).chr(0xa7),
668	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
669	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
670	'l' => chr(0xc5).chr(0x82),
671	'L' => chr(0xc5).chr(0x81),
672	'o' => chr(0xc3).chr(0xb8),
673	'O' => chr(0xc3).chr(0x98),
674	't' => chr(0xc5).chr(0xa7),
675	'T' => chr(0xc5).chr(0xa6),
676
677	# other special chars
678	'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
679	'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
680	);
681
682	my $self=shift;
683	my $textref=shift;
684
685	my $outhandle=$self->{'outhandle'};
686	my $text=$$textref;
687
688	# remove space (if any) between \ and letter to accent (eg {\' a})
689	$text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
690
691	# remove {} around a single character (eg \'{e})
692	$text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g; # for emacs indenting... `]);
693
694	## only in bibtex... not in latex proper?!
695	### \, is another way of doing cedilla \c
696	##$text =~ s~\\,(.)~\\c $1~g;
697
698	# remove {} around a single character for special 1 letter commands -
699	# need to insert a space. Eg \v{s} -> {\v s}
700	$text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
701
702	# only do if the text contains a '\' character.
703	if ($text =~ m\|\\\|) {
704	# "normal" accents - ie non-alpha latex tag
705	# xxx used to have ([\w]\b)@ (for word boundary)
706	while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
707	my $tex="$1$2"; my $char="$2";
708	my $replacement=$utf8_chars{$tex};
709	if (!defined($replacement)) {
710	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
711	print $outhandle . $self->{'plugin_type'} .
712	": Warning: unknown latex accent \"$tex\""
713	. " in \"$1\"\n";
714	$replacement=$char;
715	}
716	$text =~ s/\\\Q$tex/$replacement/g;
717	}
718
719	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
720	# only do the change if immediately followed by a space, }, {, or \
721	# one letter accents ( + ss / aa)
722	while ($text =~ m~\\([DdhiLlOoTt]\|ss\|aa)[{}\s\"\\]~) {
723	my $tex=$1;
724	my $replacement=$special_utf8_chars{$tex};
725	if (!defined($replacement)) {
726	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
727	print $outhandle $self->{'plugin_type'} .
728	": Warning: unknown latex accent \"$tex\""
729	. " in \"$1\"\n";
730	$replacement=$tex;
731	}
732	($text =~ s/{\\$tex}/$replacement/g) or
733	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
734
735	}
736
737	# one letter latex accent commands that affect following letter
738	while ($text =~ m~\\([uvcH]) ([\w])~) {
739	my $tex="$1 $2"; my $char="$2";
740	my $replacement=$special_utf8_chars{$tex};
741	if (!defined($replacement)) {
742	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
743	print $outhandle $self->{'plugin_type'} .
744	": Warning: unknown latex accent \"$tex\""
745	. " in \"$1\"\n";
746	$replacement=$char;
747	}
748	$text =~ s/\\$tex/$replacement/g;
749	}
750	}
751	$textref=\$text;
752	}
753
754
755	# modules must return true
756	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: