Context Navigation

source: main/trunk/greenstone2/perllib/plugins/LaTeXPlugin.pm@ 23484

Last change on this file since 23484 was 16104, checked in by kjdon, 16 years ago
tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process
Property svn:keywords set to `Author Date Id Revision`
File size: 23.0 KB

Line
1	###########################################################################
2	#
3	# LaTeXPlugin.pm
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Written by John McPherson
10	# Copyright (C) 2004 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	###########################################################################
23
24	# todo:
25	# \includegraphics
26	# parse/remove tex \if ... macros
27
28	package LaTeXPlugin;
29
30	# System complains about $arguments if the strict is set
31	use strict;
32	no strict 'refs'; # so we can print to a handle named by a variable
33
34	# greenstone packages
35	use ReadTextFile;
36	use unicode;
37	use util;
38
39	my $arguments =
40	[ { 'name' => "process_exp",
41	'desc' => "{BasePlugin.process_exp}",
42	'type' => "regexp",
43	'reqd' => "no",
44	'deft' => &get_default_process_exp() } ];
45
46	my $options = { 'name' => "LaTeXPlugin",
47	'desc' => "{LaTeXPlugin.desc}",
48	'abstract' => "no",
49	'inherits' => "yes",
50	'args' => $arguments };
51
52	sub BEGIN {
53	@LaTeXPlugin::ISA = ('ReadTextFile');
54	}
55
56	sub new {
57	my ($class) = shift (@_);
58	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59	push(@$pluginlist, $class);
60
61	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62	push(@{$hashArgOptLists->{"OptList"}},$options);
63
64	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
65
66	$self->{'aux_files'} = {};
67	$self->{'dir_num'} = 0;
68	$self->{'file_num'} = 0;
69	return bless $self, $class;
70	}
71
72
73	sub get_default_process_exp {
74	my $self = shift (@_);
75	return q^\.tex$^;
76	}
77
78	sub get_default_block_exp {
79	# assume any .eps files are part of the latex stuff
80	return '\.(?:eps)$';
81	}
82
83
84	sub process {
85	my $self = shift (@_);
86	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
87
88	my $start=substr($$textref, 0, 200); # first 200 bytes
89
90	if ($start !~ m~\\ (?:documentclass \| documentstyle \| input \| section
91	\| chapter \| contents \| begin) ~x) {
92	# this doesn't look like latex...
93	return undef;
94	}
95	my $outhandle = $self->{'outhandle'};
96
97	my $cursection = $doc_obj->get_top_section();
98
99	###### clean up text ######
100	$$textref =~ s/\r$//mg; # remove dos ^M
101	$$textref =~ s/%.*$//mg; # remove comments
102
103	# convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
104
105	$$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
106	# check that both sides are ascii, so we don't screw up utf-8 chars
107	$$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
108	unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
109
110
111	###### find metadata ######
112
113	## FileFormat metadata ##
114	$doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
115
116	### title metadata ###
117	$$textref =~ m@\\title\s{(.?)}@s;
118	my $title = $1;
119	if (!$title) {
120	# no title tag. look for a chapter/section heading
121	$$textref =~ m@\\(?:chapter\|section)\s{(.?)}@s; # will get 1st match
122	$title = $1;
123	}
124	if (!$title) {
125	# no chapter/section heading tags either... use filename
126	$title = $file;
127	$title =~ s/\.tex$//i;
128	$title =~ s/[-_.]/ /g; # turn punctuation into spaces
129	}
130	if ($title) {
131	$title =~ s@\\\\@ @g; # embedded newlines
132	$title = $self->process_latex($title); # no "-html" for title eg in browser
133	$doc_obj->add_utf8_metadata($cursection, "Title", $title);
134	}
135
136	### creator/individual author metadata ###
137	$$textref =~ m@\\author\s{((?:{.?}\|.?)+)}\s$@sm;
138	my $authors=$1;
139	if ($authors) {
140	# take care of "et al."...
141	$authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
142	my $etal=$1;
143	$etal="" if (!defined ($etal));
144
145	my @authorlist=parse_authors($self, $authors);
146
147	foreach my $author (@authorlist) {
148	# Add each name to set of Authors
149	$doc_obj->add_utf8_metadata ($cursection, "Author", $author);
150	}
151
152	# Only want at most one "and" in the Creator field
153	my $creator_str="";
154	if (scalar(@authorlist) > 2) {
155	my $lastauthor=pop @authorlist;
156	$creator_str=join(', ', @authorlist);
157	$creator_str.=" and $lastauthor";
158	} else { # 1 or 2 authors...
159	$creator_str=join(" and ",@authorlist);
160	}
161	$creator_str.=$etal; # if there was "et al."
162	$doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
163	}
164	### end of author metadata ###
165
166	###### process latex for the main text ######
167	$$textref =~ s/^.*?\\begin{document}//s;
168	$$textref =~ s/\\end{document}.*?$//s;
169	$$textref = $self->process_latex("-html",$$textref);
170	$doc_obj->add_utf8_text($cursection, $$textref);
171
172	return 1;
173	}
174
175
176	# returns a list of author names
177	sub parse_authors {
178	my $self=shift;
179	my $authors=shift;
180
181	my $outhandle=$self->{'outhandle'};
182
183	$authors =~ s/\n/ /g; # remove newlines
184
185	# some people do this for affiliation footnote/dagger
186	$authors =~ s@\$.*?\$@@g; # remove maths from author :(
187
188	# und here for german language...
189	# don't use brackets in pattern, else the matched bit becomes
190	# an element in the list!
191	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $authors);
192	my @formattedlist = ();
193	foreach my $author (@authorlist) {
194	$author =~ s/\s*$//;
195	$author =~ s/^\s*//;
196	# Reformat and add author name
197	next if $author=~ /^\s*$/;
198
199	# names are "First von Last", "von Last, First"
200	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
201	my $first="";
202	my $vonlast="";
203	my $jr="";
204
205	if ($author =~ /,/) {
206	my @parts=split(/,\s*/, $author);
207	$first = pop @parts;
208	if (scalar(@parts) == 2) {
209	$jr = pop @parts;
210	}
211	$vonlast=shift @parts;
212	if (scalar(@parts) > 0) {
213	print $outhandle $self->{'plugin_type'} .
214	": couldn't parse name $author\n";
215	# but we continue anyway...
216	}
217	} else { # First von Last
218	my @words = split(/ /, $author);
219	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
220	$first .= " " . shift (@words);
221	}
222	$first =~ s/^\s//;
223	$vonlast = join (' ', @words); # whatever's left...
224	}
225	my $von="";
226	my $last="";
227	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
228	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
229	$von = $1;
230	if (!defined ($von)) {
231	# some non-English names do start with lowercase
232	# eg "Marie desJardins". Also we can get typos...
233	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
234	$von="";
235	if ($vonlast =~ /^[a-z]+$/) {
236	# if it's all lowercase, uppercase 1st.
237	$vonlast =~ s/^(.)/\u$1/;
238	}
239	}
240	$von =~ s/\s*$//;
241	$last=$vonlast;
242	} else {
243	$last=$vonlast;
244	}
245	my $wholename="$first $von $last $jr";
246	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
247	# my $fullname = "$last";
248	# $fullname .= " $jr" if ($jr);
249	# $fullname .= ", $first";
250	# $fullname .= " $von" if ($von);
251	push (@formattedlist, $wholename);
252	}
253	return @formattedlist;
254	}
255
256
257	## following functions based on bibtex plugin ##
258	# not actually used at the moment, but might be useful in future?
259	sub expand_month {
260	my $text=shift;
261
262	# bibtex style files expand abbreviations for months.
263	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
264	$text =~ s/jan/_textmonth01_/g;
265	$text =~ s/feb/_textmonth02_/g;
266	$text =~ s/mar/_textmonth03_/g;
267	$text =~ s/apr/_textmonth04_/g;
268	$text =~ s/may/_textmonth05_/g;
269	$text =~ s/jun/_textmonth06_/g;
270	$text =~ s/jul/_textmonth07_/g;
271	$text =~ s/aug/_textmonth08_/g;
272	$text =~ s/sep/_textmonth09_/g;
273	$text =~ s/oct/_textmonth10_/g;
274	$text =~ s/nov/_textmonth11_/g;
275	$text =~ s/dec/_textmonth12_/g;
276
277	return $text;
278	}
279
280
281	# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
282	# as the first argument to this function.
283	#
284	# Convert accented characters, remove { }, interprete some commands....
285	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
286
287	# Also, it sucks quite a bit for complicated/nested commands since it doesn't
288	# match { with the corresponding }, only the nearest }
289
290	sub process_latex {
291	my $self=shift;
292	my $text=shift;
293
294	my $outhandle=$self->{'outhandle'};
295
296	my $html_markup=0;
297	if ($text =~ /^\-html/) {
298	$html_markup=1;
299	$text=shift;
300	}
301
302	if (! $text) {
303	return $text;
304	}
305	# escape html-sensitive characters
306	$text =~ s@&@&@g;
307	$text =~ s@<@<@g;
308	$text =~ s@>@>@g;
309
310	# do this before accents, since \= means something different in tabbing
311	# also \> is a tab stop too, and \\ is newline
312	sub do_tabbing {
313	my $tabbing=shift;
314	$tabbing =~ s!^.\\kill\s$!!mg; # \kill sets tab stops, kills line
315	$tabbing =~ s~\\(?:=\|>)~\xc2\xa0~g; # replace with nbsp
316	$tabbing =~ s~[\\][\\](?:\[.?\])?\s$~<br/>~mg;
317	return "<br/>" . $tabbing . "<br/>\n";
318	}
319	$text =~ s@\\begin{tabbing}(.*?)\\end{tabbing}@do_tabbing($1)@ges;
320	sub do_tabular {
321	my $tabular=shift;
322	$tabular =~ s~(?<!\\)\s&\s~</td><td>~g;
323	$tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
324	$tabular =~ s~\\hline~~g; # for now...
325	$tabular =~ s~<td>\s\\multicolumn{(\d+)}{.?}~<td colspan="$1">~g;
326	return "<table border=\"1\">\n <tr><td>"
327	. $tabular . "</td></tr></table>\n";
328	}
329	$text =~ s@\\begin{tabular}(?:\[.?\])?{.?}(.*?)\\end{tabular} @
330	do_tabular($1) @xges;
331
332	$text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
333
334	# process maths mode before accents... things like \, mean different!
335	# maths mode
336	$text =~ s@\$\$(.*?)\$\$
337	@ process_latex_math($html_markup,$1)
338	@xsge; # multi-line maths: $$ .... $$
339
340	$text =~ s@([^\\])\$(.*?[^\\])\$
341	@$1.process_latex_math($html_markup,$2)@xsge;
342
343
344	# is this an amstext environment, or just custom for that input file?
345	$text =~ s@\\begin{(algorithm)}(.*?)\\end{\1}@remove_equals($2)@ges;
346
347	# convert latex-style accented characters.
348	$self->latex_accents_to_utf8(\$text);
349
350	# replace quotes with utf-8
351
352	$text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (“)
353	$text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (”)
354	$text =~ s/`/\xe2\xc0\x98/g; # single left quote
355	$text =~ s/'/\xe2\xc0\x99/g; # single right quote
356
357	###### remove/replace latex commands ######
358	### commands that expand to something that gets displayed ###
359	$text =~ s~\\ldots~…~g;
360	$text =~ s~\\hrule~<hr/>\n~g;
361	$text =~ s~\\maketitle~ ~;
362	### space commands ###
363	$text =~ s~\\[vh]skip\s+\S+~~g;
364	$text =~ s~\\vspace\?{.?}~<div> </div>~g; # vertical space
365	$text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
366	$text =~ s~\\noindent\b~~g;
367	# newpage, etc
368	$text =~ s~\\(?:clearemptydoublepage\|newpage)~~g;
369	### counters, contents, environments, labels, etc ###
370	$text =~ s~\\(?:addcontentsline){.?}{.?}{.*}~~g;
371	$text =~ s~\s\\begin{itemize}\s~\n<ul>\n~g;
372	$text =~ s~\s\\end{itemize}\s~</li></ul>\n~g;
373	$text =~ s~\s\\begin{enumerate}\s~<ol>\n~g;
374	$text =~ s~\s\\end{enumerate}\s~</li></ol>\n~g;
375	if ($text =~ s~\s*\\item~</li>\n<li>~g) {
376	# (count for first list item)
377	$text =~ s~<([ou])l>\s</li>\s~<$1l>~g;
378	}
379	$text =~ s~\\(?:label\|begin\|end){.?}\s\n?~ ~g; # remove tag and contents
380	$text =~ s~\\(?:tableofcontents\|listoffigures)~ ~g;
381	### font sizes/styles ###
382	$text =~ s~\\(?:tiny\|small\|footnotesize\|normalsize\|large\|Large\|huge\|Huge)\b~~g;
383
384	if ($html_markup) {
385	$text =~ s~\\section\?{([^\}]+)}\s\n?~<H1>$1</H1>\n~g;
386	$text =~ s~\\subsection\?{(.?)}\s*\n?~<H2>$1</H2>\n~g;
387	$text =~ s~{\\tt\s(.?)}~<tt>$1</tt>~g;
388	$text =~ s~\\(?:texttt\|tt\|ttseries)\s{(.?)}~<tt>$1</tt>~g;
389	$text =~ s~\\emph{(.*?)}~<em>$1</em>~g;
390	$text =~ s~{\\(?:em\|it)\s(.?)}~<em>$1</em>~g;
391	$text =~ s~{\\(?:bf\|bfseries)\s(.?)}~<strong>$1</strong>~g;
392	$text =~ s~\\(?:textbf\|bf\|bfseries)\s{(.?)}~<strong>$1</strong>~g;
393	} else {
394	# remove tags for text-only
395	$text =~ s~\\(?:textbf\|bf\|bfseries\|em\|emph\|tt\|rm\|texttt)\b~ ~g;
396	}
397	$text =~ s ~ {\\sc\s+(.*?)} ~
398	{<span style="font-variant:\ small-caps">$1</span>} ~gx;
399	# ignore these font tags (if there are any left)
400	# sf is sans-serif
401	$text =~ s~\\(?:mdseries\|textmd\|bfseries\|textbf\|sffamily\|sf\|sc)\b~ ~;
402	#### end font-related stuff ####
403
404	### remove all other commands with optional arguments... ###
405	# don't remove commands without { }....
406	# $text =~ s~\\\w+(\[.?\])?\s~~g;
407	# $text =~ s~\\noopsort{[^}]+\}~~g;
408	# verbatim
409	$text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
410	# remove tags, keep contents for \tag[optional]{contents}
411	while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
412
413	# remove latex groupings { } (but not \{ or \} )
414	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
415	$text =~ s/^\{//; # remove { if first char
416
417	# latex characters
418	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
419	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
420	# optional break "\-"
421	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
422	# the non-macro bits have quotes around them - we just remove quotes
423	# XXX bibtex and latex differ here (for the '#' char)...
424	$text =~ s/([^&])[\"\#]/$1/g;
425	}
426	# dashes. Convert (m\|n)-dash into single dash for html.
427	$text =~ s~\-\-+~\-~g;
428
429	# quoted { } chars
430	$text =~ s~\\{~{~g;
431	$text =~ s~\\}~}~g;
432
433	# spaces
434	$text =~ s~\\ ~ ~g;
435
436	# finally to protect against macro language...
437	# greenstone-specific
438	$text =~ s~\[~&\#91;~g;
439	$text =~ s~\]~&\#93;~g;
440	$text =~ s~(?<!\\)([\\_])~\\$1~g;
441
442	if ($html_markup) {
443	$text =~ s~\n{2,}~\n</p>\n<p>~g;
444	return "<p>$text</p>";
445	}
446
447	return $text;
448	}
449
450	# only used by process_latex for \verb....
451	sub verb_text {
452	my $verbatim=shift;
453	$verbatim =~ s/([{}_])/\\$1/g;
454	return $verbatim;
455	}
456	# only used by process_latex_math
457	# returns a unicode char if applicable, otherwise ascii
458	sub math_fraction {
459	my $num=$1;
460	my $denom=$2;
461
462	if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
463	if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
464	if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
465	return "$num/$denom";
466	}
467
468	sub process_latex_math {
469
470	my $text = pop; # if given one or two args, this is the last one...
471	my $html_markup=pop; # if given two args, this is the first one else undef
472
473	$text =~ s~\\,~ ~g; # forces a space?
474	$text =~ s~\\infty~infinity~g; # or unicode 0x221E...
475
476	# use this one when more things can read 3-byte utf8 values like this!
477	# $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
478	$text =~ s~\\cup\b~ U ~g;
479
480	$text =~ s~\\frac\s*{(.+?)}{(.+?)}~math_fraction($1,$2)~ge;
481
482	if ($html_markup) {
483	$text =~ s~\^{(.*?)}~<sup>$1</sup>~g; # a^b superscript
484	$text =~ s~\^([^\{])~<sup>$1</sup>~g;
485	$text =~ s~\_{(.*?)}~<sub>$1</sub>~g; # a_b subscript
486	$text =~ s~\_([^\{])~<sub>$1</sub>~g;
487
488	$text =~ s~\\ldots~…~g; # use html named entity for now
489
490	# put all other command names in italics for now
491	$text =~ s~\\([\w]+)~<i>$1</i> ~g;
492	}
493
494	# special cases, for some input files
495	if ($text =~ m~^\\\w+$~) {
496	$text="{" . $text . "}";
497	}
498
499	return $text;
500	}
501
502
503
504	sub latex_accents_to_utf8 {
505
506	# note - this is really ugly, but it works. There may be a prettier way
507	# of mapping latex accented chars to utf8, but we just brute force it here.
508	# Also, this isn't complete - not every single possible accented letter
509	# is in here yet, but most of the common ones are.
510
511	my %utf8_chars =
512	(
513	# acutes
514	'\'a' => chr(0xc3).chr(0xa1),
515	'\'c' => chr(0xc4).chr(0x87),
516	'\'e' => chr(0xc3).chr(0xa9),
517	'\'i' => chr(0xc3).chr(0xad),
518	'\'l' => chr(0xc3).chr(0xba),
519	'\'n' => chr(0xc3).chr(0x84),
520	'\'o' => chr(0xc3).chr(0xb3),
521	'\'r' => chr(0xc5).chr(0x95),
522	'\'s' => chr(0xc5).chr(0x9b),
523	'\'u' => chr(0xc3).chr(0xba),
524	'\'y' => chr(0xc3).chr(0xbd),
525	'\'z' => chr(0xc5).chr(0xba),
526	# graves
527	'`a' => chr(0xc3).chr(0xa0),
528	'`A' => chr(0xc3).chr(0x80),
529	'`e' => chr(0xc3).chr(0xa8),
530	'`E' => chr(0xc3).chr(0x88),
531	'`i' => chr(0xc3).chr(0xac),
532	'`I' => chr(0xc3).chr(0x8c),
533	'`o' => chr(0xc3).chr(0xb2),
534	'`O' => chr(0xc3).chr(0x92),
535	'`u' => chr(0xc3).chr(0xb9),
536	'`U' => chr(0xc3).chr(0x99),
537	# circumflex
538	'^a' => chr(0xc3).chr(0xa2),
539	'^A' => chr(0xc3).chr(0x82),
540	'^c' => chr(0xc4).chr(0x89),
541	'^C' => chr(0xc4).chr(0x88),
542	'^e' => chr(0xc3).chr(0xaa),
543	'^E' => chr(0xc3).chr(0x8a),
544	'^g' => chr(0xc4).chr(0x9d),
545	'^G' => chr(0xc4).chr(0x9c),
546	'^h' => chr(0xc4).chr(0xa5),
547	'^H' => chr(0xc4).chr(0xa4),
548	'^i' => chr(0xc3).chr(0xae),
549	'^I' => chr(0xc3).chr(0x8e),
550	'^j' => chr(0xc4).chr(0xb5),
551	'^J' => chr(0xc4).chr(0xb4),
552	'^o' => chr(0xc3).chr(0xb4),
553	'^O' => chr(0xc3).chr(0x94),
554	'^s' => chr(0xc5).chr(0x9d),
555	'^S' => chr(0xc5).chr(0x9c),
556	'^u' => chr(0xc3).chr(0xa2),
557	'^U' => chr(0xc3).chr(0xbb),
558	'^w' => chr(0xc5).chr(0xb5),
559	'^W' => chr(0xc5).chr(0xb4),
560	'^y' => chr(0xc5).chr(0xb7),
561	'^Y' => chr(0xc5).chr(0xb6),
562
563	# diaeresis
564	'"a' => chr(0xc3).chr(0xa4),
565	'"A' => chr(0xc3).chr(0x84),
566	'"e' => chr(0xc3).chr(0xab),
567	'"E' => chr(0xc3).chr(0x8b),
568	'"i' => chr(0xc3).chr(0xaf),
569	'"I' => chr(0xc3).chr(0x8f),
570	'"\\\\i' => chr(0xc3).chr(0xaf),
571	'"\\\\I' => chr(0xc3).chr(0x8f),
572	'"o' => chr(0xc3).chr(0xb6),
573	'"O' => chr(0xc3).chr(0x96),
574	'"u' => chr(0xc3).chr(0xbc),
575	'"U' => chr(0xc3).chr(0x9c),
576	'"y' => chr(0xc3).chr(0xbf),
577	'"Y' => chr(0xc3).chr(0xb8),
578	# tilde
579	'~A' => chr(0xc3).chr(0x83),
580	'~N' => chr(0xc3).chr(0x91),
581	'~O' => chr(0xc3).chr(0x95),
582	'~a' => chr(0xc3).chr(0xa3),
583	'~n' => chr(0xc3).chr(0xb1),
584	'~o' => chr(0xc3).chr(0xb5),
585	# caron - handled specially
586	# double acute
587	# ring
588	# dot
589	'.c' => chr(0xc4).chr(0x8b),
590	'.C' => chr(0xc4).chr(0x8a),
591	'.e' => chr(0xc4).chr(0x97),
592	'.E' => chr(0xc4).chr(0x96),
593	'.g' => chr(0xc4).chr(0xa1),
594	'.G' => chr(0xc4).chr(0xa0),
595	'.I' => chr(0xc4).chr(0xb0),
596	'.z' => chr(0xc5).chr(0xbc),
597	'.Z' => chr(0xc5).chr(0xbb),
598	# macron
599	'=a' => chr(0xc4).chr(0x81),
600	'=A' => chr(0xc4).chr(0x80),
601	'=e' => chr(0xc4).chr(0x93),
602	'=E' => chr(0xc4).chr(0x92),
603	'=i' => chr(0xc4).chr(0xab),
604	'=I' => chr(0xc4).chr(0xaa),
605	'=o' => chr(0xc4).chr(0x8d),
606	'=O' => chr(0xc4).chr(0x8c),
607	'=u' => chr(0xc4).chr(0xab),
608	'=U' => chr(0xc4).chr(0xaa),
609
610	# stroke - handled specially - see below
611
612	# cedilla - handled specially
613	);
614
615	# these are one letter latex commands - we make sure they're not a longer
616	# command name. eg {\d} is d+stroke, so careful of \d
617	my %special_utf8_chars =
618	(
619	# breve
620	'u g' => chr(0xc4).chr(0x9f),
621	'u G' => chr(0xc4).chr(0x9e),
622	'u i' => chr(0xc4).chr(0xad),
623	'u I' => chr(0xc4).chr(0xac),
624	'u o' => chr(0xc5).chr(0x8f),
625	'u O' => chr(0xc5).chr(0x8e),
626	'u u' => chr(0xc5).chr(0xad),
627	'u U' => chr(0xc5).chr(0xac),
628	'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
629	'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
630	# caron
631	'v c' => chr(0xc4).chr(0x8d),
632	'v C' => chr(0xc4).chr(0x8c),
633	'v n' => chr(0xc5).chr(0x88),
634	'v N' => chr(0xc5).chr(0x87),
635	'v s' => chr(0xc5).chr(0xa1),
636	'v S' => chr(0xc5).chr(0xa5),
637	'v z' => chr(0xc5).chr(0xbe),
638	'v Z' => chr(0xc5).chr(0xbd),
639	# cedilla
640	'c c' => chr(0xc3).chr(0xa7),
641	'c C' => chr(0xc3).chr(0x87),
642	'c g' => chr(0xc4).chr(0xa3),
643	'c G' => chr(0xc4).chr(0xa2),
644	'c k' => chr(0xc4).chr(0xb7),
645	'c K' => chr(0xc4).chr(0xb6),
646	'c l' => chr(0xc4).chr(0xbc),
647	'c L' => chr(0xc4).chr(0xbb),
648	'c n' => chr(0xc5).chr(0x86),
649	'c N' => chr(0xc5).chr(0x85),
650	'c r' => chr(0xc5).chr(0x97),
651	'c R' => chr(0xc5).chr(0x96),
652	'c s' => chr(0xc5).chr(0x9f),
653	'c S' => chr(0xc5).chr(0x9e),
654	'c t' => chr(0xc5).chr(0xa3),
655	'c T' => chr(0xc5).chr(0xa2),
656	# double acute / Hungarian accent
657	'H O' => chr(0xc5).chr(0x90),
658	'H o' => chr(0xc5).chr(0x91),
659	'H U' => chr(0xc5).chr(0xb0),
660	'H u' => chr(0xc5).chr(0xb1),
661
662	# stroke
663	'd' => chr(0xc4).chr(0x91),
664	'D' => chr(0xc4).chr(0x90),
665	'h' => chr(0xc4).chr(0xa7),
666	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
667	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
668	'l' => chr(0xc5).chr(0x82),
669	'L' => chr(0xc5).chr(0x81),
670	'o' => chr(0xc3).chr(0xb8),
671	'O' => chr(0xc3).chr(0x98),
672	't' => chr(0xc5).chr(0xa7),
673	'T' => chr(0xc5).chr(0xa6),
674
675	# other special chars
676	'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
677	'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
678	);
679
680	my $self=shift;
681	my $textref=shift;
682
683	my $outhandle=$self->{'outhandle'};
684	my $text=$$textref;
685
686	# remove space (if any) between \ and letter to accent (eg {\' a})
687	$text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
688
689	# remove {} around a single character (eg \'{e})
690	$text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g; # for emacs indenting... `]);
691
692	## only in bibtex... not in latex proper?!
693	### \, is another way of doing cedilla \c
694	##$text =~ s~\\,(.)~\\c $1~g;
695
696	# remove {} around a single character for special 1 letter commands -
697	# need to insert a space. Eg \v{s} -> {\v s}
698	$text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
699
700	# only do if the text contains a '\' character.
701	if ($text =~ m\|\\\|) {
702	# "normal" accents - ie non-alpha latex tag
703	# xxx used to have ([\w]\b)@ (for word boundary)
704	while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
705	my $tex="$1$2"; my $char="$2";
706	my $replacement=$utf8_chars{$tex};
707	if (!defined($replacement)) {
708	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
709	print $outhandle . $self->{'plugin_type'} .
710	": Warning: unknown latex accent \"$tex\""
711	. " in \"$1\"\n";
712	$replacement=$char;
713	}
714	$text =~ s/\\\Q$tex/$replacement/g;
715	}
716
717	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
718	# only do the change if immediately followed by a space, }, {, or \
719	# one letter accents ( + ss / aa)
720	while ($text =~ m~\\([DdhiLlOoTt]\|ss\|aa)[{}\s\"\\]~) {
721	my $tex=$1;
722	my $replacement=$special_utf8_chars{$tex};
723	if (!defined($replacement)) {
724	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
725	print $outhandle $self->{'plugin_type'} .
726	": Warning: unknown latex accent \"$tex\""
727	. " in \"$1\"\n";
728	$replacement=$tex;
729	}
730	($text =~ s/{\\$tex}/$replacement/g) or
731	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
732
733	}
734
735	# one letter latex accent commands that affect following letter
736	while ($text =~ m~\\([uvcH]) ([\w])~) {
737	my $tex="$1 $2"; my $char="$2";
738	my $replacement=$special_utf8_chars{$tex};
739	if (!defined($replacement)) {
740	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
741	print $outhandle $self->{'plugin_type'} .
742	": Warning: unknown latex accent \"$tex\""
743	. " in \"$1\"\n";
744	$replacement=$char;
745	}
746	$text =~ s/\\$tex/$replacement/g;
747	}
748	}
749	$textref=\$text;
750	}
751
752
753	# modules must return true
754	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: