Context Navigation

source: gsdl/trunk/perllib/plugins/LaTeXPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago
plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...
Property svn:keywords set to `Author Date Id Revision`
File size: 23.1 KB

Line
1	###########################################################################
2	#
3	# LaTeXPlugin.pm
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Written by John McPherson
10	# Copyright (C) 2004 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	###########################################################################
23
24	# todo:
25	# \includegraphics
26	# parse/remove tex \if ... macros
27
28	package LaTeXPlugin;
29
30	# System complains about $arguments if the strict is set
31	use strict;
32	no strict 'refs'; # so we can print to a handle named by a variable
33
34	# greenstone packages
35	use ReadTextFile;
36	use unicode;
37	use util;
38
39	my $arguments =
40	[ { 'name' => "process_exp",
41	'desc' => "{ReadTextFile.process_exp}",
42	'type' => "regexp",
43	'reqd' => "no",
44	'deft' => &get_default_process_exp() } ];
45
46	my $options = { 'name' => 'LaTeXPlugin',
47	'desc' => '{LaTeXPlugin.desc}',
48	'abstract' => 'no',
49	'inherits' => 'yes',
50	'args' => $arguments };
51
52	sub BEGIN {
53	@LaTeXPlugin::ISA = ('ReadTextFile');
54	}
55
56	sub new {
57	my ($class) = shift (@_);
58	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59	push(@$pluginlist, $class);
60
61	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62	push(@{$hashArgOptLists->{"OptList"}},$options);
63
64	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
65
66	$self->{'aux_files'} = {};
67	$self->{'dir_num'} = 0;
68	$self->{'file_num'} = 0;
69	return bless $self, $class;
70	}
71
72
73	sub get_default_process_exp {
74	my $self = shift (@_);
75	return q^\.tex$^;
76	}
77
78	sub get_default_block_exp {
79	# assume any .eps files are part of the latex stuff
80	return '\.(?:eps)$';
81	}
82
83
84	sub process {
85	my $self = shift (@_);
86	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
87
88	my $start=substr($$textref, 0, 200); # first 200 bytes
89
90	if ($start !~ m~\\ (?:documentclass \| documentstyle \| input \| section
91	\| chapter \| contents \| begin) ~x) {
92	# this doesn't look like latex...
93	return undef;
94	}
95	my $outhandle = $self->{'outhandle'};
96	if ($gli) {
97	print STDERR "<Processing n='$file' p='LaTeXPlugin'>\n";
98	} elsif ($self->{'verbosity'} > 1) {
99	print $outhandle "LaTeXPlugin: processing $file\n"
100	}
101	my $cursection = $doc_obj->get_top_section();
102
103	###### clean up text ######
104	$$textref =~ s/\r$//mg; # remove dos ^M
105	$$textref =~ s/%.*$//mg; # remove comments
106
107	# convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
108
109	$$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
110	# check that both sides are ascii, so we don't screw up utf-8 chars
111	$$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
112	unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
113
114
115	###### find metadata ######
116
117	## FileFormat metadata ##
118	$doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
119
120	### title metadata ###
121	$$textref =~ m@\\title\s{(.?)}@s;
122	my $title = $1;
123	if (!$title) {
124	# no title tag. look for a chapter/section heading
125	$$textref =~ m@\\(?:chapter\|section)\s{(.?)}@s; # will get 1st match
126	$title = $1;
127	}
128	if (!$title) {
129	# no chapter/section heading tags either... use filename
130	$title = $file;
131	$title =~ s/\.tex$//i;
132	$title =~ s/[-_.]/ /g; # turn punctuation into spaces
133	}
134	if ($title) {
135	$title =~ s@\\\\@ @g; # embedded newlines
136	$title = $self->process_latex($title); # no "-html" for title eg in browser
137	$doc_obj->add_utf8_metadata($cursection, "Title", $title);
138	}
139
140	### creator/individual author metadata ###
141	$$textref =~ m@\\author\s{((?:{.?}\|.?)+)}\s$@sm;
142	my $authors=$1;
143	if ($authors) {
144	# take care of "et al."...
145	$authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
146	my $etal=$1;
147	$etal="" if (!defined ($etal));
148
149	my @authorlist=parse_authors($self, $authors);
150
151	foreach my $author (@authorlist) {
152	# Add each name to set of Authors
153	$doc_obj->add_utf8_metadata ($cursection, "Author", $author);
154	}
155
156	# Only want at most one "and" in the Creator field
157	my $creator_str="";
158	if (scalar(@authorlist) > 2) {
159	my $lastauthor=pop @authorlist;
160	$creator_str=join(', ', @authorlist);
161	$creator_str.=" and $lastauthor";
162	} else { # 1 or 2 authors...
163	$creator_str=join(" and ",@authorlist);
164	}
165	$creator_str.=$etal; # if there was "et al."
166	$doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
167	}
168	### end of author metadata ###
169
170	###### process latex for the main text ######
171	$$textref =~ s/^.*?\\begin{document}//s;
172	$$textref =~ s/\\end{document}.*?$//s;
173	$$textref = $self->process_latex("-html",$$textref);
174	$doc_obj->add_utf8_text($cursection, $$textref);
175
176	return 1;
177	}
178
179
180	# returns a list of author names
181	sub parse_authors {
182	my $self=shift;
183	my $authors=shift;
184
185	my $outhandle=$self->{'outhandle'};
186
187	$authors =~ s/\n/ /g; # remove newlines
188
189	# some people do this for affiliation footnote/dagger
190	$authors =~ s@\$.*?\$@@g; # remove maths from author :(
191
192	# und here for german language...
193	# don't use brackets in pattern, else the matched bit becomes
194	# an element in the list!
195	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $authors);
196	my @formattedlist = ();
197	foreach my $author (@authorlist) {
198	$author =~ s/\s*$//;
199	$author =~ s/^\s*//;
200	# Reformat and add author name
201	next if $author=~ /^\s*$/;
202
203	# names are "First von Last", "von Last, First"
204	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
205	my $first="";
206	my $vonlast="";
207	my $jr="";
208
209	if ($author =~ /,/) {
210	my @parts=split(/,\s*/, $author);
211	$first = pop @parts;
212	if (scalar(@parts) == 2) {
213	$jr = pop @parts;
214	}
215	$vonlast=shift @parts;
216	if (scalar(@parts) > 0) {
217	print $outhandle $self->{'plugin_type'} .
218	": couldn't parse name $author\n";
219	# but we continue anyway...
220	}
221	} else { # First von Last
222	my @words = split(/ /, $author);
223	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
224	$first .= " " . shift (@words);
225	}
226	$first =~ s/^\s//;
227	$vonlast = join (' ', @words); # whatever's left...
228	}
229	my $von="";
230	my $last="";
231	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
232	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
233	$von = $1;
234	if (!defined ($von)) {
235	# some non-English names do start with lowercase
236	# eg "Marie desJardins". Also we can get typos...
237	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
238	$von="";
239	if ($vonlast =~ /^[a-z]+$/) {
240	# if it's all lowercase, uppercase 1st.
241	$vonlast =~ s/^(.)/\u$1/;
242	}
243	}
244	$von =~ s/\s*$//;
245	$last=$vonlast;
246	} else {
247	$last=$vonlast;
248	}
249	my $wholename="$first $von $last $jr";
250	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
251	# my $fullname = "$last";
252	# $fullname .= " $jr" if ($jr);
253	# $fullname .= ", $first";
254	# $fullname .= " $von" if ($von);
255	push (@formattedlist, $wholename);
256	}
257	return @formattedlist;
258	}
259
260
261	## following functions based on bibtex plugin ##
262	# not actually used at the moment, but might be useful in future?
263	sub expand_month {
264	my $text=shift;
265
266	# bibtex style files expand abbreviations for months.
267	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
268	$text =~ s/jan/_textmonth01_/g;
269	$text =~ s/feb/_textmonth02_/g;
270	$text =~ s/mar/_textmonth03_/g;
271	$text =~ s/apr/_textmonth04_/g;
272	$text =~ s/may/_textmonth05_/g;
273	$text =~ s/jun/_textmonth06_/g;
274	$text =~ s/jul/_textmonth07_/g;
275	$text =~ s/aug/_textmonth08_/g;
276	$text =~ s/sep/_textmonth09_/g;
277	$text =~ s/oct/_textmonth10_/g;
278	$text =~ s/nov/_textmonth11_/g;
279	$text =~ s/dec/_textmonth12_/g;
280
281	return $text;
282	}
283
284
285	# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
286	# as the first argument to this function.
287	#
288	# Convert accented characters, remove { }, interprete some commands....
289	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
290
291	# Also, it sucks quite a bit for complicated/nested commands since it doesn't
292	# match { with the corresponding }, only the nearest }
293
294	sub process_latex {
295	my $self=shift;
296	my $text=shift;
297
298	my $outhandle=$self->{'outhandle'};
299
300	my $html_markup=0;
301	if ($text =~ /^\-html/) {
302	$html_markup=1;
303	$text=shift;
304	}
305
306	if (! $text) {
307	return $text;
308	}
309	# escape html-sensitive characters
310	$text =~ s@&@&@g;
311	$text =~ s@<@<@g;
312	$text =~ s@>@>@g;
313
314	# do this before accents, since \= means something different in tabbing
315	# also \> is a tab stop too, and \\ is newline
316	sub do_tabbing {
317	my $tabbing=shift;
318	$tabbing =~ s!^.\\kill\s$!!mg; # \kill sets tab stops, kills line
319	$tabbing =~ s~\\(?:=\|>)~\xc2\xa0~g; # replace with nbsp
320	$tabbing =~ s~[\\][\\](?:\[.?\])?\s$~<br/>~mg;
321	return "<br/>" . $tabbing . "<br/>\n";
322	}
323	$text =~ s@\\begin{tabbing}(.*?)\\end{tabbing}@do_tabbing($1)@ges;
324	sub do_tabular {
325	my $tabular=shift;
326	$tabular =~ s~(?<!\\)\s&\s~</td><td>~g;
327	$tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
328	$tabular =~ s~\\hline~~g; # for now...
329	$tabular =~ s~<td>\s\\multicolumn{(\d+)}{.?}~<td colspan="$1">~g;
330	return "<table border=\"1\">\n <tr><td>"
331	. $tabular . "</td></tr></table>\n";
332	}
333	$text =~ s@\\begin{tabular}(?:\[.?\])?{.?}(.*?)\\end{tabular} @
334	do_tabular($1) @xges;
335
336	$text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
337
338	# process maths mode before accents... things like \, mean different!
339	# maths mode
340	$text =~ s@\$\$(.*?)\$\$
341	@ process_latex_math($html_markup,$1)
342	@xsge; # multi-line maths: $$ .... $$
343
344	$text =~ s@([^\\])\$(.*?[^\\])\$
345	@$1.process_latex_math($html_markup,$2)@xsge;
346
347
348	# is this an amstext environment, or just custom for that input file?
349	$text =~ s@\\begin{(algorithm)}(.*?)\\end{\1}@remove_equals($2)@ges;
350
351	# convert latex-style accented characters.
352	$self->latex_accents_to_utf8(\$text);
353
354	# replace quotes with utf-8
355
356	$text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (“)
357	$text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (”)
358	$text =~ s/`/\xe2\xc0\x98/g; # single left quote
359	$text =~ s/'/\xe2\xc0\x99/g; # single right quote
360
361	###### remove/replace latex commands ######
362	### commands that expand to something that gets displayed ###
363	$text =~ s~\\ldots~…~g;
364	$text =~ s~\\hrule~<hr/>\n~g;
365	$text =~ s~\\maketitle~ ~;
366	### space commands ###
367	$text =~ s~\\[vh]skip\s+\S+~~g;
368	$text =~ s~\\vspace\?{.?}~<div> </div>~g; # vertical space
369	$text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
370	$text =~ s~\\noindent\b~~g;
371	# newpage, etc
372	$text =~ s~\\(?:clearemptydoublepage\|newpage)~~g;
373	### counters, contents, environments, labels, etc ###
374	$text =~ s~\\(?:addcontentsline){.?}{.?}{.*}~~g;
375	$text =~ s~\s\\begin{itemize}\s~\n<ul>\n~g;
376	$text =~ s~\s\\end{itemize}\s~</li></ul>\n~g;
377	$text =~ s~\s\\begin{enumerate}\s~<ol>\n~g;
378	$text =~ s~\s\\end{enumerate}\s~</li></ol>\n~g;
379	if ($text =~ s~\s*\\item~</li>\n<li>~g) {
380	# (count for first list item)
381	$text =~ s~<([ou])l>\s</li>\s~<$1l>~g;
382	}
383	$text =~ s~\\(?:label\|begin\|end){.?}\s\n?~ ~g; # remove tag and contents
384	$text =~ s~\\(?:tableofcontents\|listoffigures)~ ~g;
385	### font sizes/styles ###
386	$text =~ s~\\(?:tiny\|small\|footnotesize\|normalsize\|large\|Large\|huge\|Huge)\b~~g;
387
388	if ($html_markup) {
389	$text =~ s~\\section\?{([^\}]+)}\s\n?~<H1>$1</H1>\n~g;
390	$text =~ s~\\subsection\?{(.?)}\s*\n?~<H2>$1</H2>\n~g;
391	$text =~ s~{\\tt\s(.?)}~<tt>$1</tt>~g;
392	$text =~ s~\\(?:texttt\|tt\|ttseries)\s{(.?)}~<tt>$1</tt>~g;
393	$text =~ s~\\emph{(.*?)}~<em>$1</em>~g;
394	$text =~ s~{\\(?:em\|it)\s(.?)}~<em>$1</em>~g;
395	$text =~ s~{\\(?:bf\|bfseries)\s(.?)}~<strong>$1</strong>~g;
396	$text =~ s~\\(?:textbf\|bf\|bfseries)\s{(.?)}~<strong>$1</strong>~g;
397	} else {
398	# remove tags for text-only
399	$text =~ s~\\(?:textbf\|bf\|bfseries\|em\|emph\|tt\|rm\|texttt)\b~ ~g;
400	}
401	$text =~ s ~ {\\sc\s+(.*?)} ~
402	{<span style="font-variant:\ small-caps">$1</span>} ~gx;
403	# ignore these font tags (if there are any left)
404	# sf is sans-serif
405	$text =~ s~\\(?:mdseries\|textmd\|bfseries\|textbf\|sffamily\|sf\|sc)\b~ ~;
406	#### end font-related stuff ####
407
408	### remove all other commands with optional arguments... ###
409	# don't remove commands without { }....
410	# $text =~ s~\\\w+(\[.?\])?\s~~g;
411	# $text =~ s~\\noopsort{[^}]+\}~~g;
412	# verbatim
413	$text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
414	# remove tags, keep contents for \tag[optional]{contents}
415	while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
416
417	# remove latex groupings { } (but not \{ or \} )
418	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
419	$text =~ s/^\{//; # remove { if first char
420
421	# latex characters
422	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
423	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
424	# optional break "\-"
425	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
426	# the non-macro bits have quotes around them - we just remove quotes
427	# XXX bibtex and latex differ here (for the '#' char)...
428	$text =~ s/([^&])[\"\#]/$1/g;
429	}
430	# dashes. Convert (m\|n)-dash into single dash for html.
431	$text =~ s~\-\-+~\-~g;
432
433	# quoted { } chars
434	$text =~ s~\\{~{~g;
435	$text =~ s~\\}~}~g;
436
437	# spaces
438	$text =~ s~\\ ~ ~g;
439
440	# finally to protect against macro language...
441	# greenstone-specific
442	$text =~ s~\[~&\#91;~g;
443	$text =~ s~\]~&\#93;~g;
444	$text =~ s~(?<!\\)([\\_])~\\$1~g;
445
446	if ($html_markup) {
447	$text =~ s~\n{2,}~\n</p>\n<p>~g;
448	return "<p>$text</p>";
449	}
450
451	return $text;
452	}
453
454	# only used by process_latex for \verb....
455	sub verb_text {
456	my $verbatim=shift;
457	$verbatim =~ s/([{}_])/\\$1/g;
458	return $verbatim;
459	}
460	# only used by process_latex_math
461	# returns a unicode char if applicable, otherwise ascii
462	sub math_fraction {
463	my $num=$1;
464	my $denom=$2;
465
466	if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
467	if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
468	if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
469	return "$num/$denom";
470	}
471
472	sub process_latex_math {
473
474	my $text = pop; # if given one or two args, this is the last one...
475	my $html_markup=pop; # if given two args, this is the first one else undef
476
477	$text =~ s~\\,~ ~g; # forces a space?
478	$text =~ s~\\infty~infinity~g; # or unicode 0x221E...
479
480	# use this one when more things can read 3-byte utf8 values like this!
481	# $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
482	$text =~ s~\\cup\b~ U ~g;
483
484	$text =~ s~\\frac\s*{(.+?)}{(.+?)}~math_fraction($1,$2)~ge;
485
486	if ($html_markup) {
487	$text =~ s~\^{(.*?)}~<sup>$1</sup>~g; # a^b superscript
488	$text =~ s~\^([^\{])~<sup>$1</sup>~g;
489	$text =~ s~\_{(.*?)}~<sub>$1</sub>~g; # a_b subscript
490	$text =~ s~\_([^\{])~<sub>$1</sub>~g;
491
492	$text =~ s~\\ldots~…~g; # use html named entity for now
493
494	# put all other command names in italics for now
495	$text =~ s~\\([\w]+)~<i>$1</i> ~g;
496	}
497
498	# special cases, for some input files
499	if ($text =~ m~^\\\w+$~) {
500	$text="{" . $text . "}";
501	}
502
503	return $text;
504	}
505
506
507
508	sub latex_accents_to_utf8 {
509
510	# note - this is really ugly, but it works. There may be a prettier way
511	# of mapping latex accented chars to utf8, but we just brute force it here.
512	# Also, this isn't complete - not every single possible accented letter
513	# is in here yet, but most of the common ones are.
514
515	my %utf8_chars =
516	(
517	# acutes
518	'\'a' => chr(0xc3).chr(0xa1),
519	'\'c' => chr(0xc4).chr(0x87),
520	'\'e' => chr(0xc3).chr(0xa9),
521	'\'i' => chr(0xc3).chr(0xad),
522	'\'l' => chr(0xc3).chr(0xba),
523	'\'n' => chr(0xc3).chr(0x84),
524	'\'o' => chr(0xc3).chr(0xb3),
525	'\'r' => chr(0xc5).chr(0x95),
526	'\'s' => chr(0xc5).chr(0x9b),
527	'\'u' => chr(0xc3).chr(0xba),
528	'\'y' => chr(0xc3).chr(0xbd),
529	'\'z' => chr(0xc5).chr(0xba),
530	# graves
531	'`a' => chr(0xc3).chr(0xa0),
532	'`A' => chr(0xc3).chr(0x80),
533	'`e' => chr(0xc3).chr(0xa8),
534	'`E' => chr(0xc3).chr(0x88),
535	'`i' => chr(0xc3).chr(0xac),
536	'`I' => chr(0xc3).chr(0x8c),
537	'`o' => chr(0xc3).chr(0xb2),
538	'`O' => chr(0xc3).chr(0x92),
539	'`u' => chr(0xc3).chr(0xb9),
540	'`U' => chr(0xc3).chr(0x99),
541	# circumflex
542	'^a' => chr(0xc3).chr(0xa2),
543	'^A' => chr(0xc3).chr(0x82),
544	'^c' => chr(0xc4).chr(0x89),
545	'^C' => chr(0xc4).chr(0x88),
546	'^e' => chr(0xc3).chr(0xaa),
547	'^E' => chr(0xc3).chr(0x8a),
548	'^g' => chr(0xc4).chr(0x9d),
549	'^G' => chr(0xc4).chr(0x9c),
550	'^h' => chr(0xc4).chr(0xa5),
551	'^H' => chr(0xc4).chr(0xa4),
552	'^i' => chr(0xc3).chr(0xae),
553	'^I' => chr(0xc3).chr(0x8e),
554	'^j' => chr(0xc4).chr(0xb5),
555	'^J' => chr(0xc4).chr(0xb4),
556	'^o' => chr(0xc3).chr(0xb4),
557	'^O' => chr(0xc3).chr(0x94),
558	'^s' => chr(0xc5).chr(0x9d),
559	'^S' => chr(0xc5).chr(0x9c),
560	'^u' => chr(0xc3).chr(0xa2),
561	'^U' => chr(0xc3).chr(0xbb),
562	'^w' => chr(0xc5).chr(0xb5),
563	'^W' => chr(0xc5).chr(0xb4),
564	'^y' => chr(0xc5).chr(0xb7),
565	'^Y' => chr(0xc5).chr(0xb6),
566
567	# diaeresis
568	'"a' => chr(0xc3).chr(0xa4),
569	'"A' => chr(0xc3).chr(0x84),
570	'"e' => chr(0xc3).chr(0xab),
571	'"E' => chr(0xc3).chr(0x8b),
572	'"i' => chr(0xc3).chr(0xaf),
573	'"I' => chr(0xc3).chr(0x8f),
574	'"\\\\i' => chr(0xc3).chr(0xaf),
575	'"\\\\I' => chr(0xc3).chr(0x8f),
576	'"o' => chr(0xc3).chr(0xb6),
577	'"O' => chr(0xc3).chr(0x96),
578	'"u' => chr(0xc3).chr(0xbc),
579	'"U' => chr(0xc3).chr(0x9c),
580	'"y' => chr(0xc3).chr(0xbf),
581	'"Y' => chr(0xc3).chr(0xb8),
582	# tilde
583	'~A' => chr(0xc3).chr(0x83),
584	'~N' => chr(0xc3).chr(0x91),
585	'~O' => chr(0xc3).chr(0x95),
586	'~a' => chr(0xc3).chr(0xa3),
587	'~n' => chr(0xc3).chr(0xb1),
588	'~o' => chr(0xc3).chr(0xb5),
589	# caron - handled specially
590	# double acute
591	# ring
592	# dot
593	'.c' => chr(0xc4).chr(0x8b),
594	'.C' => chr(0xc4).chr(0x8a),
595	'.e' => chr(0xc4).chr(0x97),
596	'.E' => chr(0xc4).chr(0x96),
597	'.g' => chr(0xc4).chr(0xa1),
598	'.G' => chr(0xc4).chr(0xa0),
599	'.I' => chr(0xc4).chr(0xb0),
600	'.z' => chr(0xc5).chr(0xbc),
601	'.Z' => chr(0xc5).chr(0xbb),
602	# macron
603	'=a' => chr(0xc4).chr(0x81),
604	'=A' => chr(0xc4).chr(0x80),
605	'=e' => chr(0xc4).chr(0x93),
606	'=E' => chr(0xc4).chr(0x92),
607	'=i' => chr(0xc4).chr(0xab),
608	'=I' => chr(0xc4).chr(0xaa),
609	'=o' => chr(0xc4).chr(0x8d),
610	'=O' => chr(0xc4).chr(0x8c),
611	'=u' => chr(0xc4).chr(0xab),
612	'=U' => chr(0xc4).chr(0xaa),
613
614	# stroke - handled specially - see below
615
616	# cedilla - handled specially
617	);
618
619	# these are one letter latex commands - we make sure they're not a longer
620	# command name. eg {\d} is d+stroke, so careful of \d
621	my %special_utf8_chars =
622	(
623	# breve
624	'u g' => chr(0xc4).chr(0x9f),
625	'u G' => chr(0xc4).chr(0x9e),
626	'u i' => chr(0xc4).chr(0xad),
627	'u I' => chr(0xc4).chr(0xac),
628	'u o' => chr(0xc5).chr(0x8f),
629	'u O' => chr(0xc5).chr(0x8e),
630	'u u' => chr(0xc5).chr(0xad),
631	'u U' => chr(0xc5).chr(0xac),
632	'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
633	'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
634	# caron
635	'v c' => chr(0xc4).chr(0x8d),
636	'v C' => chr(0xc4).chr(0x8c),
637	'v n' => chr(0xc5).chr(0x88),
638	'v N' => chr(0xc5).chr(0x87),
639	'v s' => chr(0xc5).chr(0xa1),
640	'v S' => chr(0xc5).chr(0xa5),
641	'v z' => chr(0xc5).chr(0xbe),
642	'v Z' => chr(0xc5).chr(0xbd),
643	# cedilla
644	'c c' => chr(0xc3).chr(0xa7),
645	'c C' => chr(0xc3).chr(0x87),
646	'c g' => chr(0xc4).chr(0xa3),
647	'c G' => chr(0xc4).chr(0xa2),
648	'c k' => chr(0xc4).chr(0xb7),
649	'c K' => chr(0xc4).chr(0xb6),
650	'c l' => chr(0xc4).chr(0xbc),
651	'c L' => chr(0xc4).chr(0xbb),
652	'c n' => chr(0xc5).chr(0x86),
653	'c N' => chr(0xc5).chr(0x85),
654	'c r' => chr(0xc5).chr(0x97),
655	'c R' => chr(0xc5).chr(0x96),
656	'c s' => chr(0xc5).chr(0x9f),
657	'c S' => chr(0xc5).chr(0x9e),
658	'c t' => chr(0xc5).chr(0xa3),
659	'c T' => chr(0xc5).chr(0xa2),
660	# double acute / Hungarian accent
661	'H O' => chr(0xc5).chr(0x90),
662	'H o' => chr(0xc5).chr(0x91),
663	'H U' => chr(0xc5).chr(0xb0),
664	'H u' => chr(0xc5).chr(0xb1),
665
666	# stroke
667	'd' => chr(0xc4).chr(0x91),
668	'D' => chr(0xc4).chr(0x90),
669	'h' => chr(0xc4).chr(0xa7),
670	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
671	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
672	'l' => chr(0xc5).chr(0x82),
673	'L' => chr(0xc5).chr(0x81),
674	'o' => chr(0xc3).chr(0xb8),
675	'O' => chr(0xc3).chr(0x98),
676	't' => chr(0xc5).chr(0xa7),
677	'T' => chr(0xc5).chr(0xa6),
678
679	# other special chars
680	'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
681	'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
682	);
683
684	my $self=shift;
685	my $textref=shift;
686
687	my $outhandle=$self->{'outhandle'};
688	my $text=$$textref;
689
690	# remove space (if any) between \ and letter to accent (eg {\' a})
691	$text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
692
693	# remove {} around a single character (eg \'{e})
694	$text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g; # for emacs indenting... `]);
695
696	## only in bibtex... not in latex proper?!
697	### \, is another way of doing cedilla \c
698	##$text =~ s~\\,(.)~\\c $1~g;
699
700	# remove {} around a single character for special 1 letter commands -
701	# need to insert a space. Eg \v{s} -> {\v s}
702	$text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
703
704	# only do if the text contains a '\' character.
705	if ($text =~ m\|\\\|) {
706	# "normal" accents - ie non-alpha latex tag
707	# xxx used to have ([\w]\b)@ (for word boundary)
708	while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
709	my $tex="$1$2"; my $char="$2";
710	my $replacement=$utf8_chars{$tex};
711	if (!defined($replacement)) {
712	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
713	print $outhandle . $self->{'plugin_type'} .
714	": Warning: unknown latex accent \"$tex\""
715	. " in \"$1\"\n";
716	$replacement=$char;
717	}
718	$text =~ s/\\\Q$tex/$replacement/g;
719	}
720
721	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
722	# only do the change if immediately followed by a space, }, {, or \
723	# one letter accents ( + ss / aa)
724	while ($text =~ m~\\([DdhiLlOoTt]\|ss\|aa)[{}\s\"\\]~) {
725	my $tex=$1;
726	my $replacement=$special_utf8_chars{$tex};
727	if (!defined($replacement)) {
728	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
729	print $outhandle $self->{'plugin_type'} .
730	": Warning: unknown latex accent \"$tex\""
731	. " in \"$1\"\n";
732	$replacement=$tex;
733	}
734	($text =~ s/{\\$tex}/$replacement/g) or
735	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
736
737	}
738
739	# one letter latex accent commands that affect following letter
740	while ($text =~ m~\\([uvcH]) ([\w])~) {
741	my $tex="$1 $2"; my $char="$2";
742	my $replacement=$special_utf8_chars{$tex};
743	if (!defined($replacement)) {
744	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
745	print $outhandle $self->{'plugin_type'} .
746	": Warning: unknown latex accent \"$tex\""
747	. " in \"$1\"\n";
748	$replacement=$char;
749	}
750	$text =~ s/\\$tex/$replacement/g;
751	}
752	}
753	$textref=\$text;
754	}
755
756
757	# modules must return true
758	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: