source: gsdl/trunk/perllib/plugins/LaTeXPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 23.1 KB
Line 
1###########################################################################
2#
3# LaTeXPlugin.pm
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Written by John McPherson
10# Copyright (C) 2004 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22###########################################################################
23
24# todo:
25# \includegraphics
26# parse/remove tex \if ... macros
27
28package LaTeXPlugin;
29
30# System complains about $arguments if the strict is set
31use strict;
32no strict 'refs'; # so we can print to a handle named by a variable
33
34# greenstone packages
35use ReadTextFile;
36use unicode;
37use util;
38
39my $arguments =
40 [ { 'name' => "process_exp",
41 'desc' => "{ReadTextFile.process_exp}",
42 'type' => "regexp",
43 'reqd' => "no",
44 'deft' => &get_default_process_exp() } ];
45
46my $options = { 'name' => 'LaTeXPlugin',
47 'desc' => '{LaTeXPlugin.desc}',
48 'abstract' => 'no',
49 'inherits' => 'yes',
50 'args' => $arguments };
51
52sub BEGIN {
53 @LaTeXPlugin::ISA = ('ReadTextFile');
54}
55
56sub new {
57 my ($class) = shift (@_);
58 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59 push(@$pluginlist, $class);
60
61 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62 push(@{$hashArgOptLists->{"OptList"}},$options);
63
64 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
65
66 $self->{'aux_files'} = {};
67 $self->{'dir_num'} = 0;
68 $self->{'file_num'} = 0;
69 return bless $self, $class;
70}
71
72
73sub get_default_process_exp {
74 my $self = shift (@_);
75 return q^\.tex$^;
76}
77
78sub get_default_block_exp {
79 # assume any .eps files are part of the latex stuff
80 return '\.(?:eps)$';
81}
82
83
84sub process {
85 my $self = shift (@_);
86 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
87
88 my $start=substr($$textref, 0, 200); # first 200 bytes
89
90 if ($start !~ m~\\ (?:documentclass | documentstyle | input | section
91 | chapter | contents | begin) ~x) {
92 # this doesn't look like latex...
93 return undef;
94 }
95 my $outhandle = $self->{'outhandle'};
96 if ($gli) {
97 print STDERR "<Processing n='$file' p='LaTeXPlugin'>\n";
98 } elsif ($self->{'verbosity'} > 1) {
99 print $outhandle "LaTeXPlugin: processing $file\n"
100 }
101 my $cursection = $doc_obj->get_top_section();
102
103 ###### clean up text ######
104 $$textref =~ s/\r$//mg; # remove dos ^M
105 $$textref =~ s/%.*$//mg; # remove comments
106
107 # convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
108
109 $$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
110 # check that both sides are ascii, so we don't screw up utf-8 chars
111 $$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
112 unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
113
114
115 ###### find metadata ######
116
117 ## FileFormat metadata ##
118 $doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
119
120 ### title metadata ###
121 $$textref =~ m@\\title\s*{(.*?)}@s;
122 my $title = $1;
123 if (!$title) {
124 # no title tag. look for a chapter/section heading
125 $$textref =~ m@\\(?:chapter|section)\s*{(.*?)}@s; # will get 1st match
126 $title = $1;
127 }
128 if (!$title) {
129 # no chapter/section heading tags either... use filename
130 $title = $file;
131 $title =~ s/\.tex$//i;
132 $title =~ s/[-_.]/ /g; # turn punctuation into spaces
133 }
134 if ($title) {
135 $title =~ s@\\\\@ @g; # embedded newlines
136 $title = $self->process_latex($title); # no "-html" for title eg in browser
137 $doc_obj->add_utf8_metadata($cursection, "Title", $title);
138 }
139
140 ### creator/individual author metadata ###
141 $$textref =~ m@\\author\s*{((?:{.*?}|.*?)+)}\s*$@sm;
142 my $authors=$1;
143 if ($authors) {
144 # take care of "et al."...
145 $authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
146 my $etal=$1;
147 $etal="" if (!defined ($etal));
148
149 my @authorlist=parse_authors($self, $authors);
150
151 foreach my $author (@authorlist) {
152 # Add each name to set of Authors
153 $doc_obj->add_utf8_metadata ($cursection, "Author", $author);
154 }
155
156 # Only want at most one "and" in the Creator field
157 my $creator_str="";
158 if (scalar(@authorlist) > 2) {
159 my $lastauthor=pop @authorlist;
160 $creator_str=join(', ', @authorlist);
161 $creator_str.=" and $lastauthor";
162 } else { # 1 or 2 authors...
163 $creator_str=join(" and ",@authorlist);
164 }
165 $creator_str.=$etal; # if there was "et al."
166 $doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
167 }
168 ### end of author metadata ###
169
170 ###### process latex for the main text ######
171 $$textref =~ s/^.*?\\begin{document}//s;
172 $$textref =~ s/\\end{document}.*?$//s;
173 $$textref = $self->process_latex("-html",$$textref);
174 $doc_obj->add_utf8_text($cursection, $$textref);
175
176 return 1;
177}
178
179
180# returns a list of author names
181sub parse_authors {
182 my $self=shift;
183 my $authors=shift;
184
185 my $outhandle=$self->{'outhandle'};
186
187 $authors =~ s/\n/ /g; # remove newlines
188
189 # some people do this for affiliation footnote/dagger
190 $authors =~ s@\$.*?\$@@g; # remove maths from author :(
191
192 # und here for german language...
193 # don't use brackets in pattern, else the matched bit becomes
194 # an element in the list!
195 my @authorlist = split(/\s+and\s+|\s+und\s+/, $authors);
196 my @formattedlist = ();
197 foreach my $author (@authorlist) {
198 $author =~ s/\s*$//;
199 $author =~ s/^\s*//;
200 # Reformat and add author name
201 next if $author=~ /^\s*$/;
202
203 # names are "First von Last", "von Last, First"
204 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
205 my $first="";
206 my $vonlast="";
207 my $jr="";
208
209 if ($author =~ /,/) {
210 my @parts=split(/,\s*/, $author);
211 $first = pop @parts;
212 if (scalar(@parts) == 2) {
213 $jr = pop @parts;
214 }
215 $vonlast=shift @parts;
216 if (scalar(@parts) > 0) {
217 print $outhandle $self->{'plugin_type'} .
218 ": couldn't parse name $author\n";
219 # but we continue anyway...
220 }
221 } else { # First von Last
222 my @words = split(/ /, $author);
223 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
224 $first .= " " . shift (@words);
225 }
226 $first =~ s/^\s//;
227 $vonlast = join (' ', @words); # whatever's left...
228 }
229 my $von="";
230 my $last="";
231 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
232 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
233 $von = $1;
234 if (!defined ($von)) {
235 # some non-English names do start with lowercase
236 # eg "Marie desJardins". Also we can get typos...
237 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
238 $von="";
239 if ($vonlast =~ /^[a-z]+$/) {
240 # if it's all lowercase, uppercase 1st.
241 $vonlast =~ s/^(.)/\u$1/;
242 }
243 }
244 $von =~ s/\s*$//;
245 $last=$vonlast;
246 } else {
247 $last=$vonlast;
248 }
249 my $wholename="$first $von $last $jr";
250 $wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
251# my $fullname = "$last";
252# $fullname .= " $jr" if ($jr);
253# $fullname .= ", $first";
254# $fullname .= " $von" if ($von);
255 push (@formattedlist, $wholename);
256 }
257 return @formattedlist;
258}
259
260
261## following functions based on bibtex plugin ##
262# not actually used at the moment, but might be useful in future?
263sub expand_month {
264 my $text=shift;
265
266 # bibtex style files expand abbreviations for months.
267 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
268 $text =~ s/jan/_textmonth01_/g;
269 $text =~ s/feb/_textmonth02_/g;
270 $text =~ s/mar/_textmonth03_/g;
271 $text =~ s/apr/_textmonth04_/g;
272 $text =~ s/may/_textmonth05_/g;
273 $text =~ s/jun/_textmonth06_/g;
274 $text =~ s/jul/_textmonth07_/g;
275 $text =~ s/aug/_textmonth08_/g;
276 $text =~ s/sep/_textmonth09_/g;
277 $text =~ s/oct/_textmonth10_/g;
278 $text =~ s/nov/_textmonth11_/g;
279 $text =~ s/dec/_textmonth12_/g;
280
281 return $text;
282}
283
284
285# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
286# as the first argument to this function.
287#
288# Convert accented characters, remove { }, interprete some commands....
289# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
290
291# Also, it sucks quite a bit for complicated/nested commands since it doesn't
292# match { with the corresponding }, only the nearest }
293
294sub process_latex {
295 my $self=shift;
296 my $text=shift;
297
298 my $outhandle=$self->{'outhandle'};
299
300 my $html_markup=0;
301 if ($text =~ /^\-html/) {
302 $html_markup=1;
303 $text=shift;
304 }
305
306 if (! $text) {
307 return $text;
308 }
309 # escape html-sensitive characters
310 $text =~ s@&@&amp;@g;
311 $text =~ s@<@&lt;@g;
312 $text =~ s@>@&gt;@g;
313
314 # do this before accents, since \= means something different in tabbing
315 # also \> is a tab stop too, and \\ is newline
316 sub do_tabbing {
317 my $tabbing=shift;
318 $tabbing =~ s!^.*\\kill\s*$!!mg; # \kill sets tab stops, kills line
319 $tabbing =~ s~\\(?:=|&gt;)~\xc2\xa0~g; # replace with nbsp
320 $tabbing =~ s~[\\][\\](?:\[.*?\])?\s*$~<br/>~mg;
321 return "<br/>" . $tabbing . "<br/>\n";
322 }
323 $text =~ s@\\begin{tabbing}(.*?)\\end{tabbing}@do_tabbing($1)@ges;
324 sub do_tabular {
325 my $tabular=shift;
326 $tabular =~ s~(?<!\\)\s*&amp;\s*~</td><td>~g;
327 $tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
328 $tabular =~ s~\\hline~~g; # for now...
329 $tabular =~ s~<td>\s*\\multicolumn{(\d+)}{.*?}~<td colspan="$1">~g;
330 return "<table border=\"1\">\n <tr><td>"
331 . $tabular . "</td></tr></table>\n";
332 }
333 $text =~ s@\\begin{tabular}(?:\[.*?\])?{.*?}(.*?)\\end{tabular} @
334 do_tabular($1) @xges;
335
336 $text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
337
338 # process maths mode before accents... things like \, mean different!
339 # maths mode
340 $text =~ s@\$\$(.*?)\$\$
341 @ process_latex_math($html_markup,$1)
342 @xsge; # multi-line maths: $$ .... $$
343
344 $text =~ s@([^\\])\$(.*?[^\\])\$
345 @$1.process_latex_math($html_markup,$2)@xsge;
346
347
348 # is this an amstext environment, or just custom for that input file?
349 $text =~ s@\\begin{(algorithm)}(.*?)\\end{\1}@remove_equals($2)@ges;
350
351 # convert latex-style accented characters.
352 $self->latex_accents_to_utf8(\$text);
353
354 # replace quotes with utf-8
355
356 $text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (&ldquo;)
357 $text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (&rdquo;)
358 $text =~ s/`/\xe2\xc0\x98/g; # single left quote
359 $text =~ s/'/\xe2\xc0\x99/g; # single right quote
360
361 ###### remove/replace latex commands ######
362 ### commands that expand to something that gets displayed ###
363 $text =~ s~\\ldots~&hellip;~g;
364 $text =~ s~\\hrule~<hr/>\n~g;
365 $text =~ s~\\maketitle~ ~;
366 ### space commands ###
367 $text =~ s~\\[vh]skip\s+\S+~~g;
368 $text =~ s~\\vspace\*?{.*?}~<div>&nbsp;</div>~g; # vertical space
369 $text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
370 $text =~ s~\\noindent\b~~g;
371 # newpage, etc
372 $text =~ s~\\(?:clearemptydoublepage|newpage)~~g;
373 ### counters, contents, environments, labels, etc ###
374 $text =~ s~\\(?:addcontentsline){.*?}{.*?}{.*}~~g;
375 $text =~ s~\s*\\begin{itemize}\s*~\n<ul>\n~g;
376 $text =~ s~\s*\\end{itemize}\s*~</li></ul>\n~g;
377 $text =~ s~\s*\\begin{enumerate}\s*~<ol>\n~g;
378 $text =~ s~\s*\\end{enumerate}\s*~</li></ol>\n~g;
379 if ($text =~ s~\s*\\item~</li>\n<li>~g) {
380 # (count for first list item)
381 $text =~ s~<([ou])l>\s*</li>\s*~<$1l>~g;
382 }
383 $text =~ s~\\(?:label|begin|end){.*?}\s*\n?~ ~g; # remove tag and contents
384 $text =~ s~\\(?:tableofcontents|listoffigures)~ ~g;
385 ### font sizes/styles ###
386 $text =~ s~\\(?:tiny|small|footnotesize|normalsize|large|Large|huge|Huge)\b~~g;
387
388 if ($html_markup) {
389 $text =~ s~\\section\*?{([^\}]+)}\s*\n?~<H1>$1</H1>\n~g;
390 $text =~ s~\\subsection\*?{(.*?)}\s*\n?~<H2>$1</H2>\n~g;
391 $text =~ s~{\\tt\s*(.*?)}~<tt>$1</tt>~g;
392 $text =~ s~\\(?:texttt|tt|ttseries)\s*{(.*?)}~<tt>$1</tt>~g;
393 $text =~ s~\\emph{(.*?)}~<em>$1</em>~g;
394 $text =~ s~{\\(?:em|it)\s*(.*?)}~<em>$1</em>~g;
395 $text =~ s~{\\(?:bf|bfseries)\s*(.*?)}~<strong>$1</strong>~g;
396 $text =~ s~\\(?:textbf|bf|bfseries)\s*{(.*?)}~<strong>$1</strong>~g;
397 } else {
398 # remove tags for text-only
399 $text =~ s~\\(?:textbf|bf|bfseries|em|emph|tt|rm|texttt)\b~ ~g;
400 }
401 $text =~ s ~ {\\sc\s+(.*?)} ~
402 {<span style="font-variant:\ small-caps">$1</span>} ~gx;
403 # ignore these font tags (if there are any left)
404 # sf is sans-serif
405 $text =~ s~\\(?:mdseries|textmd|bfseries|textbf|sffamily|sf|sc)\b~ ~;
406 #### end font-related stuff ####
407
408 ### remove all other commands with optional arguments... ###
409 # don't remove commands without { }....
410 # $text =~ s~\\\w+(\[.*?\])?\s*~~g;
411 # $text =~ s~\\noopsort{[^}]+\}~~g;
412 # verbatim
413 $text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
414 # remove tags, keep contents for \tag[optional]{contents}
415 while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
416
417 # remove latex groupings { } (but not \{ or \} )
418 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
419 $text =~ s/^\{//; # remove { if first char
420
421 # latex characters
422 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
423 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
424 # optional break "\-"
425 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
426 # the non-macro bits have quotes around them - we just remove quotes
427 # XXX bibtex and latex differ here (for the '#' char)...
428 $text =~ s/([^&])[\"\#]/$1/g;
429 }
430 # dashes. Convert (m|n)-dash into single dash for html.
431 $text =~ s~\-\-+~\-~g;
432
433 # quoted { } chars
434 $text =~ s~\\{~{~g;
435 $text =~ s~\\}~}~g;
436
437 # spaces
438 $text =~ s~\\ ~ ~g;
439
440 # finally to protect against macro language...
441 # greenstone-specific
442 $text =~ s~\[~&\#91;~g;
443 $text =~ s~\]~&\#93;~g;
444 $text =~ s~(?<!\\)([\\_])~\\$1~g;
445
446 if ($html_markup) {
447 $text =~ s~\n{2,}~\n</p>\n<p>~g;
448 return "<p>$text</p>";
449 }
450
451 return $text;
452}
453
454# only used by process_latex for \verb....
455sub verb_text {
456 my $verbatim=shift;
457 $verbatim =~ s/([{}_])/\\$1/g;
458 return $verbatim;
459}
460# only used by process_latex_math
461# returns a unicode char if applicable, otherwise ascii
462sub math_fraction {
463 my $num=$1;
464 my $denom=$2;
465
466 if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
467 if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
468 if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
469 return "$num/$denom";
470}
471
472sub process_latex_math {
473
474 my $text = pop; # if given one or two args, this is the last one...
475 my $html_markup=pop; # if given two args, this is the first one else undef
476
477 $text =~ s~\\,~ ~g; # forces a space?
478 $text =~ s~\\infty~infinity~g; # or unicode 0x221E...
479
480# use this one when more things can read 3-byte utf8 values like this!
481# $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
482 $text =~ s~\\cup\b~ U ~g;
483
484 $text =~ s~\\frac\s*{(.+?)}{(.+?)}~math_fraction($1,$2)~ge;
485
486 if ($html_markup) {
487 $text =~ s~\^{(.*?)}~<sup>$1</sup>~g; # a^b superscript
488 $text =~ s~\^([^\{])~<sup>$1</sup>~g;
489 $text =~ s~\_{(.*?)}~<sub>$1</sub>~g; # a_b subscript
490 $text =~ s~\_([^\{])~<sub>$1</sub>~g;
491
492 $text =~ s~\\ldots~&hellip;~g; # use html named entity for now
493
494 # put all other command names in italics for now
495 $text =~ s~\\([\w]+)~<i>$1</i> ~g;
496 }
497
498 # special cases, for some input files
499 if ($text =~ m~^\\\w+$~) {
500 $text="{" . $text . "}";
501 }
502
503 return $text;
504}
505
506
507
508sub latex_accents_to_utf8 {
509
510 # note - this is really ugly, but it works. There may be a prettier way
511 # of mapping latex accented chars to utf8, but we just brute force it here.
512 # Also, this isn't complete - not every single possible accented letter
513 # is in here yet, but most of the common ones are.
514
515 my %utf8_chars =
516 (
517 # acutes
518 '\'a' => chr(0xc3).chr(0xa1),
519 '\'c' => chr(0xc4).chr(0x87),
520 '\'e' => chr(0xc3).chr(0xa9),
521 '\'i' => chr(0xc3).chr(0xad),
522 '\'l' => chr(0xc3).chr(0xba),
523 '\'n' => chr(0xc3).chr(0x84),
524 '\'o' => chr(0xc3).chr(0xb3),
525 '\'r' => chr(0xc5).chr(0x95),
526 '\'s' => chr(0xc5).chr(0x9b),
527 '\'u' => chr(0xc3).chr(0xba),
528 '\'y' => chr(0xc3).chr(0xbd),
529 '\'z' => chr(0xc5).chr(0xba),
530 # graves
531 '`a' => chr(0xc3).chr(0xa0),
532 '`A' => chr(0xc3).chr(0x80),
533 '`e' => chr(0xc3).chr(0xa8),
534 '`E' => chr(0xc3).chr(0x88),
535 '`i' => chr(0xc3).chr(0xac),
536 '`I' => chr(0xc3).chr(0x8c),
537 '`o' => chr(0xc3).chr(0xb2),
538 '`O' => chr(0xc3).chr(0x92),
539 '`u' => chr(0xc3).chr(0xb9),
540 '`U' => chr(0xc3).chr(0x99),
541 # circumflex
542 '^a' => chr(0xc3).chr(0xa2),
543 '^A' => chr(0xc3).chr(0x82),
544 '^c' => chr(0xc4).chr(0x89),
545 '^C' => chr(0xc4).chr(0x88),
546 '^e' => chr(0xc3).chr(0xaa),
547 '^E' => chr(0xc3).chr(0x8a),
548 '^g' => chr(0xc4).chr(0x9d),
549 '^G' => chr(0xc4).chr(0x9c),
550 '^h' => chr(0xc4).chr(0xa5),
551 '^H' => chr(0xc4).chr(0xa4),
552 '^i' => chr(0xc3).chr(0xae),
553 '^I' => chr(0xc3).chr(0x8e),
554 '^j' => chr(0xc4).chr(0xb5),
555 '^J' => chr(0xc4).chr(0xb4),
556 '^o' => chr(0xc3).chr(0xb4),
557 '^O' => chr(0xc3).chr(0x94),
558 '^s' => chr(0xc5).chr(0x9d),
559 '^S' => chr(0xc5).chr(0x9c),
560 '^u' => chr(0xc3).chr(0xa2),
561 '^U' => chr(0xc3).chr(0xbb),
562 '^w' => chr(0xc5).chr(0xb5),
563 '^W' => chr(0xc5).chr(0xb4),
564 '^y' => chr(0xc5).chr(0xb7),
565 '^Y' => chr(0xc5).chr(0xb6),
566
567 # diaeresis
568 '"a' => chr(0xc3).chr(0xa4),
569 '"A' => chr(0xc3).chr(0x84),
570 '"e' => chr(0xc3).chr(0xab),
571 '"E' => chr(0xc3).chr(0x8b),
572 '"i' => chr(0xc3).chr(0xaf),
573 '"I' => chr(0xc3).chr(0x8f),
574 '"\\\\i' => chr(0xc3).chr(0xaf),
575 '"\\\\I' => chr(0xc3).chr(0x8f),
576 '"o' => chr(0xc3).chr(0xb6),
577 '"O' => chr(0xc3).chr(0x96),
578 '"u' => chr(0xc3).chr(0xbc),
579 '"U' => chr(0xc3).chr(0x9c),
580 '"y' => chr(0xc3).chr(0xbf),
581 '"Y' => chr(0xc3).chr(0xb8),
582 # tilde
583 '~A' => chr(0xc3).chr(0x83),
584 '~N' => chr(0xc3).chr(0x91),
585 '~O' => chr(0xc3).chr(0x95),
586 '~a' => chr(0xc3).chr(0xa3),
587 '~n' => chr(0xc3).chr(0xb1),
588 '~o' => chr(0xc3).chr(0xb5),
589 # caron - handled specially
590 # double acute
591 # ring
592 # dot
593 '.c' => chr(0xc4).chr(0x8b),
594 '.C' => chr(0xc4).chr(0x8a),
595 '.e' => chr(0xc4).chr(0x97),
596 '.E' => chr(0xc4).chr(0x96),
597 '.g' => chr(0xc4).chr(0xa1),
598 '.G' => chr(0xc4).chr(0xa0),
599 '.I' => chr(0xc4).chr(0xb0),
600 '.z' => chr(0xc5).chr(0xbc),
601 '.Z' => chr(0xc5).chr(0xbb),
602 # macron
603 '=a' => chr(0xc4).chr(0x81),
604 '=A' => chr(0xc4).chr(0x80),
605 '=e' => chr(0xc4).chr(0x93),
606 '=E' => chr(0xc4).chr(0x92),
607 '=i' => chr(0xc4).chr(0xab),
608 '=I' => chr(0xc4).chr(0xaa),
609 '=o' => chr(0xc4).chr(0x8d),
610 '=O' => chr(0xc4).chr(0x8c),
611 '=u' => chr(0xc4).chr(0xab),
612 '=U' => chr(0xc4).chr(0xaa),
613
614 # stroke - handled specially - see below
615
616 # cedilla - handled specially
617 );
618
619# these are one letter latex commands - we make sure they're not a longer
620# command name. eg {\d} is d+stroke, so careful of \d
621 my %special_utf8_chars =
622 (
623 # breve
624 'u g' => chr(0xc4).chr(0x9f),
625 'u G' => chr(0xc4).chr(0x9e),
626 'u i' => chr(0xc4).chr(0xad),
627 'u I' => chr(0xc4).chr(0xac),
628 'u o' => chr(0xc5).chr(0x8f),
629 'u O' => chr(0xc5).chr(0x8e),
630 'u u' => chr(0xc5).chr(0xad),
631 'u U' => chr(0xc5).chr(0xac),
632 'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
633 'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
634 # caron
635 'v c' => chr(0xc4).chr(0x8d),
636 'v C' => chr(0xc4).chr(0x8c),
637 'v n' => chr(0xc5).chr(0x88),
638 'v N' => chr(0xc5).chr(0x87),
639 'v s' => chr(0xc5).chr(0xa1),
640 'v S' => chr(0xc5).chr(0xa5),
641 'v z' => chr(0xc5).chr(0xbe),
642 'v Z' => chr(0xc5).chr(0xbd),
643 # cedilla
644 'c c' => chr(0xc3).chr(0xa7),
645 'c C' => chr(0xc3).chr(0x87),
646 'c g' => chr(0xc4).chr(0xa3),
647 'c G' => chr(0xc4).chr(0xa2),
648 'c k' => chr(0xc4).chr(0xb7),
649 'c K' => chr(0xc4).chr(0xb6),
650 'c l' => chr(0xc4).chr(0xbc),
651 'c L' => chr(0xc4).chr(0xbb),
652 'c n' => chr(0xc5).chr(0x86),
653 'c N' => chr(0xc5).chr(0x85),
654 'c r' => chr(0xc5).chr(0x97),
655 'c R' => chr(0xc5).chr(0x96),
656 'c s' => chr(0xc5).chr(0x9f),
657 'c S' => chr(0xc5).chr(0x9e),
658 'c t' => chr(0xc5).chr(0xa3),
659 'c T' => chr(0xc5).chr(0xa2),
660 # double acute / Hungarian accent
661 'H O' => chr(0xc5).chr(0x90),
662 'H o' => chr(0xc5).chr(0x91),
663 'H U' => chr(0xc5).chr(0xb0),
664 'H u' => chr(0xc5).chr(0xb1),
665
666 # stroke
667 'd' => chr(0xc4).chr(0x91),
668 'D' => chr(0xc4).chr(0x90),
669 'h' => chr(0xc4).chr(0xa7),
670# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
671 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
672 'l' => chr(0xc5).chr(0x82),
673 'L' => chr(0xc5).chr(0x81),
674 'o' => chr(0xc3).chr(0xb8),
675 'O' => chr(0xc3).chr(0x98),
676 't' => chr(0xc5).chr(0xa7),
677 'T' => chr(0xc5).chr(0xa6),
678
679 # other special chars
680 'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
681 'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
682 );
683
684 my $self=shift;
685 my $textref=shift;
686
687 my $outhandle=$self->{'outhandle'};
688 my $text=$$textref;
689
690 # remove space (if any) between \ and letter to accent (eg {\' a})
691 $text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
692
693 # remove {} around a single character (eg \'{e})
694 $text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g; # for emacs indenting... `]);
695
696 ## only in bibtex... not in latex proper?!
697 ### \, is another way of doing cedilla \c
698 ##$text =~ s~\\,(.)~\\c $1~g;
699
700 # remove {} around a single character for special 1 letter commands -
701 # need to insert a space. Eg \v{s} -> {\v s}
702 $text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
703
704 # only do if the text contains a '\' character.
705 if ($text =~ m|\\|) {
706 # "normal" accents - ie non-alpha latex tag
707 # xxx used to have ([\w]\b)@ (for word boundary)
708 while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
709 my $tex="$1$2"; my $char="$2";
710 my $replacement=$utf8_chars{$tex};
711 if (!defined($replacement)) {
712 $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
713 print $outhandle . $self->{'plugin_type'} .
714 ": Warning: unknown latex accent \"$tex\""
715 . " in \"$1\"\n";
716 $replacement=$char;
717 }
718 $text =~ s/\\\Q$tex/$replacement/g;
719 }
720
721 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
722 # only do the change if immediately followed by a space, }, {, or \
723 # one letter accents ( + ss / aa)
724 while ($text =~ m~\\([DdhiLlOoTt]|ss|aa)[{}\s\"\\]~) {
725 my $tex=$1;
726 my $replacement=$special_utf8_chars{$tex};
727 if (!defined($replacement)) {
728 $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
729 print $outhandle $self->{'plugin_type'} .
730 ": Warning: unknown latex accent \"$tex\""
731 . " in \"$1\"\n";
732 $replacement=$tex;
733 }
734 ($text =~ s/{\\$tex}/$replacement/g) or
735 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
736
737 }
738
739 # one letter latex accent commands that affect following letter
740 while ($text =~ m~\\([uvcH]) ([\w])~) {
741 my $tex="$1 $2"; my $char="$2";
742 my $replacement=$special_utf8_chars{$tex};
743 if (!defined($replacement)) {
744 $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
745 print $outhandle $self->{'plugin_type'} .
746 ": Warning: unknown latex accent \"$tex\""
747 . " in \"$1\"\n";
748 $replacement=$char;
749 }
750 $text =~ s/\\$tex/$replacement/g;
751 }
752 }
753 $textref=\$text;
754}
755
756
757# modules must return true
7581;
Note: See TracBrowser for help on using the repository browser.