source: trunk/gsdl/perllib/plugins/LaTeXPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 23.1 KB
Line 
1###########################################################################
2#
3# LaTeXPlug.pm
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Written by John McPherson
10# Copyright (C) 2004 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22###########################################################################
23
24# todo:
25# \includegraphics
26# parse/remove tex \if ... macros
27
28package LaTeXPlug;
29
30# System complains about $arguments if the strict is set
31use strict;
32no strict 'refs'; # so we can print to a handle named by a variable
33
34# greenstone packages
35use BasPlug;
36use parsargv;
37use unicode;
38use util;
39
40my $arguments = [
41 ];
42my $options={ 'name' => 'LaTeXPlug',
43 'desc' => '{LaTeXPlug.desc}',
44 'abstract' => 'no',
45 'inherits' => 'yes' };
46sub BEGIN {
47 @LaTeXPlug::ISA = ('BasPlug');
48}
49
50sub print_usage {
51 print STDERR "\n usage: plugin LaTeXPlug [options]\n\n";
52}
53
54sub new {
55 my ($class) = shift (@_);
56 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
57 push(@$pluginlist, $class);
58
59 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
60 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
61
62 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
63
64 $self->{'aux_files'} = {};
65 $self->{'dir_num'} = 0;
66 $self->{'file_num'} = 0;
67 return bless $self, $class;
68}
69
70
71sub get_default_process_exp {
72 my $self = shift (@_);
73 return q^\.tex$^;
74}
75
76sub get_default_block_exp {
77 # assume any .eps files are part of the latex stuff
78 return '\.(?:eps)$';
79}
80
81
82sub process {
83 my $self = shift (@_);
84 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
85
86 my $start=substr($$textref, 0, 200); # first 200 bytes
87
88 if ($start !~ m~\\ (?:documentclass | documentstyle | input | section
89 | chapter | contents | begin) ~x) {
90 # this doesn't look like latex...
91 return undef;
92 }
93 my $outhandle = $self->{'outhandle'};
94 if ($gli) {
95 print STDERR "<Processing n='$file' p='LaTeXPlug'>\n";
96 } elsif ($self->{'verbosity'} > 1) {
97 print $outhandle "LaTeXPlug: processing $file\n"
98 }
99 my $cursection = $doc_obj->get_top_section();
100
101 ###### clean up text ######
102 $$textref =~ s/\r$//mg; # remove dos ^M
103 $$textref =~ s/%.*$//mg; # remove comments
104
105 # convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
106
107 $$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
108 # check that both sides are ascii, so we don't screw up utf-8 chars
109 $$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
110 unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
111
112
113 ###### find metadata ######
114
115 ## FileFormat metadata ##
116 $doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
117
118 ### title metadata ###
119 $$textref =~ m@\\title\s*{(.*?)}@s;
120 my $title = $1;
121 if (!$title) {
122 # no title tag. look for a chapter/section heading
123 $$textref =~ m@\\(?:chapter|section)\s*{(.*?)}@s; # will get 1st match
124 $title = $1;
125 }
126 if (!$title) {
127 # no chapter/section heading tags either... use filename
128 $title = $file;
129 $title =~ s/\.tex$//i;
130 $title =~ s/[-_.]/ /g; # turn punctuation into spaces
131 }
132 if ($title) {
133 $title =~ s@\\\\@ @g; # embedded newlines
134 $title = $self->process_latex($title); # no "-html" for title eg in browser
135 $doc_obj->add_utf8_metadata($cursection, "Title", $title);
136 }
137
138 ### creator/individual author metadata ###
139 $$textref =~ m@\\author\s*{((?:{.*?}|.*?)+)}\s*$@sm;
140 my $authors=$1;
141 if ($authors) {
142 # take care of "et al."...
143 $authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
144 my $etal=$1;
145 $etal="" if (!defined ($etal));
146
147 my @authorlist=parse_authors($self, $authors);
148
149 foreach my $author (@authorlist) {
150 # Add each name to set of Authors
151 $doc_obj->add_utf8_metadata ($cursection, "Author", $author);
152 }
153
154 # Only want at most one "and" in the Creator field
155 my $creator_str="";
156 if (scalar(@authorlist) > 2) {
157 my $lastauthor=pop @authorlist;
158 $creator_str=join(', ', @authorlist);
159 $creator_str.=" and $lastauthor";
160 } else { # 1 or 2 authors...
161 $creator_str=join(" and ",@authorlist);
162 }
163 $creator_str.=$etal; # if there was "et al."
164 $doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
165 }
166 ### end of author metadata ###
167
168 ###### process latex for the main text ######
169 $$textref =~ s/^.*?\\begin{document}//s;
170 $$textref =~ s/\\end{document}.*?$//s;
171 $$textref = $self->process_latex("-html",$$textref);
172 $doc_obj->add_utf8_text($cursection, $$textref);
173
174 return 1;
175}
176
177
178# returns a list of author names
179sub parse_authors {
180 my $self=shift;
181 my $authors=shift;
182
183 my $outhandle=$self->{'outhandle'};
184
185 $authors =~ s/\n/ /g; # remove newlines
186
187 # some people do this for affiliation footnote/dagger
188 $authors =~ s@\$.*?\$@@g; # remove maths from author :(
189
190 # und here for german language...
191 # don't use brackets in pattern, else the matched bit becomes
192 # an element in the list!
193 my @authorlist = split(/\s+and\s+|\s+und\s+/, $authors);
194 my @formattedlist = ();
195 foreach my $author (@authorlist) {
196 $author =~ s/\s*$//;
197 $author =~ s/^\s*//;
198 # Reformat and add author name
199 next if $author=~ /^\s*$/;
200
201 # names are "First von Last", "von Last, First"
202 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
203 my $first="";
204 my $vonlast="";
205 my $jr="";
206
207 if ($author =~ /,/) {
208 my @parts=split(/,\s*/, $author);
209 $first = pop @parts;
210 if (scalar(@parts) == 2) {
211 $jr = pop @parts;
212 }
213 $vonlast=shift @parts;
214 if (scalar(@parts) > 0) {
215 print $outhandle $self->{'plugin_type'} .
216 ": couldn't parse name $author\n";
217 # but we continue anyway...
218 }
219 } else { # First von Last
220 my @words = split(/ /, $author);
221 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
222 $first .= " " . shift (@words);
223 }
224 $first =~ s/^\s//;
225 $vonlast = join (' ', @words); # whatever's left...
226 }
227 my $von="";
228 my $last="";
229 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
230 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
231 $von = $1;
232 if (!defined ($von)) {
233 # some non-English names do start with lowercase
234 # eg "Marie desJardins". Also we can get typos...
235 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
236 $von="";
237 if ($vonlast =~ /^[a-z]+$/) {
238 # if it's all lowercase, uppercase 1st.
239 $vonlast =~ s/^(.)/\u$1/;
240 }
241 }
242 $von =~ s/\s*$//;
243 $last=$vonlast;
244 } else {
245 $last=$vonlast;
246 }
247 my $wholename="$first $von $last $jr";
248 $wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
249# my $fullname = "$last";
250# $fullname .= " $jr" if ($jr);
251# $fullname .= ", $first";
252# $fullname .= " $von" if ($von);
253 push (@formattedlist, $wholename);
254 }
255 return @formattedlist;
256}
257
258
259## following functions based on bibtex plugin ##
260# not actually used at the moment, but might be useful in future?
261sub expand_month {
262 my $text=shift;
263
264 # bibtex style files expand abbreviations for months.
265 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
266 $text =~ s/jan/_textmonth01_/g;
267 $text =~ s/feb/_textmonth02_/g;
268 $text =~ s/mar/_textmonth03_/g;
269 $text =~ s/apr/_textmonth04_/g;
270 $text =~ s/may/_textmonth05_/g;
271 $text =~ s/jun/_textmonth06_/g;
272 $text =~ s/jul/_textmonth07_/g;
273 $text =~ s/aug/_textmonth08_/g;
274 $text =~ s/sep/_textmonth09_/g;
275 $text =~ s/oct/_textmonth10_/g;
276 $text =~ s/nov/_textmonth11_/g;
277 $text =~ s/dec/_textmonth12_/g;
278
279 return $text;
280}
281
282
283# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
284# as the first argument to this function.
285#
286# Convert accented characters, remove { }, interprete some commands....
287# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
288
289# Also, it sucks quite a bit for complicated/nested commands since it doesn't
290# match { with the corresponding }, only the nearest }
291
292sub process_latex {
293 my $self=shift;
294 my $text=shift;
295
296 my $outhandle=$self->{'outhandle'};
297
298 my $html_markup=0;
299 if ($text =~ /^\-html/) {
300 $html_markup=1;
301 $text=shift;
302 }
303
304 if (! $text) {
305 return $text;
306 }
307 # escape html-sensitive characters
308 $text =~ s@&@&amp;@g;
309 $text =~ s@<@&lt;@g;
310 $text =~ s@>@&gt;@g;
311
312 # do this before accents, since \= means something different in tabbing
313 # also \> is a tab stop too, and \\ is newline
314 sub do_tabbing {
315 my $tabbing=shift;
316 $tabbing =~ s!^.*\\kill\s*$!!mg; # \kill sets tab stops, kills line
317 $tabbing =~ s~\\(?:=|&gt;)~\xc2\xa0~g; # replace with nbsp
318 $tabbing =~ s~[\\][\\](?:\[.*?\])?\s*$~<br/>~mg;
319 return "<br/>" . $tabbing . "<br/>\n";
320 }
321 $text =~ s@\\begin{tabbing}(.*?)\\end{tabbing}@do_tabbing($1)@ges;
322 sub do_tabular {
323 my $tabular=shift;
324 $tabular =~ s~(?<!\\)\s*&amp;\s*~</td><td>~g;
325 $tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
326 $tabular =~ s~\\hline~~g; # for now...
327 $tabular =~ s~<td>\s*\\multicolumn{(\d+)}{.*?}~<td colspan="$1">~g;
328 return "<table border=\"1\">\n <tr><td>"
329 . $tabular . "</td></tr></table>\n";
330 }
331 $text =~ s@\\begin{tabular}(?:\[.*?\])?{.*?}(.*?)\\end{tabular} @
332 do_tabular($1) @xges;
333
334 $text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
335
336 # process maths mode before accents... things like \, mean different!
337 # maths mode
338 $text =~ s@\$\$(.*?)\$\$
339 @ process_latex_math($html_markup,$1)
340 @xsge; # multi-line maths: $$ .... $$
341
342 $text =~ s@([^\\])\$(.*?[^\\])\$
343 @$1.process_latex_math($html_markup,$2)@xsge;
344
345
346 # is this an amstext environment, or just custom for that input file?
347 $text =~ s@\\begin{(algorithm)}(.*?)\\end{\1}@remove_equals($2)@ges;
348
349 # convert latex-style accented characters.
350 $self->latex_accents_to_utf8(\$text);
351
352 # replace quotes with utf-8
353
354 $text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (&ldquo;)
355 $text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (&rdquo;)
356 $text =~ s/`/\xe2\xc0\x98/g; # single left quote
357 $text =~ s/'/\xe2\xc0\x99/g; # single right quote
358
359 ###### remove/replace latex commands ######
360 ### commands that expand to something that gets displayed ###
361 $text =~ s~\\ldots~&hellip;~g;
362 $text =~ s~\\hrule~<hr/>\n~g;
363 $text =~ s~\\maketitle~ ~;
364 ### space commands ###
365 $text =~ s~\\[vh]skip\s+\S+~~g;
366 $text =~ s~\\vspace\*?{.*?}~<div>&nbsp;</div>~g; # vertical space
367 $text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
368 $text =~ s~\\noindent\b~~g;
369 # newpage, etc
370 $text =~ s~\\(?:clearemptydoublepage|newpage)~~g;
371 ### counters, contents, environments, labels, etc ###
372 $text =~ s~\\(?:addcontentsline){.*?}{.*?}{.*}~~g;
373 $text =~ s~\s*\\begin{itemize}\s*~\n<ul>\n~g;
374 $text =~ s~\s*\\end{itemize}\s*~</li></ul>\n~g;
375 $text =~ s~\s*\\begin{enumerate}\s*~<ol>\n~g;
376 $text =~ s~\s*\\end{enumerate}\s*~</li></ol>\n~g;
377 if ($text =~ s~\s*\\item~</li>\n<li>~g) {
378 # (count for first list item)
379 $text =~ s~<([ou])l>\s*</li>\s*~<$1l>~g;
380 }
381 $text =~ s~\\(?:label|begin|end){.*?}\s*\n?~ ~g; # remove tag and contents
382 $text =~ s~\\(?:tableofcontents|listoffigures)~ ~g;
383 ### font sizes/styles ###
384 $text =~ s~\\(?:tiny|small|footnotesize|normalsize|large|Large|huge|Huge)\b~~g;
385
386 if ($html_markup) {
387 $text =~ s~\\section\*?{([^\}]+)}\s*\n?~<H1>$1</H1>\n~g;
388 $text =~ s~\\subsection\*?{(.*?)}\s*\n?~<H2>$1</H2>\n~g;
389 $text =~ s~{\\tt\s*(.*?)}~<tt>$1</tt>~g;
390 $text =~ s~\\(?:texttt|tt|ttseries)\s*{(.*?)}~<tt>$1</tt>~g;
391 $text =~ s~\\emph{(.*?)}~<em>$1</em>~g;
392 $text =~ s~{\\(?:em|it)\s*(.*?)}~<em>$1</em>~g;
393 $text =~ s~{\\(?:bf|bfseries)\s*(.*?)}~<strong>$1</strong>~g;
394 $text =~ s~\\(?:textbf|bf|bfseries)\s*{(.*?)}~<strong>$1</strong>~g;
395 } else {
396 # remove tags for text-only
397 $text =~ s~\\(?:textbf|bf|bfseries|em|emph|tt|rm|texttt)\b~ ~g;
398 }
399 $text =~ s ~ {\\sc\s+(.*?)} ~
400 {<span style="font-variant:\ small-caps">$1</span>} ~gx;
401 # ignore these font tags (if there are any left)
402 # sf is sans-serif
403 $text =~ s~\\(?:mdseries|textmd|bfseries|textbf|sffamily|sf|sc)\b~ ~;
404 #### end font-related stuff ####
405
406 ### remove all other commands with optional arguments... ###
407 # don't remove commands without { }....
408 # $text =~ s~\\\w+(\[.*?\])?\s*~~g;
409 # $text =~ s~\\noopsort{[^}]+\}~~g;
410 # verbatim
411 $text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
412 # remove tags, keep contents for \tag[optional]{contents}
413 while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
414
415 # remove latex groupings { } (but not \{ or \} )
416 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
417 $text =~ s/^\{//; # remove { if first char
418
419 # latex characters
420 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
421 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
422 # optional break "\-"
423 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
424 # the non-macro bits have quotes around them - we just remove quotes
425 # XXX bibtex and latex differ here (for the '#' char)...
426 $text =~ s/([^&])[\"\#]/$1/g;
427 }
428 # dashes. Convert (m|n)-dash into single dash for html.
429 $text =~ s~\-\-+~\-~g;
430
431 # quoted { } chars
432 $text =~ s~\\{~{~g;
433 $text =~ s~\\}~}~g;
434
435 # spaces
436 $text =~ s~\\ ~ ~g;
437
438 # finally to protect against macro language...
439 # greenstone-specific
440 $text =~ s~\[~&\#91;~g;
441 $text =~ s~\]~&\#93;~g;
442 $text =~ s~(?<!\\)([\\_])~\\$1~g;
443
444 if ($html_markup) {
445 $text =~ s~\n{2,}~\n</p>\n<p>~g;
446 return "<p>$text</p>";
447 }
448
449 return $text;
450}
451
452# only used by process_latex for \verb....
453sub verb_text {
454 my $verbatim=shift;
455 $verbatim =~ s/([{}_])/\\$1/g;
456 return $verbatim;
457}
458# only used by process_latex_math
459# returns a unicode char if applicable, otherwise ascii
460sub math_fraction {
461 my $num=$1;
462 my $denom=$2;
463
464 if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
465 if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
466 if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
467 return "$num/$denom";
468}
469
470sub process_latex_math {
471
472 my $text = pop; # if given one or two args, this is the last one...
473 my $html_markup=pop; # if given two args, this is the first one else undef
474
475 $text =~ s~\\,~ ~g; # forces a space?
476 $text =~ s~\\infty~infinity~g; # or unicode 0x221E...
477
478# use this one when more things can read 3-byte utf8 values like this!
479# $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
480 $text =~ s~\\cup\b~ U ~g;
481
482 $text =~ s~\\frac\s*{(.+?)}{(.+?)}~math_fraction($1,$2)~ge;
483
484 if ($html_markup) {
485 $text =~ s~\^{(.*?)}~<sup>$1</sup>~g; # a^b superscript
486 $text =~ s~\^([^\{])~<sup>$1</sup>~g;
487 $text =~ s~\_{(.*?)}~<sub>$1</sub>~g; # a_b subscript
488 $text =~ s~\_([^\{])~<sub>$1</sub>~g;
489
490 $text =~ s~\\ldots~&hellip;~g; # use html named entity for now
491
492 # put all other command names in italics for now
493 $text =~ s~\\([\w]+)~<i>$1</i> ~g;
494 }
495
496 # special cases, for some input files
497 if ($text =~ m~^\\\w+$~) {
498 $text="{" . $text . "}";
499 }
500
501 return $text;
502}
503
504
505
506sub latex_accents_to_utf8 {
507
508 # note - this is really ugly, but it works. There may be a prettier way
509 # of mapping latex accented chars to utf8, but we just brute force it here.
510 # Also, this isn't complete - not every single possible accented letter
511 # is in here yet, but most of the common ones are.
512
513 my %utf8_chars =
514 (
515 # acutes
516 '\'a' => chr(0xc3).chr(0xa1),
517 '\'c' => chr(0xc4).chr(0x87),
518 '\'e' => chr(0xc3).chr(0xa9),
519 '\'i' => chr(0xc3).chr(0xad),
520 '\'l' => chr(0xc3).chr(0xba),
521 '\'n' => chr(0xc3).chr(0x84),
522 '\'o' => chr(0xc3).chr(0xb3),
523 '\'r' => chr(0xc5).chr(0x95),
524 '\'s' => chr(0xc5).chr(0x9b),
525 '\'u' => chr(0xc3).chr(0xba),
526 '\'y' => chr(0xc3).chr(0xbd),
527 '\'z' => chr(0xc5).chr(0xba),
528 # graves
529 '`a' => chr(0xc3).chr(0xa0),
530 '`A' => chr(0xc3).chr(0x80),
531 '`e' => chr(0xc3).chr(0xa8),
532 '`E' => chr(0xc3).chr(0x88),
533 '`i' => chr(0xc3).chr(0xac),
534 '`I' => chr(0xc3).chr(0x8c),
535 '`o' => chr(0xc3).chr(0xb2),
536 '`O' => chr(0xc3).chr(0x92),
537 '`u' => chr(0xc3).chr(0xb9),
538 '`U' => chr(0xc3).chr(0x99),
539 # circumflex
540 '^a' => chr(0xc3).chr(0xa2),
541 '^A' => chr(0xc3).chr(0x82),
542 '^c' => chr(0xc4).chr(0x89),
543 '^C' => chr(0xc4).chr(0x88),
544 '^e' => chr(0xc3).chr(0xaa),
545 '^E' => chr(0xc3).chr(0x8a),
546 '^g' => chr(0xc4).chr(0x9d),
547 '^G' => chr(0xc4).chr(0x9c),
548 '^h' => chr(0xc4).chr(0xa5),
549 '^H' => chr(0xc4).chr(0xa4),
550 '^i' => chr(0xc3).chr(0xae),
551 '^I' => chr(0xc3).chr(0x8e),
552 '^j' => chr(0xc4).chr(0xb5),
553 '^J' => chr(0xc4).chr(0xb4),
554 '^o' => chr(0xc3).chr(0xb4),
555 '^O' => chr(0xc3).chr(0x94),
556 '^s' => chr(0xc5).chr(0x9d),
557 '^S' => chr(0xc5).chr(0x9c),
558 '^u' => chr(0xc3).chr(0xa2),
559 '^U' => chr(0xc3).chr(0xbb),
560 '^w' => chr(0xc5).chr(0xb5),
561 '^W' => chr(0xc5).chr(0xb4),
562 '^y' => chr(0xc5).chr(0xb7),
563 '^Y' => chr(0xc5).chr(0xb6),
564
565 # diaeresis
566 '"a' => chr(0xc3).chr(0xa4),
567 '"A' => chr(0xc3).chr(0x84),
568 '"e' => chr(0xc3).chr(0xab),
569 '"E' => chr(0xc3).chr(0x8b),
570 '"i' => chr(0xc3).chr(0xaf),
571 '"I' => chr(0xc3).chr(0x8f),
572 '"\\\\i' => chr(0xc3).chr(0xaf),
573 '"\\\\I' => chr(0xc3).chr(0x8f),
574 '"o' => chr(0xc3).chr(0xb6),
575 '"O' => chr(0xc3).chr(0x96),
576 '"u' => chr(0xc3).chr(0xbc),
577 '"U' => chr(0xc3).chr(0x9c),
578 '"y' => chr(0xc3).chr(0xbf),
579 '"Y' => chr(0xc3).chr(0xb8),
580 # tilde
581 '~A' => chr(0xc3).chr(0x83),
582 '~N' => chr(0xc3).chr(0x91),
583 '~O' => chr(0xc3).chr(0x95),
584 '~a' => chr(0xc3).chr(0xa3),
585 '~n' => chr(0xc3).chr(0xb1),
586 '~o' => chr(0xc3).chr(0xb5),
587 # caron - handled specially
588 # double acute
589 # ring
590 # dot
591 '.c' => chr(0xc4).chr(0x8b),
592 '.C' => chr(0xc4).chr(0x8a),
593 '.e' => chr(0xc4).chr(0x97),
594 '.E' => chr(0xc4).chr(0x96),
595 '.g' => chr(0xc4).chr(0xa1),
596 '.G' => chr(0xc4).chr(0xa0),
597 '.I' => chr(0xc4).chr(0xb0),
598 '.z' => chr(0xc5).chr(0xbc),
599 '.Z' => chr(0xc5).chr(0xbb),
600 # macron
601 '=a' => chr(0xc4).chr(0x81),
602 '=A' => chr(0xc4).chr(0x80),
603 '=e' => chr(0xc4).chr(0x93),
604 '=E' => chr(0xc4).chr(0x92),
605 '=i' => chr(0xc4).chr(0xab),
606 '=I' => chr(0xc4).chr(0xaa),
607 '=o' => chr(0xc4).chr(0x8d),
608 '=O' => chr(0xc4).chr(0x8c),
609 '=u' => chr(0xc4).chr(0xab),
610 '=U' => chr(0xc4).chr(0xaa),
611
612 # stroke - handled specially - see below
613
614 # cedilla - handled specially
615 );
616
617# these are one letter latex commands - we make sure they're not a longer
618# command name. eg {\d} is d+stroke, so careful of \d
619 my %special_utf8_chars =
620 (
621 # breve
622 'u g' => chr(0xc4).chr(0x9f),
623 'u G' => chr(0xc4).chr(0x9e),
624 'u i' => chr(0xc4).chr(0xad),
625 'u I' => chr(0xc4).chr(0xac),
626 'u o' => chr(0xc5).chr(0x8f),
627 'u O' => chr(0xc5).chr(0x8e),
628 'u u' => chr(0xc5).chr(0xad),
629 'u U' => chr(0xc5).chr(0xac),
630 'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
631 'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
632 # caron
633 'v c' => chr(0xc4).chr(0x8d),
634 'v C' => chr(0xc4).chr(0x8c),
635 'v n' => chr(0xc5).chr(0x88),
636 'v N' => chr(0xc5).chr(0x87),
637 'v s' => chr(0xc5).chr(0xa1),
638 'v S' => chr(0xc5).chr(0xa5),
639 'v z' => chr(0xc5).chr(0xbe),
640 'v Z' => chr(0xc5).chr(0xbd),
641 # cedilla
642 'c c' => chr(0xc3).chr(0xa7),
643 'c C' => chr(0xc3).chr(0x87),
644 'c g' => chr(0xc4).chr(0xa3),
645 'c G' => chr(0xc4).chr(0xa2),
646 'c k' => chr(0xc4).chr(0xb7),
647 'c K' => chr(0xc4).chr(0xb6),
648 'c l' => chr(0xc4).chr(0xbc),
649 'c L' => chr(0xc4).chr(0xbb),
650 'c n' => chr(0xc5).chr(0x86),
651 'c N' => chr(0xc5).chr(0x85),
652 'c r' => chr(0xc5).chr(0x97),
653 'c R' => chr(0xc5).chr(0x96),
654 'c s' => chr(0xc5).chr(0x9f),
655 'c S' => chr(0xc5).chr(0x9e),
656 'c t' => chr(0xc5).chr(0xa3),
657 'c T' => chr(0xc5).chr(0xa2),
658 # double acute / Hungarian accent
659 'H O' => chr(0xc5).chr(0x90),
660 'H o' => chr(0xc5).chr(0x91),
661 'H U' => chr(0xc5).chr(0xb0),
662 'H u' => chr(0xc5).chr(0xb1),
663
664 # stroke
665 'd' => chr(0xc4).chr(0x91),
666 'D' => chr(0xc4).chr(0x90),
667 'h' => chr(0xc4).chr(0xa7),
668# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
669 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
670 'l' => chr(0xc5).chr(0x82),
671 'L' => chr(0xc5).chr(0x81),
672 'o' => chr(0xc3).chr(0xb8),
673 'O' => chr(0xc3).chr(0x98),
674 't' => chr(0xc5).chr(0xa7),
675 'T' => chr(0xc5).chr(0xa6),
676
677 # other special chars
678 'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
679 'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
680 );
681
682 my $self=shift;
683 my $textref=shift;
684
685 my $outhandle=$self->{'outhandle'};
686 my $text=$$textref;
687
688 # remove space (if any) between \ and letter to accent (eg {\' a})
689 $text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
690
691 # remove {} around a single character (eg \'{e})
692 $text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g; # for emacs indenting... `]);
693
694 ## only in bibtex... not in latex proper?!
695 ### \, is another way of doing cedilla \c
696 ##$text =~ s~\\,(.)~\\c $1~g;
697
698 # remove {} around a single character for special 1 letter commands -
699 # need to insert a space. Eg \v{s} -> {\v s}
700 $text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
701
702 # only do if the text contains a '\' character.
703 if ($text =~ m|\\|) {
704 # "normal" accents - ie non-alpha latex tag
705 # xxx used to have ([\w]\b)@ (for word boundary)
706 while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
707 my $tex="$1$2"; my $char="$2";
708 my $replacement=$utf8_chars{$tex};
709 if (!defined($replacement)) {
710 $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
711 print $outhandle . $self->{'plugin_type'} .
712 ": Warning: unknown latex accent \"$tex\""
713 . " in \"$1\"\n";
714 $replacement=$char;
715 }
716 $text =~ s/\\\Q$tex/$replacement/g;
717 }
718
719 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
720 # only do the change if immediately followed by a space, }, {, or \
721 # one letter accents ( + ss / aa)
722 while ($text =~ m~\\([DdhiLlOoTt]|ss|aa)[{}\s\"\\]~) {
723 my $tex=$1;
724 my $replacement=$special_utf8_chars{$tex};
725 if (!defined($replacement)) {
726 $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
727 print $outhandle $self->{'plugin_type'} .
728 ": Warning: unknown latex accent \"$tex\""
729 . " in \"$1\"\n";
730 $replacement=$tex;
731 }
732 ($text =~ s/{\\$tex}/$replacement/g) or
733 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
734
735 }
736
737 # one letter latex accent commands that affect following letter
738 while ($text =~ m~\\([uvcH]) ([\w])~) {
739 my $tex="$1 $2"; my $char="$2";
740 my $replacement=$special_utf8_chars{$tex};
741 if (!defined($replacement)) {
742 $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
743 print $outhandle $self->{'plugin_type'} .
744 ": Warning: unknown latex accent \"$tex\""
745 . " in \"$1\"\n";
746 $replacement=$char;
747 }
748 $text =~ s/\\$tex/$replacement/g;
749 }
750 }
751 $textref=\$text;
752}
753
754
755# modules must return true
7561;
Note: See TracBrowser for help on using the repository browser.