source: main/trunk/greenstone2/perllib/plugins/LaTeXPlugin.pm@ 22658

Last change on this file since 22658 was 16104, checked in by kjdon, 16 years ago

tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process

  • Property svn:keywords set to Author Date Id Revision
File size: 23.0 KB
Line 
1###########################################################################
2#
3# LaTeXPlugin.pm
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Written by John McPherson
10# Copyright (C) 2004 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22###########################################################################
23
24# todo:
25# \includegraphics
26# parse/remove tex \if ... macros
27
28package LaTeXPlugin;
29
30# System complains about $arguments if the strict is set
31use strict;
32no strict 'refs'; # so we can print to a handle named by a variable
33
34# greenstone packages
35use ReadTextFile;
36use unicode;
37use util;
38
39my $arguments =
40 [ { 'name' => "process_exp",
41 'desc' => "{BasePlugin.process_exp}",
42 'type' => "regexp",
43 'reqd' => "no",
44 'deft' => &get_default_process_exp() } ];
45
46my $options = { 'name' => "LaTeXPlugin",
47 'desc' => "{LaTeXPlugin.desc}",
48 'abstract' => "no",
49 'inherits' => "yes",
50 'args' => $arguments };
51
52sub BEGIN {
53 @LaTeXPlugin::ISA = ('ReadTextFile');
54}
55
56sub new {
57 my ($class) = shift (@_);
58 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59 push(@$pluginlist, $class);
60
61 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62 push(@{$hashArgOptLists->{"OptList"}},$options);
63
64 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
65
66 $self->{'aux_files'} = {};
67 $self->{'dir_num'} = 0;
68 $self->{'file_num'} = 0;
69 return bless $self, $class;
70}
71
72
73sub get_default_process_exp {
74 my $self = shift (@_);
75 return q^\.tex$^;
76}
77
78sub get_default_block_exp {
79 # assume any .eps files are part of the latex stuff
80 return '\.(?:eps)$';
81}
82
83
84sub process {
85 my $self = shift (@_);
86 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
87
88 my $start=substr($$textref, 0, 200); # first 200 bytes
89
90 if ($start !~ m~\\ (?:documentclass | documentstyle | input | section
91 | chapter | contents | begin) ~x) {
92 # this doesn't look like latex...
93 return undef;
94 }
95 my $outhandle = $self->{'outhandle'};
96
97 my $cursection = $doc_obj->get_top_section();
98
99 ###### clean up text ######
100 $$textref =~ s/\r$//mg; # remove dos ^M
101 $$textref =~ s/%.*$//mg; # remove comments
102
103 # convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
104
105 $$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
106 # check that both sides are ascii, so we don't screw up utf-8 chars
107 $$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
108 unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
109
110
111 ###### find metadata ######
112
113 ## FileFormat metadata ##
114 $doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
115
116 ### title metadata ###
117 $$textref =~ m@\\title\s*{(.*?)}@s;
118 my $title = $1;
119 if (!$title) {
120 # no title tag. look for a chapter/section heading
121 $$textref =~ m@\\(?:chapter|section)\s*{(.*?)}@s; # will get 1st match
122 $title = $1;
123 }
124 if (!$title) {
125 # no chapter/section heading tags either... use filename
126 $title = $file;
127 $title =~ s/\.tex$//i;
128 $title =~ s/[-_.]/ /g; # turn punctuation into spaces
129 }
130 if ($title) {
131 $title =~ s@\\\\@ @g; # embedded newlines
132 $title = $self->process_latex($title); # no "-html" for title eg in browser
133 $doc_obj->add_utf8_metadata($cursection, "Title", $title);
134 }
135
136 ### creator/individual author metadata ###
137 $$textref =~ m@\\author\s*{((?:{.*?}|.*?)+)}\s*$@sm;
138 my $authors=$1;
139 if ($authors) {
140 # take care of "et al."...
141 $authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
142 my $etal=$1;
143 $etal="" if (!defined ($etal));
144
145 my @authorlist=parse_authors($self, $authors);
146
147 foreach my $author (@authorlist) {
148 # Add each name to set of Authors
149 $doc_obj->add_utf8_metadata ($cursection, "Author", $author);
150 }
151
152 # Only want at most one "and" in the Creator field
153 my $creator_str="";
154 if (scalar(@authorlist) > 2) {
155 my $lastauthor=pop @authorlist;
156 $creator_str=join(', ', @authorlist);
157 $creator_str.=" and $lastauthor";
158 } else { # 1 or 2 authors...
159 $creator_str=join(" and ",@authorlist);
160 }
161 $creator_str.=$etal; # if there was "et al."
162 $doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
163 }
164 ### end of author metadata ###
165
166 ###### process latex for the main text ######
167 $$textref =~ s/^.*?\\begin{document}//s;
168 $$textref =~ s/\\end{document}.*?$//s;
169 $$textref = $self->process_latex("-html",$$textref);
170 $doc_obj->add_utf8_text($cursection, $$textref);
171
172 return 1;
173}
174
175
176# returns a list of author names
177sub parse_authors {
178 my $self=shift;
179 my $authors=shift;
180
181 my $outhandle=$self->{'outhandle'};
182
183 $authors =~ s/\n/ /g; # remove newlines
184
185 # some people do this for affiliation footnote/dagger
186 $authors =~ s@\$.*?\$@@g; # remove maths from author :(
187
188 # und here for german language...
189 # don't use brackets in pattern, else the matched bit becomes
190 # an element in the list!
191 my @authorlist = split(/\s+and\s+|\s+und\s+/, $authors);
192 my @formattedlist = ();
193 foreach my $author (@authorlist) {
194 $author =~ s/\s*$//;
195 $author =~ s/^\s*//;
196 # Reformat and add author name
197 next if $author=~ /^\s*$/;
198
199 # names are "First von Last", "von Last, First"
200 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
201 my $first="";
202 my $vonlast="";
203 my $jr="";
204
205 if ($author =~ /,/) {
206 my @parts=split(/,\s*/, $author);
207 $first = pop @parts;
208 if (scalar(@parts) == 2) {
209 $jr = pop @parts;
210 }
211 $vonlast=shift @parts;
212 if (scalar(@parts) > 0) {
213 print $outhandle $self->{'plugin_type'} .
214 ": couldn't parse name $author\n";
215 # but we continue anyway...
216 }
217 } else { # First von Last
218 my @words = split(/ /, $author);
219 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
220 $first .= " " . shift (@words);
221 }
222 $first =~ s/^\s//;
223 $vonlast = join (' ', @words); # whatever's left...
224 }
225 my $von="";
226 my $last="";
227 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
228 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
229 $von = $1;
230 if (!defined ($von)) {
231 # some non-English names do start with lowercase
232 # eg "Marie desJardins". Also we can get typos...
233 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
234 $von="";
235 if ($vonlast =~ /^[a-z]+$/) {
236 # if it's all lowercase, uppercase 1st.
237 $vonlast =~ s/^(.)/\u$1/;
238 }
239 }
240 $von =~ s/\s*$//;
241 $last=$vonlast;
242 } else {
243 $last=$vonlast;
244 }
245 my $wholename="$first $von $last $jr";
246 $wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
247# my $fullname = "$last";
248# $fullname .= " $jr" if ($jr);
249# $fullname .= ", $first";
250# $fullname .= " $von" if ($von);
251 push (@formattedlist, $wholename);
252 }
253 return @formattedlist;
254}
255
256
257## following functions based on bibtex plugin ##
258# not actually used at the moment, but might be useful in future?
259sub expand_month {
260 my $text=shift;
261
262 # bibtex style files expand abbreviations for months.
263 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
264 $text =~ s/jan/_textmonth01_/g;
265 $text =~ s/feb/_textmonth02_/g;
266 $text =~ s/mar/_textmonth03_/g;
267 $text =~ s/apr/_textmonth04_/g;
268 $text =~ s/may/_textmonth05_/g;
269 $text =~ s/jun/_textmonth06_/g;
270 $text =~ s/jul/_textmonth07_/g;
271 $text =~ s/aug/_textmonth08_/g;
272 $text =~ s/sep/_textmonth09_/g;
273 $text =~ s/oct/_textmonth10_/g;
274 $text =~ s/nov/_textmonth11_/g;
275 $text =~ s/dec/_textmonth12_/g;
276
277 return $text;
278}
279
280
281# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
282# as the first argument to this function.
283#
284# Convert accented characters, remove { }, interprete some commands....
285# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
286
287# Also, it sucks quite a bit for complicated/nested commands since it doesn't
288# match { with the corresponding }, only the nearest }
289
290sub process_latex {
291 my $self=shift;
292 my $text=shift;
293
294 my $outhandle=$self->{'outhandle'};
295
296 my $html_markup=0;
297 if ($text =~ /^\-html/) {
298 $html_markup=1;
299 $text=shift;
300 }
301
302 if (! $text) {
303 return $text;
304 }
305 # escape html-sensitive characters
306 $text =~ s@&@&amp;@g;
307 $text =~ s@<@&lt;@g;
308 $text =~ s@>@&gt;@g;
309
310 # do this before accents, since \= means something different in tabbing
311 # also \> is a tab stop too, and \\ is newline
312 sub do_tabbing {
313 my $tabbing=shift;
314 $tabbing =~ s!^.*\\kill\s*$!!mg; # \kill sets tab stops, kills line
315 $tabbing =~ s~\\(?:=|&gt;)~\xc2\xa0~g; # replace with nbsp
316 $tabbing =~ s~[\\][\\](?:\[.*?\])?\s*$~<br/>~mg;
317 return "<br/>" . $tabbing . "<br/>\n";
318 }
319 $text =~ s@\\begin{tabbing}(.*?)\\end{tabbing}@do_tabbing($1)@ges;
320 sub do_tabular {
321 my $tabular=shift;
322 $tabular =~ s~(?<!\\)\s*&amp;\s*~</td><td>~g;
323 $tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
324 $tabular =~ s~\\hline~~g; # for now...
325 $tabular =~ s~<td>\s*\\multicolumn{(\d+)}{.*?}~<td colspan="$1">~g;
326 return "<table border=\"1\">\n <tr><td>"
327 . $tabular . "</td></tr></table>\n";
328 }
329 $text =~ s@\\begin{tabular}(?:\[.*?\])?{.*?}(.*?)\\end{tabular} @
330 do_tabular($1) @xges;
331
332 $text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
333
334 # process maths mode before accents... things like \, mean different!
335 # maths mode
336 $text =~ s@\$\$(.*?)\$\$
337 @ process_latex_math($html_markup,$1)
338 @xsge; # multi-line maths: $$ .... $$
339
340 $text =~ s@([^\\])\$(.*?[^\\])\$
341 @$1.process_latex_math($html_markup,$2)@xsge;
342
343
344 # is this an amstext environment, or just custom for that input file?
345 $text =~ s@\\begin{(algorithm)}(.*?)\\end{\1}@remove_equals($2)@ges;
346
347 # convert latex-style accented characters.
348 $self->latex_accents_to_utf8(\$text);
349
350 # replace quotes with utf-8
351
352 $text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (&ldquo;)
353 $text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (&rdquo;)
354 $text =~ s/`/\xe2\xc0\x98/g; # single left quote
355 $text =~ s/'/\xe2\xc0\x99/g; # single right quote
356
357 ###### remove/replace latex commands ######
358 ### commands that expand to something that gets displayed ###
359 $text =~ s~\\ldots~&hellip;~g;
360 $text =~ s~\\hrule~<hr/>\n~g;
361 $text =~ s~\\maketitle~ ~;
362 ### space commands ###
363 $text =~ s~\\[vh]skip\s+\S+~~g;
364 $text =~ s~\\vspace\*?{.*?}~<div>&nbsp;</div>~g; # vertical space
365 $text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
366 $text =~ s~\\noindent\b~~g;
367 # newpage, etc
368 $text =~ s~\\(?:clearemptydoublepage|newpage)~~g;
369 ### counters, contents, environments, labels, etc ###
370 $text =~ s~\\(?:addcontentsline){.*?}{.*?}{.*}~~g;
371 $text =~ s~\s*\\begin{itemize}\s*~\n<ul>\n~g;
372 $text =~ s~\s*\\end{itemize}\s*~</li></ul>\n~g;
373 $text =~ s~\s*\\begin{enumerate}\s*~<ol>\n~g;
374 $text =~ s~\s*\\end{enumerate}\s*~</li></ol>\n~g;
375 if ($text =~ s~\s*\\item~</li>\n<li>~g) {
376 # (count for first list item)
377 $text =~ s~<([ou])l>\s*</li>\s*~<$1l>~g;
378 }
379 $text =~ s~\\(?:label|begin|end){.*?}\s*\n?~ ~g; # remove tag and contents
380 $text =~ s~\\(?:tableofcontents|listoffigures)~ ~g;
381 ### font sizes/styles ###
382 $text =~ s~\\(?:tiny|small|footnotesize|normalsize|large|Large|huge|Huge)\b~~g;
383
384 if ($html_markup) {
385 $text =~ s~\\section\*?{([^\}]+)}\s*\n?~<H1>$1</H1>\n~g;
386 $text =~ s~\\subsection\*?{(.*?)}\s*\n?~<H2>$1</H2>\n~g;
387 $text =~ s~{\\tt\s*(.*?)}~<tt>$1</tt>~g;
388 $text =~ s~\\(?:texttt|tt|ttseries)\s*{(.*?)}~<tt>$1</tt>~g;
389 $text =~ s~\\emph{(.*?)}~<em>$1</em>~g;
390 $text =~ s~{\\(?:em|it)\s*(.*?)}~<em>$1</em>~g;
391 $text =~ s~{\\(?:bf|bfseries)\s*(.*?)}~<strong>$1</strong>~g;
392 $text =~ s~\\(?:textbf|bf|bfseries)\s*{(.*?)}~<strong>$1</strong>~g;
393 } else {
394 # remove tags for text-only
395 $text =~ s~\\(?:textbf|bf|bfseries|em|emph|tt|rm|texttt)\b~ ~g;
396 }
397 $text =~ s ~ {\\sc\s+(.*?)} ~
398 {<span style="font-variant:\ small-caps">$1</span>} ~gx;
399 # ignore these font tags (if there are any left)
400 # sf is sans-serif
401 $text =~ s~\\(?:mdseries|textmd|bfseries|textbf|sffamily|sf|sc)\b~ ~;
402 #### end font-related stuff ####
403
404 ### remove all other commands with optional arguments... ###
405 # don't remove commands without { }....
406 # $text =~ s~\\\w+(\[.*?\])?\s*~~g;
407 # $text =~ s~\\noopsort{[^}]+\}~~g;
408 # verbatim
409 $text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
410 # remove tags, keep contents for \tag[optional]{contents}
411 while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
412
413 # remove latex groupings { } (but not \{ or \} )
414 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
415 $text =~ s/^\{//; # remove { if first char
416
417 # latex characters
418 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
419 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
420 # optional break "\-"
421 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
422 # the non-macro bits have quotes around them - we just remove quotes
423 # XXX bibtex and latex differ here (for the '#' char)...
424 $text =~ s/([^&])[\"\#]/$1/g;
425 }
426 # dashes. Convert (m|n)-dash into single dash for html.
427 $text =~ s~\-\-+~\-~g;
428
429 # quoted { } chars
430 $text =~ s~\\{~{~g;
431 $text =~ s~\\}~}~g;
432
433 # spaces
434 $text =~ s~\\ ~ ~g;
435
436 # finally to protect against macro language...
437 # greenstone-specific
438 $text =~ s~\[~&\#91;~g;
439 $text =~ s~\]~&\#93;~g;
440 $text =~ s~(?<!\\)([\\_])~\\$1~g;
441
442 if ($html_markup) {
443 $text =~ s~\n{2,}~\n</p>\n<p>~g;
444 return "<p>$text</p>";
445 }
446
447 return $text;
448}
449
450# only used by process_latex for \verb....
451sub verb_text {
452 my $verbatim=shift;
453 $verbatim =~ s/([{}_])/\\$1/g;
454 return $verbatim;
455}
456# only used by process_latex_math
457# returns a unicode char if applicable, otherwise ascii
458sub math_fraction {
459 my $num=$1;
460 my $denom=$2;
461
462 if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
463 if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
464 if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
465 return "$num/$denom";
466}
467
468sub process_latex_math {
469
470 my $text = pop; # if given one or two args, this is the last one...
471 my $html_markup=pop; # if given two args, this is the first one else undef
472
473 $text =~ s~\\,~ ~g; # forces a space?
474 $text =~ s~\\infty~infinity~g; # or unicode 0x221E...
475
476# use this one when more things can read 3-byte utf8 values like this!
477# $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
478 $text =~ s~\\cup\b~ U ~g;
479
480 $text =~ s~\\frac\s*{(.+?)}{(.+?)}~math_fraction($1,$2)~ge;
481
482 if ($html_markup) {
483 $text =~ s~\^{(.*?)}~<sup>$1</sup>~g; # a^b superscript
484 $text =~ s~\^([^\{])~<sup>$1</sup>~g;
485 $text =~ s~\_{(.*?)}~<sub>$1</sub>~g; # a_b subscript
486 $text =~ s~\_([^\{])~<sub>$1</sub>~g;
487
488 $text =~ s~\\ldots~&hellip;~g; # use html named entity for now
489
490 # put all other command names in italics for now
491 $text =~ s~\\([\w]+)~<i>$1</i> ~g;
492 }
493
494 # special cases, for some input files
495 if ($text =~ m~^\\\w+$~) {
496 $text="{" . $text . "}";
497 }
498
499 return $text;
500}
501
502
503
504sub latex_accents_to_utf8 {
505
506 # note - this is really ugly, but it works. There may be a prettier way
507 # of mapping latex accented chars to utf8, but we just brute force it here.
508 # Also, this isn't complete - not every single possible accented letter
509 # is in here yet, but most of the common ones are.
510
511 my %utf8_chars =
512 (
513 # acutes
514 '\'a' => chr(0xc3).chr(0xa1),
515 '\'c' => chr(0xc4).chr(0x87),
516 '\'e' => chr(0xc3).chr(0xa9),
517 '\'i' => chr(0xc3).chr(0xad),
518 '\'l' => chr(0xc3).chr(0xba),
519 '\'n' => chr(0xc3).chr(0x84),
520 '\'o' => chr(0xc3).chr(0xb3),
521 '\'r' => chr(0xc5).chr(0x95),
522 '\'s' => chr(0xc5).chr(0x9b),
523 '\'u' => chr(0xc3).chr(0xba),
524 '\'y' => chr(0xc3).chr(0xbd),
525 '\'z' => chr(0xc5).chr(0xba),
526 # graves
527 '`a' => chr(0xc3).chr(0xa0),
528 '`A' => chr(0xc3).chr(0x80),
529 '`e' => chr(0xc3).chr(0xa8),
530 '`E' => chr(0xc3).chr(0x88),
531 '`i' => chr(0xc3).chr(0xac),
532 '`I' => chr(0xc3).chr(0x8c),
533 '`o' => chr(0xc3).chr(0xb2),
534 '`O' => chr(0xc3).chr(0x92),
535 '`u' => chr(0xc3).chr(0xb9),
536 '`U' => chr(0xc3).chr(0x99),
537 # circumflex
538 '^a' => chr(0xc3).chr(0xa2),
539 '^A' => chr(0xc3).chr(0x82),
540 '^c' => chr(0xc4).chr(0x89),
541 '^C' => chr(0xc4).chr(0x88),
542 '^e' => chr(0xc3).chr(0xaa),
543 '^E' => chr(0xc3).chr(0x8a),
544 '^g' => chr(0xc4).chr(0x9d),
545 '^G' => chr(0xc4).chr(0x9c),
546 '^h' => chr(0xc4).chr(0xa5),
547 '^H' => chr(0xc4).chr(0xa4),
548 '^i' => chr(0xc3).chr(0xae),
549 '^I' => chr(0xc3).chr(0x8e),
550 '^j' => chr(0xc4).chr(0xb5),
551 '^J' => chr(0xc4).chr(0xb4),
552 '^o' => chr(0xc3).chr(0xb4),
553 '^O' => chr(0xc3).chr(0x94),
554 '^s' => chr(0xc5).chr(0x9d),
555 '^S' => chr(0xc5).chr(0x9c),
556 '^u' => chr(0xc3).chr(0xa2),
557 '^U' => chr(0xc3).chr(0xbb),
558 '^w' => chr(0xc5).chr(0xb5),
559 '^W' => chr(0xc5).chr(0xb4),
560 '^y' => chr(0xc5).chr(0xb7),
561 '^Y' => chr(0xc5).chr(0xb6),
562
563 # diaeresis
564 '"a' => chr(0xc3).chr(0xa4),
565 '"A' => chr(0xc3).chr(0x84),
566 '"e' => chr(0xc3).chr(0xab),
567 '"E' => chr(0xc3).chr(0x8b),
568 '"i' => chr(0xc3).chr(0xaf),
569 '"I' => chr(0xc3).chr(0x8f),
570 '"\\\\i' => chr(0xc3).chr(0xaf),
571 '"\\\\I' => chr(0xc3).chr(0x8f),
572 '"o' => chr(0xc3).chr(0xb6),
573 '"O' => chr(0xc3).chr(0x96),
574 '"u' => chr(0xc3).chr(0xbc),
575 '"U' => chr(0xc3).chr(0x9c),
576 '"y' => chr(0xc3).chr(0xbf),
577 '"Y' => chr(0xc3).chr(0xb8),
578 # tilde
579 '~A' => chr(0xc3).chr(0x83),
580 '~N' => chr(0xc3).chr(0x91),
581 '~O' => chr(0xc3).chr(0x95),
582 '~a' => chr(0xc3).chr(0xa3),
583 '~n' => chr(0xc3).chr(0xb1),
584 '~o' => chr(0xc3).chr(0xb5),
585 # caron - handled specially
586 # double acute
587 # ring
588 # dot
589 '.c' => chr(0xc4).chr(0x8b),
590 '.C' => chr(0xc4).chr(0x8a),
591 '.e' => chr(0xc4).chr(0x97),
592 '.E' => chr(0xc4).chr(0x96),
593 '.g' => chr(0xc4).chr(0xa1),
594 '.G' => chr(0xc4).chr(0xa0),
595 '.I' => chr(0xc4).chr(0xb0),
596 '.z' => chr(0xc5).chr(0xbc),
597 '.Z' => chr(0xc5).chr(0xbb),
598 # macron
599 '=a' => chr(0xc4).chr(0x81),
600 '=A' => chr(0xc4).chr(0x80),
601 '=e' => chr(0xc4).chr(0x93),
602 '=E' => chr(0xc4).chr(0x92),
603 '=i' => chr(0xc4).chr(0xab),
604 '=I' => chr(0xc4).chr(0xaa),
605 '=o' => chr(0xc4).chr(0x8d),
606 '=O' => chr(0xc4).chr(0x8c),
607 '=u' => chr(0xc4).chr(0xab),
608 '=U' => chr(0xc4).chr(0xaa),
609
610 # stroke - handled specially - see below
611
612 # cedilla - handled specially
613 );
614
615# these are one letter latex commands - we make sure they're not a longer
616# command name. eg {\d} is d+stroke, so careful of \d
617 my %special_utf8_chars =
618 (
619 # breve
620 'u g' => chr(0xc4).chr(0x9f),
621 'u G' => chr(0xc4).chr(0x9e),
622 'u i' => chr(0xc4).chr(0xad),
623 'u I' => chr(0xc4).chr(0xac),
624 'u o' => chr(0xc5).chr(0x8f),
625 'u O' => chr(0xc5).chr(0x8e),
626 'u u' => chr(0xc5).chr(0xad),
627 'u U' => chr(0xc5).chr(0xac),
628 'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
629 'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
630 # caron
631 'v c' => chr(0xc4).chr(0x8d),
632 'v C' => chr(0xc4).chr(0x8c),
633 'v n' => chr(0xc5).chr(0x88),
634 'v N' => chr(0xc5).chr(0x87),
635 'v s' => chr(0xc5).chr(0xa1),
636 'v S' => chr(0xc5).chr(0xa5),
637 'v z' => chr(0xc5).chr(0xbe),
638 'v Z' => chr(0xc5).chr(0xbd),
639 # cedilla
640 'c c' => chr(0xc3).chr(0xa7),
641 'c C' => chr(0xc3).chr(0x87),
642 'c g' => chr(0xc4).chr(0xa3),
643 'c G' => chr(0xc4).chr(0xa2),
644 'c k' => chr(0xc4).chr(0xb7),
645 'c K' => chr(0xc4).chr(0xb6),
646 'c l' => chr(0xc4).chr(0xbc),
647 'c L' => chr(0xc4).chr(0xbb),
648 'c n' => chr(0xc5).chr(0x86),
649 'c N' => chr(0xc5).chr(0x85),
650 'c r' => chr(0xc5).chr(0x97),
651 'c R' => chr(0xc5).chr(0x96),
652 'c s' => chr(0xc5).chr(0x9f),
653 'c S' => chr(0xc5).chr(0x9e),
654 'c t' => chr(0xc5).chr(0xa3),
655 'c T' => chr(0xc5).chr(0xa2),
656 # double acute / Hungarian accent
657 'H O' => chr(0xc5).chr(0x90),
658 'H o' => chr(0xc5).chr(0x91),
659 'H U' => chr(0xc5).chr(0xb0),
660 'H u' => chr(0xc5).chr(0xb1),
661
662 # stroke
663 'd' => chr(0xc4).chr(0x91),
664 'D' => chr(0xc4).chr(0x90),
665 'h' => chr(0xc4).chr(0xa7),
666# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
667 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
668 'l' => chr(0xc5).chr(0x82),
669 'L' => chr(0xc5).chr(0x81),
670 'o' => chr(0xc3).chr(0xb8),
671 'O' => chr(0xc3).chr(0x98),
672 't' => chr(0xc5).chr(0xa7),
673 'T' => chr(0xc5).chr(0xa6),
674
675 # other special chars
676 'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
677 'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
678 );
679
680 my $self=shift;
681 my $textref=shift;
682
683 my $outhandle=$self->{'outhandle'};
684 my $text=$$textref;
685
686 # remove space (if any) between \ and letter to accent (eg {\' a})
687 $text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
688
689 # remove {} around a single character (eg \'{e})
690 $text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g; # for emacs indenting... `]);
691
692 ## only in bibtex... not in latex proper?!
693 ### \, is another way of doing cedilla \c
694 ##$text =~ s~\\,(.)~\\c $1~g;
695
696 # remove {} around a single character for special 1 letter commands -
697 # need to insert a space. Eg \v{s} -> {\v s}
698 $text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
699
700 # only do if the text contains a '\' character.
701 if ($text =~ m|\\|) {
702 # "normal" accents - ie non-alpha latex tag
703 # xxx used to have ([\w]\b)@ (for word boundary)
704 while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
705 my $tex="$1$2"; my $char="$2";
706 my $replacement=$utf8_chars{$tex};
707 if (!defined($replacement)) {
708 $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
709 print $outhandle . $self->{'plugin_type'} .
710 ": Warning: unknown latex accent \"$tex\""
711 . " in \"$1\"\n";
712 $replacement=$char;
713 }
714 $text =~ s/\\\Q$tex/$replacement/g;
715 }
716
717 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
718 # only do the change if immediately followed by a space, }, {, or \
719 # one letter accents ( + ss / aa)
720 while ($text =~ m~\\([DdhiLlOoTt]|ss|aa)[{}\s\"\\]~) {
721 my $tex=$1;
722 my $replacement=$special_utf8_chars{$tex};
723 if (!defined($replacement)) {
724 $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
725 print $outhandle $self->{'plugin_type'} .
726 ": Warning: unknown latex accent \"$tex\""
727 . " in \"$1\"\n";
728 $replacement=$tex;
729 }
730 ($text =~ s/{\\$tex}/$replacement/g) or
731 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
732
733 }
734
735 # one letter latex accent commands that affect following letter
736 while ($text =~ m~\\([uvcH]) ([\w])~) {
737 my $tex="$1 $2"; my $char="$2";
738 my $replacement=$special_utf8_chars{$tex};
739 if (!defined($replacement)) {
740 $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
741 print $outhandle $self->{'plugin_type'} .
742 ": Warning: unknown latex accent \"$tex\""
743 . " in \"$1\"\n";
744 $replacement=$char;
745 }
746 $text =~ s/\\$tex/$replacement/g;
747 }
748 }
749 $textref=\$text;
750}
751
752
753# modules must return true
7541;
Note: See TracBrowser for help on using the repository browser.