root/main/trunk/greenstone2/perllib/plugins/LaTeXPlugin.pm @ 23484

Revision 16104, 23.0 KB (checked in by kjdon, 12 years ago)

tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# LaTeXPlugin.pm
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Written by John McPherson
10# Copyright (C) 2004 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22###########################################################################
23
24# todo:
25#  \includegraphics
26#  parse/remove tex \if ... macros
27
28package LaTeXPlugin;
29
30# System complains about $arguments if the strict is set
31use strict;
32no strict 'refs'; # so we can print to a handle named by a variable
33
34# greenstone packages
35use ReadTextFile;
36use unicode;
37use util;
38
39my $arguments =
40    [ { 'name' => "process_exp",
41    'desc' => "{BasePlugin.process_exp}",
42    'type' => "regexp",
43    'reqd' => "no",
44    'deft' => &get_default_process_exp() } ];
45
46my $options = { 'name'     => "LaTeXPlugin",
47        'desc'     => "{LaTeXPlugin.desc}",
48        'abstract' => "no",
49        'inherits' => "yes",
50        'args'     => $arguments };
51
52sub BEGIN {
53    @LaTeXPlugin::ISA = ('ReadTextFile');
54}
55
56sub new {
57    my ($class) = shift (@_);
58    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
59    push(@$pluginlist, $class);
60
61    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
62    push(@{$hashArgOptLists->{"OptList"}},$options);
63
64    my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
65
66    $self->{'aux_files'} = {};
67    $self->{'dir_num'} = 0;
68    $self->{'file_num'} = 0;
69    return bless $self, $class;
70}
71
72
73sub get_default_process_exp {
74    my $self = shift (@_);
75    return q^\.tex$^;
76}
77
78sub get_default_block_exp {
79    # assume any .eps files are part of the latex stuff
80    return '\.(?:eps)$';
81}
82
83
84sub process {
85    my $self = shift (@_);
86    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
87
88    my $start=substr($$textref, 0, 200); # first 200 bytes
89
90    if ($start !~ m~\\ (?:documentclass | documentstyle | input | section
91            | chapter | contents | begin) ~x) {
92    # this doesn't look like latex...
93    return undef;
94    }
95    my $outhandle = $self->{'outhandle'};
96
97    my $cursection = $doc_obj->get_top_section();
98
99    ###### clean up text ######
100    $$textref =~ s/\r$//mg;  # remove dos ^M
101    $$textref =~ s/%.*$//mg; # remove comments
102
103    # convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
104
105    $$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
106    # check that both sides are ascii, so we don't screw up utf-8 chars
107    $$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
108    unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
109
110
111    ###### find metadata ######
112
113    ## FileFormat metadata ##
114    $doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
115
116    ### title metadata ###
117    $$textref =~ m@\\title\s*{(.*?)}@s;
118    my $title = $1;
119    if (!$title) {
120    # no title tag. look for a chapter/section heading
121    $$textref =~ m@\\(?:chapter|section)\s*{(.*?)}@s; # will get 1st match
122    $title = $1;
123    }
124    if (!$title) {
125    # no chapter/section heading tags either... use filename
126    $title = $file;
127    $title =~ s/\.tex$//i;
128    $title =~ s/[-_.]/ /g; # turn punctuation into spaces
129    }
130    if ($title) {
131    $title =~ s@\\\\@ @g; # embedded newlines
132    $title = $self->process_latex($title); # no "-html" for title eg in browser
133    $doc_obj->add_utf8_metadata($cursection, "Title", $title);
134    }
135
136    ### creator/individual author metadata ###
137    $$textref =~ m@\\author\s*{((?:{.*?}|.*?)+)}\s*$@sm;
138    my $authors=$1;
139    if ($authors) {
140    # take care of "et al."...
141    $authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
142    my $etal=$1;
143    $etal="" if (!defined ($etal));
144
145    my @authorlist=parse_authors($self, $authors);
146
147    foreach my $author (@authorlist) {
148        # Add each name to set of Authors
149        $doc_obj->add_utf8_metadata ($cursection, "Author", $author);
150    }
151
152    # Only want at most one "and" in the Creator field
153    my $creator_str="";
154    if (scalar(@authorlist) > 2) {
155        my $lastauthor=pop @authorlist;
156        $creator_str=join(', ', @authorlist);
157        $creator_str.=" and $lastauthor";
158    } else { # 1 or 2 authors...
159        $creator_str=join(" and ",@authorlist);
160    }
161    $creator_str.=$etal; # if there was "et al."
162        $doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
163    }
164    ### end of author metadata ###
165
166    ###### process latex for the main text ######
167    $$textref =~ s/^.*?\\begin{document}//s;
168    $$textref =~ s/\\end{document}.*?$//s;
169    $$textref = $self->process_latex("-html",$$textref);
170    $doc_obj->add_utf8_text($cursection, $$textref);
171
172    return 1;
173}
174
175
176# returns a list of author names
177sub parse_authors {
178    my $self=shift;
179    my $authors=shift;
180
181    my $outhandle=$self->{'outhandle'};
182
183    $authors =~ s/\n/ /g; # remove newlines
184   
185    # some people do this for affiliation footnote/dagger
186    $authors =~ s@\$.*?\$@@g; # remove maths from author :(
187
188    # und here for german language...
189    # don't use brackets in pattern, else the matched bit becomes
190    # an element in the list!
191    my @authorlist = split(/\s+and\s+|\s+und\s+/, $authors);
192    my @formattedlist = ();
193    foreach my $author (@authorlist) {
194    $author =~ s/\s*$//;
195    $author =~ s/^\s*//;
196    # Reformat and add author name
197    next if $author=~ /^\s*$/;
198
199    # names are "First von Last", "von Last, First"
200    # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
201    my $first="";
202    my $vonlast="";
203    my $jr="";
204       
205    if ($author =~ /,/) {
206        my @parts=split(/,\s*/, $author);
207        $first = pop @parts;
208        if (scalar(@parts) == 2) {
209        $jr = pop @parts;
210        }
211        $vonlast=shift @parts;
212        if (scalar(@parts) > 0) {
213        print $outhandle $self->{'plugin_type'} .
214            ": couldn't parse name $author\n";
215        # but we continue anyway...
216        }
217    } else { # First von Last
218        my @words = split(/ /, $author);
219        while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
220        $first .= " " . shift (@words);
221        }
222        $first =~ s/^\s//;
223        $vonlast = join (' ', @words); # whatever's left...
224    }
225    my $von="";
226    my $last="";
227    if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
228        $vonlast =~ s/^(([a-z]\w+\s+)+)//;
229        $von = $1;
230        if (!defined ($von)) {
231        # some non-English names do start with lowercase
232        # eg "Marie desJardins". Also we can get typos...
233        print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
234        $von="";
235        if ($vonlast =~ /^[a-z]+$/) {
236            # if it's all lowercase, uppercase 1st.
237            $vonlast =~ s/^(.)/\u$1/;
238        }
239        }
240        $von =~ s/\s*$//;
241        $last=$vonlast;
242    } else {
243        $last=$vonlast;
244    }
245    my $wholename="$first $von $last $jr";
246    $wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
247#   my $fullname = "$last";
248#   $fullname .= " $jr" if ($jr);
249#   $fullname .= ", $first";
250#   $fullname .= " $von" if ($von);
251    push (@formattedlist, $wholename);
252    }
253    return @formattedlist;
254}
255
256
257## following functions based on bibtex plugin ##
258# not actually used at the moment, but might be useful in future?
259sub expand_month {
260    my $text=shift;
261
262    # bibtex style files expand abbreviations for months.
263    # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
264    $text =~ s/jan/_textmonth01_/g;
265    $text =~ s/feb/_textmonth02_/g;
266    $text =~ s/mar/_textmonth03_/g;
267    $text =~ s/apr/_textmonth04_/g;
268    $text =~ s/may/_textmonth05_/g;
269    $text =~ s/jun/_textmonth06_/g;
270    $text =~ s/jul/_textmonth07_/g;
271    $text =~ s/aug/_textmonth08_/g;
272    $text =~ s/sep/_textmonth09_/g;
273    $text =~ s/oct/_textmonth10_/g;
274    $text =~ s/nov/_textmonth11_/g;
275    $text =~ s/dec/_textmonth12_/g;
276
277    return $text;
278}
279
280
281# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
282# as the first argument to this function.
283#
284# Convert accented characters, remove { }, interprete some commands....
285# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
286
287# Also, it sucks quite a bit for complicated/nested commands since it doesn't
288# match { with the corresponding }, only the nearest }
289
290sub process_latex {
291    my $self=shift;
292    my $text=shift;
293
294    my $outhandle=$self->{'outhandle'};
295
296    my $html_markup=0;
297    if ($text =~ /^\-html/) {
298    $html_markup=1;
299    $text=shift;
300    }
301
302    if (! $text) {
303    return $text;
304    }
305    # escape html-sensitive characters
306    $text =~ s@&@&amp;@g;
307    $text =~ s@<@&lt;@g;
308    $text =~ s@>@&gt;@g;
309 
310    # do this before accents, since \= means something different in tabbing
311    # also \> is a tab stop too, and \\ is newline
312    sub do_tabbing {
313    my $tabbing=shift;
314    $tabbing =~ s!^.*\\kill\s*$!!mg; # \kill sets tab stops, kills line
315    $tabbing =~ s~\\(?:=|&gt;)~\xc2\xa0~g; # replace with nbsp
316    $tabbing =~ s~[\\][\\](?:\[.*?\])?\s*$~<br/>~mg;
317    return "<br/>" . $tabbing . "<br/>\n";
318    }
319    $text =~ s@\\begin{tabbing}(.*?)\\end{tabbing}@do_tabbing($1)@ges;
320    sub do_tabular {
321    my $tabular=shift;
322    $tabular =~ s~(?<!\\)\s*&amp;\s*~</td><td>~g;
323    $tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
324    $tabular =~ s~\\hline~~g; # for now...
325    $tabular =~ s~<td>\s*\\multicolumn{(\d+)}{.*?}~<td colspan="$1">~g;
326    return "<table border=\"1\">\n <tr><td>"
327        . $tabular . "</td></tr></table>\n";
328    }
329    $text =~ s@\\begin{tabular}(?:\[.*?\])?{.*?}(.*?)\\end{tabular} @
330            do_tabular($1)  @xges;
331
332    $text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
333
334    # process maths mode before accents... things like \, mean different!
335    # maths mode
336    $text =~ s@\$\$(.*?)\$\$
337    @ process_latex_math($html_markup,$1)
338    @xsge; # multi-line maths: $$ .... $$
339
340    $text =~ s@([^\\])\$(.*?[^\\])\$
341    @$1.process_latex_math($html_markup,$2)@xsge;
342
343
344    # is this an amstext environment, or just custom for that input file?
345    $text =~ s@\\begin{(algorithm)}(.*?)\\end{\1}@remove_equals($2)@ges;
346
347    # convert latex-style accented characters.
348    $self->latex_accents_to_utf8(\$text);
349
350    # replace quotes with utf-8
351
352    $text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (&ldquo;)
353    $text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (&rdquo;)
354    $text =~ s/`/\xe2\xc0\x98/g; # single left quote
355    $text =~ s/'/\xe2\xc0\x99/g; # single right quote
356
357    ###### remove/replace latex commands ######
358    ### commands that expand to something that gets displayed ###
359    $text =~ s~\\ldots~&hellip;~g;
360    $text =~ s~\\hrule~<hr/>\n~g;
361    $text =~ s~\\maketitle~ ~;
362    ### space commands ###
363    $text =~ s~\\[vh]skip\s+\S+~~g;
364    $text =~ s~\\vspace\*?{.*?}~<div>&nbsp;</div>~g; # vertical space
365    $text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
366    $text =~ s~\\noindent\b~~g;
367    # newpage, etc
368    $text =~ s~\\(?:clearemptydoublepage|newpage)~~g;
369    ### counters, contents, environments, labels, etc ###
370    $text =~ s~\\(?:addcontentsline){.*?}{.*?}{.*}~~g;
371    $text =~ s~\s*\\begin{itemize}\s*~\n<ul>\n~g;
372    $text =~ s~\s*\\end{itemize}\s*~</li></ul>\n~g;
373    $text =~ s~\s*\\begin{enumerate}\s*~<ol>\n~g;
374    $text =~ s~\s*\\end{enumerate}\s*~</li></ol>\n~g;
375    if ($text =~ s~\s*\\item~</li>\n<li>~g) {
376    # (count for first list item)
377    $text =~ s~<([ou])l>\s*</li>\s*~<$1l>~g;
378    }
379    $text =~ s~\\(?:label|begin|end){.*?}\s*\n?~ ~g; # remove tag and contents
380    $text =~ s~\\(?:tableofcontents|listoffigures)~ ~g;
381    ### font sizes/styles ###
382    $text =~ s~\\(?:tiny|small|footnotesize|normalsize|large|Large|huge|Huge)\b~~g;
383
384    if ($html_markup) {
385    $text =~ s~\\section\*?{([^\}]+)}\s*\n?~<H1>$1</H1>\n~g;
386    $text =~ s~\\subsection\*?{(.*?)}\s*\n?~<H2>$1</H2>\n~g;
387    $text =~ s~{\\tt\s*(.*?)}~<tt>$1</tt>~g;
388    $text =~ s~\\(?:texttt|tt|ttseries)\s*{(.*?)}~<tt>$1</tt>~g;
389    $text =~ s~\\emph{(.*?)}~<em>$1</em>~g;
390    $text =~ s~{\\(?:em|it)\s*(.*?)}~<em>$1</em>~g;
391    $text =~ s~{\\(?:bf|bfseries)\s*(.*?)}~<strong>$1</strong>~g;
392    $text =~ s~\\(?:textbf|bf|bfseries)\s*{(.*?)}~<strong>$1</strong>~g;
393    } else {
394    # remove tags for text-only
395    $text =~ s~\\(?:textbf|bf|bfseries|em|emph|tt|rm|texttt)\b~ ~g;
396    }
397    $text =~ s ~ {\\sc\s+(.*?)} ~
398        {<span style="font-variant:\ small-caps">$1</span>} ~gx;
399    # ignore these font tags (if there are any left)
400    # sf is sans-serif
401    $text =~ s~\\(?:mdseries|textmd|bfseries|textbf|sffamily|sf|sc)\b~ ~;
402    #### end font-related stuff ####
403
404    ### remove all other commands with optional arguments... ###
405    # don't remove commands without { }....
406    # $text =~ s~\\\w+(\[.*?\])?\s*~~g;
407    # $text =~ s~\\noopsort{[^}]+\}~~g;
408    # verbatim
409    $text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
410    # remove tags, keep contents for \tag[optional]{contents}
411    while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
412   
413    # remove latex groupings { } (but not \{ or \} )
414    while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
415    $text =~ s/^\{//; # remove { if first char
416
417    # latex characters
418    # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
419    $text =~ s/([^\\])~+/$1 /g; # non-breaking space  "~"
420    # optional break "\-"
421    if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
422    # the non-macro bits have quotes around them - we just remove quotes
423    # XXX bibtex and latex differ here (for the '#' char)...
424    $text =~ s/([^&])[\"\#]/$1/g;
425    }
426    # dashes. Convert (m|n)-dash into single dash for html.
427    $text =~ s~\-\-+~\-~g;
428
429    # quoted { } chars
430    $text =~ s~\\{~{~g;
431    $text =~ s~\\}~}~g;
432
433    # spaces
434    $text =~ s~\\ ~ ~g;
435
436    # finally to protect against macro language...
437    # greenstone-specific
438    $text =~ s~\[~&\#91;~g;
439    $text =~ s~\]~&\#93;~g;
440    $text =~ s~(?<!\\)([\\_])~\\$1~g;
441
442    if ($html_markup) {
443    $text =~ s~\n{2,}~\n</p>\n<p>~g;
444    return "<p>$text</p>";
445    }
446
447    return $text;
448}
449
450# only used by process_latex for \verb....
451sub verb_text {
452    my $verbatim=shift;
453    $verbatim =~ s/([{}_])/\\$1/g;
454    return $verbatim;
455}
456# only used by process_latex_math
457# returns a unicode char if applicable, otherwise ascii
458sub math_fraction {
459    my $num=$1;
460    my $denom=$2;
461
462    if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
463    if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
464    if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
465    return "$num/$denom";
466}
467
468sub process_latex_math {
469
470    my $text = pop; # if given one or two args, this is the last one...
471    my $html_markup=pop; # if given two args, this is the first one else undef
472
473    $text =~ s~\\,~ ~g; # forces a space?
474    $text =~ s~\\infty~infinity~g;             # or unicode 0x221E...
475
476# use this one when more things can read 3-byte utf8 values like this!
477#    $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
478    $text =~ s~\\cup\b~ U ~g;
479
480    $text =~ s~\\frac\s*{(.+?)}{(.+?)}~math_fraction($1,$2)~ge;
481
482    if ($html_markup) {
483    $text =~ s~\^{(.*?)}~<sup>$1</sup>~g;  # a^b superscript
484    $text =~ s~\^([^\{])~<sup>$1</sup>~g;
485    $text =~ s~\_{(.*?)}~<sub>$1</sub>~g;  # a_b subscript
486    $text =~ s~\_([^\{])~<sub>$1</sub>~g;
487   
488    $text =~ s~\\ldots~&hellip;~g;         # use html named entity for now
489
490    # put all other command names in italics for now
491    $text =~ s~\\([\w]+)~<i>$1</i> ~g;
492    }
493
494    # special cases, for some input files
495    if ($text =~ m~^\\\w+$~) {
496    $text="{" . $text . "}";
497    }
498
499    return $text;
500}
501
502
503
504sub latex_accents_to_utf8 {
505
506    # note - this is really ugly, but it works. There may be a prettier way
507    # of mapping latex accented chars to utf8, but we just brute force it here.
508    # Also, this isn't complete - not every single possible accented letter
509    # is in here yet, but most of the common ones are.
510
511    my %utf8_chars =
512    (
513     # acutes
514     '\'a' => chr(0xc3).chr(0xa1),
515     '\'c' => chr(0xc4).chr(0x87),
516     '\'e' => chr(0xc3).chr(0xa9),
517     '\'i' => chr(0xc3).chr(0xad),
518     '\'l' => chr(0xc3).chr(0xba),
519     '\'n' => chr(0xc3).chr(0x84),
520     '\'o' => chr(0xc3).chr(0xb3),
521     '\'r' => chr(0xc5).chr(0x95),
522     '\'s' => chr(0xc5).chr(0x9b),
523     '\'u' => chr(0xc3).chr(0xba),
524     '\'y' => chr(0xc3).chr(0xbd),
525     '\'z' => chr(0xc5).chr(0xba),
526     # graves
527     '`a' => chr(0xc3).chr(0xa0),
528     '`A' => chr(0xc3).chr(0x80),
529     '`e' => chr(0xc3).chr(0xa8),
530     '`E' => chr(0xc3).chr(0x88),
531     '`i' => chr(0xc3).chr(0xac),
532     '`I' => chr(0xc3).chr(0x8c),
533     '`o' => chr(0xc3).chr(0xb2),
534     '`O' => chr(0xc3).chr(0x92),
535     '`u' => chr(0xc3).chr(0xb9),
536     '`U' => chr(0xc3).chr(0x99),
537     # circumflex
538     '^a' => chr(0xc3).chr(0xa2),
539     '^A' => chr(0xc3).chr(0x82),
540     '^c' => chr(0xc4).chr(0x89),
541     '^C' => chr(0xc4).chr(0x88),
542     '^e' => chr(0xc3).chr(0xaa),
543     '^E' => chr(0xc3).chr(0x8a),
544     '^g' => chr(0xc4).chr(0x9d),
545     '^G' => chr(0xc4).chr(0x9c),
546     '^h' => chr(0xc4).chr(0xa5),
547     '^H' => chr(0xc4).chr(0xa4),
548     '^i' => chr(0xc3).chr(0xae),
549     '^I' => chr(0xc3).chr(0x8e),
550     '^j' => chr(0xc4).chr(0xb5),
551     '^J' => chr(0xc4).chr(0xb4),
552     '^o' => chr(0xc3).chr(0xb4),
553     '^O' => chr(0xc3).chr(0x94),
554     '^s' => chr(0xc5).chr(0x9d),
555     '^S' => chr(0xc5).chr(0x9c),
556     '^u' => chr(0xc3).chr(0xa2),
557     '^U' => chr(0xc3).chr(0xbb),
558     '^w' => chr(0xc5).chr(0xb5),
559     '^W' => chr(0xc5).chr(0xb4),
560     '^y' => chr(0xc5).chr(0xb7),
561     '^Y' => chr(0xc5).chr(0xb6),
562     
563     # diaeresis
564     '"a' => chr(0xc3).chr(0xa4),
565     '"A' => chr(0xc3).chr(0x84),
566     '"e' => chr(0xc3).chr(0xab),
567     '"E' => chr(0xc3).chr(0x8b),
568     '"i' => chr(0xc3).chr(0xaf),
569     '"I' => chr(0xc3).chr(0x8f),
570     '"\\\\i' => chr(0xc3).chr(0xaf),
571     '"\\\\I' => chr(0xc3).chr(0x8f),
572     '"o' => chr(0xc3).chr(0xb6),
573     '"O' => chr(0xc3).chr(0x96),
574     '"u' => chr(0xc3).chr(0xbc),
575     '"U' => chr(0xc3).chr(0x9c),
576     '"y' => chr(0xc3).chr(0xbf),
577     '"Y' => chr(0xc3).chr(0xb8),
578     # tilde
579     '~A' => chr(0xc3).chr(0x83),
580     '~N' => chr(0xc3).chr(0x91),
581     '~O' => chr(0xc3).chr(0x95),
582     '~a' => chr(0xc3).chr(0xa3),
583     '~n' => chr(0xc3).chr(0xb1),
584     '~o' => chr(0xc3).chr(0xb5),
585     # caron - handled specially
586     # double acute
587     # ring
588     # dot
589     '.c' => chr(0xc4).chr(0x8b),
590     '.C' => chr(0xc4).chr(0x8a),
591     '.e' => chr(0xc4).chr(0x97),
592     '.E' => chr(0xc4).chr(0x96),
593     '.g' => chr(0xc4).chr(0xa1),
594     '.G' => chr(0xc4).chr(0xa0),
595     '.I' => chr(0xc4).chr(0xb0),
596     '.z' => chr(0xc5).chr(0xbc),
597     '.Z' => chr(0xc5).chr(0xbb),
598     # macron
599     '=a' => chr(0xc4).chr(0x81),
600     '=A' => chr(0xc4).chr(0x80),
601     '=e' => chr(0xc4).chr(0x93),
602     '=E' => chr(0xc4).chr(0x92),
603     '=i' => chr(0xc4).chr(0xab),
604     '=I' => chr(0xc4).chr(0xaa),
605     '=o' => chr(0xc4).chr(0x8d),
606     '=O' => chr(0xc4).chr(0x8c),
607     '=u' => chr(0xc4).chr(0xab),
608     '=U' => chr(0xc4).chr(0xaa),
609     
610     # stroke - handled specially - see below
611     
612     # cedilla - handled specially
613     );
614   
615# these are one letter latex commands - we make sure they're not a longer
616# command name. eg {\d} is d+stroke, so careful of \d
617    my %special_utf8_chars =
618    (
619     # breve
620     'u g' => chr(0xc4).chr(0x9f),
621     'u G' => chr(0xc4).chr(0x9e),
622     'u i' => chr(0xc4).chr(0xad),
623     'u I' => chr(0xc4).chr(0xac),
624     'u o' => chr(0xc5).chr(0x8f),
625     'u O' => chr(0xc5).chr(0x8e),
626     'u u' => chr(0xc5).chr(0xad),
627     'u U' => chr(0xc5).chr(0xac),
628     'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
629     'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
630     # caron
631     'v c' => chr(0xc4).chr(0x8d),
632     'v C' => chr(0xc4).chr(0x8c),
633     'v n' => chr(0xc5).chr(0x88),
634     'v N' => chr(0xc5).chr(0x87),
635     'v s' => chr(0xc5).chr(0xa1),
636     'v S' => chr(0xc5).chr(0xa5),
637     'v z' => chr(0xc5).chr(0xbe),
638     'v Z' => chr(0xc5).chr(0xbd),
639     # cedilla
640     'c c' => chr(0xc3).chr(0xa7),
641     'c C' => chr(0xc3).chr(0x87),
642     'c g' => chr(0xc4).chr(0xa3),
643     'c G' => chr(0xc4).chr(0xa2),
644     'c k' => chr(0xc4).chr(0xb7),
645     'c K' => chr(0xc4).chr(0xb6),
646     'c l' => chr(0xc4).chr(0xbc),
647     'c L' => chr(0xc4).chr(0xbb),
648     'c n' => chr(0xc5).chr(0x86),
649     'c N' => chr(0xc5).chr(0x85),
650     'c r' => chr(0xc5).chr(0x97),
651     'c R' => chr(0xc5).chr(0x96),
652     'c s' => chr(0xc5).chr(0x9f),
653     'c S' => chr(0xc5).chr(0x9e),
654     'c t' => chr(0xc5).chr(0xa3),
655     'c T' => chr(0xc5).chr(0xa2),
656     # double acute / Hungarian accent
657     'H O' => chr(0xc5).chr(0x90),
658     'H o' => chr(0xc5).chr(0x91),
659     'H U' => chr(0xc5).chr(0xb0),
660     'H u' => chr(0xc5).chr(0xb1),
661     
662     # stroke
663     'd' => chr(0xc4).chr(0x91),
664     'D' => chr(0xc4).chr(0x90),
665     'h' => chr(0xc4).chr(0xa7),
666#    'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
667     'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
668     'l' => chr(0xc5).chr(0x82),
669     'L' => chr(0xc5).chr(0x81),
670     'o' => chr(0xc3).chr(0xb8),
671     'O' => chr(0xc3).chr(0x98),
672     't' => chr(0xc5).chr(0xa7),
673     'T' => chr(0xc5).chr(0xa6),
674
675         # other special chars
676         'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
677         'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
678     );
679
680    my $self=shift;
681    my $textref=shift;
682
683    my $outhandle=$self->{'outhandle'};
684    my $text=$$textref;
685   
686    # remove space (if any) between \ and letter to accent (eg {\' a})
687    $text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g;   # for emacs indenting... `]);
688
689    # remove {} around a single character (eg \'{e})
690    $text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g;  # for emacs indenting... `]);
691
692    ## only in bibtex... not in latex proper?!
693    ### \, is another way of doing cedilla \c
694    ##$text =~ s~\\,(.)~\\c $1~g;
695
696    # remove {} around a single character for special 1 letter commands -
697    # need to insert a space. Eg \v{s}  ->  {\v s}
698    $text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
699
700    # only do if the text contains a '\' character.
701    if ($text =~ m|\\|) {
702    # "normal" accents - ie non-alpha latex tag
703    # xxx used to have ([\w]\b)@ (for word boundary)
704    while ($text =~ m/\\([`'="^~\.])([\w])/) {      # for emacs `])){
705        my $tex="$1$2"; my $char="$2";
706        my $replacement=$utf8_chars{$tex};
707        if (!defined($replacement)) {
708        $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
709        print $outhandle . $self->{'plugin_type'} .
710            ": Warning: unknown latex accent \"$tex\""
711            . " in \"$1\"\n";
712        $replacement=$char;
713        }
714        $text =~ s/\\\Q$tex/$replacement/g;
715    }
716
717        # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
718        # only do the change if immediately followed by a space, }, {, or \
719    # one letter accents ( + ss / aa)
720        while ($text =~ m~\\([DdhiLlOoTt]|ss|aa)[{}\s\"\\]~) {
721        my $tex=$1;
722        my $replacement=$special_utf8_chars{$tex};
723        if (!defined($replacement)) {
724        $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
725        print $outhandle $self->{'plugin_type'} .
726            ": Warning: unknown latex accent \"$tex\""
727            . " in \"$1\"\n";
728        $replacement=$tex;
729        }
730        ($text =~ s/{\\$tex}/$replacement/g) or
731        $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
732
733    }
734
735    # one letter latex accent commands that affect following letter
736        while ($text =~ m~\\([uvcH]) ([\w])~) {
737          my $tex="$1 $2"; my $char="$2";
738          my $replacement=$special_utf8_chars{$tex};
739          if (!defined($replacement)) {
740          $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
741          print  $outhandle $self->{'plugin_type'} .
742          ": Warning: unknown latex accent \"$tex\""
743          . " in \"$1\"\n";
744          $replacement=$char;
745      }
746          $text =~ s/\\$tex/$replacement/g;
747        }
748    }
749    $textref=\$text;
750}
751
752
753# modules must return true
7541;
Note: See TracBrowser for help on using the browser.