source: main/trunk/greenstone2/perllib/plugins/LaTeXPlugin.pm@ 28836

Last change on this file since 28836 was 24548, checked in by ak19, 13 years ago

Part 2 of previous commit (r24547). Added new abstract plugin MetadataRead? that defines can_process_this_file_for_metadata that MetadataPlugin? subclasses can inherit (if MetadataRead? is listed first in the ISA inheritance list) and which will then override the one defined in BasePlugin?. For now committing MARC, ISIS and OAIPlugins which now additionally inherit from MetadataRead?. Other metadataPlugins also need to be committed.

  • Property svn:keywords set to Author Date Id Revision
File size: 23.0 KB
Line 
1###########################################################################
2#
3# LaTeXPlugin.pm
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Written by John McPherson
10# Copyright (C) 2004 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22###########################################################################
23
24# todo:
25# \includegraphics
26# parse/remove tex \if ... macros
27
28package LaTeXPlugin;
29
30# System complains about $arguments if the strict is set
31use strict;
32no strict 'refs'; # so we can print to a handle named by a variable
33
34# greenstone packages
35use ReadTextFile;
36use MetadataRead;
37use unicode;
38use util;
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasePlugin.process_exp}",
43 'type' => "regexp",
44 'reqd' => "no",
45 'deft' => &get_default_process_exp() } ];
46
47my $options = { 'name' => "LaTeXPlugin",
48 'desc' => "{LaTeXPlugin.desc}",
49 'abstract' => "no",
50 'inherits' => "yes",
51 'args' => $arguments };
52
53sub BEGIN {
54 @LaTeXPlugin::ISA = ('MetadataRead', 'ReadTextFile');
55}
56
57sub new {
58 my ($class) = shift (@_);
59 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
60 push(@$pluginlist, $class);
61
62 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
63 push(@{$hashArgOptLists->{"OptList"}},$options);
64
65 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
66
67 $self->{'aux_files'} = {};
68 $self->{'dir_num'} = 0;
69 $self->{'file_num'} = 0;
70 return bless $self, $class;
71}
72
73
74sub get_default_process_exp {
75 my $self = shift (@_);
76 return q^\.tex$^;
77}
78
79sub get_default_block_exp {
80 # assume any .eps files are part of the latex stuff
81 return '\.(?:eps)$';
82}
83
84
85sub process {
86 my $self = shift (@_);
87 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
88
89 my $start=substr($$textref, 0, 200); # first 200 bytes
90
91 if ($start !~ m~\\ (?:documentclass | documentstyle | input | section
92 | chapter | contents | begin) ~x) {
93 # this doesn't look like latex...
94 return undef;
95 }
96 my $outhandle = $self->{'outhandle'};
97
98 my $cursection = $doc_obj->get_top_section();
99
100 ###### clean up text ######
101 $$textref =~ s/\r$//mg; # remove dos ^M
102 $$textref =~ s/%.*$//mg; # remove comments
103
104 # convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
105
106 $$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
107 # check that both sides are ascii, so we don't screw up utf-8 chars
108 $$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
109 unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
110
111
112 ###### find metadata ######
113
114 ## FileFormat metadata ##
115 $doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
116
117 ### title metadata ###
118 $$textref =~ m@\\title\s*{(.*?)}@s;
119 my $title = $1;
120 if (!$title) {
121 # no title tag. look for a chapter/section heading
122 $$textref =~ m@\\(?:chapter|section)\s*{(.*?)}@s; # will get 1st match
123 $title = $1;
124 }
125 if (!$title) {
126 # no chapter/section heading tags either... use filename
127 $title = $file;
128 $title =~ s/\.tex$//i;
129 $title =~ s/[-_.]/ /g; # turn punctuation into spaces
130 }
131 if ($title) {
132 $title =~ s@\\\\@ @g; # embedded newlines
133 $title = $self->process_latex($title); # no "-html" for title eg in browser
134 $doc_obj->add_utf8_metadata($cursection, "Title", $title);
135 }
136
137 ### creator/individual author metadata ###
138 $$textref =~ m@\\author\s*{((?:{.*?}|.*?)+)}\s*$@sm;
139 my $authors=$1;
140 if ($authors) {
141 # take care of "et al."...
142 $authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
143 my $etal=$1;
144 $etal="" if (!defined ($etal));
145
146 my @authorlist=parse_authors($self, $authors);
147
148 foreach my $author (@authorlist) {
149 # Add each name to set of Authors
150 $doc_obj->add_utf8_metadata ($cursection, "Author", $author);
151 }
152
153 # Only want at most one "and" in the Creator field
154 my $creator_str="";
155 if (scalar(@authorlist) > 2) {
156 my $lastauthor=pop @authorlist;
157 $creator_str=join(', ', @authorlist);
158 $creator_str.=" and $lastauthor";
159 } else { # 1 or 2 authors...
160 $creator_str=join(" and ",@authorlist);
161 }
162 $creator_str.=$etal; # if there was "et al."
163 $doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
164 }
165 ### end of author metadata ###
166
167 ###### process latex for the main text ######
168 $$textref =~ s/^.*?\\begin{document}//s;
169 $$textref =~ s/\\end{document}.*?$//s;
170 $$textref = $self->process_latex("-html",$$textref);
171 $doc_obj->add_utf8_text($cursection, $$textref);
172
173 return 1;
174}
175
176
177# returns a list of author names
178sub parse_authors {
179 my $self=shift;
180 my $authors=shift;
181
182 my $outhandle=$self->{'outhandle'};
183
184 $authors =~ s/\n/ /g; # remove newlines
185
186 # some people do this for affiliation footnote/dagger
187 $authors =~ s@\$.*?\$@@g; # remove maths from author :(
188
189 # und here for german language...
190 # don't use brackets in pattern, else the matched bit becomes
191 # an element in the list!
192 my @authorlist = split(/\s+and\s+|\s+und\s+/, $authors);
193 my @formattedlist = ();
194 foreach my $author (@authorlist) {
195 $author =~ s/\s*$//;
196 $author =~ s/^\s*//;
197 # Reformat and add author name
198 next if $author=~ /^\s*$/;
199
200 # names are "First von Last", "von Last, First"
201 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
202 my $first="";
203 my $vonlast="";
204 my $jr="";
205
206 if ($author =~ /,/) {
207 my @parts=split(/,\s*/, $author);
208 $first = pop @parts;
209 if (scalar(@parts) == 2) {
210 $jr = pop @parts;
211 }
212 $vonlast=shift @parts;
213 if (scalar(@parts) > 0) {
214 print $outhandle $self->{'plugin_type'} .
215 ": couldn't parse name $author\n";
216 # but we continue anyway...
217 }
218 } else { # First von Last
219 my @words = split(/ /, $author);
220 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
221 $first .= " " . shift (@words);
222 }
223 $first =~ s/^\s//;
224 $vonlast = join (' ', @words); # whatever's left...
225 }
226 my $von="";
227 my $last="";
228 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
229 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
230 $von = $1;
231 if (!defined ($von)) {
232 # some non-English names do start with lowercase
233 # eg "Marie desJardins". Also we can get typos...
234 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
235 $von="";
236 if ($vonlast =~ /^[a-z]+$/) {
237 # if it's all lowercase, uppercase 1st.
238 $vonlast =~ s/^(.)/\u$1/;
239 }
240 }
241 $von =~ s/\s*$//;
242 $last=$vonlast;
243 } else {
244 $last=$vonlast;
245 }
246 my $wholename="$first $von $last $jr";
247 $wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
248# my $fullname = "$last";
249# $fullname .= " $jr" if ($jr);
250# $fullname .= ", $first";
251# $fullname .= " $von" if ($von);
252 push (@formattedlist, $wholename);
253 }
254 return @formattedlist;
255}
256
257
258## following functions based on bibtex plugin ##
259# not actually used at the moment, but might be useful in future?
260sub expand_month {
261 my $text=shift;
262
263 # bibtex style files expand abbreviations for months.
264 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
265 $text =~ s/jan/_textmonth01_/g;
266 $text =~ s/feb/_textmonth02_/g;
267 $text =~ s/mar/_textmonth03_/g;
268 $text =~ s/apr/_textmonth04_/g;
269 $text =~ s/may/_textmonth05_/g;
270 $text =~ s/jun/_textmonth06_/g;
271 $text =~ s/jul/_textmonth07_/g;
272 $text =~ s/aug/_textmonth08_/g;
273 $text =~ s/sep/_textmonth09_/g;
274 $text =~ s/oct/_textmonth10_/g;
275 $text =~ s/nov/_textmonth11_/g;
276 $text =~ s/dec/_textmonth12_/g;
277
278 return $text;
279}
280
281
282# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
283# as the first argument to this function.
284#
285# Convert accented characters, remove { }, interprete some commands....
286# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
287
288# Also, it sucks quite a bit for complicated/nested commands since it doesn't
289# match { with the corresponding }, only the nearest }
290
291sub process_latex {
292 my $self=shift;
293 my $text=shift;
294
295 my $outhandle=$self->{'outhandle'};
296
297 my $html_markup=0;
298 if ($text =~ /^\-html/) {
299 $html_markup=1;
300 $text=shift;
301 }
302
303 if (! $text) {
304 return $text;
305 }
306 # escape html-sensitive characters
307 $text =~ s@&@&amp;@g;
308 $text =~ s@<@&lt;@g;
309 $text =~ s@>@&gt;@g;
310
311 # do this before accents, since \= means something different in tabbing
312 # also \> is a tab stop too, and \\ is newline
313 sub do_tabbing {
314 my $tabbing=shift;
315 $tabbing =~ s!^.*\\kill\s*$!!mg; # \kill sets tab stops, kills line
316 $tabbing =~ s~\\(?:=|&gt;)~\xc2\xa0~g; # replace with nbsp
317 $tabbing =~ s~[\\][\\](?:\[.*?\])?\s*$~<br/>~mg;
318 return "<br/>" . $tabbing . "<br/>\n";
319 }
320 $text =~ s@\\begin{tabbing}(.*?)\\end{tabbing}@do_tabbing($1)@ges;
321 sub do_tabular {
322 my $tabular=shift;
323 $tabular =~ s~(?<!\\)\s*&amp;\s*~</td><td>~g;
324 $tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
325 $tabular =~ s~\\hline~~g; # for now...
326 $tabular =~ s~<td>\s*\\multicolumn{(\d+)}{.*?}~<td colspan="$1">~g;
327 return "<table border=\"1\">\n <tr><td>"
328 . $tabular . "</td></tr></table>\n";
329 }
330 $text =~ s@\\begin{tabular}(?:\[.*?\])?{.*?}(.*?)\\end{tabular} @
331 do_tabular($1) @xges;
332
333 $text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
334
335 # process maths mode before accents... things like \, mean different!
336 # maths mode
337 $text =~ s@\$\$(.*?)\$\$
338 @ process_latex_math($html_markup,$1)
339 @xsge; # multi-line maths: $$ .... $$
340
341 $text =~ s@([^\\])\$(.*?[^\\])\$
342 @$1.process_latex_math($html_markup,$2)@xsge;
343
344
345 # is this an amstext environment, or just custom for that input file?
346 $text =~ s@\\begin{(algorithm)}(.*?)\\end{\1}@remove_equals($2)@ges;
347
348 # convert latex-style accented characters.
349 $self->latex_accents_to_utf8(\$text);
350
351 # replace quotes with utf-8
352
353 $text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (&ldquo;)
354 $text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (&rdquo;)
355 $text =~ s/`/\xe2\xc0\x98/g; # single left quote
356 $text =~ s/'/\xe2\xc0\x99/g; # single right quote
357
358 ###### remove/replace latex commands ######
359 ### commands that expand to something that gets displayed ###
360 $text =~ s~\\ldots~&hellip;~g;
361 $text =~ s~\\hrule~<hr/>\n~g;
362 $text =~ s~\\maketitle~ ~;
363 ### space commands ###
364 $text =~ s~\\[vh]skip\s+\S+~~g;
365 $text =~ s~\\vspace\*?{.*?}~<div>&nbsp;</div>~g; # vertical space
366 $text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
367 $text =~ s~\\noindent\b~~g;
368 # newpage, etc
369 $text =~ s~\\(?:clearemptydoublepage|newpage)~~g;
370 ### counters, contents, environments, labels, etc ###
371 $text =~ s~\\(?:addcontentsline){.*?}{.*?}{.*}~~g;
372 $text =~ s~\s*\\begin{itemize}\s*~\n<ul>\n~g;
373 $text =~ s~\s*\\end{itemize}\s*~</li></ul>\n~g;
374 $text =~ s~\s*\\begin{enumerate}\s*~<ol>\n~g;
375 $text =~ s~\s*\\end{enumerate}\s*~</li></ol>\n~g;
376 if ($text =~ s~\s*\\item~</li>\n<li>~g) {
377 # (count for first list item)
378 $text =~ s~<([ou])l>\s*</li>\s*~<$1l>~g;
379 }
380 $text =~ s~\\(?:label|begin|end){.*?}\s*\n?~ ~g; # remove tag and contents
381 $text =~ s~\\(?:tableofcontents|listoffigures)~ ~g;
382 ### font sizes/styles ###
383 $text =~ s~\\(?:tiny|small|footnotesize|normalsize|large|Large|huge|Huge)\b~~g;
384
385 if ($html_markup) {
386 $text =~ s~\\section\*?{([^\}]+)}\s*\n?~<H1>$1</H1>\n~g;
387 $text =~ s~\\subsection\*?{(.*?)}\s*\n?~<H2>$1</H2>\n~g;
388 $text =~ s~{\\tt\s*(.*?)}~<tt>$1</tt>~g;
389 $text =~ s~\\(?:texttt|tt|ttseries)\s*{(.*?)}~<tt>$1</tt>~g;
390 $text =~ s~\\emph{(.*?)}~<em>$1</em>~g;
391 $text =~ s~{\\(?:em|it)\s*(.*?)}~<em>$1</em>~g;
392 $text =~ s~{\\(?:bf|bfseries)\s*(.*?)}~<strong>$1</strong>~g;
393 $text =~ s~\\(?:textbf|bf|bfseries)\s*{(.*?)}~<strong>$1</strong>~g;
394 } else {
395 # remove tags for text-only
396 $text =~ s~\\(?:textbf|bf|bfseries|em|emph|tt|rm|texttt)\b~ ~g;
397 }
398 $text =~ s ~ {\\sc\s+(.*?)} ~
399 {<span style="font-variant:\ small-caps">$1</span>} ~gx;
400 # ignore these font tags (if there are any left)
401 # sf is sans-serif
402 $text =~ s~\\(?:mdseries|textmd|bfseries|textbf|sffamily|sf|sc)\b~ ~;
403 #### end font-related stuff ####
404
405 ### remove all other commands with optional arguments... ###
406 # don't remove commands without { }....
407 # $text =~ s~\\\w+(\[.*?\])?\s*~~g;
408 # $text =~ s~\\noopsort{[^}]+\}~~g;
409 # verbatim
410 $text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
411 # remove tags, keep contents for \tag[optional]{contents}
412 while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
413
414 # remove latex groupings { } (but not \{ or \} )
415 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
416 $text =~ s/^\{//; # remove { if first char
417
418 # latex characters
419 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
420 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
421 # optional break "\-"
422 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
423 # the non-macro bits have quotes around them - we just remove quotes
424 # XXX bibtex and latex differ here (for the '#' char)...
425 $text =~ s/([^&])[\"\#]/$1/g;
426 }
427 # dashes. Convert (m|n)-dash into single dash for html.
428 $text =~ s~\-\-+~\-~g;
429
430 # quoted { } chars
431 $text =~ s~\\{~{~g;
432 $text =~ s~\\}~}~g;
433
434 # spaces
435 $text =~ s~\\ ~ ~g;
436
437 # finally to protect against macro language...
438 # greenstone-specific
439 $text =~ s~\[~&\#91;~g;
440 $text =~ s~\]~&\#93;~g;
441 $text =~ s~(?<!\\)([\\_])~\\$1~g;
442
443 if ($html_markup) {
444 $text =~ s~\n{2,}~\n</p>\n<p>~g;
445 return "<p>$text</p>";
446 }
447
448 return $text;
449}
450
451# only used by process_latex for \verb....
452sub verb_text {
453 my $verbatim=shift;
454 $verbatim =~ s/([{}_])/\\$1/g;
455 return $verbatim;
456}
457# only used by process_latex_math
458# returns a unicode char if applicable, otherwise ascii
459sub math_fraction {
460 my $num=$1;
461 my $denom=$2;
462
463 if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
464 if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
465 if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
466 return "$num/$denom";
467}
468
469sub process_latex_math {
470
471 my $text = pop; # if given one or two args, this is the last one...
472 my $html_markup=pop; # if given two args, this is the first one else undef
473
474 $text =~ s~\\,~ ~g; # forces a space?
475 $text =~ s~\\infty~infinity~g; # or unicode 0x221E...
476
477# use this one when more things can read 3-byte utf8 values like this!
478# $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
479 $text =~ s~\\cup\b~ U ~g;
480
481 $text =~ s~\\frac\s*{(.+?)}{(.+?)}~math_fraction($1,$2)~ge;
482
483 if ($html_markup) {
484 $text =~ s~\^{(.*?)}~<sup>$1</sup>~g; # a^b superscript
485 $text =~ s~\^([^\{])~<sup>$1</sup>~g;
486 $text =~ s~\_{(.*?)}~<sub>$1</sub>~g; # a_b subscript
487 $text =~ s~\_([^\{])~<sub>$1</sub>~g;
488
489 $text =~ s~\\ldots~&hellip;~g; # use html named entity for now
490
491 # put all other command names in italics for now
492 $text =~ s~\\([\w]+)~<i>$1</i> ~g;
493 }
494
495 # special cases, for some input files
496 if ($text =~ m~^\\\w+$~) {
497 $text="{" . $text . "}";
498 }
499
500 return $text;
501}
502
503
504
505sub latex_accents_to_utf8 {
506
507 # note - this is really ugly, but it works. There may be a prettier way
508 # of mapping latex accented chars to utf8, but we just brute force it here.
509 # Also, this isn't complete - not every single possible accented letter
510 # is in here yet, but most of the common ones are.
511
512 my %utf8_chars =
513 (
514 # acutes
515 '\'a' => chr(0xc3).chr(0xa1),
516 '\'c' => chr(0xc4).chr(0x87),
517 '\'e' => chr(0xc3).chr(0xa9),
518 '\'i' => chr(0xc3).chr(0xad),
519 '\'l' => chr(0xc3).chr(0xba),
520 '\'n' => chr(0xc3).chr(0x84),
521 '\'o' => chr(0xc3).chr(0xb3),
522 '\'r' => chr(0xc5).chr(0x95),
523 '\'s' => chr(0xc5).chr(0x9b),
524 '\'u' => chr(0xc3).chr(0xba),
525 '\'y' => chr(0xc3).chr(0xbd),
526 '\'z' => chr(0xc5).chr(0xba),
527 # graves
528 '`a' => chr(0xc3).chr(0xa0),
529 '`A' => chr(0xc3).chr(0x80),
530 '`e' => chr(0xc3).chr(0xa8),
531 '`E' => chr(0xc3).chr(0x88),
532 '`i' => chr(0xc3).chr(0xac),
533 '`I' => chr(0xc3).chr(0x8c),
534 '`o' => chr(0xc3).chr(0xb2),
535 '`O' => chr(0xc3).chr(0x92),
536 '`u' => chr(0xc3).chr(0xb9),
537 '`U' => chr(0xc3).chr(0x99),
538 # circumflex
539 '^a' => chr(0xc3).chr(0xa2),
540 '^A' => chr(0xc3).chr(0x82),
541 '^c' => chr(0xc4).chr(0x89),
542 '^C' => chr(0xc4).chr(0x88),
543 '^e' => chr(0xc3).chr(0xaa),
544 '^E' => chr(0xc3).chr(0x8a),
545 '^g' => chr(0xc4).chr(0x9d),
546 '^G' => chr(0xc4).chr(0x9c),
547 '^h' => chr(0xc4).chr(0xa5),
548 '^H' => chr(0xc4).chr(0xa4),
549 '^i' => chr(0xc3).chr(0xae),
550 '^I' => chr(0xc3).chr(0x8e),
551 '^j' => chr(0xc4).chr(0xb5),
552 '^J' => chr(0xc4).chr(0xb4),
553 '^o' => chr(0xc3).chr(0xb4),
554 '^O' => chr(0xc3).chr(0x94),
555 '^s' => chr(0xc5).chr(0x9d),
556 '^S' => chr(0xc5).chr(0x9c),
557 '^u' => chr(0xc3).chr(0xa2),
558 '^U' => chr(0xc3).chr(0xbb),
559 '^w' => chr(0xc5).chr(0xb5),
560 '^W' => chr(0xc5).chr(0xb4),
561 '^y' => chr(0xc5).chr(0xb7),
562 '^Y' => chr(0xc5).chr(0xb6),
563
564 # diaeresis
565 '"a' => chr(0xc3).chr(0xa4),
566 '"A' => chr(0xc3).chr(0x84),
567 '"e' => chr(0xc3).chr(0xab),
568 '"E' => chr(0xc3).chr(0x8b),
569 '"i' => chr(0xc3).chr(0xaf),
570 '"I' => chr(0xc3).chr(0x8f),
571 '"\\\\i' => chr(0xc3).chr(0xaf),
572 '"\\\\I' => chr(0xc3).chr(0x8f),
573 '"o' => chr(0xc3).chr(0xb6),
574 '"O' => chr(0xc3).chr(0x96),
575 '"u' => chr(0xc3).chr(0xbc),
576 '"U' => chr(0xc3).chr(0x9c),
577 '"y' => chr(0xc3).chr(0xbf),
578 '"Y' => chr(0xc3).chr(0xb8),
579 # tilde
580 '~A' => chr(0xc3).chr(0x83),
581 '~N' => chr(0xc3).chr(0x91),
582 '~O' => chr(0xc3).chr(0x95),
583 '~a' => chr(0xc3).chr(0xa3),
584 '~n' => chr(0xc3).chr(0xb1),
585 '~o' => chr(0xc3).chr(0xb5),
586 # caron - handled specially
587 # double acute
588 # ring
589 # dot
590 '.c' => chr(0xc4).chr(0x8b),
591 '.C' => chr(0xc4).chr(0x8a),
592 '.e' => chr(0xc4).chr(0x97),
593 '.E' => chr(0xc4).chr(0x96),
594 '.g' => chr(0xc4).chr(0xa1),
595 '.G' => chr(0xc4).chr(0xa0),
596 '.I' => chr(0xc4).chr(0xb0),
597 '.z' => chr(0xc5).chr(0xbc),
598 '.Z' => chr(0xc5).chr(0xbb),
599 # macron
600 '=a' => chr(0xc4).chr(0x81),
601 '=A' => chr(0xc4).chr(0x80),
602 '=e' => chr(0xc4).chr(0x93),
603 '=E' => chr(0xc4).chr(0x92),
604 '=i' => chr(0xc4).chr(0xab),
605 '=I' => chr(0xc4).chr(0xaa),
606 '=o' => chr(0xc4).chr(0x8d),
607 '=O' => chr(0xc4).chr(0x8c),
608 '=u' => chr(0xc4).chr(0xab),
609 '=U' => chr(0xc4).chr(0xaa),
610
611 # stroke - handled specially - see below
612
613 # cedilla - handled specially
614 );
615
616# these are one letter latex commands - we make sure they're not a longer
617# command name. eg {\d} is d+stroke, so careful of \d
618 my %special_utf8_chars =
619 (
620 # breve
621 'u g' => chr(0xc4).chr(0x9f),
622 'u G' => chr(0xc4).chr(0x9e),
623 'u i' => chr(0xc4).chr(0xad),
624 'u I' => chr(0xc4).chr(0xac),
625 'u o' => chr(0xc5).chr(0x8f),
626 'u O' => chr(0xc5).chr(0x8e),
627 'u u' => chr(0xc5).chr(0xad),
628 'u U' => chr(0xc5).chr(0xac),
629 'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
630 'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
631 # caron
632 'v c' => chr(0xc4).chr(0x8d),
633 'v C' => chr(0xc4).chr(0x8c),
634 'v n' => chr(0xc5).chr(0x88),
635 'v N' => chr(0xc5).chr(0x87),
636 'v s' => chr(0xc5).chr(0xa1),
637 'v S' => chr(0xc5).chr(0xa5),
638 'v z' => chr(0xc5).chr(0xbe),
639 'v Z' => chr(0xc5).chr(0xbd),
640 # cedilla
641 'c c' => chr(0xc3).chr(0xa7),
642 'c C' => chr(0xc3).chr(0x87),
643 'c g' => chr(0xc4).chr(0xa3),
644 'c G' => chr(0xc4).chr(0xa2),
645 'c k' => chr(0xc4).chr(0xb7),
646 'c K' => chr(0xc4).chr(0xb6),
647 'c l' => chr(0xc4).chr(0xbc),
648 'c L' => chr(0xc4).chr(0xbb),
649 'c n' => chr(0xc5).chr(0x86),
650 'c N' => chr(0xc5).chr(0x85),
651 'c r' => chr(0xc5).chr(0x97),
652 'c R' => chr(0xc5).chr(0x96),
653 'c s' => chr(0xc5).chr(0x9f),
654 'c S' => chr(0xc5).chr(0x9e),
655 'c t' => chr(0xc5).chr(0xa3),
656 'c T' => chr(0xc5).chr(0xa2),
657 # double acute / Hungarian accent
658 'H O' => chr(0xc5).chr(0x90),
659 'H o' => chr(0xc5).chr(0x91),
660 'H U' => chr(0xc5).chr(0xb0),
661 'H u' => chr(0xc5).chr(0xb1),
662
663 # stroke
664 'd' => chr(0xc4).chr(0x91),
665 'D' => chr(0xc4).chr(0x90),
666 'h' => chr(0xc4).chr(0xa7),
667# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
668 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
669 'l' => chr(0xc5).chr(0x82),
670 'L' => chr(0xc5).chr(0x81),
671 'o' => chr(0xc3).chr(0xb8),
672 'O' => chr(0xc3).chr(0x98),
673 't' => chr(0xc5).chr(0xa7),
674 'T' => chr(0xc5).chr(0xa6),
675
676 # other special chars
677 'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
678 'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
679 );
680
681 my $self=shift;
682 my $textref=shift;
683
684 my $outhandle=$self->{'outhandle'};
685 my $text=$$textref;
686
687 # remove space (if any) between \ and letter to accent (eg {\' a})
688 $text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
689
690 # remove {} around a single character (eg \'{e})
691 $text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g; # for emacs indenting... `]);
692
693 ## only in bibtex... not in latex proper?!
694 ### \, is another way of doing cedilla \c
695 ##$text =~ s~\\,(.)~\\c $1~g;
696
697 # remove {} around a single character for special 1 letter commands -
698 # need to insert a space. Eg \v{s} -> {\v s}
699 $text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
700
701 # only do if the text contains a '\' character.
702 if ($text =~ m|\\|) {
703 # "normal" accents - ie non-alpha latex tag
704 # xxx used to have ([\w]\b)@ (for word boundary)
705 while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
706 my $tex="$1$2"; my $char="$2";
707 my $replacement=$utf8_chars{$tex};
708 if (!defined($replacement)) {
709 $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
710 print $outhandle . $self->{'plugin_type'} .
711 ": Warning: unknown latex accent \"$tex\""
712 . " in \"$1\"\n";
713 $replacement=$char;
714 }
715 $text =~ s/\\\Q$tex/$replacement/g;
716 }
717
718 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
719 # only do the change if immediately followed by a space, }, {, or \
720 # one letter accents ( + ss / aa)
721 while ($text =~ m~\\([DdhiLlOoTt]|ss|aa)[{}\s\"\\]~) {
722 my $tex=$1;
723 my $replacement=$special_utf8_chars{$tex};
724 if (!defined($replacement)) {
725 $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
726 print $outhandle $self->{'plugin_type'} .
727 ": Warning: unknown latex accent \"$tex\""
728 . " in \"$1\"\n";
729 $replacement=$tex;
730 }
731 ($text =~ s/{\\$tex}/$replacement/g) or
732 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
733
734 }
735
736 # one letter latex accent commands that affect following letter
737 while ($text =~ m~\\([uvcH]) ([\w])~) {
738 my $tex="$1 $2"; my $char="$2";
739 my $replacement=$special_utf8_chars{$tex};
740 if (!defined($replacement)) {
741 $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
742 print $outhandle $self->{'plugin_type'} .
743 ": Warning: unknown latex accent \"$tex\""
744 . " in \"$1\"\n";
745 $replacement=$char;
746 }
747 $text =~ s/\\$tex/$replacement/g;
748 }
749 }
750 $textref=\$text;
751}
752
753
754# modules must return true
7551;
Note: See TracBrowser for help on using the repository browser.