Context Navigation

source: trunk/gsdl/perllib/plugins/LaTeXPlug.pm@ 12169

Last change on this file since 12169 was 12169, checked in by mdewsnip, 18 years ago
Tidied up that horrible long line in the new() function of every plugin.
Property svn:keywords set to `Author Date Id Revision`
File size: 23.2 KB

Line
1	###########################################################################
2	#
3	# LaTeXPlug.pm
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Written by John McPherson
10	# Copyright (C) 2004 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	###########################################################################
23
24	# todo:
25	# \includegraphics
26	# parse/remove tex \if ... macros
27
28	package LaTeXPlug;
29
30	# System complains about $arguments if the strict is set
31	use strict;
32	no strict 'refs'; # so we can print to a handle named by a variable
33
34	# greenstone packages
35	use BasPlug;
36	use unicode;
37	use util;
38
39	my $arguments =
40	[ { 'name' => "process_exp",
41	'desc' => "{BasPlug.process_exp}",
42	'type' => "regexp",
43	'reqd' => "no",
44	'deft' => &get_default_process_exp() } ];
45
46	my $options = { 'name' => 'LaTeXPlug',
47	'desc' => '{LaTeXPlug.desc}',
48	'abstract' => 'no',
49	'inherits' => 'yes',
50	'args' => $arguments };
51
52	sub BEGIN {
53	@LaTeXPlug::ISA = ('BasPlug');
54	}
55
56	sub print_usage {
57	print STDERR "\n usage: plugin LaTeXPlug [options]\n\n";
58	}
59
60	sub new {
61	my ($class) = shift (@_);
62	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
63	push(@$pluginlist, $class);
64
65	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
66	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
67
68	my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
69
70	$self->{'aux_files'} = {};
71	$self->{'dir_num'} = 0;
72	$self->{'file_num'} = 0;
73	return bless $self, $class;
74	}
75
76
77	sub get_default_process_exp {
78	my $self = shift (@_);
79	return q^\.tex$^;
80	}
81
82	sub get_default_block_exp {
83	# assume any .eps files are part of the latex stuff
84	return '\.(?:eps)$';
85	}
86
87
88	sub process {
89	my $self = shift (@_);
90	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
91
92	my $start=substr($$textref, 0, 200); # first 200 bytes
93
94	if ($start !~ m~\\ (?:documentclass \| documentstyle \| input \| section
95	\| chapter \| contents \| begin) ~x) {
96	# this doesn't look like latex...
97	return undef;
98	}
99	my $outhandle = $self->{'outhandle'};
100	if ($gli) {
101	print STDERR "<Processing n='$file' p='LaTeXPlug'>\n";
102	} elsif ($self->{'verbosity'} > 1) {
103	print $outhandle "LaTeXPlug: processing $file\n"
104	}
105	my $cursection = $doc_obj->get_top_section();
106
107	###### clean up text ######
108	$$textref =~ s/\r$//mg; # remove dos ^M
109	$$textref =~ s/%.*$//mg; # remove comments
110
111	# convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
112
113	$$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
114	# check that both sides are ascii, so we don't screw up utf-8 chars
115	$$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
116	unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
117
118
119	###### find metadata ######
120
121	## FileFormat metadata ##
122	$doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
123
124	### title metadata ###
125	$$textref =~ m@\\title\s{(.?)}@s;
126	my $title = $1;
127	if (!$title) {
128	# no title tag. look for a chapter/section heading
129	$$textref =~ m@\\(?:chapter\|section)\s{(.?)}@s; # will get 1st match
130	$title = $1;
131	}
132	if (!$title) {
133	# no chapter/section heading tags either... use filename
134	$title = $file;
135	$title =~ s/\.tex$//i;
136	$title =~ s/[-_.]/ /g; # turn punctuation into spaces
137	}
138	if ($title) {
139	$title =~ s@\\\\@ @g; # embedded newlines
140	$title = $self->process_latex($title); # no "-html" for title eg in browser
141	$doc_obj->add_utf8_metadata($cursection, "Title", $title);
142	}
143
144	### creator/individual author metadata ###
145	$$textref =~ m@\\author\s{((?:{.?}\|.?)+)}\s$@sm;
146	my $authors=$1;
147	if ($authors) {
148	# take care of "et al."...
149	$authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
150	my $etal=$1;
151	$etal="" if (!defined ($etal));
152
153	my @authorlist=parse_authors($self, $authors);
154
155	foreach my $author (@authorlist) {
156	# Add each name to set of Authors
157	$doc_obj->add_utf8_metadata ($cursection, "Author", $author);
158	}
159
160	# Only want at most one "and" in the Creator field
161	my $creator_str="";
162	if (scalar(@authorlist) > 2) {
163	my $lastauthor=pop @authorlist;
164	$creator_str=join(', ', @authorlist);
165	$creator_str.=" and $lastauthor";
166	} else { # 1 or 2 authors...
167	$creator_str=join(" and ",@authorlist);
168	}
169	$creator_str.=$etal; # if there was "et al."
170	$doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
171	}
172	### end of author metadata ###
173
174	###### process latex for the main text ######
175	$$textref =~ s/^.*?\\begin{document}//s;
176	$$textref =~ s/\\end{document}.*?$//s;
177	$$textref = $self->process_latex("-html",$$textref);
178	$doc_obj->add_utf8_text($cursection, $$textref);
179
180	return 1;
181	}
182
183
184	# returns a list of author names
185	sub parse_authors {
186	my $self=shift;
187	my $authors=shift;
188
189	my $outhandle=$self->{'outhandle'};
190
191	$authors =~ s/\n/ /g; # remove newlines
192
193	# some people do this for affiliation footnote/dagger
194	$authors =~ s@\$.*?\$@@g; # remove maths from author :(
195
196	# und here for german language...
197	# don't use brackets in pattern, else the matched bit becomes
198	# an element in the list!
199	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $authors);
200	my @formattedlist = ();
201	foreach my $author (@authorlist) {
202	$author =~ s/\s*$//;
203	$author =~ s/^\s*//;
204	# Reformat and add author name
205	next if $author=~ /^\s*$/;
206
207	# names are "First von Last", "von Last, First"
208	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
209	my $first="";
210	my $vonlast="";
211	my $jr="";
212
213	if ($author =~ /,/) {
214	my @parts=split(/,\s*/, $author);
215	$first = pop @parts;
216	if (scalar(@parts) == 2) {
217	$jr = pop @parts;
218	}
219	$vonlast=shift @parts;
220	if (scalar(@parts) > 0) {
221	print $outhandle $self->{'plugin_type'} .
222	": couldn't parse name $author\n";
223	# but we continue anyway...
224	}
225	} else { # First von Last
226	my @words = split(/ /, $author);
227	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
228	$first .= " " . shift (@words);
229	}
230	$first =~ s/^\s//;
231	$vonlast = join (' ', @words); # whatever's left...
232	}
233	my $von="";
234	my $last="";
235	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
236	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
237	$von = $1;
238	if (!defined ($von)) {
239	# some non-English names do start with lowercase
240	# eg "Marie desJardins". Also we can get typos...
241	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
242	$von="";
243	if ($vonlast =~ /^[a-z]+$/) {
244	# if it's all lowercase, uppercase 1st.
245	$vonlast =~ s/^(.)/\u$1/;
246	}
247	}
248	$von =~ s/\s*$//;
249	$last=$vonlast;
250	} else {
251	$last=$vonlast;
252	}
253	my $wholename="$first $von $last $jr";
254	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
255	# my $fullname = "$last";
256	# $fullname .= " $jr" if ($jr);
257	# $fullname .= ", $first";
258	# $fullname .= " $von" if ($von);
259	push (@formattedlist, $wholename);
260	}
261	return @formattedlist;
262	}
263
264
265	## following functions based on bibtex plugin ##
266	# not actually used at the moment, but might be useful in future?
267	sub expand_month {
268	my $text=shift;
269
270	# bibtex style files expand abbreviations for months.
271	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
272	$text =~ s/jan/_textmonth01_/g;
273	$text =~ s/feb/_textmonth02_/g;
274	$text =~ s/mar/_textmonth03_/g;
275	$text =~ s/apr/_textmonth04_/g;
276	$text =~ s/may/_textmonth05_/g;
277	$text =~ s/jun/_textmonth06_/g;
278	$text =~ s/jul/_textmonth07_/g;
279	$text =~ s/aug/_textmonth08_/g;
280	$text =~ s/sep/_textmonth09_/g;
281	$text =~ s/oct/_textmonth10_/g;
282	$text =~ s/nov/_textmonth11_/g;
283	$text =~ s/dec/_textmonth12_/g;
284
285	return $text;
286	}
287
288
289	# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
290	# as the first argument to this function.
291	#
292	# Convert accented characters, remove { }, interprete some commands....
293	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
294
295	# Also, it sucks quite a bit for complicated/nested commands since it doesn't
296	# match { with the corresponding }, only the nearest }
297
298	sub process_latex {
299	my $self=shift;
300	my $text=shift;
301
302	my $outhandle=$self->{'outhandle'};
303
304	my $html_markup=0;
305	if ($text =~ /^\-html/) {
306	$html_markup=1;
307	$text=shift;
308	}
309
310	if (! $text) {
311	return $text;
312	}
313	# escape html-sensitive characters
314	$text =~ s@&@&@g;
315	$text =~ s@<@<@g;
316	$text =~ s@>@>@g;
317
318	# do this before accents, since \= means something different in tabbing
319	# also \> is a tab stop too, and \\ is newline
320	sub do_tabbing {
321	my $tabbing=shift;
322	$tabbing =~ s!^.\\kill\s$!!mg; # \kill sets tab stops, kills line
323	$tabbing =~ s~\\(?:=\|>)~\xc2\xa0~g; # replace with nbsp
324	$tabbing =~ s~[\\][\\](?:\[.?\])?\s$~<br/>~mg;
325	return "<br/>" . $tabbing . "<br/>\n";
326	}
327	$text =~ s@\\begin{tabbing}(.*?)\\end{tabbing}@do_tabbing($1)@ges;
328	sub do_tabular {
329	my $tabular=shift;
330	$tabular =~ s~(?<!\\)\s&\s~</td><td>~g;
331	$tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
332	$tabular =~ s~\\hline~~g; # for now...
333	$tabular =~ s~<td>\s\\multicolumn{(\d+)}{.?}~<td colspan="$1">~g;
334	return "<table border=\"1\">\n <tr><td>"
335	. $tabular . "</td></tr></table>\n";
336	}
337	$text =~ s@\\begin{tabular}(?:\[.?\])?{.?}(.*?)\\end{tabular} @
338	do_tabular($1) @xges;
339
340	$text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
341
342	# process maths mode before accents... things like \, mean different!
343	# maths mode
344	$text =~ s@\$\$(.*?)\$\$
345	@ process_latex_math($html_markup,$1)
346	@xsge; # multi-line maths: $$ .... $$
347
348	$text =~ s@([^\\])\$(.*?[^\\])\$
349	@$1.process_latex_math($html_markup,$2)@xsge;
350
351
352	# is this an amstext environment, or just custom for that input file?
353	$text =~ s@\\begin{(algorithm)}(.*?)\\end{\1}@remove_equals($2)@ges;
354
355	# convert latex-style accented characters.
356	$self->latex_accents_to_utf8(\$text);
357
358	# replace quotes with utf-8
359
360	$text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (“)
361	$text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (”)
362	$text =~ s/`/\xe2\xc0\x98/g; # single left quote
363	$text =~ s/'/\xe2\xc0\x99/g; # single right quote
364
365	###### remove/replace latex commands ######
366	### commands that expand to something that gets displayed ###
367	$text =~ s~\\ldots~…~g;
368	$text =~ s~\\hrule~<hr/>\n~g;
369	$text =~ s~\\maketitle~ ~;
370	### space commands ###
371	$text =~ s~\\[vh]skip\s+\S+~~g;
372	$text =~ s~\\vspace\?{.?}~<div> </div>~g; # vertical space
373	$text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
374	$text =~ s~\\noindent\b~~g;
375	# newpage, etc
376	$text =~ s~\\(?:clearemptydoublepage\|newpage)~~g;
377	### counters, contents, environments, labels, etc ###
378	$text =~ s~\\(?:addcontentsline){.?}{.?}{.*}~~g;
379	$text =~ s~\s\\begin{itemize}\s~\n<ul>\n~g;
380	$text =~ s~\s\\end{itemize}\s~</li></ul>\n~g;
381	$text =~ s~\s\\begin{enumerate}\s~<ol>\n~g;
382	$text =~ s~\s\\end{enumerate}\s~</li></ol>\n~g;
383	if ($text =~ s~\s*\\item~</li>\n<li>~g) {
384	# (count for first list item)
385	$text =~ s~<([ou])l>\s</li>\s~<$1l>~g;
386	}
387	$text =~ s~\\(?:label\|begin\|end){.?}\s\n?~ ~g; # remove tag and contents
388	$text =~ s~\\(?:tableofcontents\|listoffigures)~ ~g;
389	### font sizes/styles ###
390	$text =~ s~\\(?:tiny\|small\|footnotesize\|normalsize\|large\|Large\|huge\|Huge)\b~~g;
391
392	if ($html_markup) {
393	$text =~ s~\\section\?{([^\}]+)}\s\n?~<H1>$1</H1>\n~g;
394	$text =~ s~\\subsection\?{(.?)}\s*\n?~<H2>$1</H2>\n~g;
395	$text =~ s~{\\tt\s(.?)}~<tt>$1</tt>~g;
396	$text =~ s~\\(?:texttt\|tt\|ttseries)\s{(.?)}~<tt>$1</tt>~g;
397	$text =~ s~\\emph{(.*?)}~<em>$1</em>~g;
398	$text =~ s~{\\(?:em\|it)\s(.?)}~<em>$1</em>~g;
399	$text =~ s~{\\(?:bf\|bfseries)\s(.?)}~<strong>$1</strong>~g;
400	$text =~ s~\\(?:textbf\|bf\|bfseries)\s{(.?)}~<strong>$1</strong>~g;
401	} else {
402	# remove tags for text-only
403	$text =~ s~\\(?:textbf\|bf\|bfseries\|em\|emph\|tt\|rm\|texttt)\b~ ~g;
404	}
405	$text =~ s ~ {\\sc\s+(.*?)} ~
406	{<span style="font-variant:\ small-caps">$1</span>} ~gx;
407	# ignore these font tags (if there are any left)
408	# sf is sans-serif
409	$text =~ s~\\(?:mdseries\|textmd\|bfseries\|textbf\|sffamily\|sf\|sc)\b~ ~;
410	#### end font-related stuff ####
411
412	### remove all other commands with optional arguments... ###
413	# don't remove commands without { }....
414	# $text =~ s~\\\w+(\[.?\])?\s~~g;
415	# $text =~ s~\\noopsort{[^}]+\}~~g;
416	# verbatim
417	$text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
418	# remove tags, keep contents for \tag[optional]{contents}
419	while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
420
421	# remove latex groupings { } (but not \{ or \} )
422	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
423	$text =~ s/^\{//; # remove { if first char
424
425	# latex characters
426	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
427	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
428	# optional break "\-"
429	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
430	# the non-macro bits have quotes around them - we just remove quotes
431	# XXX bibtex and latex differ here (for the '#' char)...
432	$text =~ s/([^&])[\"\#]/$1/g;
433	}
434	# dashes. Convert (m\|n)-dash into single dash for html.
435	$text =~ s~\-\-+~\-~g;
436
437	# quoted { } chars
438	$text =~ s~\\{~{~g;
439	$text =~ s~\\}~}~g;
440
441	# spaces
442	$text =~ s~\\ ~ ~g;
443
444	# finally to protect against macro language...
445	# greenstone-specific
446	$text =~ s~\[~&\#91;~g;
447	$text =~ s~\]~&\#93;~g;
448	$text =~ s~(?<!\\)([\\_])~\\$1~g;
449
450	if ($html_markup) {
451	$text =~ s~\n{2,}~\n</p>\n<p>~g;
452	return "<p>$text</p>";
453	}
454
455	return $text;
456	}
457
458	# only used by process_latex for \verb....
459	sub verb_text {
460	my $verbatim=shift;
461	$verbatim =~ s/([{}_])/\\$1/g;
462	return $verbatim;
463	}
464	# only used by process_latex_math
465	# returns a unicode char if applicable, otherwise ascii
466	sub math_fraction {
467	my $num=$1;
468	my $denom=$2;
469
470	if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
471	if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
472	if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
473	return "$num/$denom";
474	}
475
476	sub process_latex_math {
477
478	my $text = pop; # if given one or two args, this is the last one...
479	my $html_markup=pop; # if given two args, this is the first one else undef
480
481	$text =~ s~\\,~ ~g; # forces a space?
482	$text =~ s~\\infty~infinity~g; # or unicode 0x221E...
483
484	# use this one when more things can read 3-byte utf8 values like this!
485	# $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
486	$text =~ s~\\cup\b~ U ~g;
487
488	$text =~ s~\\frac\s*{(.+?)}{(.+?)}~math_fraction($1,$2)~ge;
489
490	if ($html_markup) {
491	$text =~ s~\^{(.*?)}~<sup>$1</sup>~g; # a^b superscript
492	$text =~ s~\^([^\{])~<sup>$1</sup>~g;
493	$text =~ s~\_{(.*?)}~<sub>$1</sub>~g; # a_b subscript
494	$text =~ s~\_([^\{])~<sub>$1</sub>~g;
495
496	$text =~ s~\\ldots~…~g; # use html named entity for now
497
498	# put all other command names in italics for now
499	$text =~ s~\\([\w]+)~<i>$1</i> ~g;
500	}
501
502	# special cases, for some input files
503	if ($text =~ m~^\\\w+$~) {
504	$text="{" . $text . "}";
505	}
506
507	return $text;
508	}
509
510
511
512	sub latex_accents_to_utf8 {
513
514	# note - this is really ugly, but it works. There may be a prettier way
515	# of mapping latex accented chars to utf8, but we just brute force it here.
516	# Also, this isn't complete - not every single possible accented letter
517	# is in here yet, but most of the common ones are.
518
519	my %utf8_chars =
520	(
521	# acutes
522	'\'a' => chr(0xc3).chr(0xa1),
523	'\'c' => chr(0xc4).chr(0x87),
524	'\'e' => chr(0xc3).chr(0xa9),
525	'\'i' => chr(0xc3).chr(0xad),
526	'\'l' => chr(0xc3).chr(0xba),
527	'\'n' => chr(0xc3).chr(0x84),
528	'\'o' => chr(0xc3).chr(0xb3),
529	'\'r' => chr(0xc5).chr(0x95),
530	'\'s' => chr(0xc5).chr(0x9b),
531	'\'u' => chr(0xc3).chr(0xba),
532	'\'y' => chr(0xc3).chr(0xbd),
533	'\'z' => chr(0xc5).chr(0xba),
534	# graves
535	'`a' => chr(0xc3).chr(0xa0),
536	'`A' => chr(0xc3).chr(0x80),
537	'`e' => chr(0xc3).chr(0xa8),
538	'`E' => chr(0xc3).chr(0x88),
539	'`i' => chr(0xc3).chr(0xac),
540	'`I' => chr(0xc3).chr(0x8c),
541	'`o' => chr(0xc3).chr(0xb2),
542	'`O' => chr(0xc3).chr(0x92),
543	'`u' => chr(0xc3).chr(0xb9),
544	'`U' => chr(0xc3).chr(0x99),
545	# circumflex
546	'^a' => chr(0xc3).chr(0xa2),
547	'^A' => chr(0xc3).chr(0x82),
548	'^c' => chr(0xc4).chr(0x89),
549	'^C' => chr(0xc4).chr(0x88),
550	'^e' => chr(0xc3).chr(0xaa),
551	'^E' => chr(0xc3).chr(0x8a),
552	'^g' => chr(0xc4).chr(0x9d),
553	'^G' => chr(0xc4).chr(0x9c),
554	'^h' => chr(0xc4).chr(0xa5),
555	'^H' => chr(0xc4).chr(0xa4),
556	'^i' => chr(0xc3).chr(0xae),
557	'^I' => chr(0xc3).chr(0x8e),
558	'^j' => chr(0xc4).chr(0xb5),
559	'^J' => chr(0xc4).chr(0xb4),
560	'^o' => chr(0xc3).chr(0xb4),
561	'^O' => chr(0xc3).chr(0x94),
562	'^s' => chr(0xc5).chr(0x9d),
563	'^S' => chr(0xc5).chr(0x9c),
564	'^u' => chr(0xc3).chr(0xa2),
565	'^U' => chr(0xc3).chr(0xbb),
566	'^w' => chr(0xc5).chr(0xb5),
567	'^W' => chr(0xc5).chr(0xb4),
568	'^y' => chr(0xc5).chr(0xb7),
569	'^Y' => chr(0xc5).chr(0xb6),
570
571	# diaeresis
572	'"a' => chr(0xc3).chr(0xa4),
573	'"A' => chr(0xc3).chr(0x84),
574	'"e' => chr(0xc3).chr(0xab),
575	'"E' => chr(0xc3).chr(0x8b),
576	'"i' => chr(0xc3).chr(0xaf),
577	'"I' => chr(0xc3).chr(0x8f),
578	'"\\\\i' => chr(0xc3).chr(0xaf),
579	'"\\\\I' => chr(0xc3).chr(0x8f),
580	'"o' => chr(0xc3).chr(0xb6),
581	'"O' => chr(0xc3).chr(0x96),
582	'"u' => chr(0xc3).chr(0xbc),
583	'"U' => chr(0xc3).chr(0x9c),
584	'"y' => chr(0xc3).chr(0xbf),
585	'"Y' => chr(0xc3).chr(0xb8),
586	# tilde
587	'~A' => chr(0xc3).chr(0x83),
588	'~N' => chr(0xc3).chr(0x91),
589	'~O' => chr(0xc3).chr(0x95),
590	'~a' => chr(0xc3).chr(0xa3),
591	'~n' => chr(0xc3).chr(0xb1),
592	'~o' => chr(0xc3).chr(0xb5),
593	# caron - handled specially
594	# double acute
595	# ring
596	# dot
597	'.c' => chr(0xc4).chr(0x8b),
598	'.C' => chr(0xc4).chr(0x8a),
599	'.e' => chr(0xc4).chr(0x97),
600	'.E' => chr(0xc4).chr(0x96),
601	'.g' => chr(0xc4).chr(0xa1),
602	'.G' => chr(0xc4).chr(0xa0),
603	'.I' => chr(0xc4).chr(0xb0),
604	'.z' => chr(0xc5).chr(0xbc),
605	'.Z' => chr(0xc5).chr(0xbb),
606	# macron
607	'=a' => chr(0xc4).chr(0x81),
608	'=A' => chr(0xc4).chr(0x80),
609	'=e' => chr(0xc4).chr(0x93),
610	'=E' => chr(0xc4).chr(0x92),
611	'=i' => chr(0xc4).chr(0xab),
612	'=I' => chr(0xc4).chr(0xaa),
613	'=o' => chr(0xc4).chr(0x8d),
614	'=O' => chr(0xc4).chr(0x8c),
615	'=u' => chr(0xc4).chr(0xab),
616	'=U' => chr(0xc4).chr(0xaa),
617
618	# stroke - handled specially - see below
619
620	# cedilla - handled specially
621	);
622
623	# these are one letter latex commands - we make sure they're not a longer
624	# command name. eg {\d} is d+stroke, so careful of \d
625	my %special_utf8_chars =
626	(
627	# breve
628	'u g' => chr(0xc4).chr(0x9f),
629	'u G' => chr(0xc4).chr(0x9e),
630	'u i' => chr(0xc4).chr(0xad),
631	'u I' => chr(0xc4).chr(0xac),
632	'u o' => chr(0xc5).chr(0x8f),
633	'u O' => chr(0xc5).chr(0x8e),
634	'u u' => chr(0xc5).chr(0xad),
635	'u U' => chr(0xc5).chr(0xac),
636	'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
637	'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
638	# caron
639	'v c' => chr(0xc4).chr(0x8d),
640	'v C' => chr(0xc4).chr(0x8c),
641	'v n' => chr(0xc5).chr(0x88),
642	'v N' => chr(0xc5).chr(0x87),
643	'v s' => chr(0xc5).chr(0xa1),
644	'v S' => chr(0xc5).chr(0xa5),
645	'v z' => chr(0xc5).chr(0xbe),
646	'v Z' => chr(0xc5).chr(0xbd),
647	# cedilla
648	'c c' => chr(0xc3).chr(0xa7),
649	'c C' => chr(0xc3).chr(0x87),
650	'c g' => chr(0xc4).chr(0xa3),
651	'c G' => chr(0xc4).chr(0xa2),
652	'c k' => chr(0xc4).chr(0xb7),
653	'c K' => chr(0xc4).chr(0xb6),
654	'c l' => chr(0xc4).chr(0xbc),
655	'c L' => chr(0xc4).chr(0xbb),
656	'c n' => chr(0xc5).chr(0x86),
657	'c N' => chr(0xc5).chr(0x85),
658	'c r' => chr(0xc5).chr(0x97),
659	'c R' => chr(0xc5).chr(0x96),
660	'c s' => chr(0xc5).chr(0x9f),
661	'c S' => chr(0xc5).chr(0x9e),
662	'c t' => chr(0xc5).chr(0xa3),
663	'c T' => chr(0xc5).chr(0xa2),
664	# double acute / Hungarian accent
665	'H O' => chr(0xc5).chr(0x90),
666	'H o' => chr(0xc5).chr(0x91),
667	'H U' => chr(0xc5).chr(0xb0),
668	'H u' => chr(0xc5).chr(0xb1),
669
670	# stroke
671	'd' => chr(0xc4).chr(0x91),
672	'D' => chr(0xc4).chr(0x90),
673	'h' => chr(0xc4).chr(0xa7),
674	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
675	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
676	'l' => chr(0xc5).chr(0x82),
677	'L' => chr(0xc5).chr(0x81),
678	'o' => chr(0xc3).chr(0xb8),
679	'O' => chr(0xc3).chr(0x98),
680	't' => chr(0xc5).chr(0xa7),
681	'T' => chr(0xc5).chr(0xa6),
682
683	# other special chars
684	'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
685	'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
686	);
687
688	my $self=shift;
689	my $textref=shift;
690
691	my $outhandle=$self->{'outhandle'};
692	my $text=$$textref;
693
694	# remove space (if any) between \ and letter to accent (eg {\' a})
695	$text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
696
697	# remove {} around a single character (eg \'{e})
698	$text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g; # for emacs indenting... `]);
699
700	## only in bibtex... not in latex proper?!
701	### \, is another way of doing cedilla \c
702	##$text =~ s~\\,(.)~\\c $1~g;
703
704	# remove {} around a single character for special 1 letter commands -
705	# need to insert a space. Eg \v{s} -> {\v s}
706	$text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
707
708	# only do if the text contains a '\' character.
709	if ($text =~ m\|\\\|) {
710	# "normal" accents - ie non-alpha latex tag
711	# xxx used to have ([\w]\b)@ (for word boundary)
712	while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
713	my $tex="$1$2"; my $char="$2";
714	my $replacement=$utf8_chars{$tex};
715	if (!defined($replacement)) {
716	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
717	print $outhandle . $self->{'plugin_type'} .
718	": Warning: unknown latex accent \"$tex\""
719	. " in \"$1\"\n";
720	$replacement=$char;
721	}
722	$text =~ s/\\\Q$tex/$replacement/g;
723	}
724
725	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
726	# only do the change if immediately followed by a space, }, {, or \
727	# one letter accents ( + ss / aa)
728	while ($text =~ m~\\([DdhiLlOoTt]\|ss\|aa)[{}\s\"\\]~) {
729	my $tex=$1;
730	my $replacement=$special_utf8_chars{$tex};
731	if (!defined($replacement)) {
732	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
733	print $outhandle $self->{'plugin_type'} .
734	": Warning: unknown latex accent \"$tex\""
735	. " in \"$1\"\n";
736	$replacement=$tex;
737	}
738	($text =~ s/{\\$tex}/$replacement/g) or
739	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
740
741	}
742
743	# one letter latex accent commands that affect following letter
744	while ($text =~ m~\\([uvcH]) ([\w])~) {
745	my $tex="$1 $2"; my $char="$2";
746	my $replacement=$special_utf8_chars{$tex};
747	if (!defined($replacement)) {
748	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
749	print $outhandle $self->{'plugin_type'} .
750	": Warning: unknown latex accent \"$tex\""
751	. " in \"$1\"\n";
752	$replacement=$char;
753	}
754	$text =~ s/\\$tex/$replacement/g;
755	}
756	}
757	$textref=\$text;
758	}
759
760
761	# modules must return true
762	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: