Context Navigation

source: main/trunk/greenstone2/perllib/plugins/LaTeXPlugin.pm@ 28836

Last change on this file since 28836 was 24548, checked in by ak19, 13 years ago
Part 2 of previous commit (r24547). Added new abstract plugin MetadataRead? that defines can_process_this_file_for_metadata that MetadataPlugin? subclasses can inherit (if MetadataRead? is listed first in the ISA inheritance list) and which will then override the one defined in BasePlugin?. For now committing MARC, ISIS and OAIPlugins which now additionally inherit from MetadataRead?. Other metadataPlugins also need to be committed.
Property svn:keywords set to `Author Date Id Revision`
File size: 23.0 KB

Line
1	###########################################################################
2	#
3	# LaTeXPlugin.pm
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Written by John McPherson
10	# Copyright (C) 2004 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	###########################################################################
23
24	# todo:
25	# \includegraphics
26	# parse/remove tex \if ... macros
27
28	package LaTeXPlugin;
29
30	# System complains about $arguments if the strict is set
31	use strict;
32	no strict 'refs'; # so we can print to a handle named by a variable
33
34	# greenstone packages
35	use ReadTextFile;
36	use MetadataRead;
37	use unicode;
38	use util;
39
40	my $arguments =
41	[ { 'name' => "process_exp",
42	'desc' => "{BasePlugin.process_exp}",
43	'type' => "regexp",
44	'reqd' => "no",
45	'deft' => &get_default_process_exp() } ];
46
47	my $options = { 'name' => "LaTeXPlugin",
48	'desc' => "{LaTeXPlugin.desc}",
49	'abstract' => "no",
50	'inherits' => "yes",
51	'args' => $arguments };
52
53	sub BEGIN {
54	@LaTeXPlugin::ISA = ('MetadataRead', 'ReadTextFile');
55	}
56
57	sub new {
58	my ($class) = shift (@_);
59	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
60	push(@$pluginlist, $class);
61
62	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
63	push(@{$hashArgOptLists->{"OptList"}},$options);
64
65	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
66
67	$self->{'aux_files'} = {};
68	$self->{'dir_num'} = 0;
69	$self->{'file_num'} = 0;
70	return bless $self, $class;
71	}
72
73
74	sub get_default_process_exp {
75	my $self = shift (@_);
76	return q^\.tex$^;
77	}
78
79	sub get_default_block_exp {
80	# assume any .eps files are part of the latex stuff
81	return '\.(?:eps)$';
82	}
83
84
85	sub process {
86	my $self = shift (@_);
87	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
88
89	my $start=substr($$textref, 0, 200); # first 200 bytes
90
91	if ($start !~ m~\\ (?:documentclass \| documentstyle \| input \| section
92	\| chapter \| contents \| begin) ~x) {
93	# this doesn't look like latex...
94	return undef;
95	}
96	my $outhandle = $self->{'outhandle'};
97
98	my $cursection = $doc_obj->get_top_section();
99
100	###### clean up text ######
101	$$textref =~ s/\r$//mg; # remove dos ^M
102	$$textref =~ s/%.*$//mg; # remove comments
103
104	# convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
105
106	$$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
107	# check that both sides are ascii, so we don't screw up utf-8 chars
108	$$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
109	unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
110
111
112	###### find metadata ######
113
114	## FileFormat metadata ##
115	$doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
116
117	### title metadata ###
118	$$textref =~ m@\\title\s{(.?)}@s;
119	my $title = $1;
120	if (!$title) {
121	# no title tag. look for a chapter/section heading
122	$$textref =~ m@\\(?:chapter\|section)\s{(.?)}@s; # will get 1st match
123	$title = $1;
124	}
125	if (!$title) {
126	# no chapter/section heading tags either... use filename
127	$title = $file;
128	$title =~ s/\.tex$//i;
129	$title =~ s/[-_.]/ /g; # turn punctuation into spaces
130	}
131	if ($title) {
132	$title =~ s@\\\\@ @g; # embedded newlines
133	$title = $self->process_latex($title); # no "-html" for title eg in browser
134	$doc_obj->add_utf8_metadata($cursection, "Title", $title);
135	}
136
137	### creator/individual author metadata ###
138	$$textref =~ m@\\author\s{((?:{.?}\|.?)+)}\s$@sm;
139	my $authors=$1;
140	if ($authors) {
141	# take care of "et al."...
142	$authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
143	my $etal=$1;
144	$etal="" if (!defined ($etal));
145
146	my @authorlist=parse_authors($self, $authors);
147
148	foreach my $author (@authorlist) {
149	# Add each name to set of Authors
150	$doc_obj->add_utf8_metadata ($cursection, "Author", $author);
151	}
152
153	# Only want at most one "and" in the Creator field
154	my $creator_str="";
155	if (scalar(@authorlist) > 2) {
156	my $lastauthor=pop @authorlist;
157	$creator_str=join(', ', @authorlist);
158	$creator_str.=" and $lastauthor";
159	} else { # 1 or 2 authors...
160	$creator_str=join(" and ",@authorlist);
161	}
162	$creator_str.=$etal; # if there was "et al."
163	$doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
164	}
165	### end of author metadata ###
166
167	###### process latex for the main text ######
168	$$textref =~ s/^.*?\\begin{document}//s;
169	$$textref =~ s/\\end{document}.*?$//s;
170	$$textref = $self->process_latex("-html",$$textref);
171	$doc_obj->add_utf8_text($cursection, $$textref);
172
173	return 1;
174	}
175
176
177	# returns a list of author names
178	sub parse_authors {
179	my $self=shift;
180	my $authors=shift;
181
182	my $outhandle=$self->{'outhandle'};
183
184	$authors =~ s/\n/ /g; # remove newlines
185
186	# some people do this for affiliation footnote/dagger
187	$authors =~ s@\$.*?\$@@g; # remove maths from author :(
188
189	# und here for german language...
190	# don't use brackets in pattern, else the matched bit becomes
191	# an element in the list!
192	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $authors);
193	my @formattedlist = ();
194	foreach my $author (@authorlist) {
195	$author =~ s/\s*$//;
196	$author =~ s/^\s*//;
197	# Reformat and add author name
198	next if $author=~ /^\s*$/;
199
200	# names are "First von Last", "von Last, First"
201	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
202	my $first="";
203	my $vonlast="";
204	my $jr="";
205
206	if ($author =~ /,/) {
207	my @parts=split(/,\s*/, $author);
208	$first = pop @parts;
209	if (scalar(@parts) == 2) {
210	$jr = pop @parts;
211	}
212	$vonlast=shift @parts;
213	if (scalar(@parts) > 0) {
214	print $outhandle $self->{'plugin_type'} .
215	": couldn't parse name $author\n";
216	# but we continue anyway...
217	}
218	} else { # First von Last
219	my @words = split(/ /, $author);
220	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
221	$first .= " " . shift (@words);
222	}
223	$first =~ s/^\s//;
224	$vonlast = join (' ', @words); # whatever's left...
225	}
226	my $von="";
227	my $last="";
228	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
229	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
230	$von = $1;
231	if (!defined ($von)) {
232	# some non-English names do start with lowercase
233	# eg "Marie desJardins". Also we can get typos...
234	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
235	$von="";
236	if ($vonlast =~ /^[a-z]+$/) {
237	# if it's all lowercase, uppercase 1st.
238	$vonlast =~ s/^(.)/\u$1/;
239	}
240	}
241	$von =~ s/\s*$//;
242	$last=$vonlast;
243	} else {
244	$last=$vonlast;
245	}
246	my $wholename="$first $von $last $jr";
247	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
248	# my $fullname = "$last";
249	# $fullname .= " $jr" if ($jr);
250	# $fullname .= ", $first";
251	# $fullname .= " $von" if ($von);
252	push (@formattedlist, $wholename);
253	}
254	return @formattedlist;
255	}
256
257
258	## following functions based on bibtex plugin ##
259	# not actually used at the moment, but might be useful in future?
260	sub expand_month {
261	my $text=shift;
262
263	# bibtex style files expand abbreviations for months.
264	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
265	$text =~ s/jan/_textmonth01_/g;
266	$text =~ s/feb/_textmonth02_/g;
267	$text =~ s/mar/_textmonth03_/g;
268	$text =~ s/apr/_textmonth04_/g;
269	$text =~ s/may/_textmonth05_/g;
270	$text =~ s/jun/_textmonth06_/g;
271	$text =~ s/jul/_textmonth07_/g;
272	$text =~ s/aug/_textmonth08_/g;
273	$text =~ s/sep/_textmonth09_/g;
274	$text =~ s/oct/_textmonth10_/g;
275	$text =~ s/nov/_textmonth11_/g;
276	$text =~ s/dec/_textmonth12_/g;
277
278	return $text;
279	}
280
281
282	# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
283	# as the first argument to this function.
284	#
285	# Convert accented characters, remove { }, interprete some commands....
286	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
287
288	# Also, it sucks quite a bit for complicated/nested commands since it doesn't
289	# match { with the corresponding }, only the nearest }
290
291	sub process_latex {
292	my $self=shift;
293	my $text=shift;
294
295	my $outhandle=$self->{'outhandle'};
296
297	my $html_markup=0;
298	if ($text =~ /^\-html/) {
299	$html_markup=1;
300	$text=shift;
301	}
302
303	if (! $text) {
304	return $text;
305	}
306	# escape html-sensitive characters
307	$text =~ s@&@&@g;
308	$text =~ s@<@<@g;
309	$text =~ s@>@>@g;
310
311	# do this before accents, since \= means something different in tabbing
312	# also \> is a tab stop too, and \\ is newline
313	sub do_tabbing {
314	my $tabbing=shift;
315	$tabbing =~ s!^.\\kill\s$!!mg; # \kill sets tab stops, kills line
316	$tabbing =~ s~\\(?:=\|>)~\xc2\xa0~g; # replace with nbsp
317	$tabbing =~ s~[\\][\\](?:\[.?\])?\s$~<br/>~mg;
318	return "<br/>" . $tabbing . "<br/>\n";
319	}
320	$text =~ s@\\begin{tabbing}(.*?)\\end{tabbing}@do_tabbing($1)@ges;
321	sub do_tabular {
322	my $tabular=shift;
323	$tabular =~ s~(?<!\\)\s&\s~</td><td>~g;
324	$tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
325	$tabular =~ s~\\hline~~g; # for now...
326	$tabular =~ s~<td>\s\\multicolumn{(\d+)}{.?}~<td colspan="$1">~g;
327	return "<table border=\"1\">\n <tr><td>"
328	. $tabular . "</td></tr></table>\n";
329	}
330	$text =~ s@\\begin{tabular}(?:\[.?\])?{.?}(.*?)\\end{tabular} @
331	do_tabular($1) @xges;
332
333	$text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
334
335	# process maths mode before accents... things like \, mean different!
336	# maths mode
337	$text =~ s@\$\$(.*?)\$\$
338	@ process_latex_math($html_markup,$1)
339	@xsge; # multi-line maths: $$ .... $$
340
341	$text =~ s@([^\\])\$(.*?[^\\])\$
342	@$1.process_latex_math($html_markup,$2)@xsge;
343
344
345	# is this an amstext environment, or just custom for that input file?
346	$text =~ s@\\begin{(algorithm)}(.*?)\\end{\1}@remove_equals($2)@ges;
347
348	# convert latex-style accented characters.
349	$self->latex_accents_to_utf8(\$text);
350
351	# replace quotes with utf-8
352
353	$text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (“)
354	$text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (”)
355	$text =~ s/`/\xe2\xc0\x98/g; # single left quote
356	$text =~ s/'/\xe2\xc0\x99/g; # single right quote
357
358	###### remove/replace latex commands ######
359	### commands that expand to something that gets displayed ###
360	$text =~ s~\\ldots~…~g;
361	$text =~ s~\\hrule~<hr/>\n~g;
362	$text =~ s~\\maketitle~ ~;
363	### space commands ###
364	$text =~ s~\\[vh]skip\s+\S+~~g;
365	$text =~ s~\\vspace\?{.?}~<div> </div>~g; # vertical space
366	$text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
367	$text =~ s~\\noindent\b~~g;
368	# newpage, etc
369	$text =~ s~\\(?:clearemptydoublepage\|newpage)~~g;
370	### counters, contents, environments, labels, etc ###
371	$text =~ s~\\(?:addcontentsline){.?}{.?}{.*}~~g;
372	$text =~ s~\s\\begin{itemize}\s~\n<ul>\n~g;
373	$text =~ s~\s\\end{itemize}\s~</li></ul>\n~g;
374	$text =~ s~\s\\begin{enumerate}\s~<ol>\n~g;
375	$text =~ s~\s\\end{enumerate}\s~</li></ol>\n~g;
376	if ($text =~ s~\s*\\item~</li>\n<li>~g) {
377	# (count for first list item)
378	$text =~ s~<([ou])l>\s</li>\s~<$1l>~g;
379	}
380	$text =~ s~\\(?:label\|begin\|end){.?}\s\n?~ ~g; # remove tag and contents
381	$text =~ s~\\(?:tableofcontents\|listoffigures)~ ~g;
382	### font sizes/styles ###
383	$text =~ s~\\(?:tiny\|small\|footnotesize\|normalsize\|large\|Large\|huge\|Huge)\b~~g;
384
385	if ($html_markup) {
386	$text =~ s~\\section\?{([^\}]+)}\s\n?~<H1>$1</H1>\n~g;
387	$text =~ s~\\subsection\?{(.?)}\s*\n?~<H2>$1</H2>\n~g;
388	$text =~ s~{\\tt\s(.?)}~<tt>$1</tt>~g;
389	$text =~ s~\\(?:texttt\|tt\|ttseries)\s{(.?)}~<tt>$1</tt>~g;
390	$text =~ s~\\emph{(.*?)}~<em>$1</em>~g;
391	$text =~ s~{\\(?:em\|it)\s(.?)}~<em>$1</em>~g;
392	$text =~ s~{\\(?:bf\|bfseries)\s(.?)}~<strong>$1</strong>~g;
393	$text =~ s~\\(?:textbf\|bf\|bfseries)\s{(.?)}~<strong>$1</strong>~g;
394	} else {
395	# remove tags for text-only
396	$text =~ s~\\(?:textbf\|bf\|bfseries\|em\|emph\|tt\|rm\|texttt)\b~ ~g;
397	}
398	$text =~ s ~ {\\sc\s+(.*?)} ~
399	{<span style="font-variant:\ small-caps">$1</span>} ~gx;
400	# ignore these font tags (if there are any left)
401	# sf is sans-serif
402	$text =~ s~\\(?:mdseries\|textmd\|bfseries\|textbf\|sffamily\|sf\|sc)\b~ ~;
403	#### end font-related stuff ####
404
405	### remove all other commands with optional arguments... ###
406	# don't remove commands without { }....
407	# $text =~ s~\\\w+(\[.?\])?\s~~g;
408	# $text =~ s~\\noopsort{[^}]+\}~~g;
409	# verbatim
410	$text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
411	# remove tags, keep contents for \tag[optional]{contents}
412	while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
413
414	# remove latex groupings { } (but not \{ or \} )
415	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
416	$text =~ s/^\{//; # remove { if first char
417
418	# latex characters
419	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
420	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
421	# optional break "\-"
422	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
423	# the non-macro bits have quotes around them - we just remove quotes
424	# XXX bibtex and latex differ here (for the '#' char)...
425	$text =~ s/([^&])[\"\#]/$1/g;
426	}
427	# dashes. Convert (m\|n)-dash into single dash for html.
428	$text =~ s~\-\-+~\-~g;
429
430	# quoted { } chars
431	$text =~ s~\\{~{~g;
432	$text =~ s~\\}~}~g;
433
434	# spaces
435	$text =~ s~\\ ~ ~g;
436
437	# finally to protect against macro language...
438	# greenstone-specific
439	$text =~ s~\[~&\#91;~g;
440	$text =~ s~\]~&\#93;~g;
441	$text =~ s~(?<!\\)([\\_])~\\$1~g;
442
443	if ($html_markup) {
444	$text =~ s~\n{2,}~\n</p>\n<p>~g;
445	return "<p>$text</p>";
446	}
447
448	return $text;
449	}
450
451	# only used by process_latex for \verb....
452	sub verb_text {
453	my $verbatim=shift;
454	$verbatim =~ s/([{}_])/\\$1/g;
455	return $verbatim;
456	}
457	# only used by process_latex_math
458	# returns a unicode char if applicable, otherwise ascii
459	sub math_fraction {
460	my $num=$1;
461	my $denom=$2;
462
463	if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
464	if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
465	if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
466	return "$num/$denom";
467	}
468
469	sub process_latex_math {
470
471	my $text = pop; # if given one or two args, this is the last one...
472	my $html_markup=pop; # if given two args, this is the first one else undef
473
474	$text =~ s~\\,~ ~g; # forces a space?
475	$text =~ s~\\infty~infinity~g; # or unicode 0x221E...
476
477	# use this one when more things can read 3-byte utf8 values like this!
478	# $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
479	$text =~ s~\\cup\b~ U ~g;
480
481	$text =~ s~\\frac\s*{(.+?)}{(.+?)}~math_fraction($1,$2)~ge;
482
483	if ($html_markup) {
484	$text =~ s~\^{(.*?)}~<sup>$1</sup>~g; # a^b superscript
485	$text =~ s~\^([^\{])~<sup>$1</sup>~g;
486	$text =~ s~\_{(.*?)}~<sub>$1</sub>~g; # a_b subscript
487	$text =~ s~\_([^\{])~<sub>$1</sub>~g;
488
489	$text =~ s~\\ldots~…~g; # use html named entity for now
490
491	# put all other command names in italics for now
492	$text =~ s~\\([\w]+)~<i>$1</i> ~g;
493	}
494
495	# special cases, for some input files
496	if ($text =~ m~^\\\w+$~) {
497	$text="{" . $text . "}";
498	}
499
500	return $text;
501	}
502
503
504
505	sub latex_accents_to_utf8 {
506
507	# note - this is really ugly, but it works. There may be a prettier way
508	# of mapping latex accented chars to utf8, but we just brute force it here.
509	# Also, this isn't complete - not every single possible accented letter
510	# is in here yet, but most of the common ones are.
511
512	my %utf8_chars =
513	(
514	# acutes
515	'\'a' => chr(0xc3).chr(0xa1),
516	'\'c' => chr(0xc4).chr(0x87),
517	'\'e' => chr(0xc3).chr(0xa9),
518	'\'i' => chr(0xc3).chr(0xad),
519	'\'l' => chr(0xc3).chr(0xba),
520	'\'n' => chr(0xc3).chr(0x84),
521	'\'o' => chr(0xc3).chr(0xb3),
522	'\'r' => chr(0xc5).chr(0x95),
523	'\'s' => chr(0xc5).chr(0x9b),
524	'\'u' => chr(0xc3).chr(0xba),
525	'\'y' => chr(0xc3).chr(0xbd),
526	'\'z' => chr(0xc5).chr(0xba),
527	# graves
528	'`a' => chr(0xc3).chr(0xa0),
529	'`A' => chr(0xc3).chr(0x80),
530	'`e' => chr(0xc3).chr(0xa8),
531	'`E' => chr(0xc3).chr(0x88),
532	'`i' => chr(0xc3).chr(0xac),
533	'`I' => chr(0xc3).chr(0x8c),
534	'`o' => chr(0xc3).chr(0xb2),
535	'`O' => chr(0xc3).chr(0x92),
536	'`u' => chr(0xc3).chr(0xb9),
537	'`U' => chr(0xc3).chr(0x99),
538	# circumflex
539	'^a' => chr(0xc3).chr(0xa2),
540	'^A' => chr(0xc3).chr(0x82),
541	'^c' => chr(0xc4).chr(0x89),
542	'^C' => chr(0xc4).chr(0x88),
543	'^e' => chr(0xc3).chr(0xaa),
544	'^E' => chr(0xc3).chr(0x8a),
545	'^g' => chr(0xc4).chr(0x9d),
546	'^G' => chr(0xc4).chr(0x9c),
547	'^h' => chr(0xc4).chr(0xa5),
548	'^H' => chr(0xc4).chr(0xa4),
549	'^i' => chr(0xc3).chr(0xae),
550	'^I' => chr(0xc3).chr(0x8e),
551	'^j' => chr(0xc4).chr(0xb5),
552	'^J' => chr(0xc4).chr(0xb4),
553	'^o' => chr(0xc3).chr(0xb4),
554	'^O' => chr(0xc3).chr(0x94),
555	'^s' => chr(0xc5).chr(0x9d),
556	'^S' => chr(0xc5).chr(0x9c),
557	'^u' => chr(0xc3).chr(0xa2),
558	'^U' => chr(0xc3).chr(0xbb),
559	'^w' => chr(0xc5).chr(0xb5),
560	'^W' => chr(0xc5).chr(0xb4),
561	'^y' => chr(0xc5).chr(0xb7),
562	'^Y' => chr(0xc5).chr(0xb6),
563
564	# diaeresis
565	'"a' => chr(0xc3).chr(0xa4),
566	'"A' => chr(0xc3).chr(0x84),
567	'"e' => chr(0xc3).chr(0xab),
568	'"E' => chr(0xc3).chr(0x8b),
569	'"i' => chr(0xc3).chr(0xaf),
570	'"I' => chr(0xc3).chr(0x8f),
571	'"\\\\i' => chr(0xc3).chr(0xaf),
572	'"\\\\I' => chr(0xc3).chr(0x8f),
573	'"o' => chr(0xc3).chr(0xb6),
574	'"O' => chr(0xc3).chr(0x96),
575	'"u' => chr(0xc3).chr(0xbc),
576	'"U' => chr(0xc3).chr(0x9c),
577	'"y' => chr(0xc3).chr(0xbf),
578	'"Y' => chr(0xc3).chr(0xb8),
579	# tilde
580	'~A' => chr(0xc3).chr(0x83),
581	'~N' => chr(0xc3).chr(0x91),
582	'~O' => chr(0xc3).chr(0x95),
583	'~a' => chr(0xc3).chr(0xa3),
584	'~n' => chr(0xc3).chr(0xb1),
585	'~o' => chr(0xc3).chr(0xb5),
586	# caron - handled specially
587	# double acute
588	# ring
589	# dot
590	'.c' => chr(0xc4).chr(0x8b),
591	'.C' => chr(0xc4).chr(0x8a),
592	'.e' => chr(0xc4).chr(0x97),
593	'.E' => chr(0xc4).chr(0x96),
594	'.g' => chr(0xc4).chr(0xa1),
595	'.G' => chr(0xc4).chr(0xa0),
596	'.I' => chr(0xc4).chr(0xb0),
597	'.z' => chr(0xc5).chr(0xbc),
598	'.Z' => chr(0xc5).chr(0xbb),
599	# macron
600	'=a' => chr(0xc4).chr(0x81),
601	'=A' => chr(0xc4).chr(0x80),
602	'=e' => chr(0xc4).chr(0x93),
603	'=E' => chr(0xc4).chr(0x92),
604	'=i' => chr(0xc4).chr(0xab),
605	'=I' => chr(0xc4).chr(0xaa),
606	'=o' => chr(0xc4).chr(0x8d),
607	'=O' => chr(0xc4).chr(0x8c),
608	'=u' => chr(0xc4).chr(0xab),
609	'=U' => chr(0xc4).chr(0xaa),
610
611	# stroke - handled specially - see below
612
613	# cedilla - handled specially
614	);
615
616	# these are one letter latex commands - we make sure they're not a longer
617	# command name. eg {\d} is d+stroke, so careful of \d
618	my %special_utf8_chars =
619	(
620	# breve
621	'u g' => chr(0xc4).chr(0x9f),
622	'u G' => chr(0xc4).chr(0x9e),
623	'u i' => chr(0xc4).chr(0xad),
624	'u I' => chr(0xc4).chr(0xac),
625	'u o' => chr(0xc5).chr(0x8f),
626	'u O' => chr(0xc5).chr(0x8e),
627	'u u' => chr(0xc5).chr(0xad),
628	'u U' => chr(0xc5).chr(0xac),
629	'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
630	'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
631	# caron
632	'v c' => chr(0xc4).chr(0x8d),
633	'v C' => chr(0xc4).chr(0x8c),
634	'v n' => chr(0xc5).chr(0x88),
635	'v N' => chr(0xc5).chr(0x87),
636	'v s' => chr(0xc5).chr(0xa1),
637	'v S' => chr(0xc5).chr(0xa5),
638	'v z' => chr(0xc5).chr(0xbe),
639	'v Z' => chr(0xc5).chr(0xbd),
640	# cedilla
641	'c c' => chr(0xc3).chr(0xa7),
642	'c C' => chr(0xc3).chr(0x87),
643	'c g' => chr(0xc4).chr(0xa3),
644	'c G' => chr(0xc4).chr(0xa2),
645	'c k' => chr(0xc4).chr(0xb7),
646	'c K' => chr(0xc4).chr(0xb6),
647	'c l' => chr(0xc4).chr(0xbc),
648	'c L' => chr(0xc4).chr(0xbb),
649	'c n' => chr(0xc5).chr(0x86),
650	'c N' => chr(0xc5).chr(0x85),
651	'c r' => chr(0xc5).chr(0x97),
652	'c R' => chr(0xc5).chr(0x96),
653	'c s' => chr(0xc5).chr(0x9f),
654	'c S' => chr(0xc5).chr(0x9e),
655	'c t' => chr(0xc5).chr(0xa3),
656	'c T' => chr(0xc5).chr(0xa2),
657	# double acute / Hungarian accent
658	'H O' => chr(0xc5).chr(0x90),
659	'H o' => chr(0xc5).chr(0x91),
660	'H U' => chr(0xc5).chr(0xb0),
661	'H u' => chr(0xc5).chr(0xb1),
662
663	# stroke
664	'd' => chr(0xc4).chr(0x91),
665	'D' => chr(0xc4).chr(0x90),
666	'h' => chr(0xc4).chr(0xa7),
667	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
668	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
669	'l' => chr(0xc5).chr(0x82),
670	'L' => chr(0xc5).chr(0x81),
671	'o' => chr(0xc3).chr(0xb8),
672	'O' => chr(0xc3).chr(0x98),
673	't' => chr(0xc5).chr(0xa7),
674	'T' => chr(0xc5).chr(0xa6),
675
676	# other special chars
677	'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
678	'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
679	);
680
681	my $self=shift;
682	my $textref=shift;
683
684	my $outhandle=$self->{'outhandle'};
685	my $text=$$textref;
686
687	# remove space (if any) between \ and letter to accent (eg {\' a})
688	$text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
689
690	# remove {} around a single character (eg \'{e})
691	$text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g; # for emacs indenting... `]);
692
693	## only in bibtex... not in latex proper?!
694	### \, is another way of doing cedilla \c
695	##$text =~ s~\\,(.)~\\c $1~g;
696
697	# remove {} around a single character for special 1 letter commands -
698	# need to insert a space. Eg \v{s} -> {\v s}
699	$text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
700
701	# only do if the text contains a '\' character.
702	if ($text =~ m\|\\\|) {
703	# "normal" accents - ie non-alpha latex tag
704	# xxx used to have ([\w]\b)@ (for word boundary)
705	while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
706	my $tex="$1$2"; my $char="$2";
707	my $replacement=$utf8_chars{$tex};
708	if (!defined($replacement)) {
709	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
710	print $outhandle . $self->{'plugin_type'} .
711	": Warning: unknown latex accent \"$tex\""
712	. " in \"$1\"\n";
713	$replacement=$char;
714	}
715	$text =~ s/\\\Q$tex/$replacement/g;
716	}
717
718	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
719	# only do the change if immediately followed by a space, }, {, or \
720	# one letter accents ( + ss / aa)
721	while ($text =~ m~\\([DdhiLlOoTt]\|ss\|aa)[{}\s\"\\]~) {
722	my $tex=$1;
723	my $replacement=$special_utf8_chars{$tex};
724	if (!defined($replacement)) {
725	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
726	print $outhandle $self->{'plugin_type'} .
727	": Warning: unknown latex accent \"$tex\""
728	. " in \"$1\"\n";
729	$replacement=$tex;
730	}
731	($text =~ s/{\\$tex}/$replacement/g) or
732	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
733
734	}
735
736	# one letter latex accent commands that affect following letter
737	while ($text =~ m~\\([uvcH]) ([\w])~) {
738	my $tex="$1 $2"; my $char="$2";
739	my $replacement=$special_utf8_chars{$tex};
740	if (!defined($replacement)) {
741	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
742	print $outhandle $self->{'plugin_type'} .
743	": Warning: unknown latex accent \"$tex\""
744	. " in \"$1\"\n";
745	$replacement=$char;
746	}
747	$text =~ s/\\$tex/$replacement/g;
748	}
749	}
750	$textref=\$text;
751	}
752
753
754	# modules must return true
755	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: