Context Navigation

source: trunk/gsdl/perllib/plugins/LaTeXPlug.pm@ 7668

Last change on this file since 7668 was 7559, checked in by kjdon, 20 years ago
added use BasPLug
Property svn:keywords set to `Author Date Id Revision`
File size: 22.2 KB

Line
1	###########################################################################
2	#
3	# LaTeXPlug.pm
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Written by John McPherson
10	# Copyright (C) 2004 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	###########################################################################
23
24
25	package LaTeXPlug;
26
27	use strict;
28	no strict 'refs'; # so we can print to a handle named by a variable
29
30	# greenstone packages
31	use BasPlug;
32	use parsargv;
33	use unicode;
34	use util;
35
36	sub BEGIN {
37	@LaTeXPlug::ISA = ('BasPlug');
38	}
39
40	sub print_usage {
41	print STDERR "\n usage: plugin LaTeXPlug [options]\n\n";
42	}
43
44	sub new {
45	my $class = shift (@_);
46	my $self = new BasPlug ($class, @_);
47	$self->{'plugin_type'} = 'LaTeXPlug';
48	my $option_list = $self->{'option_list'};
49	my $options={ 'name' => 'LaTeXPlug',
50	'desc' => '{LaTeXPlug.desc}',
51	'abstract' => 'no',
52	'inherits' => 'yes',
53	'args' => [
54	] # no arguments for now...
55	};
56
57	push( @{$option_list}, $options );
58
59	if (!parsargv::parse(\@_,
60	"allow_extra_options")) {
61
62	print STDERR "\nIncorrect options passed to LaTeXPlug, check your collect.cfg configuration file\n";
63	$self->print_txt_usage(""); # don't specify which language bundle
64	die "\n";
65	}
66
67	$self->{'aux_files'} = {};
68	$self->{'dir_num'} = 0;
69	$self->{'file_num'} = 0;
70	return bless $self, $class;
71	}
72
73
74	sub get_default_process_exp {
75	my $self = shift (@_);
76	return q^\.tex$^;
77	}
78
79	sub get_default_block_exp {
80	# assume any .eps files are part of the latex stuff
81	return '\.(?:eps)$';
82	}
83
84
85	sub process {
86	my $self = shift (@_);
87	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
88
89	my $start=substr($$textref, 0, 200); # first 200 bytes
90
91	if ($start !~ m~\\ (?:documentclass \| documentstyle \| input \| section
92	\| chapter \| contents \| begin) ~x) {
93	# this doesn't look like latex...
94	return undef;
95	}
96	my $outhandle = $self->{'outhandle'};
97	if ($gli) {
98	print STDERR "<Processing n='$file' p='LaTeXPlug'>\n";
99	} elsif ($self->{'verbosity'} > 1) {
100	print $outhandle "LaTeXPlug: processing $file\n"
101	}
102	my $cursection = $doc_obj->get_top_section();
103
104	###### clean up text ######
105	$$textref =~ s/\r$//mg; # remove dos ^M
106	$$textref =~ s/%.*$//mg; # remove comments
107
108	# convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
109
110	$$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
111	# check that both sides are ascii, so we don't screw up utf-8 chars
112	$$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
113	unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
114
115
116	###### find metadata ######
117
118	### title metadata ###
119	$$textref =~ m@\\title\s{(.?)}@s;
120	my $title = $1;
121	if ($title) {
122	$title =~ s@\\\\@ @g; # embedded newlines
123	$title = $self->process_latex($title); # no "-html" for title eg in browser
124	$doc_obj->add_utf8_metadata($cursection, "Title", $title);
125	}
126
127	### creator/individual author metadata ###
128	$$textref =~ m@\\author\s{((?:{.?}\|.?)+)}\s$@sm;
129	my $authors=$1;
130	if ($authors) {
131	# take care of "et al."...
132	$authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
133	my $etal=$1;
134	$etal="" if (!defined ($etal));
135
136	my @authorlist=parse_authors($self, $authors);
137
138	foreach my $author (@authorlist) {
139	# Add each name to set of Authors
140	$doc_obj->add_utf8_metadata ($cursection, "Author", $author);
141	}
142
143	# Only want at most one "and" in the Creator field
144	my $creator_str="";
145	if (scalar(@authorlist) > 2) {
146	my $lastauthor=pop @authorlist;
147	$creator_str=join(', ', @authorlist);
148	$creator_str.=" and $lastauthor";
149	} else { # 1 or 2 authors...
150	$creator_str=join(" and ",@authorlist);
151	}
152	$creator_str.=$etal; # if there was "et al."
153	$doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
154	}
155	### end of author metadata ###
156
157	###### process latex for the main text ######
158	$$textref =~ s/^.*?\\begin{document}//s;
159	$$textref =~ s/\\end{document}.*?$//s;
160	$$textref = $self->process_latex("-html",$$textref);
161	$doc_obj->add_utf8_text($cursection, $$textref);
162
163	return 1;
164	}
165
166
167	# returns a list of author names
168	sub parse_authors {
169	my $self=shift;
170	my $authors=shift;
171
172	my $outhandle=$self->{'outhandle'};
173
174	$authors =~ s/\n/ /g; # remove newlines
175
176	# some people do this for affiliation footnote/dagger
177	$authors =~ s@\$.*?\$@@g; # remove maths from author :(
178
179	# und here for german language...
180	# don't use brackets in pattern, else the matched bit becomes
181	# an element in the list!
182	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $authors);
183	my @formattedlist = ();
184	foreach my $author (@authorlist) {
185	$author =~ s/\s*$//;
186	$author =~ s/^\s*//;
187	# Reformat and add author name
188	next if $author=~ /^\s*$/;
189
190	# names are "First von Last", "von Last, First"
191	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
192	my $first="";
193	my $vonlast="";
194	my $jr="";
195
196	if ($author =~ /,/) {
197	my @parts=split(/,\s*/, $author);
198	$first = pop @parts;
199	if (scalar(@parts) == 2) {
200	$jr = pop @parts;
201	}
202	$vonlast=shift @parts;
203	if (scalar(@parts) > 0) {
204	print $outhandle $self->{'plugin_type'} .
205	": couldn't parse name $author\n";
206	# but we continue anyway...
207	}
208	} else { # First von Last
209	my @words = split(/ /, $author);
210	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
211	$first .= " " . shift (@words);
212	}
213	$first =~ s/^\s//;
214	$vonlast = join (' ', @words); # whatever's left...
215	}
216	my $von="";
217	my $last="";
218	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
219	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
220	$von = $1;
221	if (!defined ($von)) {
222	# some non-English names do start with lowercase
223	# eg "Marie desJardins". Also we can get typos...
224	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
225	$von="";
226	if ($vonlast =~ /^[a-z]+$/) {
227	# if it's all lowercase, uppercase 1st.
228	$vonlast =~ s/^(.)/\u$1/;
229	}
230	}
231	$von =~ s/\s*$//;
232	$last=$vonlast;
233	} else {
234	$last=$vonlast;
235	}
236	my $wholename="$first $von $last $jr";
237	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
238	# my $fullname = "$last";
239	# $fullname .= " $jr" if ($jr);
240	# $fullname .= ", $first";
241	# $fullname .= " $von" if ($von);
242	push (@formattedlist, $wholename);
243	}
244	return @formattedlist;
245	}
246
247
248	## following functions based on bibtex plugin ##
249	# not actually used at the moment, but might be useful in future?
250	sub expand_month {
251	my $text=shift;
252
253	# bibtex style files expand abbreviations for months.
254	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
255	$text =~ s/jan/_textmonth01_/g;
256	$text =~ s/feb/_textmonth02_/g;
257	$text =~ s/mar/_textmonth03_/g;
258	$text =~ s/apr/_textmonth04_/g;
259	$text =~ s/may/_textmonth05_/g;
260	$text =~ s/jun/_textmonth06_/g;
261	$text =~ s/jul/_textmonth07_/g;
262	$text =~ s/aug/_textmonth08_/g;
263	$text =~ s/sep/_textmonth09_/g;
264	$text =~ s/oct/_textmonth10_/g;
265	$text =~ s/nov/_textmonth11_/g;
266	$text =~ s/dec/_textmonth12_/g;
267
268	return $text;
269	}
270
271
272	# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
273	# as the first argument to this function.
274	#
275	# Convert accented characters, remove { }, interprete some commands....
276	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
277
278	# Also, it sucks quite a bit for complicated/nested commands since it doesn't
279	# match { with the corresponding }, only the nearest }
280
281	sub process_latex {
282	my $self=shift;
283	my $text=shift;
284
285	my $outhandle=$self->{'outhandle'};
286
287	my $html_markup=0;
288	if ($text =~ /^\-html/) {
289	$html_markup=1;
290	$text=shift;
291	}
292
293	if (! $text) {
294	return $text;
295	}
296	# escape html-sensitive characters
297	$text =~ s@&@&@g;
298	$text =~ s@<@<@g;
299	$text =~ s@>@>@g;
300
301	# do this before accents, since \= means something different in tabbing
302	# also \> is a tab stop too, and \\ is newline
303	sub do_tabbing {
304	my $tabbing=shift;
305	$tabbing =~ s!^.\\kill\s$!!mg; # \kill sets tab stops, kills line
306	$tabbing =~ s~\\(?:=\|>)~\xc2\xa0~g; # replace with nbsp
307	$tabbing =~ s~[\\][\\](?:\[.?\])?\s$~<br/>~mg;
308	return "<br/>" . $tabbing . "<br/>\n";
309	}
310	$text =~ s@\\begin{tabbing}(.*?)\\end{tabbing}@do_tabbing($1)@ges;
311	sub do_tabular {
312	my $tabular=shift;
313	$tabular =~ s~(?<!\\)\s&\s~</td><td>~g;
314	$tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
315	$tabular =~ s~\\hline~~g; # for now...
316	$tabular =~ s~<td>\s\\multicolumn{(\d+)}{.?}~<td colspan="$1">~g;
317	return "<table border=\"1\">\n <tr><td>"
318	. $tabular . "</td></tr></table>\n";
319	}
320	$text =~ s@\\begin{tabular}(?:\[.?\])?{.?}(.*?)\\end{tabular} @
321	do_tabular($1) @xges;
322
323	$text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
324
325	# process maths mode before accents... things like \, mean different!
326	# maths mode
327	$text =~ s@\$\$(.*?)\$\$
328	@ process_latex_math($html_markup,$1)
329	@xsge; # multi-line maths: $$ .... $$
330
331	$text =~ s@([^\\])\$(.*?[^\\])\$
332	@$1.process_latex_math($html_markup,$2)@xsge;
333
334
335	# is this an amstext environment, or just custom for that input file?
336	$text =~ s@\\begin{(algorithm)}(.*?)\\end{\1}@remove_equals($2)@ges;
337
338	# convert latex-style accented characters.
339	$self->latex_accents_to_utf8(\$text);
340
341	# replace quotes with utf-8
342
343	$text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (“)
344	$text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (”)
345	$text =~ s/`/\xe2\xc0\x98/g; # single left quote
346	$text =~ s/'/\xe2\xc0\x99/g; # single right quote
347
348	###### remove/replace latex commands ######
349	### commands that expand to something that gets displayed ###
350	$text =~ s~\\ldots~…~g;
351	$text =~ s~\\hrule~<hr/>\n~g;
352	$text =~ s~\\maketitle~ ~;
353	### space commands ###
354	$text =~ s~\\[vh]skip\s+\S+~~g;
355	$text =~ s~\\vspace\?{.?}~<div> </div>~g; # vertical space
356	$text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
357	$text =~ s~\\noindent\b~~g;
358	# newpage, etc
359	$text =~ s~\\(?:clearemptydoublepage\|newpage)~~g;
360	### counters, contents, environments, labels, etc ###
361	$text =~ s~\\(?:addcontentsline){.?}{.?}{.*}~~g;
362	$text =~ s~\s\\begin{itemize}\s~\n<ul>\n~g;
363	$text =~ s~\s\\end{itemize}\s~</li></ul>\n~g;
364	$text =~ s~\s\\begin{enumerate}\s~<ol>\n~g;
365	$text =~ s~\s\\end{enumerate}\s~</li></ol>\n~g;
366	if ($text =~ s~\s*\\item~</li>\n<li>~g) {
367	# (count for first list item)
368	$text =~ s~<([ou])l>\s</li>\s~<$1l>~g;
369	}
370	$text =~ s~\\(?:label\|begin\|end){.?}\s\n?~ ~g; # remove tag and contents
371	$text =~ s~\\(?:tableofcontents\|listoffigures)~ ~g;
372	### font sizes/styles ###
373	$text =~ s~\\(?:tiny\|small\|footnotesize\|normalsize\|large\|Large\|huge\|Huge)\b~~g;
374
375	if ($html_markup) {
376	$text =~ s~\\section\?{([^\}]+)}\s\n?~<H1>$1</H1>\n~g;
377	$text =~ s~\\subsection\?{(.?)}\s*\n?~<H2>$1</H2>\n~g;
378	$text =~ s~{\\tt\s(.?)}~<tt>$1</tt>~g;
379	$text =~ s~\\(?:texttt\|tt\|ttseries)\s{(.?)}~<tt>$1</tt>~g;
380	$text =~ s~\\emph{(.*?)}~<em>$1</em>~g;
381	$text =~ s~{\\em\s(.?)}~<em>$1</em>~g;
382	$text =~ s~{\\(?:bf\|bfseries)\s(.?)}~<strong>$1</strong>~g;
383	$text =~ s~\\(?:textbf\|bf\|bfseries)\s{(.?)}~<strong>$1</strong>~g;
384	} else {
385	# remove tags for text-only
386	$text =~ s~\\(?:textbf\|bf\|bfseries\|em\|emph\|tt\|rm\|texttt)\b~ ~g;
387	}
388	$text =~ s ~ {\\sc\s+(.*?)} ~
389	{<span style="font-variant:\ small-caps">$1</span>} ~gx;
390	# ignore these font tags (if there are any left)
391	# sf is sans-serif
392	$text =~ s~\\(?:mdseries\|textmd\|bfseries\|textbf\|sffamily\|sf\|sc)\b~ ~;
393	#### end font-related stuff ####
394
395	### remove all other commands with optional arguments... ###
396	# don't remove commands without { }....
397	# $text =~ s~\\\w+(\[.?\])?\s~~g;
398	# $text =~ s~\\noopsort{[^}]+\}~~g;
399	# verbatim
400	$text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
401	# remove tags, keep contents for \tag[optional]{contents}
402	while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
403
404	# remove latex groupings { } (but not \{ or \} )
405	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
406	$text =~ s/^\{//; # remove { if first char
407
408	# latex characters
409	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
410	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
411	# optional break "\-"
412	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
413	# the non-macro bits have quotes around them - we just remove quotes
414	# XXX bibtex and latex differ here (for the '#' char)...
415	$text =~ s/([^&])[\"\#]/$1/g;
416	}
417	# dashes. Convert (m\|n)-dash into single dash for html.
418	$text =~ s~\-\-+~\-~g;
419
420	# quoted { } chars
421	$text =~ s~\\{~{~g;
422	$text =~ s~\\}~}~g;
423
424	# spaces
425	$text =~ s~\\ ~ ~g;
426
427	# finally to protect against macro language...
428	# greenstone-specific
429	$text =~ s~\[~&\#91;~g;
430	$text =~ s~\]~&\#93;~g;
431	$text =~ s~(?<!\\)([\\_])~\\$1~g;
432
433	if ($html_markup) {
434	$text =~ s~\n{2,}~\n</p>\n<p>~g;
435	return "<p>$text</p>";
436	}
437
438	return $text;
439	}
440
441	# only used by process_latex for \verb....
442	sub verb_text {
443	my $verbatim=shift;
444	$verbatim =~ s/([{}_])/\\$1/g;
445	return $verbatim;
446	}
447
448
449	sub process_latex_math {
450
451	my $text = pop; # if given one or two args, this is the last one...
452	my $html_markup=pop; # if given two args, this is the first one else undef
453
454	$text =~ s~\\,~ ~g; # forces a space?
455	$text =~ s~\\infty~infinity~g; # or unicode 0x221E...
456
457	# use this one when more things can read 3-byte utf8 values like this!
458	# $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
459	$text =~ s~\\cup\b~ U ~g;
460
461	if ($html_markup) {
462	$text =~ s~\^{(.*?)}~<sup>$1</sup>~g; # a^b superscript
463	$text =~ s~\^([^\{])~<sup>$1</sup>~g;
464	$text =~ s~\_{(.*?)}~<sub>$1</sub>~g; # a_b subscript
465	$text =~ s~\_([^\{])~<sub>$1</sub>~g;
466
467	$text =~ s~\\ldots~…~g; # use html named entity for now
468
469	# put all other command names in italics for now
470	$text =~ s~\\([\w]+)~<i>$1</i> ~g;
471	}
472
473	# special cases, for some input files
474	if ($text =~ m~^\\\w+$~) {
475	$text="{" . $text . "}";
476	}
477
478	return $text;
479	}
480
481
482
483	sub latex_accents_to_utf8 {
484
485	# note - this is really ugly, but it works. There may be a prettier way
486	# of mapping latex accented chars to utf8, but we just brute force it here.
487	# Also, this isn't complete - not every single possible accented letter
488	# is in here yet, but most of the common ones are.
489
490	my %utf8_chars =
491	(
492	# acutes
493	'\'a' => chr(0xc3).chr(0xa1),
494	'\'c' => chr(0xc4).chr(0x87),
495	'\'e' => chr(0xc3).chr(0xa9),
496	'\'i' => chr(0xc3).chr(0xad),
497	'\'l' => chr(0xc3).chr(0xba),
498	'\'n' => chr(0xc3).chr(0x84),
499	'\'o' => chr(0xc3).chr(0xb3),
500	'\'r' => chr(0xc5).chr(0x95),
501	'\'s' => chr(0xc5).chr(0x9b),
502	'\'u' => chr(0xc3).chr(0xba),
503	'\'y' => chr(0xc3).chr(0xbd),
504	'\'z' => chr(0xc5).chr(0xba),
505	# graves
506	'`a' => chr(0xc3).chr(0xa0),
507	'`A' => chr(0xc3).chr(0x80),
508	'`e' => chr(0xc3).chr(0xa8),
509	'`E' => chr(0xc3).chr(0x88),
510	'`i' => chr(0xc3).chr(0xac),
511	'`I' => chr(0xc3).chr(0x8c),
512	'`o' => chr(0xc3).chr(0xb2),
513	'`O' => chr(0xc3).chr(0x92),
514	'`u' => chr(0xc3).chr(0xb9),
515	'`U' => chr(0xc3).chr(0x99),
516	# circumflex
517	'^a' => chr(0xc3).chr(0xa2),
518	'^A' => chr(0xc3).chr(0x82),
519	'^c' => chr(0xc4).chr(0x89),
520	'^C' => chr(0xc4).chr(0x88),
521	'^e' => chr(0xc3).chr(0xaa),
522	'^E' => chr(0xc3).chr(0x8a),
523	'^g' => chr(0xc4).chr(0x9d),
524	'^G' => chr(0xc4).chr(0x9c),
525	'^h' => chr(0xc4).chr(0xa5),
526	'^H' => chr(0xc4).chr(0xa4),
527	'^i' => chr(0xc3).chr(0xae),
528	'^I' => chr(0xc3).chr(0x8e),
529	'^j' => chr(0xc4).chr(0xb5),
530	'^J' => chr(0xc4).chr(0xb4),
531	'^o' => chr(0xc3).chr(0xb4),
532	'^O' => chr(0xc3).chr(0x94),
533	'^s' => chr(0xc5).chr(0x9d),
534	'^S' => chr(0xc5).chr(0x9c),
535	'^u' => chr(0xc3).chr(0xa2),
536	'^U' => chr(0xc3).chr(0xbb),
537	'^w' => chr(0xc5).chr(0xb5),
538	'^W' => chr(0xc5).chr(0xb4),
539	'^y' => chr(0xc5).chr(0xb7),
540	'^Y' => chr(0xc5).chr(0xb6),
541
542	# diaeresis
543	'"a' => chr(0xc3).chr(0xa4),
544	'"A' => chr(0xc3).chr(0x84),
545	'"e' => chr(0xc3).chr(0xab),
546	'"E' => chr(0xc3).chr(0x8b),
547	'"\\\\i' => chr(0xc3).chr(0xaf),
548	'"\\\\I' => chr(0xc3).chr(0x8f),
549	'"o' => chr(0xc3).chr(0xb6),
550	'"O' => chr(0xc3).chr(0x96),
551	'"u' => chr(0xc3).chr(0xbc),
552	'"U' => chr(0xc3).chr(0x9c),
553	'"y' => chr(0xc3).chr(0xbf),
554	'"Y' => chr(0xc3).chr(0xb8),
555	# tilde
556	'~A' => chr(0xc3).chr(0x83),
557	'~N' => chr(0xc3).chr(0x91),
558	'~O' => chr(0xc3).chr(0x95),
559	'~a' => chr(0xc3).chr(0xa3),
560	'~n' => chr(0xc3).chr(0xb1),
561	'~o' => chr(0xc3).chr(0xb5),
562	# caron - handled specially
563	# double acute
564	# ring
565	# dot
566	'.c' => chr(0xc4).chr(0x8b),
567	'.C' => chr(0xc4).chr(0x8a),
568	'.e' => chr(0xc4).chr(0x97),
569	'.E' => chr(0xc4).chr(0x96),
570	'.g' => chr(0xc4).chr(0xa1),
571	'.G' => chr(0xc4).chr(0xa0),
572	'.I' => chr(0xc4).chr(0xb0),
573	'.z' => chr(0xc5).chr(0xbc),
574	'.Z' => chr(0xc5).chr(0xbb),
575	# macron
576	'=a' => chr(0xc4).chr(0x81),
577	'=A' => chr(0xc4).chr(0x80),
578	'=e' => chr(0xc4).chr(0x93),
579	'=E' => chr(0xc4).chr(0x92),
580	'=i' => chr(0xc4).chr(0xab),
581	'=I' => chr(0xc4).chr(0xaa),
582	'=o' => chr(0xc4).chr(0x8d),
583	'=O' => chr(0xc4).chr(0x8c),
584	'=u' => chr(0xc4).chr(0xab),
585	'=U' => chr(0xc4).chr(0xaa),
586
587	# stroke - handled specially - see below
588
589	# cedilla - handled specially
590	);
591
592	# these are one letter latex commands - we make sure they're not a longer
593	# command name. eg {\d} is d+stroke, so careful of \d
594	my %special_utf8_chars =
595	(
596	# breve
597	'u g' => chr(0xc4).chr(0x9f),
598	'u G' => chr(0xc4).chr(0x9e),
599	'u i' => chr(0xc4).chr(0xad),
600	'u I' => chr(0xc4).chr(0xac),
601	'u o' => chr(0xc5).chr(0x8f),
602	'u O' => chr(0xc5).chr(0x8e),
603	'u u' => chr(0xc5).chr(0xad),
604	'u U' => chr(0xc5).chr(0xac),
605	'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
606	'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
607	# caron
608	'v c' => chr(0xc4).chr(0x8d),
609	'v C' => chr(0xc4).chr(0x8c),
610	'v n' => chr(0xc5).chr(0x88),
611	'v N' => chr(0xc5).chr(0x87),
612	'v s' => chr(0xc5).chr(0xa1),
613	'v S' => chr(0xc5).chr(0xa5),
614	'v z' => chr(0xc5).chr(0xbe),
615	'v Z' => chr(0xc5).chr(0xbd),
616	# cedilla
617	'c c' => chr(0xc3).chr(0xa7),
618	'c C' => chr(0xc3).chr(0x87),
619	'c g' => chr(0xc4).chr(0xa3),
620	'c G' => chr(0xc4).chr(0xa2),
621	'c k' => chr(0xc4).chr(0xb7),
622	'c K' => chr(0xc4).chr(0xb6),
623	'c l' => chr(0xc4).chr(0xbc),
624	'c L' => chr(0xc4).chr(0xbb),
625	'c n' => chr(0xc5).chr(0x86),
626	'c N' => chr(0xc5).chr(0x85),
627	'c r' => chr(0xc5).chr(0x97),
628	'c R' => chr(0xc5).chr(0x96),
629	'c s' => chr(0xc5).chr(0x9f),
630	'c S' => chr(0xc5).chr(0x9e),
631	'c t' => chr(0xc5).chr(0xa3),
632	'c T' => chr(0xc5).chr(0xa2),
633	# double acute / Hungarian accent
634	'H O' => chr(0xc5).chr(0x90),
635	'H o' => chr(0xc5).chr(0x91),
636	'H U' => chr(0xc5).chr(0xb0),
637	'H u' => chr(0xc5).chr(0xb1),
638
639	# stroke
640	'd' => chr(0xc4).chr(0x91),
641	'D' => chr(0xc4).chr(0x90),
642	'h' => chr(0xc4).chr(0xa7),
643	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
644	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
645	'l' => chr(0xc5).chr(0x82),
646	'L' => chr(0xc5).chr(0x81),
647	'o' => chr(0xc3).chr(0xb8),
648	'O' => chr(0xc3).chr(0x98),
649	't' => chr(0xc5).chr(0xa7),
650	'T' => chr(0xc5).chr(0xa6),
651
652	# other special chars
653	'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
654	'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
655	);
656
657	my $self=shift;
658	my $textref=shift;
659
660	my $outhandle=$self->{'outhandle'};
661	my $text=$$textref;
662
663	# remove space (if any) between \ and letter to accent (eg {\' a})
664	$text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
665
666	# remove {} around a single character (eg \'{e})
667	$text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g; # for emacs indenting... `]);
668
669	## only in bibtex... not in latex proper?!
670	### \, is another way of doing cedilla \c
671	##$text =~ s~\\,(.)~\\c $1~g;
672
673	# remove {} around a single character for special 1 letter commands -
674	# need to insert a space. Eg \v{s} -> {\v s}
675	$text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
676
677	# only do if the text contains a '\' character.
678	if ($text =~ m\|\\\|) {
679	# "normal" accents - ie non-alpha latex tag
680	# xxx used to have ([\w]\b)@ (for word boundary)
681	while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
682	my $tex="$1$2"; my $char="$2";
683	my $replacement=$utf8_chars{$tex};
684	if (!defined($replacement)) {
685	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
686	print $outhandle . $self->{'plugin_type'} .
687	": Warning: unknown latex accent \"$tex\""
688	. " in \"$1\"\n";
689	$replacement=$char;
690	}
691	$text =~ s/\\\Q$tex/$replacement/g;
692	}
693
694	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
695	# only do the change if immediately followed by a space, }, {, or \
696	# one letter accents ( + ss / aa)
697	while ($text =~ m~\\([DdhiLlOoTt]\|ss\|aa)[{}\s\"\\]~) {
698	my $tex=$1;
699	my $replacement=$special_utf8_chars{$tex};
700	if (!defined($replacement)) {
701	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
702	print $outhandle $self->{'plugin_type'} .
703	": Warning: unknown latex accent \"$tex\""
704	. " in \"$1\"\n";
705	$replacement=$tex;
706	}
707	($text =~ s/{\\$tex}/$replacement/g) or
708	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
709
710	}
711
712	# one letter latex accent commands that affect following letter
713	while ($text =~ m~\\([uvcH]) ([\w])~) {
714	my $tex="$1 $2"; my $char="$2";
715	my $replacement=$special_utf8_chars{$tex};
716	if (!defined($replacement)) {
717	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
718	print $outhandle $self->{'plugin_type'} .
719	": Warning: unknown latex accent \"$tex\""
720	. " in \"$1\"\n";
721	$replacement=$char;
722	}
723	$text =~ s/\\$tex/$replacement/g;
724	}
725	}
726	$textref=\$text;
727	}
728
729
730	# modules must return true
731	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: