Context Navigation

source: trunk/gsdl/perllib/plugins/LaTeXPlug.pm@ 9143

Last change on this file since 9143 was 8121, checked in by chi, 20 years ago
Add the "FileFormat" metadata to each of the Plugins.
Property svn:keywords set to `Author Date Id Revision`
File size: 23.1 KB

Line
1	###########################################################################
2	#
3	# LaTeXPlug.pm
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Written by John McPherson
10	# Copyright (C) 2004 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	###########################################################################
23
24	# todo:
25	# \includegraphics
26	# parse/remove tex \if ... macros
27
28	package LaTeXPlug;
29
30	use strict;
31	no strict 'refs'; # so we can print to a handle named by a variable
32
33	# greenstone packages
34	use BasPlug;
35	use parsargv;
36	use unicode;
37	use util;
38
39	sub BEGIN {
40	@LaTeXPlug::ISA = ('BasPlug');
41	}
42
43	sub print_usage {
44	print STDERR "\n usage: plugin LaTeXPlug [options]\n\n";
45	}
46
47	sub new {
48	my $class = shift (@_);
49	my $self = new BasPlug ($class, @_);
50	$self->{'plugin_type'} = 'LaTeXPlug';
51	my $option_list = $self->{'option_list'};
52	my $options={ 'name' => 'LaTeXPlug',
53	'desc' => '{LaTeXPlug.desc}',
54	'abstract' => 'no',
55	'inherits' => 'yes',
56	'args' => [
57	] # no arguments for now...
58	};
59
60	push( @{$option_list}, $options );
61
62	if (!parsargv::parse(\@_,
63	"allow_extra_options")) {
64
65	print STDERR "\nIncorrect options passed to LaTeXPlug, check your collect.cfg configuration file\n";
66	$self->print_txt_usage(""); # don't specify which language bundle
67	die "\n";
68	}
69
70	$self->{'aux_files'} = {};
71	$self->{'dir_num'} = 0;
72	$self->{'file_num'} = 0;
73	return bless $self, $class;
74	}
75
76
77	sub get_default_process_exp {
78	my $self = shift (@_);
79	return q^\.tex$^;
80	}
81
82	sub get_default_block_exp {
83	# assume any .eps files are part of the latex stuff
84	return '\.(?:eps)$';
85	}
86
87
88	sub process {
89	my $self = shift (@_);
90	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
91
92	my $start=substr($$textref, 0, 200); # first 200 bytes
93
94	if ($start !~ m~\\ (?:documentclass \| documentstyle \| input \| section
95	\| chapter \| contents \| begin) ~x) {
96	# this doesn't look like latex...
97	return undef;
98	}
99	my $outhandle = $self->{'outhandle'};
100	if ($gli) {
101	print STDERR "<Processing n='$file' p='LaTeXPlug'>\n";
102	} elsif ($self->{'verbosity'} > 1) {
103	print $outhandle "LaTeXPlug: processing $file\n"
104	}
105	my $cursection = $doc_obj->get_top_section();
106
107	###### clean up text ######
108	$$textref =~ s/\r$//mg; # remove dos ^M
109	$$textref =~ s/%.*$//mg; # remove comments
110
111	# convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
112
113	$$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
114	# check that both sides are ascii, so we don't screw up utf-8 chars
115	$$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
116	unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
117
118
119	###### find metadata ######
120
121	## FileFormat metadata ##
122	$doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
123
124	### title metadata ###
125	$$textref =~ m@\\title\s{(.?)}@s;
126	my $title = $1;
127	if (!$title) {
128	# no title tag. look for a chapter/section heading
129	$$textref =~ m@\\(?:chapter\|section)\s{(.?)}@s; # will get 1st match
130	$title = $1;
131	}
132	if (!$title) {
133	# no chapter/section heading tags either... use filename
134	$title = $file;
135	$title =~ s/\.tex$//i;
136	$title =~ s/[-_.]/ /g; # turn punctuation into spaces
137	}
138	if ($title) {
139	$title =~ s@\\\\@ @g; # embedded newlines
140	$title = $self->process_latex($title); # no "-html" for title eg in browser
141	$doc_obj->add_utf8_metadata($cursection, "Title", $title);
142	}
143
144	### creator/individual author metadata ###
145	$$textref =~ m@\\author\s{((?:{.?}\|.?)+)}\s$@sm;
146	my $authors=$1;
147	if ($authors) {
148	# take care of "et al."...
149	$authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
150	my $etal=$1;
151	$etal="" if (!defined ($etal));
152
153	my @authorlist=parse_authors($self, $authors);
154
155	foreach my $author (@authorlist) {
156	# Add each name to set of Authors
157	$doc_obj->add_utf8_metadata ($cursection, "Author", $author);
158	}
159
160	# Only want at most one "and" in the Creator field
161	my $creator_str="";
162	if (scalar(@authorlist) > 2) {
163	my $lastauthor=pop @authorlist;
164	$creator_str=join(', ', @authorlist);
165	$creator_str.=" and $lastauthor";
166	} else { # 1 or 2 authors...
167	$creator_str=join(" and ",@authorlist);
168	}
169	$creator_str.=$etal; # if there was "et al."
170	$doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
171	}
172	### end of author metadata ###
173
174	###### process latex for the main text ######
175	$$textref =~ s/^.*?\\begin{document}//s;
176	$$textref =~ s/\\end{document}.*?$//s;
177	$$textref = $self->process_latex("-html",$$textref);
178	$doc_obj->add_utf8_text($cursection, $$textref);
179
180	return 1;
181	}
182
183
184	# returns a list of author names
185	sub parse_authors {
186	my $self=shift;
187	my $authors=shift;
188
189	my $outhandle=$self->{'outhandle'};
190
191	$authors =~ s/\n/ /g; # remove newlines
192
193	# some people do this for affiliation footnote/dagger
194	$authors =~ s@\$.*?\$@@g; # remove maths from author :(
195
196	# und here for german language...
197	# don't use brackets in pattern, else the matched bit becomes
198	# an element in the list!
199	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $authors);
200	my @formattedlist = ();
201	foreach my $author (@authorlist) {
202	$author =~ s/\s*$//;
203	$author =~ s/^\s*//;
204	# Reformat and add author name
205	next if $author=~ /^\s*$/;
206
207	# names are "First von Last", "von Last, First"
208	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
209	my $first="";
210	my $vonlast="";
211	my $jr="";
212
213	if ($author =~ /,/) {
214	my @parts=split(/,\s*/, $author);
215	$first = pop @parts;
216	if (scalar(@parts) == 2) {
217	$jr = pop @parts;
218	}
219	$vonlast=shift @parts;
220	if (scalar(@parts) > 0) {
221	print $outhandle $self->{'plugin_type'} .
222	": couldn't parse name $author\n";
223	# but we continue anyway...
224	}
225	} else { # First von Last
226	my @words = split(/ /, $author);
227	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
228	$first .= " " . shift (@words);
229	}
230	$first =~ s/^\s//;
231	$vonlast = join (' ', @words); # whatever's left...
232	}
233	my $von="";
234	my $last="";
235	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
236	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
237	$von = $1;
238	if (!defined ($von)) {
239	# some non-English names do start with lowercase
240	# eg "Marie desJardins". Also we can get typos...
241	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
242	$von="";
243	if ($vonlast =~ /^[a-z]+$/) {
244	# if it's all lowercase, uppercase 1st.
245	$vonlast =~ s/^(.)/\u$1/;
246	}
247	}
248	$von =~ s/\s*$//;
249	$last=$vonlast;
250	} else {
251	$last=$vonlast;
252	}
253	my $wholename="$first $von $last $jr";
254	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
255	# my $fullname = "$last";
256	# $fullname .= " $jr" if ($jr);
257	# $fullname .= ", $first";
258	# $fullname .= " $von" if ($von);
259	push (@formattedlist, $wholename);
260	}
261	return @formattedlist;
262	}
263
264
265	## following functions based on bibtex plugin ##
266	# not actually used at the moment, but might be useful in future?
267	sub expand_month {
268	my $text=shift;
269
270	# bibtex style files expand abbreviations for months.
271	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
272	$text =~ s/jan/_textmonth01_/g;
273	$text =~ s/feb/_textmonth02_/g;
274	$text =~ s/mar/_textmonth03_/g;
275	$text =~ s/apr/_textmonth04_/g;
276	$text =~ s/may/_textmonth05_/g;
277	$text =~ s/jun/_textmonth06_/g;
278	$text =~ s/jul/_textmonth07_/g;
279	$text =~ s/aug/_textmonth08_/g;
280	$text =~ s/sep/_textmonth09_/g;
281	$text =~ s/oct/_textmonth10_/g;
282	$text =~ s/nov/_textmonth11_/g;
283	$text =~ s/dec/_textmonth12_/g;
284
285	return $text;
286	}
287
288
289	# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
290	# as the first argument to this function.
291	#
292	# Convert accented characters, remove { }, interprete some commands....
293	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
294
295	# Also, it sucks quite a bit for complicated/nested commands since it doesn't
296	# match { with the corresponding }, only the nearest }
297
298	sub process_latex {
299	my $self=shift;
300	my $text=shift;
301
302	my $outhandle=$self->{'outhandle'};
303
304	my $html_markup=0;
305	if ($text =~ /^\-html/) {
306	$html_markup=1;
307	$text=shift;
308	}
309
310	if (! $text) {
311	return $text;
312	}
313	# escape html-sensitive characters
314	$text =~ s@&@&@g;
315	$text =~ s@<@<@g;
316	$text =~ s@>@>@g;
317
318	# do this before accents, since \= means something different in tabbing
319	# also \> is a tab stop too, and \\ is newline
320	sub do_tabbing {
321	my $tabbing=shift;
322	$tabbing =~ s!^.\\kill\s$!!mg; # \kill sets tab stops, kills line
323	$tabbing =~ s~\\(?:=\|>)~\xc2\xa0~g; # replace with nbsp
324	$tabbing =~ s~[\\][\\](?:\[.?\])?\s$~<br/>~mg;
325	return "<br/>" . $tabbing . "<br/>\n";
326	}
327	$text =~ s@\\begin{tabbing}(.*?)\\end{tabbing}@do_tabbing($1)@ges;
328	sub do_tabular {
329	my $tabular=shift;
330	$tabular =~ s~(?<!\\)\s&\s~</td><td>~g;
331	$tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
332	$tabular =~ s~\\hline~~g; # for now...
333	$tabular =~ s~<td>\s\\multicolumn{(\d+)}{.?}~<td colspan="$1">~g;
334	return "<table border=\"1\">\n <tr><td>"
335	. $tabular . "</td></tr></table>\n";
336	}
337	$text =~ s@\\begin{tabular}(?:\[.?\])?{.?}(.*?)\\end{tabular} @
338	do_tabular($1) @xges;
339
340	$text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
341
342	# process maths mode before accents... things like \, mean different!
343	# maths mode
344	$text =~ s@\$\$(.*?)\$\$
345	@ process_latex_math($html_markup,$1)
346	@xsge; # multi-line maths: $$ .... $$
347
348	$text =~ s@([^\\])\$(.*?[^\\])\$
349	@$1.process_latex_math($html_markup,$2)@xsge;
350
351
352	# is this an amstext environment, or just custom for that input file?
353	$text =~ s@\\begin{(algorithm)}(.*?)\\end{\1}@remove_equals($2)@ges;
354
355	# convert latex-style accented characters.
356	$self->latex_accents_to_utf8(\$text);
357
358	# replace quotes with utf-8
359
360	$text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (“)
361	$text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (”)
362	$text =~ s/`/\xe2\xc0\x98/g; # single left quote
363	$text =~ s/'/\xe2\xc0\x99/g; # single right quote
364
365	###### remove/replace latex commands ######
366	### commands that expand to something that gets displayed ###
367	$text =~ s~\\ldots~…~g;
368	$text =~ s~\\hrule~<hr/>\n~g;
369	$text =~ s~\\maketitle~ ~;
370	### space commands ###
371	$text =~ s~\\[vh]skip\s+\S+~~g;
372	$text =~ s~\\vspace\?{.?}~<div> </div>~g; # vertical space
373	$text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
374	$text =~ s~\\noindent\b~~g;
375	# newpage, etc
376	$text =~ s~\\(?:clearemptydoublepage\|newpage)~~g;
377	### counters, contents, environments, labels, etc ###
378	$text =~ s~\\(?:addcontentsline){.?}{.?}{.*}~~g;
379	$text =~ s~\s\\begin{itemize}\s~\n<ul>\n~g;
380	$text =~ s~\s\\end{itemize}\s~</li></ul>\n~g;
381	$text =~ s~\s\\begin{enumerate}\s~<ol>\n~g;
382	$text =~ s~\s\\end{enumerate}\s~</li></ol>\n~g;
383	if ($text =~ s~\s*\\item~</li>\n<li>~g) {
384	# (count for first list item)
385	$text =~ s~<([ou])l>\s</li>\s~<$1l>~g;
386	}
387	$text =~ s~\\(?:label\|begin\|end){.?}\s\n?~ ~g; # remove tag and contents
388	$text =~ s~\\(?:tableofcontents\|listoffigures)~ ~g;
389	### font sizes/styles ###
390	$text =~ s~\\(?:tiny\|small\|footnotesize\|normalsize\|large\|Large\|huge\|Huge)\b~~g;
391
392	if ($html_markup) {
393	$text =~ s~\\section\?{([^\}]+)}\s\n?~<H1>$1</H1>\n~g;
394	$text =~ s~\\subsection\?{(.?)}\s*\n?~<H2>$1</H2>\n~g;
395	$text =~ s~{\\tt\s(.?)}~<tt>$1</tt>~g;
396	$text =~ s~\\(?:texttt\|tt\|ttseries)\s{(.?)}~<tt>$1</tt>~g;
397	$text =~ s~\\emph{(.*?)}~<em>$1</em>~g;
398	$text =~ s~{\\(?:em\|it)\s(.?)}~<em>$1</em>~g;
399	$text =~ s~{\\(?:bf\|bfseries)\s(.?)}~<strong>$1</strong>~g;
400	$text =~ s~\\(?:textbf\|bf\|bfseries)\s{(.?)}~<strong>$1</strong>~g;
401	} else {
402	# remove tags for text-only
403	$text =~ s~\\(?:textbf\|bf\|bfseries\|em\|emph\|tt\|rm\|texttt)\b~ ~g;
404	}
405	$text =~ s ~ {\\sc\s+(.*?)} ~
406	{<span style="font-variant:\ small-caps">$1</span>} ~gx;
407	# ignore these font tags (if there are any left)
408	# sf is sans-serif
409	$text =~ s~\\(?:mdseries\|textmd\|bfseries\|textbf\|sffamily\|sf\|sc)\b~ ~;
410	#### end font-related stuff ####
411
412	### remove all other commands with optional arguments... ###
413	# don't remove commands without { }....
414	# $text =~ s~\\\w+(\[.?\])?\s~~g;
415	# $text =~ s~\\noopsort{[^}]+\}~~g;
416	# verbatim
417	$text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
418	# remove tags, keep contents for \tag[optional]{contents}
419	while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
420
421	# remove latex groupings { } (but not \{ or \} )
422	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
423	$text =~ s/^\{//; # remove { if first char
424
425	# latex characters
426	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
427	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
428	# optional break "\-"
429	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
430	# the non-macro bits have quotes around them - we just remove quotes
431	# XXX bibtex and latex differ here (for the '#' char)...
432	$text =~ s/([^&])[\"\#]/$1/g;
433	}
434	# dashes. Convert (m\|n)-dash into single dash for html.
435	$text =~ s~\-\-+~\-~g;
436
437	# quoted { } chars
438	$text =~ s~\\{~{~g;
439	$text =~ s~\\}~}~g;
440
441	# spaces
442	$text =~ s~\\ ~ ~g;
443
444	# finally to protect against macro language...
445	# greenstone-specific
446	$text =~ s~\[~&\#91;~g;
447	$text =~ s~\]~&\#93;~g;
448	$text =~ s~(?<!\\)([\\_])~\\$1~g;
449
450	if ($html_markup) {
451	$text =~ s~\n{2,}~\n</p>\n<p>~g;
452	return "<p>$text</p>";
453	}
454
455	return $text;
456	}
457
458	# only used by process_latex for \verb....
459	sub verb_text {
460	my $verbatim=shift;
461	$verbatim =~ s/([{}_])/\\$1/g;
462	return $verbatim;
463	}
464	# only used by process_latex_math
465	# returns a unicode char if applicable, otherwise ascii
466	sub math_fraction {
467	my $num=$1;
468	my $denom=$2;
469
470	if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
471	if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
472	if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
473	return "$num/$denom";
474	}
475
476	sub process_latex_math {
477
478	my $text = pop; # if given one or two args, this is the last one...
479	my $html_markup=pop; # if given two args, this is the first one else undef
480
481	$text =~ s~\\,~ ~g; # forces a space?
482	$text =~ s~\\infty~infinity~g; # or unicode 0x221E...
483
484	# use this one when more things can read 3-byte utf8 values like this!
485	# $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
486	$text =~ s~\\cup\b~ U ~g;
487
488	$text =~ s~\\frac\s*{(.+?)}{(.+?)}~math_fraction($1,$2)~ge;
489
490	if ($html_markup) {
491	$text =~ s~\^{(.*?)}~<sup>$1</sup>~g; # a^b superscript
492	$text =~ s~\^([^\{])~<sup>$1</sup>~g;
493	$text =~ s~\_{(.*?)}~<sub>$1</sub>~g; # a_b subscript
494	$text =~ s~\_([^\{])~<sub>$1</sub>~g;
495
496	$text =~ s~\\ldots~…~g; # use html named entity for now
497
498	# put all other command names in italics for now
499	$text =~ s~\\([\w]+)~<i>$1</i> ~g;
500	}
501
502	# special cases, for some input files
503	if ($text =~ m~^\\\w+$~) {
504	$text="{" . $text . "}";
505	}
506
507	return $text;
508	}
509
510
511
512	sub latex_accents_to_utf8 {
513
514	# note - this is really ugly, but it works. There may be a prettier way
515	# of mapping latex accented chars to utf8, but we just brute force it here.
516	# Also, this isn't complete - not every single possible accented letter
517	# is in here yet, but most of the common ones are.
518
519	my %utf8_chars =
520	(
521	# acutes
522	'\'a' => chr(0xc3).chr(0xa1),
523	'\'c' => chr(0xc4).chr(0x87),
524	'\'e' => chr(0xc3).chr(0xa9),
525	'\'i' => chr(0xc3).chr(0xad),
526	'\'l' => chr(0xc3).chr(0xba),
527	'\'n' => chr(0xc3).chr(0x84),
528	'\'o' => chr(0xc3).chr(0xb3),
529	'\'r' => chr(0xc5).chr(0x95),
530	'\'s' => chr(0xc5).chr(0x9b),
531	'\'u' => chr(0xc3).chr(0xba),
532	'\'y' => chr(0xc3).chr(0xbd),
533	'\'z' => chr(0xc5).chr(0xba),
534	# graves
535	'`a' => chr(0xc3).chr(0xa0),
536	'`A' => chr(0xc3).chr(0x80),
537	'`e' => chr(0xc3).chr(0xa8),
538	'`E' => chr(0xc3).chr(0x88),
539	'`i' => chr(0xc3).chr(0xac),
540	'`I' => chr(0xc3).chr(0x8c),
541	'`o' => chr(0xc3).chr(0xb2),
542	'`O' => chr(0xc3).chr(0x92),
543	'`u' => chr(0xc3).chr(0xb9),
544	'`U' => chr(0xc3).chr(0x99),
545	# circumflex
546	'^a' => chr(0xc3).chr(0xa2),
547	'^A' => chr(0xc3).chr(0x82),
548	'^c' => chr(0xc4).chr(0x89),
549	'^C' => chr(0xc4).chr(0x88),
550	'^e' => chr(0xc3).chr(0xaa),
551	'^E' => chr(0xc3).chr(0x8a),
552	'^g' => chr(0xc4).chr(0x9d),
553	'^G' => chr(0xc4).chr(0x9c),
554	'^h' => chr(0xc4).chr(0xa5),
555	'^H' => chr(0xc4).chr(0xa4),
556	'^i' => chr(0xc3).chr(0xae),
557	'^I' => chr(0xc3).chr(0x8e),
558	'^j' => chr(0xc4).chr(0xb5),
559	'^J' => chr(0xc4).chr(0xb4),
560	'^o' => chr(0xc3).chr(0xb4),
561	'^O' => chr(0xc3).chr(0x94),
562	'^s' => chr(0xc5).chr(0x9d),
563	'^S' => chr(0xc5).chr(0x9c),
564	'^u' => chr(0xc3).chr(0xa2),
565	'^U' => chr(0xc3).chr(0xbb),
566	'^w' => chr(0xc5).chr(0xb5),
567	'^W' => chr(0xc5).chr(0xb4),
568	'^y' => chr(0xc5).chr(0xb7),
569	'^Y' => chr(0xc5).chr(0xb6),
570
571	# diaeresis
572	'"a' => chr(0xc3).chr(0xa4),
573	'"A' => chr(0xc3).chr(0x84),
574	'"e' => chr(0xc3).chr(0xab),
575	'"E' => chr(0xc3).chr(0x8b),
576	'"i' => chr(0xc3).chr(0xaf),
577	'"I' => chr(0xc3).chr(0x8f),
578	'"\\\\i' => chr(0xc3).chr(0xaf),
579	'"\\\\I' => chr(0xc3).chr(0x8f),
580	'"o' => chr(0xc3).chr(0xb6),
581	'"O' => chr(0xc3).chr(0x96),
582	'"u' => chr(0xc3).chr(0xbc),
583	'"U' => chr(0xc3).chr(0x9c),
584	'"y' => chr(0xc3).chr(0xbf),
585	'"Y' => chr(0xc3).chr(0xb8),
586	# tilde
587	'~A' => chr(0xc3).chr(0x83),
588	'~N' => chr(0xc3).chr(0x91),
589	'~O' => chr(0xc3).chr(0x95),
590	'~a' => chr(0xc3).chr(0xa3),
591	'~n' => chr(0xc3).chr(0xb1),
592	'~o' => chr(0xc3).chr(0xb5),
593	# caron - handled specially
594	# double acute
595	# ring
596	# dot
597	'.c' => chr(0xc4).chr(0x8b),
598	'.C' => chr(0xc4).chr(0x8a),
599	'.e' => chr(0xc4).chr(0x97),
600	'.E' => chr(0xc4).chr(0x96),
601	'.g' => chr(0xc4).chr(0xa1),
602	'.G' => chr(0xc4).chr(0xa0),
603	'.I' => chr(0xc4).chr(0xb0),
604	'.z' => chr(0xc5).chr(0xbc),
605	'.Z' => chr(0xc5).chr(0xbb),
606	# macron
607	'=a' => chr(0xc4).chr(0x81),
608	'=A' => chr(0xc4).chr(0x80),
609	'=e' => chr(0xc4).chr(0x93),
610	'=E' => chr(0xc4).chr(0x92),
611	'=i' => chr(0xc4).chr(0xab),
612	'=I' => chr(0xc4).chr(0xaa),
613	'=o' => chr(0xc4).chr(0x8d),
614	'=O' => chr(0xc4).chr(0x8c),
615	'=u' => chr(0xc4).chr(0xab),
616	'=U' => chr(0xc4).chr(0xaa),
617
618	# stroke - handled specially - see below
619
620	# cedilla - handled specially
621	);
622
623	# these are one letter latex commands - we make sure they're not a longer
624	# command name. eg {\d} is d+stroke, so careful of \d
625	my %special_utf8_chars =
626	(
627	# breve
628	'u g' => chr(0xc4).chr(0x9f),
629	'u G' => chr(0xc4).chr(0x9e),
630	'u i' => chr(0xc4).chr(0xad),
631	'u I' => chr(0xc4).chr(0xac),
632	'u o' => chr(0xc5).chr(0x8f),
633	'u O' => chr(0xc5).chr(0x8e),
634	'u u' => chr(0xc5).chr(0xad),
635	'u U' => chr(0xc5).chr(0xac),
636	'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
637	'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
638	# caron
639	'v c' => chr(0xc4).chr(0x8d),
640	'v C' => chr(0xc4).chr(0x8c),
641	'v n' => chr(0xc5).chr(0x88),
642	'v N' => chr(0xc5).chr(0x87),
643	'v s' => chr(0xc5).chr(0xa1),
644	'v S' => chr(0xc5).chr(0xa5),
645	'v z' => chr(0xc5).chr(0xbe),
646	'v Z' => chr(0xc5).chr(0xbd),
647	# cedilla
648	'c c' => chr(0xc3).chr(0xa7),
649	'c C' => chr(0xc3).chr(0x87),
650	'c g' => chr(0xc4).chr(0xa3),
651	'c G' => chr(0xc4).chr(0xa2),
652	'c k' => chr(0xc4).chr(0xb7),
653	'c K' => chr(0xc4).chr(0xb6),
654	'c l' => chr(0xc4).chr(0xbc),
655	'c L' => chr(0xc4).chr(0xbb),
656	'c n' => chr(0xc5).chr(0x86),
657	'c N' => chr(0xc5).chr(0x85),
658	'c r' => chr(0xc5).chr(0x97),
659	'c R' => chr(0xc5).chr(0x96),
660	'c s' => chr(0xc5).chr(0x9f),
661	'c S' => chr(0xc5).chr(0x9e),
662	'c t' => chr(0xc5).chr(0xa3),
663	'c T' => chr(0xc5).chr(0xa2),
664	# double acute / Hungarian accent
665	'H O' => chr(0xc5).chr(0x90),
666	'H o' => chr(0xc5).chr(0x91),
667	'H U' => chr(0xc5).chr(0xb0),
668	'H u' => chr(0xc5).chr(0xb1),
669
670	# stroke
671	'd' => chr(0xc4).chr(0x91),
672	'D' => chr(0xc4).chr(0x90),
673	'h' => chr(0xc4).chr(0xa7),
674	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
675	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
676	'l' => chr(0xc5).chr(0x82),
677	'L' => chr(0xc5).chr(0x81),
678	'o' => chr(0xc3).chr(0xb8),
679	'O' => chr(0xc3).chr(0x98),
680	't' => chr(0xc5).chr(0xa7),
681	'T' => chr(0xc5).chr(0xa6),
682
683	# other special chars
684	'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
685	'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
686	);
687
688	my $self=shift;
689	my $textref=shift;
690
691	my $outhandle=$self->{'outhandle'};
692	my $text=$$textref;
693
694	# remove space (if any) between \ and letter to accent (eg {\' a})
695	$text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
696
697	# remove {} around a single character (eg \'{e})
698	$text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g; # for emacs indenting... `]);
699
700	## only in bibtex... not in latex proper?!
701	### \, is another way of doing cedilla \c
702	##$text =~ s~\\,(.)~\\c $1~g;
703
704	# remove {} around a single character for special 1 letter commands -
705	# need to insert a space. Eg \v{s} -> {\v s}
706	$text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
707
708	# only do if the text contains a '\' character.
709	if ($text =~ m\|\\\|) {
710	# "normal" accents - ie non-alpha latex tag
711	# xxx used to have ([\w]\b)@ (for word boundary)
712	while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
713	my $tex="$1$2"; my $char="$2";
714	my $replacement=$utf8_chars{$tex};
715	if (!defined($replacement)) {
716	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
717	print $outhandle . $self->{'plugin_type'} .
718	": Warning: unknown latex accent \"$tex\""
719	. " in \"$1\"\n";
720	$replacement=$char;
721	}
722	$text =~ s/\\\Q$tex/$replacement/g;
723	}
724
725	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
726	# only do the change if immediately followed by a space, }, {, or \
727	# one letter accents ( + ss / aa)
728	while ($text =~ m~\\([DdhiLlOoTt]\|ss\|aa)[{}\s\"\\]~) {
729	my $tex=$1;
730	my $replacement=$special_utf8_chars{$tex};
731	if (!defined($replacement)) {
732	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
733	print $outhandle $self->{'plugin_type'} .
734	": Warning: unknown latex accent \"$tex\""
735	. " in \"$1\"\n";
736	$replacement=$tex;
737	}
738	($text =~ s/{\\$tex}/$replacement/g) or
739	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
740
741	}
742
743	# one letter latex accent commands that affect following letter
744	while ($text =~ m~\\([uvcH]) ([\w])~) {
745	my $tex="$1 $2"; my $char="$2";
746	my $replacement=$special_utf8_chars{$tex};
747	if (!defined($replacement)) {
748	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
749	print $outhandle $self->{'plugin_type'} .
750	": Warning: unknown latex accent \"$tex\""
751	. " in \"$1\"\n";
752	$replacement=$char;
753	}
754	$text =~ s/\\$tex/$replacement/g;
755	}
756	}
757	$textref=\$text;
758	}
759
760
761	# modules must return true
762	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: