Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

LaTeXPlugin.pm

Last change on this file was 32129, checked in by kjdon, 6 years ago
After () in a regex, {} signifys quantifiers. eg (xx){2,4} - 2-4 occurrences. In later perl versions, it is illegal to have an unescaped { after a ) in a regex. If you actually want to match { you need to escape it. So I have escaped all { following ) in regex
Property svn:keywords set to `Author Date Id Revision`
File size: 23.0 KB

Line
1	###########################################################################
2	#
3	# LaTeXPlugin.pm
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Written by John McPherson
10	# Copyright (C) 2004 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	###########################################################################
23
24	# todo:
25	# \includegraphics
26	# parse/remove tex \if ... macros
27
28	package LaTeXPlugin;
29
30	# System complains about $arguments if the strict is set
31	use strict;
32	no strict 'refs'; # so we can print to a handle named by a variable
33
34	# greenstone packages
35	use ReadTextFile;
36	use MetadataRead;
37	use unicode;
38	use util;
39
40	my $arguments =
41	[ { 'name' => "process_exp",
42	'desc' => "{BaseImporter.process_exp}",
43	'type' => "regexp",
44	'reqd' => "no",
45	'deft' => &get_default_process_exp() } ];
46
47	my $options = { 'name' => "LaTeXPlugin",
48	'desc' => "{LaTeXPlugin.desc}",
49	'abstract' => "no",
50	'inherits' => "yes",
51	'args' => $arguments };
52
53	sub BEGIN {
54	@LaTeXPlugin::ISA = ('MetadataRead', 'ReadTextFile');
55	}
56
57	sub new {
58	my ($class) = shift (@_);
59	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
60	push(@$pluginlist, $class);
61
62	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
63	push(@{$hashArgOptLists->{"OptList"}},$options);
64
65	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
66
67	$self->{'aux_files'} = {};
68	$self->{'dir_num'} = 0;
69	$self->{'file_num'} = 0;
70	return bless $self, $class;
71	}
72
73
74	sub get_default_process_exp {
75	my $self = shift (@_);
76	return q^\.tex$^;
77	}
78
79	sub get_default_block_exp {
80	# assume any .eps files are part of the latex stuff
81	return '\.(?:eps)$';
82	}
83
84
85	sub process {
86	my $self = shift (@_);
87	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
88
89	my $start=substr($$textref, 0, 200); # first 200 bytes
90
91	if ($start !~ m~\\ (?:documentclass \| documentstyle \| input \| section
92	\| chapter \| contents \| begin) ~x) {
93	# this doesn't look like latex...
94	return undef;
95	}
96	my $outhandle = $self->{'outhandle'};
97
98	my $cursection = $doc_obj->get_top_section();
99
100	###### clean up text ######
101	$$textref =~ s/\r$//mg; # remove dos ^M
102	$$textref =~ s/%.*$//mg; # remove comments
103
104	# convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
105
106	$$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
107	# check that both sides are ascii, so we don't screw up utf-8 chars
108	$$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
109	unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
110
111
112	###### find metadata ######
113
114	## FileFormat metadata ##
115	$doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
116
117	### title metadata ###
118	$$textref =~ m@\\title\s{(.?)}@s;
119	my $title = $1;
120	if (!$title) {
121	# no title tag. look for a chapter/section heading
122	$$textref =~ m@\\(?:chapter\|section)\s{(.?)}@s; # will get 1st match
123	$title = $1;
124	}
125	if (!$title) {
126	# no chapter/section heading tags either... use filename
127	$title = $file;
128	$title =~ s/\.tex$//i;
129	$title =~ s/[-_.]/ /g; # turn punctuation into spaces
130	}
131	if ($title) {
132	$title =~ s@\\\\@ @g; # embedded newlines
133	$title = $self->process_latex($title); # no "-html" for title eg in browser
134	$doc_obj->add_utf8_metadata($cursection, "Title", $title);
135	}
136
137	### creator/individual author metadata ###
138	$$textref =~ m@\\author\s{((?:{.?}\|.?)+)}\s$@sm;
139	my $authors=$1;
140	if ($authors) {
141	# take care of "et al."...
142	$authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
143	my $etal=$1;
144	$etal="" if (!defined ($etal));
145
146	my @authorlist=parse_authors($self, $authors);
147
148	foreach my $author (@authorlist) {
149	# Add each name to set of Authors
150	$doc_obj->add_utf8_metadata ($cursection, "Author", $author);
151	}
152
153	# Only want at most one "and" in the Creator field
154	my $creator_str="";
155	if (scalar(@authorlist) > 2) {
156	my $lastauthor=pop @authorlist;
157	$creator_str=join(', ', @authorlist);
158	$creator_str.=" and $lastauthor";
159	} else { # 1 or 2 authors...
160	$creator_str=join(" and ",@authorlist);
161	}
162	$creator_str.=$etal; # if there was "et al."
163	$doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
164	}
165	### end of author metadata ###
166
167	###### process latex for the main text ######
168	$$textref =~ s/^.*?\\begin\{document}//s;
169	$$textref =~ s/\\end\{document}.*?$//s;
170	$$textref = $self->process_latex("-html",$$textref);
171	$doc_obj->add_utf8_text($cursection, $$textref);
172
173	return 1;
174	}
175
176
177	# returns a list of author names
178	sub parse_authors {
179	my $self=shift;
180	my $authors=shift;
181
182	my $outhandle=$self->{'outhandle'};
183
184	$authors =~ s/\n/ /g; # remove newlines
185
186	# some people do this for affiliation footnote/dagger
187	$authors =~ s@\$.*?\$@@g; # remove maths from author :(
188
189	# und here for german language...
190	# don't use brackets in pattern, else the matched bit becomes
191	# an element in the list!
192	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $authors);
193	my @formattedlist = ();
194	foreach my $author (@authorlist) {
195	$author =~ s/\s*$//;
196	$author =~ s/^\s*//;
197	# Reformat and add author name
198	next if $author=~ /^\s*$/;
199
200	# names are "First von Last", "von Last, First"
201	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
202	my $first="";
203	my $vonlast="";
204	my $jr="";
205
206	if ($author =~ /,/) {
207	my @parts=split(/,\s*/, $author);
208	$first = pop @parts;
209	if (scalar(@parts) == 2) {
210	$jr = pop @parts;
211	}
212	$vonlast=shift @parts;
213	if (scalar(@parts) > 0) {
214	print $outhandle $self->{'plugin_type'} .
215	": couldn't parse name $author\n";
216	# but we continue anyway...
217	}
218	} else { # First von Last
219	my @words = split(/ /, $author);
220	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2,}/) {
221	$first .= " " . shift (@words);
222	}
223	$first =~ s/^\s//;
224	$vonlast = join (' ', @words); # whatever's left...
225	}
226	my $von="";
227	my $last="";
228	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
229	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
230	$von = $1;
231	if (!defined ($von)) {
232	# some non-English names do start with lowercase
233	# eg "Marie desJardins". Also we can get typos...
234	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
235	$von="";
236	if ($vonlast =~ /^[a-z]+$/) {
237	# if it's all lowercase, uppercase 1st.
238	$vonlast =~ s/^(.)/\u$1/;
239	}
240	}
241	$von =~ s/\s*$//;
242	$last=$vonlast;
243	} else {
244	$last=$vonlast;
245	}
246	my $wholename="$first $von $last $jr";
247	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
248	# my $fullname = "$last";
249	# $fullname .= " $jr" if ($jr);
250	# $fullname .= ", $first";
251	# $fullname .= " $von" if ($von);
252	push (@formattedlist, $wholename);
253	}
254	return @formattedlist;
255	}
256
257
258	## following functions based on bibtex plugin ##
259	# not actually used at the moment, but might be useful in future?
260	sub expand_month {
261	my $text=shift;
262
263	# bibtex style files expand abbreviations for months.
264	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
265	$text =~ s/jan/_textmonth01_/g;
266	$text =~ s/feb/_textmonth02_/g;
267	$text =~ s/mar/_textmonth03_/g;
268	$text =~ s/apr/_textmonth04_/g;
269	$text =~ s/may/_textmonth05_/g;
270	$text =~ s/jun/_textmonth06_/g;
271	$text =~ s/jul/_textmonth07_/g;
272	$text =~ s/aug/_textmonth08_/g;
273	$text =~ s/sep/_textmonth09_/g;
274	$text =~ s/oct/_textmonth10_/g;
275	$text =~ s/nov/_textmonth11_/g;
276	$text =~ s/dec/_textmonth12_/g;
277
278	return $text;
279	}
280
281
282	# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
283	# as the first argument to this function.
284	#
285	# Convert accented characters, remove { }, interprete some commands....
286	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
287
288	# Also, it sucks quite a bit for complicated/nested commands since it doesn't
289	# match { with the corresponding }, only the nearest }
290
291	sub process_latex {
292	my $self=shift;
293	my $text=shift;
294
295	my $outhandle=$self->{'outhandle'};
296
297	my $html_markup=0;
298	if ($text =~ /^\-html/) {
299	$html_markup=1;
300	$text=shift;
301	}
302
303	if (! $text) {
304	return $text;
305	}
306	# escape html-sensitive characters
307	$text =~ s@&@&@g;
308	$text =~ s@<@<@g;
309	$text =~ s@>@>@g;
310
311	# do this before accents, since \= means something different in tabbing
312	# also \> is a tab stop too, and \\ is newline
313	sub do_tabbing {
314	my $tabbing=shift;
315	$tabbing =~ s!^.\\kill\s$!!mg; # \kill sets tab stops, kills line
316	$tabbing =~ s~\\(?:=\|>)~\xc2\xa0~g; # replace with nbsp
317	$tabbing =~ s~[\\][\\](?:\[.?\])?\s$~<br/>~mg;
318	return "<br/>" . $tabbing . "<br/>\n";
319	}
320	$text =~ s@\\begin\{tabbing}(.*?)\\end\{tabbing}@do_tabbing($1)@ges;
321	sub do_tabular {
322	my $tabular=shift;
323	$tabular =~ s~(?<!\\)\s&\s~</td><td>~g;
324	$tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
325	$tabular =~ s~\\hline~~g; # for now...
326	$tabular =~ s~<td>\s\\multicolumn\{(\d+)}\{.?}~<td colspan="$1">~g;
327	return "<table border=\"1\">\n <tr><td>"
328	. $tabular . "</td></tr></table>\n";
329	}
330	$text =~ s@\\begin\{tabular}(?:\[.?\])?{.?}(.*?)\\end\{tabular} @
331	do_tabular($1) @xges;
332
333	$text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
334
335	# process maths mode before accents... things like \, mean different!
336	# maths mode
337	$text =~ s@\$\$(.*?)\$\$
338	@ process_latex_math($html_markup,$1)
339	@xsge; # multi-line maths: $$ .... $$
340
341	$text =~ s@([^\\])\$(.*?[^\\])\$
342	@$1.process_latex_math($html_markup,$2)@xsge;
343
344
345	# is this an amstext environment, or just custom for that input file?
346	$text =~ s@\\begin\{(algorithm)}(.*?)\\end\{\1}@remove_equals($2)@ges;
347
348	# convert latex-style accented characters.
349	$self->latex_accents_to_utf8(\$text);
350
351	# replace quotes with utf-8
352
353	$text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (“)
354	$text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (”)
355	$text =~ s/`/\xe2\xc0\x98/g; # single left quote
356	$text =~ s/'/\xe2\xc0\x99/g; # single right quote
357
358	###### remove/replace latex commands ######
359	### commands that expand to something that gets displayed ###
360	$text =~ s~\\ldots~…~g;
361	$text =~ s~\\hrule~<hr/>\n~g;
362	$text =~ s~\\maketitle~ ~;
363	### space commands ###
364	$text =~ s~\\[vh]skip\s+\S+~~g;
365	$text =~ s~\\vspace\?{.?}~<div> </div>~g; # vertical space
366	$text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
367	$text =~ s~\\noindent\b~~g;
368	# newpage, etc
369	$text =~ s~\\(?:clearemptydoublepage\|newpage)~~g;
370	### counters, contents, environments, labels, etc ###
371	$text =~ s~\\(?:addcontentsline)\{.?}\{.?}\{.*}~~g;
372	$text =~ s~\s\\begin\{itemize}\s~\n<ul>\n~g;
373	$text =~ s~\s\\end\{itemize}\s~</li></ul>\n~g;
374	$text =~ s~\s\\begin\{enumerate}\s~<ol>\n~g;
375	$text =~ s~\s\\end\{enumerate}\s~</li></ol>\n~g;
376	if ($text =~ s~\s*\\item~</li>\n<li>~g) {
377	# (count for first list item)
378	$text =~ s~<([ou])l>\s</li>\s~<$1l>~g;
379	}
380	$text =~ s~\\(?:label\|begin\|end)\{.?}\s\n?~ ~g; # remove tag and contents
381	$text =~ s~\\(?:tableofcontents\|listoffigures)~ ~g;
382	### font sizes/styles ###
383	$text =~ s~\\(?:tiny\|small\|footnotesize\|normalsize\|large\|Large\|huge\|Huge)\b~~g;
384
385	if ($html_markup) {
386	$text =~ s~\\section\?{([^\}]+)}\s\n?~<H1>$1</H1>\n~g;
387	$text =~ s~\\subsection\?{(.?)}\s*\n?~<H2>$1</H2>\n~g;
388	$text =~ s~{\\tt\s(.?)}~<tt>$1</tt>~g;
389	$text =~ s~\\(?:texttt\|tt\|ttseries)\s{(.?)}~<tt>$1</tt>~g;
390	$text =~ s~\\emph\{(.*?)}~<em>$1</em>~g;
391	$text =~ s~{\\(?:em\|it)\s(.?)}~<em>$1</em>~g;
392	$text =~ s~{\\(?:bf\|bfseries)\s(.?)}~<strong>$1</strong>~g;
393	$text =~ s~\\(?:textbf\|bf\|bfseries)\s{(.?)}~<strong>$1</strong>~g;
394	} else {
395	# remove tags for text-only
396	$text =~ s~\\(?:textbf\|bf\|bfseries\|em\|emph\|tt\|rm\|texttt)\b~ ~g;
397	}
398	$text =~ s ~ {\\sc\s+(.*?)} ~
399	{<span style="font-variant:\ small-caps">$1</span>} ~gx;
400	# ignore these font tags (if there are any left)
401	# sf is sans-serif
402	$text =~ s~\\(?:mdseries\|textmd\|bfseries\|textbf\|sffamily\|sf\|sc)\b~ ~;
403	#### end font-related stuff ####
404
405	### remove all other commands with optional arguments... ###
406	# don't remove commands without { }....
407	# $text =~ s~\\\w+(\[.?\])?\s~~g;
408	# $text =~ s~\\noopsort{[^}]+\}~~g;
409	# verbatim
410	$text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
411	# remove tags, keep contents for \tag[optional]{contents}
412	while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
413
414	# remove latex groupings { } (but not \{ or \} )
415	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
416	$text =~ s/^\{//; # remove { if first char
417
418	# latex characters
419	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
420	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
421	# optional break "\-"
422	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
423	# the non-macro bits have quotes around them - we just remove quotes
424	# XXX bibtex and latex differ here (for the '#' char)...
425	$text =~ s/([^&])[\"\#]/$1/g;
426	}
427	# dashes. Convert (m\|n)-dash into single dash for html.
428	$text =~ s~\-\-+~\-~g;
429
430	# quoted { } chars
431	$text =~ s~\\\{~{~g;
432	$text =~ s~\\}~}~g;
433
434	# spaces
435	$text =~ s~\\ ~ ~g;
436
437	# finally to protect against macro language...
438	# greenstone-specific
439	$text =~ s~\[~&\#91;~g;
440	$text =~ s~\]~&\#93;~g;
441	$text =~ s~(?<!\\)([\\_])~\\$1~g;
442
443	if ($html_markup) {
444	$text =~ s~\n{2,}~\n</p>\n<p>~g;
445	return "<p>$text</p>";
446	}
447
448	return $text;
449	}
450
451	# only used by process_latex for \verb....
452	sub verb_text {
453	my $verbatim=shift;
454	$verbatim =~ s/([{}_])/\\$1/g;
455	return $verbatim;
456	}
457	# only used by process_latex_math
458	# returns a unicode char if applicable, otherwise ascii
459	sub math_fraction {
460	my $num=$1;
461	my $denom=$2;
462
463	if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
464	if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
465	if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
466	return "$num/$denom";
467	}
468
469	sub process_latex_math {
470
471	my $text = pop; # if given one or two args, this is the last one...
472	my $html_markup=pop; # if given two args, this is the first one else undef
473
474	$text =~ s~\\,~ ~g; # forces a space?
475	$text =~ s~\\infty~infinity~g; # or unicode 0x221E...
476
477	# use this one when more things can read 3-byte utf8 values like this!
478	# $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
479	$text =~ s~\\cup\b~ U ~g;
480
481	$text =~ s~\\frac\s*{(.+?)}\{(.+?)}~math_fraction($1,$2)~ge;
482
483	if ($html_markup) {
484	$text =~ s~\^\{(.*?)}~<sup>$1</sup>~g; # a^b superscript
485	$text =~ s~\^([^\{])~<sup>$1</sup>~g;
486	$text =~ s~\_\{(.*?)}~<sub>$1</sub>~g; # a_b subscript
487	$text =~ s~\_([^\{])~<sub>$1</sub>~g;
488
489	$text =~ s~\\ldots~…~g; # use html named entity for now
490
491	# put all other command names in italics for now
492	$text =~ s~\\([\w]+)~<i>$1</i> ~g;
493	}
494
495	# special cases, for some input files
496	if ($text =~ m~^\\\w+$~) {
497	$text="{" . $text . "}";
498	}
499
500	return $text;
501	}
502
503
504
505	sub latex_accents_to_utf8 {
506
507	# note - this is really ugly, but it works. There may be a prettier way
508	# of mapping latex accented chars to utf8, but we just brute force it here.
509	# Also, this isn't complete - not every single possible accented letter
510	# is in here yet, but most of the common ones are.
511
512	my %utf8_chars =
513	(
514	# acutes
515	'\'a' => chr(0xc3).chr(0xa1),
516	'\'c' => chr(0xc4).chr(0x87),
517	'\'e' => chr(0xc3).chr(0xa9),
518	'\'i' => chr(0xc3).chr(0xad),
519	'\'l' => chr(0xc3).chr(0xba),
520	'\'n' => chr(0xc3).chr(0x84),
521	'\'o' => chr(0xc3).chr(0xb3),
522	'\'r' => chr(0xc5).chr(0x95),
523	'\'s' => chr(0xc5).chr(0x9b),
524	'\'u' => chr(0xc3).chr(0xba),
525	'\'y' => chr(0xc3).chr(0xbd),
526	'\'z' => chr(0xc5).chr(0xba),
527	# graves
528	'`a' => chr(0xc3).chr(0xa0),
529	'`A' => chr(0xc3).chr(0x80),
530	'`e' => chr(0xc3).chr(0xa8),
531	'`E' => chr(0xc3).chr(0x88),
532	'`i' => chr(0xc3).chr(0xac),
533	'`I' => chr(0xc3).chr(0x8c),
534	'`o' => chr(0xc3).chr(0xb2),
535	'`O' => chr(0xc3).chr(0x92),
536	'`u' => chr(0xc3).chr(0xb9),
537	'`U' => chr(0xc3).chr(0x99),
538	# circumflex
539	'^a' => chr(0xc3).chr(0xa2),
540	'^A' => chr(0xc3).chr(0x82),
541	'^c' => chr(0xc4).chr(0x89),
542	'^C' => chr(0xc4).chr(0x88),
543	'^e' => chr(0xc3).chr(0xaa),
544	'^E' => chr(0xc3).chr(0x8a),
545	'^g' => chr(0xc4).chr(0x9d),
546	'^G' => chr(0xc4).chr(0x9c),
547	'^h' => chr(0xc4).chr(0xa5),
548	'^H' => chr(0xc4).chr(0xa4),
549	'^i' => chr(0xc3).chr(0xae),
550	'^I' => chr(0xc3).chr(0x8e),
551	'^j' => chr(0xc4).chr(0xb5),
552	'^J' => chr(0xc4).chr(0xb4),
553	'^o' => chr(0xc3).chr(0xb4),
554	'^O' => chr(0xc3).chr(0x94),
555	'^s' => chr(0xc5).chr(0x9d),
556	'^S' => chr(0xc5).chr(0x9c),
557	'^u' => chr(0xc3).chr(0xa2),
558	'^U' => chr(0xc3).chr(0xbb),
559	'^w' => chr(0xc5).chr(0xb5),
560	'^W' => chr(0xc5).chr(0xb4),
561	'^y' => chr(0xc5).chr(0xb7),
562	'^Y' => chr(0xc5).chr(0xb6),
563
564	# diaeresis
565	'"a' => chr(0xc3).chr(0xa4),
566	'"A' => chr(0xc3).chr(0x84),
567	'"e' => chr(0xc3).chr(0xab),
568	'"E' => chr(0xc3).chr(0x8b),
569	'"i' => chr(0xc3).chr(0xaf),
570	'"I' => chr(0xc3).chr(0x8f),
571	'"\\\\i' => chr(0xc3).chr(0xaf),
572	'"\\\\I' => chr(0xc3).chr(0x8f),
573	'"o' => chr(0xc3).chr(0xb6),
574	'"O' => chr(0xc3).chr(0x96),
575	'"u' => chr(0xc3).chr(0xbc),
576	'"U' => chr(0xc3).chr(0x9c),
577	'"y' => chr(0xc3).chr(0xbf),
578	'"Y' => chr(0xc3).chr(0xb8),
579	# tilde
580	'~A' => chr(0xc3).chr(0x83),
581	'~N' => chr(0xc3).chr(0x91),
582	'~O' => chr(0xc3).chr(0x95),
583	'~a' => chr(0xc3).chr(0xa3),
584	'~n' => chr(0xc3).chr(0xb1),
585	'~o' => chr(0xc3).chr(0xb5),
586	# caron - handled specially
587	# double acute
588	# ring
589	# dot
590	'.c' => chr(0xc4).chr(0x8b),
591	'.C' => chr(0xc4).chr(0x8a),
592	'.e' => chr(0xc4).chr(0x97),
593	'.E' => chr(0xc4).chr(0x96),
594	'.g' => chr(0xc4).chr(0xa1),
595	'.G' => chr(0xc4).chr(0xa0),
596	'.I' => chr(0xc4).chr(0xb0),
597	'.z' => chr(0xc5).chr(0xbc),
598	'.Z' => chr(0xc5).chr(0xbb),
599	# macron
600	'=a' => chr(0xc4).chr(0x81),
601	'=A' => chr(0xc4).chr(0x80),
602	'=e' => chr(0xc4).chr(0x93),
603	'=E' => chr(0xc4).chr(0x92),
604	'=i' => chr(0xc4).chr(0xab),
605	'=I' => chr(0xc4).chr(0xaa),
606	'=o' => chr(0xc4).chr(0x8d),
607	'=O' => chr(0xc4).chr(0x8c),
608	'=u' => chr(0xc4).chr(0xab),
609	'=U' => chr(0xc4).chr(0xaa),
610
611	# stroke - handled specially - see below
612
613	# cedilla - handled specially
614	);
615
616	# these are one letter latex commands - we make sure they're not a longer
617	# command name. eg {\d} is d+stroke, so careful of \d
618	my %special_utf8_chars =
619	(
620	# breve
621	'u g' => chr(0xc4).chr(0x9f),
622	'u G' => chr(0xc4).chr(0x9e),
623	'u i' => chr(0xc4).chr(0xad),
624	'u I' => chr(0xc4).chr(0xac),
625	'u o' => chr(0xc5).chr(0x8f),
626	'u O' => chr(0xc5).chr(0x8e),
627	'u u' => chr(0xc5).chr(0xad),
628	'u U' => chr(0xc5).chr(0xac),
629	'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
630	'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
631	# caron
632	'v c' => chr(0xc4).chr(0x8d),
633	'v C' => chr(0xc4).chr(0x8c),
634	'v n' => chr(0xc5).chr(0x88),
635	'v N' => chr(0xc5).chr(0x87),
636	'v s' => chr(0xc5).chr(0xa1),
637	'v S' => chr(0xc5).chr(0xa5),
638	'v z' => chr(0xc5).chr(0xbe),
639	'v Z' => chr(0xc5).chr(0xbd),
640	# cedilla
641	'c c' => chr(0xc3).chr(0xa7),
642	'c C' => chr(0xc3).chr(0x87),
643	'c g' => chr(0xc4).chr(0xa3),
644	'c G' => chr(0xc4).chr(0xa2),
645	'c k' => chr(0xc4).chr(0xb7),
646	'c K' => chr(0xc4).chr(0xb6),
647	'c l' => chr(0xc4).chr(0xbc),
648	'c L' => chr(0xc4).chr(0xbb),
649	'c n' => chr(0xc5).chr(0x86),
650	'c N' => chr(0xc5).chr(0x85),
651	'c r' => chr(0xc5).chr(0x97),
652	'c R' => chr(0xc5).chr(0x96),
653	'c s' => chr(0xc5).chr(0x9f),
654	'c S' => chr(0xc5).chr(0x9e),
655	'c t' => chr(0xc5).chr(0xa3),
656	'c T' => chr(0xc5).chr(0xa2),
657	# double acute / Hungarian accent
658	'H O' => chr(0xc5).chr(0x90),
659	'H o' => chr(0xc5).chr(0x91),
660	'H U' => chr(0xc5).chr(0xb0),
661	'H u' => chr(0xc5).chr(0xb1),
662
663	# stroke
664	'd' => chr(0xc4).chr(0x91),
665	'D' => chr(0xc4).chr(0x90),
666	'h' => chr(0xc4).chr(0xa7),
667	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
668	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
669	'l' => chr(0xc5).chr(0x82),
670	'L' => chr(0xc5).chr(0x81),
671	'o' => chr(0xc3).chr(0xb8),
672	'O' => chr(0xc3).chr(0x98),
673	't' => chr(0xc5).chr(0xa7),
674	'T' => chr(0xc5).chr(0xa6),
675
676	# other special chars
677	'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
678	'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
679	);
680
681	my $self=shift;
682	my $textref=shift;
683
684	my $outhandle=$self->{'outhandle'};
685	my $text=$$textref;
686
687	# remove space (if any) between \ and letter to accent (eg {\' a})
688	$text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
689
690	# remove {} around a single character (eg \'{e})
691	$text =~ s!(\\[`'="^~\.])\{(\w)}!{$1$2}!g; # for emacs indenting... `]);
692
693	## only in bibtex... not in latex proper?!
694	### \, is another way of doing cedilla \c
695	##$text =~ s~\\,(.)~\\c $1~g;
696
697	# remove {} around a single character for special 1 letter commands -
698	# need to insert a space. Eg \v{s} -> {\v s}
699	$text =~ s~(\\[uvcH])\{(\w)}~{$1 $2}~g;
700
701	# only do if the text contains a '\' character.
702	if ($text =~ m\|\\\|) {
703	# "normal" accents - ie non-alpha latex tag
704	# xxx used to have ([\w]\b)@ (for word boundary)
705	while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
706	my $tex="$1$2"; my $char="$2";
707	my $replacement=$utf8_chars{$tex};
708	if (!defined($replacement)) {
709	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
710	print $outhandle . $self->{'plugin_type'} .
711	": Warning: unknown latex accent \"$tex\""
712	. " in \"$1\"\n";
713	$replacement=$char;
714	}
715	$text =~ s/\\\Q$tex/$replacement/g;
716	}
717
718	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
719	# only do the change if immediately followed by a space, }, {, or \
720	# one letter accents ( + ss / aa)
721	while ($text =~ m~\\([DdhiLlOoTt]\|ss\|aa)[{}\s\"\\]~) {
722	my $tex=$1;
723	my $replacement=$special_utf8_chars{$tex};
724	if (!defined($replacement)) {
725	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
726	print $outhandle $self->{'plugin_type'} .
727	": Warning: unknown latex accent \"$tex\""
728	. " in \"$1\"\n";
729	$replacement=$tex;
730	}
731	($text =~ s/{\\$tex}/$replacement/g) or
732	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
733
734	}
735
736	# one letter latex accent commands that affect following letter
737	while ($text =~ m~\\([uvcH]) ([\w])~) {
738	my $tex="$1 $2"; my $char="$2";
739	my $replacement=$special_utf8_chars{$tex};
740	if (!defined($replacement)) {
741	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
742	print $outhandle $self->{'plugin_type'} .
743	": Warning: unknown latex accent \"$tex\""
744	. " in \"$1\"\n";
745	$replacement=$char;
746	}
747	$text =~ s/\\$tex/$replacement/g;
748	}
749	}
750	$textref=\$text;
751	}
752
753
754	# modules must return true
755	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugins/LaTeXPlugin.pm

Download in other formats: