Context Navigation

source: main/trunk/greenstone2/perllib/plugins/LaTeXPlugin.pm@ 31998

Last change on this file since 31998 was 31780, checked in by ak19, 7 years ago
When testing GS3.08's GLI on Ubuntu v 16.04, found its perl v 5.22.1 issued warnings about unescaped left brace in regex. As per https://unix.stackexchange.com/questions/238539/automake-error-unescaped-left-brace-in-regex-is-deprecated, in even later versions of perl, this counts as a syntax error rather than resulting in a warning. So making the minimum number of changes that allowed perl to parse the plugins and open GLI successfully, without issuing warnings on perl 5.22.1 on this Ubuntu 16.04.
Property svn:keywords set to `Author Date Id Revision`
File size: 23.0 KB

Rev	Line
[7541]	1	###########################################################################
	2	#
[15872]	3	# LaTeXPlugin.pm
[7541]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Written by John McPherson
	10	# Copyright (C) 2004 New Zealand Digital Library Project
	11	#
	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	###########################################################################
	23
[8098]	24	# todo:
	25	# \includegraphics
	26	# parse/remove tex \if ... macros
[7541]	27
[15872]	28	package LaTeXPlugin;
[7541]	29
[10218]	30	# System complains about $arguments if the strict is set
[10254]	31	use strict;
	32	no strict 'refs'; # so we can print to a handle named by a variable
[7541]	33
	34	# greenstone packages
[15872]	35	use ReadTextFile;
[24548]	36	use MetadataRead;
[7541]	37	use unicode;
[7559]	38	use util;
[7541]	39
[11390]	40	my $arguments =
	41	[ { 'name' => "process_exp",
[31492]	42	'desc' => "{BaseImporter.process_exp}",
[11390]	43	'type' => "regexp",
	44	'reqd' => "no",
	45	'deft' => &get_default_process_exp() } ];
	46
[16013]	47	my $options = { 'name' => "LaTeXPlugin",
	48	'desc' => "{LaTeXPlugin.desc}",
	49	'abstract' => "no",
	50	'inherits' => "yes",
[11390]	51	'args' => $arguments };
	52
[7541]	53	sub BEGIN {
[24548]	54	@LaTeXPlugin::ISA = ('MetadataRead', 'ReadTextFile');
[7541]	55	}
	56
	57	sub new {
[10218]	58	my ($class) = shift (@_);
	59	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	60	push(@$pluginlist, $class);
[7541]	61
[15872]	62	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	63	push(@{$hashArgOptLists->{"OptList"}},$options);
[7541]	64
[15872]	65	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
[7541]	66
	67	$self->{'aux_files'} = {};
	68	$self->{'dir_num'} = 0;
	69	$self->{'file_num'} = 0;
	70	return bless $self, $class;
	71	}
	72
	73
	74	sub get_default_process_exp {
	75	my $self = shift (@_);
	76	return q^\.tex$^;
	77	}
	78
	79	sub get_default_block_exp {
	80	# assume any .eps files are part of the latex stuff
	81	return '\.(?:eps)$';
	82	}
	83
	84
	85	sub process {
	86	my $self = shift (@_);
	87	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
	88
	89	my $start=substr($$textref, 0, 200); # first 200 bytes
	90
	91	if ($start !~ m~\\ (?:documentclass \| documentstyle \| input \| section
	92	\| chapter \| contents \| begin) ~x) {
	93	# this doesn't look like latex...
	94	return undef;
	95	}
	96	my $outhandle = $self->{'outhandle'};
[16104]	97
[7541]	98	my $cursection = $doc_obj->get_top_section();
	99
	100	###### clean up text ######
	101	$$textref =~ s/\r$//mg; # remove dos ^M
	102	$$textref =~ s/%.*$//mg; # remove comments
	103
	104	# convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
	105
	106	$$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
	107	# check that both sides are ascii, so we don't screw up utf-8 chars
	108	$$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
	109	unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
	110
	111
	112	###### find metadata ######
	113
[8121]	114	## FileFormat metadata ##
	115	$doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
	116
[7541]	117	### title metadata ###
	118	$$textref =~ m@\\title\s{(.?)}@s;
	119	my $title = $1;
[8098]	120	if (!$title) {
	121	# no title tag. look for a chapter/section heading
	122	$$textref =~ m@\\(?:chapter\|section)\s{(.?)}@s; # will get 1st match
	123	$title = $1;
	124	}
	125	if (!$title) {
	126	# no chapter/section heading tags either... use filename
	127	$title = $file;
	128	$title =~ s/\.tex$//i;
	129	$title =~ s/[-_.]/ /g; # turn punctuation into spaces
	130	}
[7541]	131	if ($title) {
	132	$title =~ s@\\\\@ @g; # embedded newlines
	133	$title = $self->process_latex($title); # no "-html" for title eg in browser
	134	$doc_obj->add_utf8_metadata($cursection, "Title", $title);
	135	}
	136
	137	### creator/individual author metadata ###
	138	$$textref =~ m@\\author\s{((?:{.?}\|.?)+)}\s$@sm;
	139	my $authors=$1;
	140	if ($authors) {
	141	# take care of "et al."...
	142	$authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
	143	my $etal=$1;
	144	$etal="" if (!defined ($etal));
	145
	146	my @authorlist=parse_authors($self, $authors);
	147
	148	foreach my $author (@authorlist) {
	149	# Add each name to set of Authors
	150	$doc_obj->add_utf8_metadata ($cursection, "Author", $author);
	151	}
	152
	153	# Only want at most one "and" in the Creator field
	154	my $creator_str="";
	155	if (scalar(@authorlist) > 2) {
	156	my $lastauthor=pop @authorlist;
	157	$creator_str=join(', ', @authorlist);
	158	$creator_str.=" and $lastauthor";
	159	} else { # 1 or 2 authors...
	160	$creator_str=join(" and ",@authorlist);
	161	}
	162	$creator_str.=$etal; # if there was "et al."
	163	$doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
	164	}
	165	### end of author metadata ###
	166
	167	###### process latex for the main text ######
[31780]	168	$$textref =~ s/^.*?\\begin\{document}//s;
	169	$$textref =~ s/\\end\{document}.*?$//s;
[7541]	170	$$textref = $self->process_latex("-html",$$textref);
	171	$doc_obj->add_utf8_text($cursection, $$textref);
	172
	173	return 1;
	174	}
	175
	176
	177	# returns a list of author names
	178	sub parse_authors {
	179	my $self=shift;
	180	my $authors=shift;
	181
	182	my $outhandle=$self->{'outhandle'};
	183
	184	$authors =~ s/\n/ /g; # remove newlines
	185
	186	# some people do this for affiliation footnote/dagger
	187	$authors =~ s@\$.*?\$@@g; # remove maths from author :(
	188
	189	# und here for german language...
	190	# don't use brackets in pattern, else the matched bit becomes
	191	# an element in the list!
	192	my @authorlist = split(/\s+and\s+\|\s+und\s+/, $authors);
	193	my @formattedlist = ();
	194	foreach my $author (@authorlist) {
	195	$author =~ s/\s*$//;
	196	$author =~ s/^\s*//;
	197	# Reformat and add author name
	198	next if $author=~ /^\s*$/;
	199
	200	# names are "First von Last", "von Last, First"
	201	# or "von Last, Jr, First". See the "BibTeXing" manual, page 16
	202	my $first="";
	203	my $vonlast="";
	204	my $jr="";
	205
	206	if ($author =~ /,/) {
	207	my @parts=split(/,\s*/, $author);
	208	$first = pop @parts;
	209	if (scalar(@parts) == 2) {
	210	$jr = pop @parts;
	211	}
	212	$vonlast=shift @parts;
	213	if (scalar(@parts) > 0) {
	214	print $outhandle $self->{'plugin_type'} .
	215	": couldn't parse name $author\n";
	216	# but we continue anyway...
	217	}
	218	} else { # First von Last
	219	my @words = split(/ /, $author);
	220	while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
	221	$first .= " " . shift (@words);
	222	}
	223	$first =~ s/^\s//;
	224	$vonlast = join (' ', @words); # whatever's left...
	225	}
	226	my $von="";
	227	my $last="";
	228	if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
	229	$vonlast =~ s/^(([a-z]\w+\s+)+)//;
	230	$von = $1;
	231	if (!defined ($von)) {
	232	# some non-English names do start with lowercase
	233	# eg "Marie desJardins". Also we can get typos...
	234	print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
	235	$von="";
	236	if ($vonlast =~ /^[a-z]+$/) {
	237	# if it's all lowercase, uppercase 1st.
	238	$vonlast =~ s/^(.)/\u$1/;
	239	}
	240	}
	241	$von =~ s/\s*$//;
	242	$last=$vonlast;
	243	} else {
	244	$last=$vonlast;
	245	}
	246	my $wholename="$first $von $last $jr";
	247	$wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
	248	# my $fullname = "$last";
	249	# $fullname .= " $jr" if ($jr);
	250	# $fullname .= ", $first";
	251	# $fullname .= " $von" if ($von);
	252	push (@formattedlist, $wholename);
	253	}
	254	return @formattedlist;
	255	}
	256
	257
	258	## following functions based on bibtex plugin ##
	259	# not actually used at the moment, but might be useful in future?
	260	sub expand_month {
	261	my $text=shift;
	262
	263	# bibtex style files expand abbreviations for months.
	264	# Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
	265	$text =~ s/jan/_textmonth01_/g;
	266	$text =~ s/feb/_textmonth02_/g;
	267	$text =~ s/mar/_textmonth03_/g;
	268	$text =~ s/apr/_textmonth04_/g;
	269	$text =~ s/may/_textmonth05_/g;
	270	$text =~ s/jun/_textmonth06_/g;
	271	$text =~ s/jul/_textmonth07_/g;
	272	$text =~ s/aug/_textmonth08_/g;
	273	$text =~ s/sep/_textmonth09_/g;
	274	$text =~ s/oct/_textmonth10_/g;
	275	$text =~ s/nov/_textmonth11_/g;
	276	$text =~ s/dec/_textmonth12_/g;
	277
	278	return $text;
	279	}
	280
	281
	282	# If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
	283	# as the first argument to this function.
	284	#
	285	# Convert accented characters, remove { }, interprete some commands....
	286	# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
	287
	288	# Also, it sucks quite a bit for complicated/nested commands since it doesn't
	289	# match { with the corresponding }, only the nearest }
	290
	291	sub process_latex {
	292	my $self=shift;
	293	my $text=shift;
	294
	295	my $outhandle=$self->{'outhandle'};
	296
	297	my $html_markup=0;
	298	if ($text =~ /^\-html/) {
	299	$html_markup=1;
	300	$text=shift;
	301	}
	302
	303	if (! $text) {
	304	return $text;
	305	}
	306	# escape html-sensitive characters
	307	$text =~ s@&@&@g;
	308	$text =~ s@<@<@g;
	309	$text =~ s@>@>@g;
	310
	311	# do this before accents, since \= means something different in tabbing
	312	# also \> is a tab stop too, and \\ is newline
	313	sub do_tabbing {
	314	my $tabbing=shift;
	315	$tabbing =~ s!^.\\kill\s$!!mg; # \kill sets tab stops, kills line
	316	$tabbing =~ s~\\(?:=\|>)~\xc2\xa0~g; # replace with nbsp
	317	$tabbing =~ s~[\\][\\](?:\[.?\])?\s$~<br/>~mg;
	318	return "<br/>" . $tabbing . "<br/>\n";
	319	}
[31780]	320	$text =~ s@\\begin\{tabbing}(.*?)\\end\{tabbing}@do_tabbing($1)@ges;
[7541]	321	sub do_tabular {
	322	my $tabular=shift;
	323	$tabular =~ s~(?<!\\)\s&\s~</td><td>~g;
	324	$tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
	325	$tabular =~ s~\\hline~~g; # for now...
[31780]	326	$tabular =~ s~<td>\s\\multicolumn\{(\d+)}\{.?}~<td colspan="$1">~g;
[7541]	327	return "<table border=\"1\">\n <tr><td>"
	328	. $tabular . "</td></tr></table>\n";
	329	}
[31780]	330	$text =~ s@\\begin\{tabular}(?:\[.?\])?{.?}(.*?)\\end\{tabular} @
[7541]	331	do_tabular($1) @xges;
	332
	333	$text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
	334
	335	# process maths mode before accents... things like \, mean different!
	336	# maths mode
	337	$text =~ s@\$\$(.*?)\$\$
	338	@ process_latex_math($html_markup,$1)
	339	@xsge; # multi-line maths: $$ .... $$
	340
	341	$text =~ s@([^\\])\$(.*?[^\\])\$
	342	@$1.process_latex_math($html_markup,$2)@xsge;
	343
	344
	345	# is this an amstext environment, or just custom for that input file?
[31780]	346	$text =~ s@\\begin\{(algorithm)}(.*?)\\end\{\1}@remove_equals($2)@ges;
[7541]	347
	348	# convert latex-style accented characters.
	349	$self->latex_accents_to_utf8(\$text);
	350
	351	# replace quotes with utf-8
	352
	353	$text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (“)
	354	$text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (”)
	355	$text =~ s/`/\xe2\xc0\x98/g; # single left quote
	356	$text =~ s/'/\xe2\xc0\x99/g; # single right quote
	357
	358	###### remove/replace latex commands ######
	359	### commands that expand to something that gets displayed ###
	360	$text =~ s~\\ldots~…~g;
	361	$text =~ s~\\hrule~<hr/>\n~g;
	362	$text =~ s~\\maketitle~ ~;
	363	### space commands ###
	364	$text =~ s~\\[vh]skip\s+\S+~~g;
	365	$text =~ s~\\vspace\?{.?}~<div> </div>~g; # vertical space
	366	$text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
	367	$text =~ s~\\noindent\b~~g;
	368	# newpage, etc
	369	$text =~ s~\\(?:clearemptydoublepage\|newpage)~~g;
	370	### counters, contents, environments, labels, etc ###
[31780]	371	$text =~ s~\\(?:addcontentsline){.?}\{.?}\{.*}~~g;
	372	$text =~ s~\s\\begin\{itemize}\s~\n<ul>\n~g;
	373	$text =~ s~\s\\end\{itemize}\s~</li></ul>\n~g;
	374	$text =~ s~\s\\begin\{enumerate}\s~<ol>\n~g;
	375	$text =~ s~\s\\end\{enumerate}\s~</li></ol>\n~g;
[7541]	376	if ($text =~ s~\s*\\item~</li>\n<li>~g) {
	377	# (count for first list item)
	378	$text =~ s~<([ou])l>\s</li>\s~<$1l>~g;
	379	}
	380	$text =~ s~\\(?:label\|begin\|end){.?}\s\n?~ ~g; # remove tag and contents
	381	$text =~ s~\\(?:tableofcontents\|listoffigures)~ ~g;
	382	### font sizes/styles ###
	383	$text =~ s~\\(?:tiny\|small\|footnotesize\|normalsize\|large\|Large\|huge\|Huge)\b~~g;
	384
	385	if ($html_markup) {
	386	$text =~ s~\\section\?{([^\}]+)}\s\n?~<H1>$1</H1>\n~g;
	387	$text =~ s~\\subsection\?{(.?)}\s*\n?~<H2>$1</H2>\n~g;
	388	$text =~ s~{\\tt\s(.?)}~<tt>$1</tt>~g;
	389	$text =~ s~\\(?:texttt\|tt\|ttseries)\s{(.?)}~<tt>$1</tt>~g;
[31780]	390	$text =~ s~\\emph\{(.*?)}~<em>$1</em>~g;
[8098]	391	$text =~ s~{\\(?:em\|it)\s(.?)}~<em>$1</em>~g;
[7541]	392	$text =~ s~{\\(?:bf\|bfseries)\s(.?)}~<strong>$1</strong>~g;
	393	$text =~ s~\\(?:textbf\|bf\|bfseries)\s{(.?)}~<strong>$1</strong>~g;
	394	} else {
	395	# remove tags for text-only
	396	$text =~ s~\\(?:textbf\|bf\|bfseries\|em\|emph\|tt\|rm\|texttt)\b~ ~g;
	397	}
	398	$text =~ s ~ {\\sc\s+(.*?)} ~
	399	{<span style="font-variant:\ small-caps">$1</span>} ~gx;
	400	# ignore these font tags (if there are any left)
	401	# sf is sans-serif
	402	$text =~ s~\\(?:mdseries\|textmd\|bfseries\|textbf\|sffamily\|sf\|sc)\b~ ~;
	403	#### end font-related stuff ####
	404
	405	### remove all other commands with optional arguments... ###
	406	# don't remove commands without { }....
	407	# $text =~ s~\\\w+(\[.?\])?\s~~g;
	408	# $text =~ s~\\noopsort{[^}]+\}~~g;
	409	# verbatim
	410	$text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
	411	# remove tags, keep contents for \tag[optional]{contents}
	412	while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
	413
	414	# remove latex groupings { } (but not \{ or \} )
	415	while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
	416	$text =~ s/^\{//; # remove { if first char
	417
	418	# latex characters
	419	# spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
	420	$text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
	421	# optional break "\-"
	422	if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
	423	# the non-macro bits have quotes around them - we just remove quotes
	424	# XXX bibtex and latex differ here (for the '#' char)...
	425	$text =~ s/([^&])[\"\#]/$1/g;
	426	}
	427	# dashes. Convert (m\|n)-dash into single dash for html.
	428	$text =~ s~\-\-+~\-~g;
	429
	430	# quoted { } chars
[31780]	431	$text =~ s~\\\{~{~g;
[7541]	432	$text =~ s~\\}~}~g;
	433
	434	# spaces
	435	$text =~ s~\\ ~ ~g;
	436
	437	# finally to protect against macro language...
	438	# greenstone-specific
	439	$text =~ s~\[~&\#91;~g;
	440	$text =~ s~\]~&\#93;~g;
	441	$text =~ s~(?<!\\)([\\_])~\\$1~g;
	442
	443	if ($html_markup) {
	444	$text =~ s~\n{2,}~\n</p>\n<p>~g;
	445	return "<p>$text</p>";
	446	}
	447
	448	return $text;
	449	}
	450
	451	# only used by process_latex for \verb....
	452	sub verb_text {
	453	my $verbatim=shift;
	454	$verbatim =~ s/([{}_])/\\$1/g;
	455	return $verbatim;
	456	}
[8098]	457	# only used by process_latex_math
	458	# returns a unicode char if applicable, otherwise ascii
	459	sub math_fraction {
	460	my $num=$1;
	461	my $denom=$2;
[7541]	462
[8098]	463	if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
	464	if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
	465	if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
	466	return "$num/$denom";
	467	}
[7541]	468
	469	sub process_latex_math {
	470
	471	my $text = pop; # if given one or two args, this is the last one...
	472	my $html_markup=pop; # if given two args, this is the first one else undef
	473
	474	$text =~ s~\\,~ ~g; # forces a space?
	475	$text =~ s~\\infty~infinity~g; # or unicode 0x221E...
	476
	477	# use this one when more things can read 3-byte utf8 values like this!
	478	# $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
	479	$text =~ s~\\cup\b~ U ~g;
	480
[31780]	481	$text =~ s~\\frac\s*{(.+?)}\{(.+?)}~math_fraction($1,$2)~ge;
[8098]	482
[7541]	483	if ($html_markup) {
[31780]	484	$text =~ s~\^\{(.*?)}~<sup>$1</sup>~g; # a^b superscript
[7541]	485	$text =~ s~\^([^\{])~<sup>$1</sup>~g;
[31780]	486	$text =~ s~\_\{(.*?)}~<sub>$1</sub>~g; # a_b subscript
[7541]	487	$text =~ s~\_([^\{])~<sub>$1</sub>~g;
	488
	489	$text =~ s~\\ldots~…~g; # use html named entity for now
	490
	491	# put all other command names in italics for now
	492	$text =~ s~\\([\w]+)~<i>$1</i> ~g;
	493	}
	494
	495	# special cases, for some input files
	496	if ($text =~ m~^\\\w+$~) {
	497	$text="{" . $text . "}";
	498	}
	499
	500	return $text;
	501	}
	502
	503
	504
	505	sub latex_accents_to_utf8 {
	506
	507	# note - this is really ugly, but it works. There may be a prettier way
	508	# of mapping latex accented chars to utf8, but we just brute force it here.
	509	# Also, this isn't complete - not every single possible accented letter
	510	# is in here yet, but most of the common ones are.
	511
	512	my %utf8_chars =
	513	(
	514	# acutes
	515	'\'a' => chr(0xc3).chr(0xa1),
	516	'\'c' => chr(0xc4).chr(0x87),
	517	'\'e' => chr(0xc3).chr(0xa9),
	518	'\'i' => chr(0xc3).chr(0xad),
	519	'\'l' => chr(0xc3).chr(0xba),
	520	'\'n' => chr(0xc3).chr(0x84),
	521	'\'o' => chr(0xc3).chr(0xb3),
	522	'\'r' => chr(0xc5).chr(0x95),
	523	'\'s' => chr(0xc5).chr(0x9b),
	524	'\'u' => chr(0xc3).chr(0xba),
	525	'\'y' => chr(0xc3).chr(0xbd),
	526	'\'z' => chr(0xc5).chr(0xba),
	527	# graves
	528	'`a' => chr(0xc3).chr(0xa0),
	529	'`A' => chr(0xc3).chr(0x80),
	530	'`e' => chr(0xc3).chr(0xa8),
	531	'`E' => chr(0xc3).chr(0x88),
	532	'`i' => chr(0xc3).chr(0xac),
	533	'`I' => chr(0xc3).chr(0x8c),
	534	'`o' => chr(0xc3).chr(0xb2),
	535	'`O' => chr(0xc3).chr(0x92),
	536	'`u' => chr(0xc3).chr(0xb9),
	537	'`U' => chr(0xc3).chr(0x99),
	538	# circumflex
	539	'^a' => chr(0xc3).chr(0xa2),
	540	'^A' => chr(0xc3).chr(0x82),
	541	'^c' => chr(0xc4).chr(0x89),
	542	'^C' => chr(0xc4).chr(0x88),
	543	'^e' => chr(0xc3).chr(0xaa),
	544	'^E' => chr(0xc3).chr(0x8a),
	545	'^g' => chr(0xc4).chr(0x9d),
	546	'^G' => chr(0xc4).chr(0x9c),
	547	'^h' => chr(0xc4).chr(0xa5),
	548	'^H' => chr(0xc4).chr(0xa4),
	549	'^i' => chr(0xc3).chr(0xae),
	550	'^I' => chr(0xc3).chr(0x8e),
	551	'^j' => chr(0xc4).chr(0xb5),
	552	'^J' => chr(0xc4).chr(0xb4),
	553	'^o' => chr(0xc3).chr(0xb4),
	554	'^O' => chr(0xc3).chr(0x94),
	555	'^s' => chr(0xc5).chr(0x9d),
	556	'^S' => chr(0xc5).chr(0x9c),
	557	'^u' => chr(0xc3).chr(0xa2),
	558	'^U' => chr(0xc3).chr(0xbb),
	559	'^w' => chr(0xc5).chr(0xb5),
	560	'^W' => chr(0xc5).chr(0xb4),
	561	'^y' => chr(0xc5).chr(0xb7),
	562	'^Y' => chr(0xc5).chr(0xb6),
	563
	564	# diaeresis
	565	'"a' => chr(0xc3).chr(0xa4),
	566	'"A' => chr(0xc3).chr(0x84),
	567	'"e' => chr(0xc3).chr(0xab),
	568	'"E' => chr(0xc3).chr(0x8b),
[8097]	569	'"i' => chr(0xc3).chr(0xaf),
	570	'"I' => chr(0xc3).chr(0x8f),
[7541]	571	'"\\\\i' => chr(0xc3).chr(0xaf),
	572	'"\\\\I' => chr(0xc3).chr(0x8f),
	573	'"o' => chr(0xc3).chr(0xb6),
	574	'"O' => chr(0xc3).chr(0x96),
	575	'"u' => chr(0xc3).chr(0xbc),
	576	'"U' => chr(0xc3).chr(0x9c),
	577	'"y' => chr(0xc3).chr(0xbf),
	578	'"Y' => chr(0xc3).chr(0xb8),
	579	# tilde
	580	'~A' => chr(0xc3).chr(0x83),
	581	'~N' => chr(0xc3).chr(0x91),
	582	'~O' => chr(0xc3).chr(0x95),
	583	'~a' => chr(0xc3).chr(0xa3),
	584	'~n' => chr(0xc3).chr(0xb1),
	585	'~o' => chr(0xc3).chr(0xb5),
	586	# caron - handled specially
	587	# double acute
	588	# ring
	589	# dot
	590	'.c' => chr(0xc4).chr(0x8b),
	591	'.C' => chr(0xc4).chr(0x8a),
	592	'.e' => chr(0xc4).chr(0x97),
	593	'.E' => chr(0xc4).chr(0x96),
	594	'.g' => chr(0xc4).chr(0xa1),
	595	'.G' => chr(0xc4).chr(0xa0),
	596	'.I' => chr(0xc4).chr(0xb0),
	597	'.z' => chr(0xc5).chr(0xbc),
	598	'.Z' => chr(0xc5).chr(0xbb),
	599	# macron
	600	'=a' => chr(0xc4).chr(0x81),
	601	'=A' => chr(0xc4).chr(0x80),
	602	'=e' => chr(0xc4).chr(0x93),
	603	'=E' => chr(0xc4).chr(0x92),
	604	'=i' => chr(0xc4).chr(0xab),
	605	'=I' => chr(0xc4).chr(0xaa),
	606	'=o' => chr(0xc4).chr(0x8d),
	607	'=O' => chr(0xc4).chr(0x8c),
	608	'=u' => chr(0xc4).chr(0xab),
	609	'=U' => chr(0xc4).chr(0xaa),
	610
	611	# stroke - handled specially - see below
	612
	613	# cedilla - handled specially
	614	);
	615
	616	# these are one letter latex commands - we make sure they're not a longer
	617	# command name. eg {\d} is d+stroke, so careful of \d
	618	my %special_utf8_chars =
	619	(
	620	# breve
	621	'u g' => chr(0xc4).chr(0x9f),
	622	'u G' => chr(0xc4).chr(0x9e),
	623	'u i' => chr(0xc4).chr(0xad),
	624	'u I' => chr(0xc4).chr(0xac),
	625	'u o' => chr(0xc5).chr(0x8f),
	626	'u O' => chr(0xc5).chr(0x8e),
	627	'u u' => chr(0xc5).chr(0xad),
	628	'u U' => chr(0xc5).chr(0xac),
	629	'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
	630	'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
	631	# caron
	632	'v c' => chr(0xc4).chr(0x8d),
	633	'v C' => chr(0xc4).chr(0x8c),
	634	'v n' => chr(0xc5).chr(0x88),
	635	'v N' => chr(0xc5).chr(0x87),
	636	'v s' => chr(0xc5).chr(0xa1),
	637	'v S' => chr(0xc5).chr(0xa5),
	638	'v z' => chr(0xc5).chr(0xbe),
	639	'v Z' => chr(0xc5).chr(0xbd),
	640	# cedilla
	641	'c c' => chr(0xc3).chr(0xa7),
	642	'c C' => chr(0xc3).chr(0x87),
	643	'c g' => chr(0xc4).chr(0xa3),
	644	'c G' => chr(0xc4).chr(0xa2),
	645	'c k' => chr(0xc4).chr(0xb7),
	646	'c K' => chr(0xc4).chr(0xb6),
	647	'c l' => chr(0xc4).chr(0xbc),
	648	'c L' => chr(0xc4).chr(0xbb),
	649	'c n' => chr(0xc5).chr(0x86),
	650	'c N' => chr(0xc5).chr(0x85),
	651	'c r' => chr(0xc5).chr(0x97),
	652	'c R' => chr(0xc5).chr(0x96),
	653	'c s' => chr(0xc5).chr(0x9f),
	654	'c S' => chr(0xc5).chr(0x9e),
	655	'c t' => chr(0xc5).chr(0xa3),
	656	'c T' => chr(0xc5).chr(0xa2),
	657	# double acute / Hungarian accent
	658	'H O' => chr(0xc5).chr(0x90),
	659	'H o' => chr(0xc5).chr(0x91),
	660	'H U' => chr(0xc5).chr(0xb0),
	661	'H u' => chr(0xc5).chr(0xb1),
	662
	663	# stroke
	664	'd' => chr(0xc4).chr(0x91),
	665	'D' => chr(0xc4).chr(0x90),
	666	'h' => chr(0xc4).chr(0xa7),
	667	# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
	668	'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
	669	'l' => chr(0xc5).chr(0x82),
	670	'L' => chr(0xc5).chr(0x81),
	671	'o' => chr(0xc3).chr(0xb8),
	672	'O' => chr(0xc3).chr(0x98),
	673	't' => chr(0xc5).chr(0xa7),
	674	'T' => chr(0xc5).chr(0xa6),
	675
	676	# other special chars
	677	'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
	678	'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
	679	);
	680
	681	my $self=shift;
	682	my $textref=shift;
	683
	684	my $outhandle=$self->{'outhandle'};
	685	my $text=$$textref;
	686
	687	# remove space (if any) between \ and letter to accent (eg {\' a})
	688	$text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
	689
	690	# remove {} around a single character (eg \'{e})
	691	$text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g; # for emacs indenting... `]);
	692
	693	## only in bibtex... not in latex proper?!
	694	### \, is another way of doing cedilla \c
	695	##$text =~ s~\\,(.)~\\c $1~g;
	696
	697	# remove {} around a single character for special 1 letter commands -
	698	# need to insert a space. Eg \v{s} -> {\v s}
	699	$text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
	700
	701	# only do if the text contains a '\' character.
	702	if ($text =~ m\|\\\|) {
	703	# "normal" accents - ie non-alpha latex tag
	704	# xxx used to have ([\w]\b)@ (for word boundary)
	705	while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
	706	my $tex="$1$2"; my $char="$2";
	707	my $replacement=$utf8_chars{$tex};
	708	if (!defined($replacement)) {
	709	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
	710	print $outhandle . $self->{'plugin_type'} .
	711	": Warning: unknown latex accent \"$tex\""
	712	. " in \"$1\"\n";
	713	$replacement=$char;
	714	}
	715	$text =~ s/\\\Q$tex/$replacement/g;
	716	}
	717
	718	# where the following letter matters (eg "sm\o rrebr\o d", \ss{})
	719	# only do the change if immediately followed by a space, }, {, or \
	720	# one letter accents ( + ss / aa)
	721	while ($text =~ m~\\([DdhiLlOoTt]\|ss\|aa)[{}\s\"\\]~) {
	722	my $tex=$1;
	723	my $replacement=$special_utf8_chars{$tex};
	724	if (!defined($replacement)) {
	725	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
	726	print $outhandle $self->{'plugin_type'} .
	727	": Warning: unknown latex accent \"$tex\""
	728	. " in \"$1\"\n";
	729	$replacement=$tex;
	730	}
	731	($text =~ s/{\\$tex}/$replacement/g) or
	732	$text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
	733
	734	}
	735
	736	# one letter latex accent commands that affect following letter
	737	while ($text =~ m~\\([uvcH]) ([\w])~) {
	738	my $tex="$1 $2"; my $char="$2";
	739	my $replacement=$special_utf8_chars{$tex};
	740	if (!defined($replacement)) {
	741	$text =~ m~(.{20}\\\Q$tex\E.{20})~s;
	742	print $outhandle $self->{'plugin_type'} .
	743	": Warning: unknown latex accent \"$tex\""
	744	. " in \"$1\"\n";
	745	$replacement=$char;
	746	}
	747	$text =~ s/\\$tex/$replacement/g;
	748	}
	749	}
	750	$textref=\$text;
	751	}
	752
	753
	754	# modules must return true
	755	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: