[7541] | 1 | ###########################################################################
|
---|
| 2 | #
|
---|
[15872] | 3 | # LaTeXPlugin.pm
|
---|
[7541] | 4 | #
|
---|
| 5 | # A component of the Greenstone digital library software
|
---|
| 6 | # from the New Zealand Digital Library Project at the
|
---|
| 7 | # University of Waikato, New Zealand.
|
---|
| 8 | #
|
---|
| 9 | # Written by John McPherson
|
---|
| 10 | # Copyright (C) 2004 New Zealand Digital Library Project
|
---|
| 11 | #
|
---|
| 12 | # This program is free software; you can redistribute it and/or modify
|
---|
| 13 | # it under the terms of the GNU General Public License as published by
|
---|
| 14 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 15 | # (at your option) any later version.
|
---|
| 16 | #
|
---|
| 17 | # This program is distributed in the hope that it will be useful,
|
---|
| 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 20 | # GNU General Public License for more details.
|
---|
| 21 | #
|
---|
| 22 | ###########################################################################
|
---|
| 23 |
|
---|
[8098] | 24 | # todo:
|
---|
| 25 | # \includegraphics
|
---|
| 26 | # parse/remove tex \if ... macros
|
---|
[7541] | 27 |
|
---|
[15872] | 28 | package LaTeXPlugin;
|
---|
[7541] | 29 |
|
---|
[10218] | 30 | # System complains about $arguments if the strict is set
|
---|
[10254] | 31 | use strict;
|
---|
| 32 | no strict 'refs'; # so we can print to a handle named by a variable
|
---|
[7541] | 33 |
|
---|
| 34 | # greenstone packages
|
---|
[15872] | 35 | use ReadTextFile;
|
---|
[24548] | 36 | use MetadataRead;
|
---|
[7541] | 37 | use unicode;
|
---|
[7559] | 38 | use util;
|
---|
[7541] | 39 |
|
---|
[11390] | 40 | my $arguments =
|
---|
| 41 | [ { 'name' => "process_exp",
|
---|
[31492] | 42 | 'desc' => "{BaseImporter.process_exp}",
|
---|
[11390] | 43 | 'type' => "regexp",
|
---|
| 44 | 'reqd' => "no",
|
---|
| 45 | 'deft' => &get_default_process_exp() } ];
|
---|
| 46 |
|
---|
[16013] | 47 | my $options = { 'name' => "LaTeXPlugin",
|
---|
| 48 | 'desc' => "{LaTeXPlugin.desc}",
|
---|
| 49 | 'abstract' => "no",
|
---|
| 50 | 'inherits' => "yes",
|
---|
[11390] | 51 | 'args' => $arguments };
|
---|
| 52 |
|
---|
[7541] | 53 | sub BEGIN {
|
---|
[24548] | 54 | @LaTeXPlugin::ISA = ('MetadataRead', 'ReadTextFile');
|
---|
[7541] | 55 | }
|
---|
| 56 |
|
---|
| 57 | sub new {
|
---|
[10218] | 58 | my ($class) = shift (@_);
|
---|
| 59 | my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
|
---|
| 60 | push(@$pluginlist, $class);
|
---|
[7541] | 61 |
|
---|
[15872] | 62 | push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
|
---|
| 63 | push(@{$hashArgOptLists->{"OptList"}},$options);
|
---|
[7541] | 64 |
|
---|
[15872] | 65 | my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
|
---|
[7541] | 66 |
|
---|
| 67 | $self->{'aux_files'} = {};
|
---|
| 68 | $self->{'dir_num'} = 0;
|
---|
| 69 | $self->{'file_num'} = 0;
|
---|
| 70 | return bless $self, $class;
|
---|
| 71 | }
|
---|
| 72 |
|
---|
| 73 |
|
---|
| 74 | sub get_default_process_exp {
|
---|
| 75 | my $self = shift (@_);
|
---|
| 76 | return q^\.tex$^;
|
---|
| 77 | }
|
---|
| 78 |
|
---|
| 79 | sub get_default_block_exp {
|
---|
| 80 | # assume any .eps files are part of the latex stuff
|
---|
| 81 | return '\.(?:eps)$';
|
---|
| 82 | }
|
---|
| 83 |
|
---|
| 84 |
|
---|
| 85 | sub process {
|
---|
| 86 | my $self = shift (@_);
|
---|
| 87 | my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
|
---|
| 88 |
|
---|
| 89 | my $start=substr($$textref, 0, 200); # first 200 bytes
|
---|
| 90 |
|
---|
| 91 | if ($start !~ m~\\ (?:documentclass | documentstyle | input | section
|
---|
| 92 | | chapter | contents | begin) ~x) {
|
---|
| 93 | # this doesn't look like latex...
|
---|
| 94 | return undef;
|
---|
| 95 | }
|
---|
| 96 | my $outhandle = $self->{'outhandle'};
|
---|
[16104] | 97 |
|
---|
[7541] | 98 | my $cursection = $doc_obj->get_top_section();
|
---|
| 99 |
|
---|
| 100 | ###### clean up text ######
|
---|
| 101 | $$textref =~ s/\r$//mg; # remove dos ^M
|
---|
| 102 | $$textref =~ s/%.*$//mg; # remove comments
|
---|
| 103 |
|
---|
| 104 | # convert to utf-8 if not already - assume non ascii => iso-8859-1/latin
|
---|
| 105 |
|
---|
| 106 | $$textref =~ s@(?<=[[:ascii:]])\xA0+@\xc2\xa0@g; # latin nonbreaking space
|
---|
| 107 | # check that both sides are ascii, so we don't screw up utf-8 chars
|
---|
| 108 | $$textref =~ s@ (?<=[[:ascii:]])([\x80-\xff])(?=[[:ascii:]]) @
|
---|
| 109 | unicode::ascii2utf8($1) @egx; # takes "extended ascii" (ie latin)
|
---|
| 110 |
|
---|
| 111 |
|
---|
| 112 | ###### find metadata ######
|
---|
| 113 |
|
---|
[8121] | 114 | ## FileFormat metadata ##
|
---|
| 115 | $doc_obj->add_metadata($cursection, "FileFormat", "LaTeX");
|
---|
| 116 |
|
---|
[7541] | 117 | ### title metadata ###
|
---|
| 118 | $$textref =~ m@\\title\s*{(.*?)}@s;
|
---|
| 119 | my $title = $1;
|
---|
[8098] | 120 | if (!$title) {
|
---|
| 121 | # no title tag. look for a chapter/section heading
|
---|
| 122 | $$textref =~ m@\\(?:chapter|section)\s*{(.*?)}@s; # will get 1st match
|
---|
| 123 | $title = $1;
|
---|
| 124 | }
|
---|
| 125 | if (!$title) {
|
---|
| 126 | # no chapter/section heading tags either... use filename
|
---|
| 127 | $title = $file;
|
---|
| 128 | $title =~ s/\.tex$//i;
|
---|
| 129 | $title =~ s/[-_.]/ /g; # turn punctuation into spaces
|
---|
| 130 | }
|
---|
[7541] | 131 | if ($title) {
|
---|
| 132 | $title =~ s@\\\\@ @g; # embedded newlines
|
---|
| 133 | $title = $self->process_latex($title); # no "-html" for title eg in browser
|
---|
| 134 | $doc_obj->add_utf8_metadata($cursection, "Title", $title);
|
---|
| 135 | }
|
---|
| 136 |
|
---|
| 137 | ### creator/individual author metadata ###
|
---|
| 138 | $$textref =~ m@\\author\s*{((?:{.*?}|.*?)+)}\s*$@sm;
|
---|
| 139 | my $authors=$1;
|
---|
| 140 | if ($authors) {
|
---|
| 141 | # take care of "et al."...
|
---|
| 142 | $authors =~ s/(\s+et\.?\s+al\.?)\s*$//;
|
---|
| 143 | my $etal=$1;
|
---|
| 144 | $etal="" if (!defined ($etal));
|
---|
| 145 |
|
---|
| 146 | my @authorlist=parse_authors($self, $authors);
|
---|
| 147 |
|
---|
| 148 | foreach my $author (@authorlist) {
|
---|
| 149 | # Add each name to set of Authors
|
---|
| 150 | $doc_obj->add_utf8_metadata ($cursection, "Author", $author);
|
---|
| 151 | }
|
---|
| 152 |
|
---|
| 153 | # Only want at most one "and" in the Creator field
|
---|
| 154 | my $creator_str="";
|
---|
| 155 | if (scalar(@authorlist) > 2) {
|
---|
| 156 | my $lastauthor=pop @authorlist;
|
---|
| 157 | $creator_str=join(', ', @authorlist);
|
---|
| 158 | $creator_str.=" and $lastauthor";
|
---|
| 159 | } else { # 1 or 2 authors...
|
---|
| 160 | $creator_str=join(" and ",@authorlist);
|
---|
| 161 | }
|
---|
| 162 | $creator_str.=$etal; # if there was "et al."
|
---|
| 163 | $doc_obj->add_utf8_metadata($cursection, "Creator", $creator_str);
|
---|
| 164 | }
|
---|
| 165 | ### end of author metadata ###
|
---|
| 166 |
|
---|
| 167 | ###### process latex for the main text ######
|
---|
[31780] | 168 | $$textref =~ s/^.*?\\begin\{document}//s;
|
---|
| 169 | $$textref =~ s/\\end\{document}.*?$//s;
|
---|
[7541] | 170 | $$textref = $self->process_latex("-html",$$textref);
|
---|
| 171 | $doc_obj->add_utf8_text($cursection, $$textref);
|
---|
| 172 |
|
---|
| 173 | return 1;
|
---|
| 174 | }
|
---|
| 175 |
|
---|
| 176 |
|
---|
| 177 | # returns a list of author names
|
---|
| 178 | sub parse_authors {
|
---|
| 179 | my $self=shift;
|
---|
| 180 | my $authors=shift;
|
---|
| 181 |
|
---|
| 182 | my $outhandle=$self->{'outhandle'};
|
---|
| 183 |
|
---|
| 184 | $authors =~ s/\n/ /g; # remove newlines
|
---|
| 185 |
|
---|
| 186 | # some people do this for affiliation footnote/dagger
|
---|
| 187 | $authors =~ s@\$.*?\$@@g; # remove maths from author :(
|
---|
| 188 |
|
---|
| 189 | # und here for german language...
|
---|
| 190 | # don't use brackets in pattern, else the matched bit becomes
|
---|
| 191 | # an element in the list!
|
---|
| 192 | my @authorlist = split(/\s+and\s+|\s+und\s+/, $authors);
|
---|
| 193 | my @formattedlist = ();
|
---|
| 194 | foreach my $author (@authorlist) {
|
---|
| 195 | $author =~ s/\s*$//;
|
---|
| 196 | $author =~ s/^\s*//;
|
---|
| 197 | # Reformat and add author name
|
---|
| 198 | next if $author=~ /^\s*$/;
|
---|
| 199 |
|
---|
| 200 | # names are "First von Last", "von Last, First"
|
---|
| 201 | # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
|
---|
| 202 | my $first="";
|
---|
| 203 | my $vonlast="";
|
---|
| 204 | my $jr="";
|
---|
| 205 |
|
---|
| 206 | if ($author =~ /,/) {
|
---|
| 207 | my @parts=split(/,\s*/, $author);
|
---|
| 208 | $first = pop @parts;
|
---|
| 209 | if (scalar(@parts) == 2) {
|
---|
| 210 | $jr = pop @parts;
|
---|
| 211 | }
|
---|
| 212 | $vonlast=shift @parts;
|
---|
| 213 | if (scalar(@parts) > 0) {
|
---|
| 214 | print $outhandle $self->{'plugin_type'} .
|
---|
| 215 | ": couldn't parse name $author\n";
|
---|
| 216 | # but we continue anyway...
|
---|
| 217 | }
|
---|
| 218 | } else { # First von Last
|
---|
| 219 | my @words = split(/ /, $author);
|
---|
| 220 | while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
|
---|
| 221 | $first .= " " . shift (@words);
|
---|
| 222 | }
|
---|
| 223 | $first =~ s/^\s//;
|
---|
| 224 | $vonlast = join (' ', @words); # whatever's left...
|
---|
| 225 | }
|
---|
| 226 | my $von="";
|
---|
| 227 | my $last="";
|
---|
| 228 | if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
|
---|
| 229 | $vonlast =~ s/^(([a-z]\w+\s+)+)//;
|
---|
| 230 | $von = $1;
|
---|
| 231 | if (!defined ($von)) {
|
---|
| 232 | # some non-English names do start with lowercase
|
---|
| 233 | # eg "Marie desJardins". Also we can get typos...
|
---|
| 234 | print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
|
---|
| 235 | $von="";
|
---|
| 236 | if ($vonlast =~ /^[a-z]+$/) {
|
---|
| 237 | # if it's all lowercase, uppercase 1st.
|
---|
| 238 | $vonlast =~ s/^(.)/\u$1/;
|
---|
| 239 | }
|
---|
| 240 | }
|
---|
| 241 | $von =~ s/\s*$//;
|
---|
| 242 | $last=$vonlast;
|
---|
| 243 | } else {
|
---|
| 244 | $last=$vonlast;
|
---|
| 245 | }
|
---|
| 246 | my $wholename="$first $von $last $jr";
|
---|
| 247 | $wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
|
---|
| 248 | # my $fullname = "$last";
|
---|
| 249 | # $fullname .= " $jr" if ($jr);
|
---|
| 250 | # $fullname .= ", $first";
|
---|
| 251 | # $fullname .= " $von" if ($von);
|
---|
| 252 | push (@formattedlist, $wholename);
|
---|
| 253 | }
|
---|
| 254 | return @formattedlist;
|
---|
| 255 | }
|
---|
| 256 |
|
---|
| 257 |
|
---|
| 258 | ## following functions based on bibtex plugin ##
|
---|
| 259 | # not actually used at the moment, but might be useful in future?
|
---|
| 260 | sub expand_month {
|
---|
| 261 | my $text=shift;
|
---|
| 262 |
|
---|
| 263 | # bibtex style files expand abbreviations for months.
|
---|
| 264 | # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
|
---|
| 265 | $text =~ s/jan/_textmonth01_/g;
|
---|
| 266 | $text =~ s/feb/_textmonth02_/g;
|
---|
| 267 | $text =~ s/mar/_textmonth03_/g;
|
---|
| 268 | $text =~ s/apr/_textmonth04_/g;
|
---|
| 269 | $text =~ s/may/_textmonth05_/g;
|
---|
| 270 | $text =~ s/jun/_textmonth06_/g;
|
---|
| 271 | $text =~ s/jul/_textmonth07_/g;
|
---|
| 272 | $text =~ s/aug/_textmonth08_/g;
|
---|
| 273 | $text =~ s/sep/_textmonth09_/g;
|
---|
| 274 | $text =~ s/oct/_textmonth10_/g;
|
---|
| 275 | $text =~ s/nov/_textmonth11_/g;
|
---|
| 276 | $text =~ s/dec/_textmonth12_/g;
|
---|
| 277 |
|
---|
| 278 | return $text;
|
---|
| 279 | }
|
---|
| 280 |
|
---|
| 281 |
|
---|
| 282 | # If you want basic html formatting (eg \emph -> <em>, \bf, etc), give "-html"
|
---|
| 283 | # as the first argument to this function.
|
---|
| 284 | #
|
---|
| 285 | # Convert accented characters, remove { }, interprete some commands....
|
---|
| 286 | # Note!! This is not comprehensive! Also assumes Latin -> Unicode!
|
---|
| 287 |
|
---|
| 288 | # Also, it sucks quite a bit for complicated/nested commands since it doesn't
|
---|
| 289 | # match { with the corresponding }, only the nearest }
|
---|
| 290 |
|
---|
| 291 | sub process_latex {
|
---|
| 292 | my $self=shift;
|
---|
| 293 | my $text=shift;
|
---|
| 294 |
|
---|
| 295 | my $outhandle=$self->{'outhandle'};
|
---|
| 296 |
|
---|
| 297 | my $html_markup=0;
|
---|
| 298 | if ($text =~ /^\-html/) {
|
---|
| 299 | $html_markup=1;
|
---|
| 300 | $text=shift;
|
---|
| 301 | }
|
---|
| 302 |
|
---|
| 303 | if (! $text) {
|
---|
| 304 | return $text;
|
---|
| 305 | }
|
---|
| 306 | # escape html-sensitive characters
|
---|
| 307 | $text =~ s@&@&@g;
|
---|
| 308 | $text =~ s@<@<@g;
|
---|
| 309 | $text =~ s@>@>@g;
|
---|
| 310 |
|
---|
| 311 | # do this before accents, since \= means something different in tabbing
|
---|
| 312 | # also \> is a tab stop too, and \\ is newline
|
---|
| 313 | sub do_tabbing {
|
---|
| 314 | my $tabbing=shift;
|
---|
| 315 | $tabbing =~ s!^.*\\kill\s*$!!mg; # \kill sets tab stops, kills line
|
---|
| 316 | $tabbing =~ s~\\(?:=|>)~\xc2\xa0~g; # replace with nbsp
|
---|
| 317 | $tabbing =~ s~[\\][\\](?:\[.*?\])?\s*$~<br/>~mg;
|
---|
| 318 | return "<br/>" . $tabbing . "<br/>\n";
|
---|
| 319 | }
|
---|
[31780] | 320 | $text =~ s@\\begin\{tabbing}(.*?)\\end\{tabbing}@do_tabbing($1)@ges;
|
---|
[7541] | 321 | sub do_tabular {
|
---|
| 322 | my $tabular=shift;
|
---|
| 323 | $tabular =~ s~(?<!\\)\s*&\s*~</td><td>~g;
|
---|
| 324 | $tabular =~ s~[\\][\\]\s*~</td></tr>\n <tr><td>~g;
|
---|
| 325 | $tabular =~ s~\\hline~~g; # for now...
|
---|
[31780] | 326 | $tabular =~ s~<td>\s*\\multicolumn\{(\d+)}\{.*?}~<td colspan="$1">~g;
|
---|
[7541] | 327 | return "<table border=\"1\">\n <tr><td>"
|
---|
| 328 | . $tabular . "</td></tr></table>\n";
|
---|
| 329 | }
|
---|
[31780] | 330 | $text =~ s@\\begin\{tabular}(?:\[.*?\])?{.*?}(.*?)\\end\{tabular} @
|
---|
[7541] | 331 | do_tabular($1) @xges;
|
---|
| 332 |
|
---|
| 333 | $text =~ s@[\\][\\]\s*\n@ @g; # fold lines ending with \\
|
---|
| 334 |
|
---|
| 335 | # process maths mode before accents... things like \, mean different!
|
---|
| 336 | # maths mode
|
---|
| 337 | $text =~ s@\$\$(.*?)\$\$
|
---|
| 338 | @ process_latex_math($html_markup,$1)
|
---|
| 339 | @xsge; # multi-line maths: $$ .... $$
|
---|
| 340 |
|
---|
| 341 | $text =~ s@([^\\])\$(.*?[^\\])\$
|
---|
| 342 | @$1.process_latex_math($html_markup,$2)@xsge;
|
---|
| 343 |
|
---|
| 344 |
|
---|
| 345 | # is this an amstext environment, or just custom for that input file?
|
---|
[31780] | 346 | $text =~ s@\\begin\{(algorithm)}(.*?)\\end\{\1}@remove_equals($2)@ges;
|
---|
[7541] | 347 |
|
---|
| 348 | # convert latex-style accented characters.
|
---|
| 349 | $self->latex_accents_to_utf8(\$text);
|
---|
| 350 |
|
---|
| 351 | # replace quotes with utf-8
|
---|
| 352 |
|
---|
| 353 | $text =~ s/``/\xe2\xc0\x9c/g; # Latex-specific, left-dbl quote (“)
|
---|
| 354 | $text =~ s/''/\xe2\xc0\x9d/g; # Latex-specific, right-dbl quote (”)
|
---|
| 355 | $text =~ s/`/\xe2\xc0\x98/g; # single left quote
|
---|
| 356 | $text =~ s/'/\xe2\xc0\x99/g; # single right quote
|
---|
| 357 |
|
---|
| 358 | ###### remove/replace latex commands ######
|
---|
| 359 | ### commands that expand to something that gets displayed ###
|
---|
| 360 | $text =~ s~\\ldots~…~g;
|
---|
| 361 | $text =~ s~\\hrule~<hr/>\n~g;
|
---|
| 362 | $text =~ s~\\maketitle~ ~;
|
---|
| 363 | ### space commands ###
|
---|
| 364 | $text =~ s~\\[vh]skip\s+\S+~~g;
|
---|
| 365 | $text =~ s~\\vspace\*?{.*?}~<div> </div>~g; # vertical space
|
---|
| 366 | $text =~ s~\\\w+skip~ ~g; # \smallskip \medskip \bigskip \baselineskip etc
|
---|
| 367 | $text =~ s~\\noindent\b~~g;
|
---|
| 368 | # newpage, etc
|
---|
| 369 | $text =~ s~\\(?:clearemptydoublepage|newpage)~~g;
|
---|
| 370 | ### counters, contents, environments, labels, etc ###
|
---|
[31780] | 371 | $text =~ s~\\(?:addcontentsline){.*?}\{.*?}\{.*}~~g;
|
---|
| 372 | $text =~ s~\s*\\begin\{itemize}\s*~\n<ul>\n~g;
|
---|
| 373 | $text =~ s~\s*\\end\{itemize}\s*~</li></ul>\n~g;
|
---|
| 374 | $text =~ s~\s*\\begin\{enumerate}\s*~<ol>\n~g;
|
---|
| 375 | $text =~ s~\s*\\end\{enumerate}\s*~</li></ol>\n~g;
|
---|
[7541] | 376 | if ($text =~ s~\s*\\item~</li>\n<li>~g) {
|
---|
| 377 | # (count for first list item)
|
---|
| 378 | $text =~ s~<([ou])l>\s*</li>\s*~<$1l>~g;
|
---|
| 379 | }
|
---|
| 380 | $text =~ s~\\(?:label|begin|end){.*?}\s*\n?~ ~g; # remove tag and contents
|
---|
| 381 | $text =~ s~\\(?:tableofcontents|listoffigures)~ ~g;
|
---|
| 382 | ### font sizes/styles ###
|
---|
| 383 | $text =~ s~\\(?:tiny|small|footnotesize|normalsize|large|Large|huge|Huge)\b~~g;
|
---|
| 384 |
|
---|
| 385 | if ($html_markup) {
|
---|
| 386 | $text =~ s~\\section\*?{([^\}]+)}\s*\n?~<H1>$1</H1>\n~g;
|
---|
| 387 | $text =~ s~\\subsection\*?{(.*?)}\s*\n?~<H2>$1</H2>\n~g;
|
---|
| 388 | $text =~ s~{\\tt\s*(.*?)}~<tt>$1</tt>~g;
|
---|
| 389 | $text =~ s~\\(?:texttt|tt|ttseries)\s*{(.*?)}~<tt>$1</tt>~g;
|
---|
[31780] | 390 | $text =~ s~\\emph\{(.*?)}~<em>$1</em>~g;
|
---|
[8098] | 391 | $text =~ s~{\\(?:em|it)\s*(.*?)}~<em>$1</em>~g;
|
---|
[7541] | 392 | $text =~ s~{\\(?:bf|bfseries)\s*(.*?)}~<strong>$1</strong>~g;
|
---|
| 393 | $text =~ s~\\(?:textbf|bf|bfseries)\s*{(.*?)}~<strong>$1</strong>~g;
|
---|
| 394 | } else {
|
---|
| 395 | # remove tags for text-only
|
---|
| 396 | $text =~ s~\\(?:textbf|bf|bfseries|em|emph|tt|rm|texttt)\b~ ~g;
|
---|
| 397 | }
|
---|
| 398 | $text =~ s ~ {\\sc\s+(.*?)} ~
|
---|
| 399 | {<span style="font-variant:\ small-caps">$1</span>} ~gx;
|
---|
| 400 | # ignore these font tags (if there are any left)
|
---|
| 401 | # sf is sans-serif
|
---|
| 402 | $text =~ s~\\(?:mdseries|textmd|bfseries|textbf|sffamily|sf|sc)\b~ ~;
|
---|
| 403 | #### end font-related stuff ####
|
---|
| 404 |
|
---|
| 405 | ### remove all other commands with optional arguments... ###
|
---|
| 406 | # don't remove commands without { }....
|
---|
| 407 | # $text =~ s~\\\w+(\[.*?\])?\s*~~g;
|
---|
| 408 | # $text =~ s~\\noopsort{[^}]+\}~~g;
|
---|
| 409 | # verbatim
|
---|
| 410 | $text =~ s~\\verb(.)(.*?)\1~verb_text($1)~ge;
|
---|
| 411 | # remove tags, keep contents for \tag[optional]{contents}
|
---|
| 412 | while ($text =~ s~\\\w+(\[.*?\])?{([^}]+)}~$2 ~g) {;} # all other commands
|
---|
| 413 |
|
---|
| 414 | # remove latex groupings { } (but not \{ or \} )
|
---|
| 415 | while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
|
---|
| 416 | $text =~ s/^\{//; # remove { if first char
|
---|
| 417 |
|
---|
| 418 | # latex characters
|
---|
| 419 | # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
|
---|
| 420 | $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
|
---|
| 421 | # optional break "\-"
|
---|
| 422 | if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
|
---|
| 423 | # the non-macro bits have quotes around them - we just remove quotes
|
---|
| 424 | # XXX bibtex and latex differ here (for the '#' char)...
|
---|
| 425 | $text =~ s/([^&])[\"\#]/$1/g;
|
---|
| 426 | }
|
---|
| 427 | # dashes. Convert (m|n)-dash into single dash for html.
|
---|
| 428 | $text =~ s~\-\-+~\-~g;
|
---|
| 429 |
|
---|
| 430 | # quoted { } chars
|
---|
[31780] | 431 | $text =~ s~\\\{~{~g;
|
---|
[7541] | 432 | $text =~ s~\\}~}~g;
|
---|
| 433 |
|
---|
| 434 | # spaces
|
---|
| 435 | $text =~ s~\\ ~ ~g;
|
---|
| 436 |
|
---|
| 437 | # finally to protect against macro language...
|
---|
| 438 | # greenstone-specific
|
---|
| 439 | $text =~ s~\[~&\#91;~g;
|
---|
| 440 | $text =~ s~\]~&\#93;~g;
|
---|
| 441 | $text =~ s~(?<!\\)([\\_])~\\$1~g;
|
---|
| 442 |
|
---|
| 443 | if ($html_markup) {
|
---|
| 444 | $text =~ s~\n{2,}~\n</p>\n<p>~g;
|
---|
| 445 | return "<p>$text</p>";
|
---|
| 446 | }
|
---|
| 447 |
|
---|
| 448 | return $text;
|
---|
| 449 | }
|
---|
| 450 |
|
---|
| 451 | # only used by process_latex for \verb....
|
---|
| 452 | sub verb_text {
|
---|
| 453 | my $verbatim=shift;
|
---|
| 454 | $verbatim =~ s/([{}_])/\\$1/g;
|
---|
| 455 | return $verbatim;
|
---|
| 456 | }
|
---|
[8098] | 457 | # only used by process_latex_math
|
---|
| 458 | # returns a unicode char if applicable, otherwise ascii
|
---|
| 459 | sub math_fraction {
|
---|
| 460 | my $num=$1;
|
---|
| 461 | my $denom=$2;
|
---|
[7541] | 462 |
|
---|
[8098] | 463 | if ($num==1 && $denom==2) {return chr(0xc2).chr(0xbd)}
|
---|
| 464 | if ($num==1 && $denom==4) {return chr(0xc2).chr(0xbc)}
|
---|
| 465 | if ($num==3 && $denom==4) {return chr(0xc2).chr(0xbe)}
|
---|
| 466 | return "$num/$denom";
|
---|
| 467 | }
|
---|
[7541] | 468 |
|
---|
| 469 | sub process_latex_math {
|
---|
| 470 |
|
---|
| 471 | my $text = pop; # if given one or two args, this is the last one...
|
---|
| 472 | my $html_markup=pop; # if given two args, this is the first one else undef
|
---|
| 473 |
|
---|
| 474 | $text =~ s~\\,~ ~g; # forces a space?
|
---|
| 475 | $text =~ s~\\infty~infinity~g; # or unicode 0x221E...
|
---|
| 476 |
|
---|
| 477 | # use this one when more things can read 3-byte utf8 values like this!
|
---|
| 478 | # $text =~ s~\\cup\b~\xe2\xc8\xaa~g; # union operator - unicode 0x222a
|
---|
| 479 | $text =~ s~\\cup\b~ U ~g;
|
---|
| 480 |
|
---|
[31780] | 481 | $text =~ s~\\frac\s*{(.+?)}\{(.+?)}~math_fraction($1,$2)~ge;
|
---|
[8098] | 482 |
|
---|
[7541] | 483 | if ($html_markup) {
|
---|
[31780] | 484 | $text =~ s~\^\{(.*?)}~<sup>$1</sup>~g; # a^b superscript
|
---|
[7541] | 485 | $text =~ s~\^([^\{])~<sup>$1</sup>~g;
|
---|
[31780] | 486 | $text =~ s~\_\{(.*?)}~<sub>$1</sub>~g; # a_b subscript
|
---|
[7541] | 487 | $text =~ s~\_([^\{])~<sub>$1</sub>~g;
|
---|
| 488 |
|
---|
| 489 | $text =~ s~\\ldots~…~g; # use html named entity for now
|
---|
| 490 |
|
---|
| 491 | # put all other command names in italics for now
|
---|
| 492 | $text =~ s~\\([\w]+)~<i>$1</i> ~g;
|
---|
| 493 | }
|
---|
| 494 |
|
---|
| 495 | # special cases, for some input files
|
---|
| 496 | if ($text =~ m~^\\\w+$~) {
|
---|
| 497 | $text="{" . $text . "}";
|
---|
| 498 | }
|
---|
| 499 |
|
---|
| 500 | return $text;
|
---|
| 501 | }
|
---|
| 502 |
|
---|
| 503 |
|
---|
| 504 |
|
---|
| 505 | sub latex_accents_to_utf8 {
|
---|
| 506 |
|
---|
| 507 | # note - this is really ugly, but it works. There may be a prettier way
|
---|
| 508 | # of mapping latex accented chars to utf8, but we just brute force it here.
|
---|
| 509 | # Also, this isn't complete - not every single possible accented letter
|
---|
| 510 | # is in here yet, but most of the common ones are.
|
---|
| 511 |
|
---|
| 512 | my %utf8_chars =
|
---|
| 513 | (
|
---|
| 514 | # acutes
|
---|
| 515 | '\'a' => chr(0xc3).chr(0xa1),
|
---|
| 516 | '\'c' => chr(0xc4).chr(0x87),
|
---|
| 517 | '\'e' => chr(0xc3).chr(0xa9),
|
---|
| 518 | '\'i' => chr(0xc3).chr(0xad),
|
---|
| 519 | '\'l' => chr(0xc3).chr(0xba),
|
---|
| 520 | '\'n' => chr(0xc3).chr(0x84),
|
---|
| 521 | '\'o' => chr(0xc3).chr(0xb3),
|
---|
| 522 | '\'r' => chr(0xc5).chr(0x95),
|
---|
| 523 | '\'s' => chr(0xc5).chr(0x9b),
|
---|
| 524 | '\'u' => chr(0xc3).chr(0xba),
|
---|
| 525 | '\'y' => chr(0xc3).chr(0xbd),
|
---|
| 526 | '\'z' => chr(0xc5).chr(0xba),
|
---|
| 527 | # graves
|
---|
| 528 | '`a' => chr(0xc3).chr(0xa0),
|
---|
| 529 | '`A' => chr(0xc3).chr(0x80),
|
---|
| 530 | '`e' => chr(0xc3).chr(0xa8),
|
---|
| 531 | '`E' => chr(0xc3).chr(0x88),
|
---|
| 532 | '`i' => chr(0xc3).chr(0xac),
|
---|
| 533 | '`I' => chr(0xc3).chr(0x8c),
|
---|
| 534 | '`o' => chr(0xc3).chr(0xb2),
|
---|
| 535 | '`O' => chr(0xc3).chr(0x92),
|
---|
| 536 | '`u' => chr(0xc3).chr(0xb9),
|
---|
| 537 | '`U' => chr(0xc3).chr(0x99),
|
---|
| 538 | # circumflex
|
---|
| 539 | '^a' => chr(0xc3).chr(0xa2),
|
---|
| 540 | '^A' => chr(0xc3).chr(0x82),
|
---|
| 541 | '^c' => chr(0xc4).chr(0x89),
|
---|
| 542 | '^C' => chr(0xc4).chr(0x88),
|
---|
| 543 | '^e' => chr(0xc3).chr(0xaa),
|
---|
| 544 | '^E' => chr(0xc3).chr(0x8a),
|
---|
| 545 | '^g' => chr(0xc4).chr(0x9d),
|
---|
| 546 | '^G' => chr(0xc4).chr(0x9c),
|
---|
| 547 | '^h' => chr(0xc4).chr(0xa5),
|
---|
| 548 | '^H' => chr(0xc4).chr(0xa4),
|
---|
| 549 | '^i' => chr(0xc3).chr(0xae),
|
---|
| 550 | '^I' => chr(0xc3).chr(0x8e),
|
---|
| 551 | '^j' => chr(0xc4).chr(0xb5),
|
---|
| 552 | '^J' => chr(0xc4).chr(0xb4),
|
---|
| 553 | '^o' => chr(0xc3).chr(0xb4),
|
---|
| 554 | '^O' => chr(0xc3).chr(0x94),
|
---|
| 555 | '^s' => chr(0xc5).chr(0x9d),
|
---|
| 556 | '^S' => chr(0xc5).chr(0x9c),
|
---|
| 557 | '^u' => chr(0xc3).chr(0xa2),
|
---|
| 558 | '^U' => chr(0xc3).chr(0xbb),
|
---|
| 559 | '^w' => chr(0xc5).chr(0xb5),
|
---|
| 560 | '^W' => chr(0xc5).chr(0xb4),
|
---|
| 561 | '^y' => chr(0xc5).chr(0xb7),
|
---|
| 562 | '^Y' => chr(0xc5).chr(0xb6),
|
---|
| 563 |
|
---|
| 564 | # diaeresis
|
---|
| 565 | '"a' => chr(0xc3).chr(0xa4),
|
---|
| 566 | '"A' => chr(0xc3).chr(0x84),
|
---|
| 567 | '"e' => chr(0xc3).chr(0xab),
|
---|
| 568 | '"E' => chr(0xc3).chr(0x8b),
|
---|
[8097] | 569 | '"i' => chr(0xc3).chr(0xaf),
|
---|
| 570 | '"I' => chr(0xc3).chr(0x8f),
|
---|
[7541] | 571 | '"\\\\i' => chr(0xc3).chr(0xaf),
|
---|
| 572 | '"\\\\I' => chr(0xc3).chr(0x8f),
|
---|
| 573 | '"o' => chr(0xc3).chr(0xb6),
|
---|
| 574 | '"O' => chr(0xc3).chr(0x96),
|
---|
| 575 | '"u' => chr(0xc3).chr(0xbc),
|
---|
| 576 | '"U' => chr(0xc3).chr(0x9c),
|
---|
| 577 | '"y' => chr(0xc3).chr(0xbf),
|
---|
| 578 | '"Y' => chr(0xc3).chr(0xb8),
|
---|
| 579 | # tilde
|
---|
| 580 | '~A' => chr(0xc3).chr(0x83),
|
---|
| 581 | '~N' => chr(0xc3).chr(0x91),
|
---|
| 582 | '~O' => chr(0xc3).chr(0x95),
|
---|
| 583 | '~a' => chr(0xc3).chr(0xa3),
|
---|
| 584 | '~n' => chr(0xc3).chr(0xb1),
|
---|
| 585 | '~o' => chr(0xc3).chr(0xb5),
|
---|
| 586 | # caron - handled specially
|
---|
| 587 | # double acute
|
---|
| 588 | # ring
|
---|
| 589 | # dot
|
---|
| 590 | '.c' => chr(0xc4).chr(0x8b),
|
---|
| 591 | '.C' => chr(0xc4).chr(0x8a),
|
---|
| 592 | '.e' => chr(0xc4).chr(0x97),
|
---|
| 593 | '.E' => chr(0xc4).chr(0x96),
|
---|
| 594 | '.g' => chr(0xc4).chr(0xa1),
|
---|
| 595 | '.G' => chr(0xc4).chr(0xa0),
|
---|
| 596 | '.I' => chr(0xc4).chr(0xb0),
|
---|
| 597 | '.z' => chr(0xc5).chr(0xbc),
|
---|
| 598 | '.Z' => chr(0xc5).chr(0xbb),
|
---|
| 599 | # macron
|
---|
| 600 | '=a' => chr(0xc4).chr(0x81),
|
---|
| 601 | '=A' => chr(0xc4).chr(0x80),
|
---|
| 602 | '=e' => chr(0xc4).chr(0x93),
|
---|
| 603 | '=E' => chr(0xc4).chr(0x92),
|
---|
| 604 | '=i' => chr(0xc4).chr(0xab),
|
---|
| 605 | '=I' => chr(0xc4).chr(0xaa),
|
---|
| 606 | '=o' => chr(0xc4).chr(0x8d),
|
---|
| 607 | '=O' => chr(0xc4).chr(0x8c),
|
---|
| 608 | '=u' => chr(0xc4).chr(0xab),
|
---|
| 609 | '=U' => chr(0xc4).chr(0xaa),
|
---|
| 610 |
|
---|
| 611 | # stroke - handled specially - see below
|
---|
| 612 |
|
---|
| 613 | # cedilla - handled specially
|
---|
| 614 | );
|
---|
| 615 |
|
---|
| 616 | # these are one letter latex commands - we make sure they're not a longer
|
---|
| 617 | # command name. eg {\d} is d+stroke, so careful of \d
|
---|
| 618 | my %special_utf8_chars =
|
---|
| 619 | (
|
---|
| 620 | # breve
|
---|
| 621 | 'u g' => chr(0xc4).chr(0x9f),
|
---|
| 622 | 'u G' => chr(0xc4).chr(0x9e),
|
---|
| 623 | 'u i' => chr(0xc4).chr(0xad),
|
---|
| 624 | 'u I' => chr(0xc4).chr(0xac),
|
---|
| 625 | 'u o' => chr(0xc5).chr(0x8f),
|
---|
| 626 | 'u O' => chr(0xc5).chr(0x8e),
|
---|
| 627 | 'u u' => chr(0xc5).chr(0xad),
|
---|
| 628 | 'u U' => chr(0xc5).chr(0xac),
|
---|
| 629 | 'u z' => chr(0xc5).chr(0xbe), # !!! no such char, but common mistake
|
---|
| 630 | 'u Z' => chr(0xc5).chr(0xbd), # used instead of v Z !!!
|
---|
| 631 | # caron
|
---|
| 632 | 'v c' => chr(0xc4).chr(0x8d),
|
---|
| 633 | 'v C' => chr(0xc4).chr(0x8c),
|
---|
| 634 | 'v n' => chr(0xc5).chr(0x88),
|
---|
| 635 | 'v N' => chr(0xc5).chr(0x87),
|
---|
| 636 | 'v s' => chr(0xc5).chr(0xa1),
|
---|
| 637 | 'v S' => chr(0xc5).chr(0xa5),
|
---|
| 638 | 'v z' => chr(0xc5).chr(0xbe),
|
---|
| 639 | 'v Z' => chr(0xc5).chr(0xbd),
|
---|
| 640 | # cedilla
|
---|
| 641 | 'c c' => chr(0xc3).chr(0xa7),
|
---|
| 642 | 'c C' => chr(0xc3).chr(0x87),
|
---|
| 643 | 'c g' => chr(0xc4).chr(0xa3),
|
---|
| 644 | 'c G' => chr(0xc4).chr(0xa2),
|
---|
| 645 | 'c k' => chr(0xc4).chr(0xb7),
|
---|
| 646 | 'c K' => chr(0xc4).chr(0xb6),
|
---|
| 647 | 'c l' => chr(0xc4).chr(0xbc),
|
---|
| 648 | 'c L' => chr(0xc4).chr(0xbb),
|
---|
| 649 | 'c n' => chr(0xc5).chr(0x86),
|
---|
| 650 | 'c N' => chr(0xc5).chr(0x85),
|
---|
| 651 | 'c r' => chr(0xc5).chr(0x97),
|
---|
| 652 | 'c R' => chr(0xc5).chr(0x96),
|
---|
| 653 | 'c s' => chr(0xc5).chr(0x9f),
|
---|
| 654 | 'c S' => chr(0xc5).chr(0x9e),
|
---|
| 655 | 'c t' => chr(0xc5).chr(0xa3),
|
---|
| 656 | 'c T' => chr(0xc5).chr(0xa2),
|
---|
| 657 | # double acute / Hungarian accent
|
---|
| 658 | 'H O' => chr(0xc5).chr(0x90),
|
---|
| 659 | 'H o' => chr(0xc5).chr(0x91),
|
---|
| 660 | 'H U' => chr(0xc5).chr(0xb0),
|
---|
| 661 | 'H u' => chr(0xc5).chr(0xb1),
|
---|
| 662 |
|
---|
| 663 | # stroke
|
---|
| 664 | 'd' => chr(0xc4).chr(0x91),
|
---|
| 665 | 'D' => chr(0xc4).chr(0x90),
|
---|
| 666 | 'h' => chr(0xc4).chr(0xa7),
|
---|
| 667 | # 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
|
---|
| 668 | 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
|
---|
| 669 | 'l' => chr(0xc5).chr(0x82),
|
---|
| 670 | 'L' => chr(0xc5).chr(0x81),
|
---|
| 671 | 'o' => chr(0xc3).chr(0xb8),
|
---|
| 672 | 'O' => chr(0xc3).chr(0x98),
|
---|
| 673 | 't' => chr(0xc5).chr(0xa7),
|
---|
| 674 | 'T' => chr(0xc5).chr(0xa6),
|
---|
| 675 |
|
---|
| 676 | # other special chars
|
---|
| 677 | 'ss' => chr(0xc3).chr(0x9f), # german ss/szlig/sharp s
|
---|
| 678 | 'aa' =>,chr(0xc3).chr(0xa5), # scandanavian/latin a with ring
|
---|
| 679 | );
|
---|
| 680 |
|
---|
| 681 | my $self=shift;
|
---|
| 682 | my $textref=shift;
|
---|
| 683 |
|
---|
| 684 | my $outhandle=$self->{'outhandle'};
|
---|
| 685 | my $text=$$textref;
|
---|
| 686 |
|
---|
| 687 | # remove space (if any) between \ and letter to accent (eg {\' a})
|
---|
| 688 | $text =~ s!(\\[`'="^~\.])\s(\w)\b!$1$2!g; # for emacs indenting... `]);
|
---|
| 689 |
|
---|
| 690 | # remove {} around a single character (eg \'{e})
|
---|
| 691 | $text =~ s!(\\[`'="^~\.]){(\w)}!{$1$2}!g; # for emacs indenting... `]);
|
---|
| 692 |
|
---|
| 693 | ## only in bibtex... not in latex proper?!
|
---|
| 694 | ### \, is another way of doing cedilla \c
|
---|
| 695 | ##$text =~ s~\\,(.)~\\c $1~g;
|
---|
| 696 |
|
---|
| 697 | # remove {} around a single character for special 1 letter commands -
|
---|
| 698 | # need to insert a space. Eg \v{s} -> {\v s}
|
---|
| 699 | $text =~ s~(\\[uvcH]){(\w)}~{$1 $2}~g;
|
---|
| 700 |
|
---|
| 701 | # only do if the text contains a '\' character.
|
---|
| 702 | if ($text =~ m|\\|) {
|
---|
| 703 | # "normal" accents - ie non-alpha latex tag
|
---|
| 704 | # xxx used to have ([\w]\b)@ (for word boundary)
|
---|
| 705 | while ($text =~ m/\\([`'="^~\.])([\w])/) { # for emacs `])){
|
---|
| 706 | my $tex="$1$2"; my $char="$2";
|
---|
| 707 | my $replacement=$utf8_chars{$tex};
|
---|
| 708 | if (!defined($replacement)) {
|
---|
| 709 | $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
|
---|
| 710 | print $outhandle . $self->{'plugin_type'} .
|
---|
| 711 | ": Warning: unknown latex accent \"$tex\""
|
---|
| 712 | . " in \"$1\"\n";
|
---|
| 713 | $replacement=$char;
|
---|
| 714 | }
|
---|
| 715 | $text =~ s/\\\Q$tex/$replacement/g;
|
---|
| 716 | }
|
---|
| 717 |
|
---|
| 718 | # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
|
---|
| 719 | # only do the change if immediately followed by a space, }, {, or \
|
---|
| 720 | # one letter accents ( + ss / aa)
|
---|
| 721 | while ($text =~ m~\\([DdhiLlOoTt]|ss|aa)[{}\s\"\\]~) {
|
---|
| 722 | my $tex=$1;
|
---|
| 723 | my $replacement=$special_utf8_chars{$tex};
|
---|
| 724 | if (!defined($replacement)) {
|
---|
| 725 | $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
|
---|
| 726 | print $outhandle $self->{'plugin_type'} .
|
---|
| 727 | ": Warning: unknown latex accent \"$tex\""
|
---|
| 728 | . " in \"$1\"\n";
|
---|
| 729 | $replacement=$tex;
|
---|
| 730 | }
|
---|
| 731 | ($text =~ s/{\\$tex}/$replacement/g) or
|
---|
| 732 | $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
|
---|
| 733 |
|
---|
| 734 | }
|
---|
| 735 |
|
---|
| 736 | # one letter latex accent commands that affect following letter
|
---|
| 737 | while ($text =~ m~\\([uvcH]) ([\w])~) {
|
---|
| 738 | my $tex="$1 $2"; my $char="$2";
|
---|
| 739 | my $replacement=$special_utf8_chars{$tex};
|
---|
| 740 | if (!defined($replacement)) {
|
---|
| 741 | $text =~ m~(.{20}\\\Q$tex\E.{20})~s;
|
---|
| 742 | print $outhandle $self->{'plugin_type'} .
|
---|
| 743 | ": Warning: unknown latex accent \"$tex\""
|
---|
| 744 | . " in \"$1\"\n";
|
---|
| 745 | $replacement=$char;
|
---|
| 746 | }
|
---|
| 747 | $text =~ s/\\$tex/$replacement/g;
|
---|
| 748 | }
|
---|
| 749 | }
|
---|
| 750 | $textref=\$text;
|
---|
| 751 | }
|
---|
| 752 |
|
---|
| 753 |
|
---|
| 754 | # modules must return true
|
---|
| 755 | 1;
|
---|