source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 10723

Last change on this file since 10723 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 23.3 KB
RevLine 
[1676]1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
[2901]10# Copyright 1999-2001 New Zealand Digital Library Project
[1676]11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
[2901]37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
[1676]42
[3587]43
[1676]44package BibTexPlug;
45
46use SplitPlug;
[10254]47use strict;
48no strict 'refs'; # allow filehandles to be variables and viceversa
[1676]49
50# BibTexPlug is a sub-class of BasPlug.
51sub BEGIN {
[10254]52 @BibTexPlug::ISA = ('SplitPlug');
[1676]53}
54
[3540]55my $arguments =
[4744]56 [ { 'name' => "process_exp",
[4873]57 'desc' => "{BasPlug.process_exp}",
[6408]58 'type' => "regexp",
[4873]59 'reqd' => "no",
[6408]60 'deft' => &get_default_process_exp() },
61 { 'name' => "split_exp",
62 'desc' => "{SplitPlug.split_exp}",
63 'type' => "regexp",
64 'deft' => &get_default_split_exp(),
65 'reqd' => "no" }
66 ];
[3540]67
[4744]68my $options = { 'name' => "BibTexPlug",
[5680]69 'desc' => "{BibTexPlug.desc}",
[6408]70 'abstract' => "no",
71 'inherits' => "yes",
[9120]72 'explodes' => "yes",
[4744]73 'args' => $arguments };
[3540]74
[1676]75# This plugin processes files with the suffix ".bib"
76sub get_default_process_exp {
77 return q^(?i)\.bib$^;
78}
79
80# This plugin splits the input text at blank lines
81sub get_default_split_exp {
82 return q^\n+(?=@)^;
83}
[3540]84sub new {
[10218]85 my ($class) = shift (@_);
86 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87 push(@$pluginlist, $class);
88
89 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
90 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
91
92 my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
93
[3540]94 return bless $self, $class;
95}
[1676]96
[2901]97
98
[1676]99# The process function reads a single bibliographic record and stores
100# it as a new document.
101
102sub process {
103 my $self = shift (@_);
[6332]104 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1676]105 my $outhandle = $self->{'outhandle'};
106
[10254]107 my $cursection = $doc_obj->get_top_section();
[2484]108 $self->{'key'} = "default";
109
[1676]110 # Check that we're dealing with a valid BibTex record
111 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
112
113 # Ignore things we can't use
114 return 0 if ($$textref =~ /^\@String/);
115
116 # Report that we're processing the file
[6332]117 print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
[1676]118 print $outhandle "BibTexPlug: processing $file\n"
119 if ($self->{'verbosity'}) > 1;
120
121
[1711]122 # This hash translates BibTex field names into metadata names. The
[1676]123 # BibTex names are taken from the "Local Guide to Latex" Graeme
[2901]124 # McKinstry. Metadata names are consistent with ReferPlug.
[1676]125
[9582]126 # The author metadata will be stored as one "Creator" entry, but will
127 # also be split up into several individual "Author" fields.
128
[1676]129 my %field = (
130 'address', 'PublisherAddress',
131 'author', 'Creator',
[2901]132
[1676]133 'booktitle', 'Booktitle',
134 'chapter', 'Chapter',
135 'edition', 'Edition',
[2901]136 'editor', 'Editor',
[1676]137 'institution', 'Publisher',
138 'journal', 'Journal',
139 'month', 'Month',
140 'number', 'Number',
141 'pages', 'Pages',
142 'publisher', 'Publisher',
143 'school', 'Publisher',
144 'title', 'Title',
145 'volume', 'Volume',
[3142]146 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
[1676]147
148 'keywords', 'Keywords',
149 'abstract', 'Abstract',
[4792]150 'copyright', 'Copyright',
151 'note', 'Note',
[8121]152 );
[1676]153
154 # Metadata fields
155 my %metadata;
[4792]156 my ($EntryType, $Creator, $Keywords, $text);
[1676]157
[2901]158 my $verbosity = $self->{'verbosity'};
159 $verbosity = 0 unless $verbosity;
160
[3142]161 # Make sure the text has exactly one entry per line -
162 # append line to previous if it doesn't start with " <key> = "
[2901]163
[4792]164 my @input_lines=split(/\r?\n/, $$textref);
[3142]165 my @all_lines;
166 my $entry_line=shift @input_lines;
167 foreach my $input_line (@input_lines) {
168 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
169 # this is a new key
170 push(@all_lines, $entry_line);
171 $entry_line=$input_line;
172 } else {
173 # this is a continuation of previous line
[3249]174 $entry_line .= " " . $input_line;
[3142]175 }
176
177 }
178 # add final line, removing trailing '}'
179 $entry_line =~ s/\}\s*$//;
180 push(@all_lines, $entry_line);
181 push(@all_lines, "}");
[2901]182
[1676]183 # Read and process each line in the bib file.
[2901]184 my ($entryname, $name, $value, $line);
185 foreach $line (@all_lines) {
186
[1676]187 # Add each line. Most lines consist of a field identifer and
188 # then data, and we simply store them, though we treat some
189 # of the fields a bit differently.
190
191 $line =~ s/\s+/ /g;
192 $text .= "$line\n";
193
[4792]194
[2901]195 print "Processing line = $line \n" if $verbosity>=4;
196
[1676]197 # The first line is special, it contains the reference type and OID
[4792]198 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]*)\W*$/) {
[1676]199 $EntryType = $1;
[4792]200 my $EntryID = (defined $2) ? $2 : "default";
[2484]201 print "** $EntryType - \"$EntryID\" \n"
202 if ($verbosity >= 4);
[4792]203
[1676]204 next;
205 }
[2484]206 if ($line =~ /\@/) {
[2901]207 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
[2484]208 if ($verbosity >= 2);
[2901]209 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
[2484]210 if ($verbosity >= 2);
211 }
[1676]212
213 # otherwise, parse the metadata out of this line
[9582]214 next unless ($line =~ /^\s*(\w+)\s*=\s*(.*)/);
[2901]215 $entryname = lc($1);
[1676]216 $value = $2;
[3112]217 $value =~ s/,?\s*$//; # remove trailing comma and space
218 if ($value =~ /^"/ && $value =~ /"$/) {
219 # remove surrounding " marks
220 $value =~ s/^"//; $value =~ s/"$//;
221 }
[2901]222 $value = &process_latex($value);
223
[1676]224 # Add this line of metadata
[3426]225 $metadata{$entryname} .= "$value";
[2901]226
[1676]227 }
228
[1677]229 # Add the Entry type as metadata
[2901]230 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
[8121]231
232 #Add the fileformat as metadata
233 $doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
[1676]234
[8121]235
[1676]236 # Add the various field as metadata
[2901]237 foreach my $entryname (keys %metadata) {
238 next unless (defined $field{$entryname});
239 next unless (defined $metadata{$entryname});
[1676]240
[2901]241 $name = $field{$entryname};
242 $value = $metadata{$entryname};
[1676]243
[3142]244 if ($name =~ /^Month/) {
245 $value=expand_month($value);
246 }
[1676]247
248 # Several special operatons on metadata follow
249
250 # Add individual keywords.
251 # The full set of keywords will be added, in due course, as "Keywords".
252 # However, we also want to add them as individual "Keyword" metadata elements.
[2901]253 if ($entryname eq "keywords") {
[1676]254 my @keywordlist = split(/,/, $value);
255 foreach my $k (@keywordlist) {
[2901]256 $k = lc($k);
[1676]257 $k =~ s/\s*$//;
258 $k =~ s/^\s*//;
259 if ($k =~ /\w/) {
[2901]260 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
[1676]261 }
262 }
263 }
264
265 # Add individual authors
266 # The author metadata will be stored as one "Creator" entry, but we
267 # also want to split it into several individual "Author" fields in
[3112]268 # "Lastname, Firstnames" format so we can browse it.
[2901]269 if ($entryname eq "author") { #added also comparison with editor
270
[3249]271 # take care of "et al."...
[9582]272 my $etal='';
273 if ($value =~ s/\s+(and\s+others|et\.?\s+al\.?)\s*$//i) {
274 $etal=' <em>et. al.</em>';
275 }
[2901]276 # und here for german language...
277 # don't use brackets in pattern, else the matched bit becomes
278 # an element in the list!
[3249]279 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
280 my @formattedlist = ();
[1676]281 foreach $a (@authorlist) {
282 $a =~ s/\s*$//;
283 $a =~ s/^\s*//;
284 # Reformat and add author name
[2901]285 next if $a=~ /^\s*$/;
[3249]286
287 # names are "First von Last", "von Last, First"
288 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
289 my $first="";
290 my $vonlast="";
291 my $jr="";
[2901]292
[3249]293 if ($a =~ /,/) {
294 my @parts=split(/,\s*/, $a);
295 $first = pop @parts;
296 if (scalar(@parts) == 2) {
297 $jr = pop @parts;
298 }
299 $vonlast=shift @parts;
300 if (scalar(@parts) > 0) {
301 print $outhandle "BibTexPlug: couldn't parse name $a\n";
302 # but we continue anyway...
303 }
304 } else { # First von Last
305 my @words = split(/ /, $a);
306 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
307 $first .= " " . shift (@words);
308 }
309 $first =~ s/^\s//;
310 $vonlast = join (' ', @words); # whatever's left...
311 }
312 my $von="";
313 my $last="";
314 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
315 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
316 $von = $1;
317 if (!defined ($von)) {
318 # some non-English names do start with lowercase
319 # eg "Marie desJardins". Also we can get typos...
320 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
321 $von="";
322 if ($vonlast =~ /^[a-z]+$/) {
323 # if it's all lowercase, uppercase 1st.
324 $vonlast =~ s/^(.)/\u$1/;
[2901]325
[3249]326 }
327 }
328 $von =~ s/\s*$//;
329 $last=$vonlast;
330 } else {
331 $last=$vonlast;
332 }
333 my $wholename="$first $von $last $jr";
[9582]334 $wholename =~ s/\s+/ /g; # squeeze multiple spaces
335 $wholename =~ s/ $//;
[3249]336 push (@formattedlist, $wholename);
337 my $fullname = "$last";
338 $fullname .= " $jr" if ($jr);
339 $fullname .= ", $first";
340 $fullname .= " $von" if ($von);
341
[1676]342 # Add each name to set of Authors
[3249]343 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
[1676]344 }
[3249]345
346 # Only want at most one "and" in the Creator field
347 if (scalar(@formattedlist) > 2) {
348 my $lastauthor=pop @formattedlist;
349 $value=join(', ', @formattedlist);
350 $value.=" and $lastauthor";
351 } else { # 1 or 2 authors...
352 $value=join(" and ",@formattedlist);
353 }
354 $value.=$etal; # if there was "et al."
[1676]355 }
356
357 # Books and Journals are additionally marked for display purposes
[2901]358 if ($entryname eq "booktitle") {
359 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
360 } elsif ($entryname eq "journal") {
361 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
[1676]362 }
363
[3249]364 # Add the various fields as metadata
365 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
[1676]366 }
367
[9582]368 # for books and journals...
369 if (!exists $metadata{'title'}) {
370 my $name=$field{'title'}; # get Greenstone metadata name
371 my $value;
372 if (exists $metadata{'booktitle'}) {
373 $value=$metadata{'booktitle'};
374 } elsif (exists $metadata{'journal'}) {
375 $value=$metadata{'journal'};
376 }
377 if ($value) {
378 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
379 }
380 }
381
[3249]382 # Add Date (yyyymmdd) metadata
383 if (defined ($metadata{'year'}) ) {
384 my $date=$metadata{'year'};
385 chomp $date;
386 my $month=$metadata{'month'};
387 if (defined($month)) {
388 # month is currently 3 letter code or a range...
389 $month = expand_month($month);
390 # take the first month found... might not find one!
391 $month =~ m/_textmonth(\d\d)_/;
392 $month = $1;
393 }
394 if (!defined($month)) {
395 $month="00";
396 }
397 $date .= "${month}00";
398 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
399}
400
401# # Add the text in BibTex format (all fields)
[1676]402 if ($text =~ /\w/) {
[3249]403
404 $text =~ s@&@&amp;@g;
405 $text =~ s@<@&lt;@g;
406 $text =~ s@>@&gt;@g;
407 $text =~ s@\n@<br/>\n@g;
408 $text =~ s@\\@\\\\@g;
409
[2901]410 $doc_obj->add_utf8_text ($cursection, $text);
[1676]411 }
412
[1677]413 return 1;
[1676]414}
415
[1677]416
[2901]417
418
419# convert email addresses and URLs into links
420sub convert_urls_into_links{
421 my ($text) = @_;
422
423 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[3540]424 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
[2901]425
426 return $text;
427}
428
429# Clean up whitespace and convert \n charaters to <BR> or <P>
430sub clean_up_whitespaces{
431 my ($text) = @_;
432
433 $text =~ s/%%%%%/<BR> <BR>/g;
434 $text =~ s/ +/ /g;
435 $text =~ s/\s*$//;
436 $text =~ s/^\s*//;
437 $text =~ s/\n/\n<BR>/g;
438 $text =~ s/<BR>\s*<BR>/<P>/g;
439
440 return $text;
441}
442
443
444sub convert_problem_characters_without_ampersand{
445 my ($text) = @_;
446 $text =~ s/</&lt;/g;
447 $text =~ s/>/&gt;/g;
448
449 $text =~ s/\'\'/\"/g; #Latex -specific conversion
450 $text =~ s/\`\`/\"/g; #Latex -specific conversion
451
452 $text =~ s/\"/&quot;/g;
453 $text =~ s/\'/&#8217;/g;
454 $text =~ s/\`/&#8216;/g;
455
[3249]456# $text =~ s/\+/ /g;
457# $text =~ s/\(/ /g;
458# $text =~ s/\)/ /g;
459
[2901]460 $text =~ s/\\/\\\\/g;
461
[3249]462# $text =~ s/\./\\\./g;
[2901]463
464 return $text;
465}
466
[1676]467# Convert a text string into HTML.
468
[1677]469# The HTML is going to be inserted into a GML file, so we have to be
[2901]470# careful not to use symbols like ">", which occurs frequently in email
[1677]471# messages (and use &gt instead.
472
473# This function also turns URLs and email addresses into links, and
474# replaces carriage returns with <BR> tags (and multiple carriage returns
475# with <P> tags).
476
[1676]477sub text_into_html {
478 my ($text) = @_;
479
[2901]480 # Convert problem characters into HTML symbols
[1676]481 $text =~ s/&/&amp;/g;
482
[2901]483 $text = &convert_problem_characters_without_ampersand( $text );
484
[1676]485 # convert email addresses and URLs into links
[2901]486 $text = &convert_urls_into_links( $text );
[1676]487
[2901]488 $text = &clean_up_whitespaces( $text );
[1676]489
490 return $text;
491}
492
[2901]493
[3142]494sub expand_month {
495 my $text=shift;
[2901]496
[3142]497 # bibtex style files expand abbreviations for months.
498 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
499 $text =~ s/jan/_textmonth01_/g;
500 $text =~ s/feb/_textmonth02_/g;
501 $text =~ s/mar/_textmonth03_/g;
502 $text =~ s/apr/_textmonth04_/g;
503 $text =~ s/may/_textmonth05_/g;
504 $text =~ s/jun/_textmonth06_/g;
505 $text =~ s/jul/_textmonth07_/g;
506 $text =~ s/aug/_textmonth08_/g;
507 $text =~ s/sep/_textmonth09_/g;
508 $text =~ s/oct/_textmonth10_/g;
509 $text =~ s/nov/_textmonth11_/g;
510 $text =~ s/dec/_textmonth12_/g;
[2901]511
[3142]512 return $text;
513}
514
515
[2901]516# Convert accented characters, remove { }, interprete some commands....
517# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
518sub process_latex {
519 my ($text) = @_;
520
521 # note - this is really ugly, but it works. There may be a prettier way
522 # of mapping latex accented chars to utf8, but we just brute force it here.
523 # Also, this isn't complete - not every single possible accented letter
524 # is in here yet, but most of the common ones are.
525
526 my %utf8_chars =
527 (
528 # acutes
529 '\'a' => chr(0xc3).chr(0xa1),
530 '\'c' => chr(0xc4).chr(0x87),
531 '\'e' => chr(0xc3).chr(0xa9),
532 '\'i' => chr(0xc3).chr(0xad),
533 '\'l' => chr(0xc3).chr(0xba),
534 '\'n' => chr(0xc3).chr(0x84),
535 '\'o' => chr(0xc3).chr(0xb3),
536 '\'r' => chr(0xc5).chr(0x95),
537 '\'s' => chr(0xc5).chr(0x9b),
538 '\'u' => chr(0xc3).chr(0xba),
539 '\'y' => chr(0xc3).chr(0xbd),
540 '\'z' => chr(0xc5).chr(0xba),
541 # graves
542 '`a' => chr(0xc3).chr(0xa0),
543 '`A' => chr(0xc3).chr(0x80),
544 '`e' => chr(0xc3).chr(0xa8),
545 '`E' => chr(0xc3).chr(0x88),
546 '`i' => chr(0xc3).chr(0xac),
547 '`I' => chr(0xc3).chr(0x8c),
548 '`o' => chr(0xc3).chr(0xb2),
549 '`O' => chr(0xc3).chr(0x92),
550 '`u' => chr(0xc3).chr(0xb9),
551 '`U' => chr(0xc3).chr(0x99),
552 # circumflex
553 '^a' => chr(0xc3).chr(0xa2),
554 '^A' => chr(0xc3).chr(0x82),
555 '^c' => chr(0xc4).chr(0x89),
556 '^C' => chr(0xc4).chr(0x88),
557 '^e' => chr(0xc3).chr(0xaa),
558 '^E' => chr(0xc3).chr(0x8a),
559 '^g' => chr(0xc4).chr(0x9d),
560 '^G' => chr(0xc4).chr(0x9c),
561 '^h' => chr(0xc4).chr(0xa5),
562 '^H' => chr(0xc4).chr(0xa4),
563 '^i' => chr(0xc3).chr(0xae),
564 '^I' => chr(0xc3).chr(0x8e),
565 '^j' => chr(0xc4).chr(0xb5),
566 '^J' => chr(0xc4).chr(0xb4),
567 '^o' => chr(0xc3).chr(0xb4),
568 '^O' => chr(0xc3).chr(0x94),
569 '^s' => chr(0xc5).chr(0x9d),
570 '^S' => chr(0xc5).chr(0x9c),
571 '^u' => chr(0xc3).chr(0xa2),
572 '^U' => chr(0xc3).chr(0xbb),
573 '^w' => chr(0xc5).chr(0xb5),
574 '^W' => chr(0xc5).chr(0xb4),
575 '^y' => chr(0xc5).chr(0xb7),
576 '^Y' => chr(0xc5).chr(0xb6),
577
578 # diaeresis
579 '"a' => chr(0xc3).chr(0xa4),
580 '"A' => chr(0xc3).chr(0x84),
581 '"e' => chr(0xc3).chr(0xab),
582 '"E' => chr(0xc3).chr(0x8b),
583 '"\\\\i' => chr(0xc3).chr(0xaf),
584 '"\\\\I' => chr(0xc3).chr(0x8f),
585 '"o' => chr(0xc3).chr(0xb6),
586 '"O' => chr(0xc3).chr(0x96),
587 '"u' => chr(0xc3).chr(0xbc),
588 '"U' => chr(0xc3).chr(0x9c),
589 '"y' => chr(0xc3).chr(0xbf),
590 '"Y' => chr(0xc3).chr(0xb8),
591 # tilde
[3142]592 '~A' => chr(0xc3).chr(0x83),
593 '~N' => chr(0xc3).chr(0x91),
594 '~O' => chr(0xc3).chr(0x95),
595 '~a' => chr(0xc3).chr(0xa3),
596 '~n' => chr(0xc3).chr(0xb1),
597 '~o' => chr(0xc3).chr(0xb5),
[2901]598 # caron - handled specially
599 # double acute
600 # ring
601 # dot
[3249]602 '.c' => chr(0xc4).chr(0x8b),
603 '.C' => chr(0xc4).chr(0x8a),
604 '.e' => chr(0xc4).chr(0x97),
605 '.E' => chr(0xc4).chr(0x96),
606 '.g' => chr(0xc4).chr(0xa1),
607 '.G' => chr(0xc4).chr(0xa0),
608 '.I' => chr(0xc4).chr(0xb0),
609 '.z' => chr(0xc5).chr(0xbc),
610 '.Z' => chr(0xc5).chr(0xbb),
[2901]611 # macron
612 '=a' => chr(0xc4).chr(0x81),
613 '=A' => chr(0xc4).chr(0x80),
614 '=e' => chr(0xc4).chr(0x93),
615 '=E' => chr(0xc4).chr(0x92),
616 '=i' => chr(0xc4).chr(0xab),
617 '=I' => chr(0xc4).chr(0xaa),
618 '=o' => chr(0xc4).chr(0x8d),
619 '=O' => chr(0xc4).chr(0x8c),
620 '=u' => chr(0xc4).chr(0xab),
621 '=U' => chr(0xc4).chr(0xaa),
622
623 # stroke - handled specially - see below
624
625 # cedilla - handled specially
626 );
627
628# these are one letter latex commands - we make sure they're not a longer
629# command name. eg {\d} is d+stroke, so careful of \d
630 my %special_utf8_chars =
631 (
[3156]632 # breve
633 'u g' => chr(0xc4).chr(0x9f),
634 'u G' => chr(0xc4).chr(0x9e),
635 'u i' => chr(0xc4).chr(0xad),
636 'u I' => chr(0xc4).chr(0xac),
637 'u o' => chr(0xc5).chr(0x8f),
638 'u O' => chr(0xc5).chr(0x8e),
639 'u u' => chr(0xc5).chr(0xad),
640 'u U' => chr(0xc5).chr(0xac),
[2901]641 # caron
[3156]642 'v c' => chr(0xc4).chr(0x8d),
643 'v C' => chr(0xc4).chr(0x8c),
[2901]644 'v n' => chr(0xc5).chr(0x88),
645 'v N' => chr(0xc5).chr(0x87),
646 'v s' => chr(0xc5).chr(0xa1),
647 'v S' => chr(0xc5).chr(0xa5),
[3156]648 'v z' => chr(0xc5).chr(0xbe),
649 'v Z' => chr(0xc5).chr(0xbd),
[2901]650 # cedilla
651 'c c' => chr(0xc3).chr(0xa7),
652 'c C' => chr(0xc3).chr(0x87),
653 'c g' => chr(0xc4).chr(0xa3),
654 'c G' => chr(0xc4).chr(0xa2),
655 'c k' => chr(0xc4).chr(0xb7),
656 'c K' => chr(0xc4).chr(0xb6),
657 'c l' => chr(0xc4).chr(0xbc),
658 'c L' => chr(0xc4).chr(0xbb),
659 'c n' => chr(0xc5).chr(0x86),
660 'c N' => chr(0xc5).chr(0x85),
661 'c r' => chr(0xc5).chr(0x97),
662 'c R' => chr(0xc5).chr(0x96),
663 'c s' => chr(0xc5).chr(0x9f),
664 'c S' => chr(0xc5).chr(0x9e),
665 'c t' => chr(0xc5).chr(0xa3),
666 'c T' => chr(0xc5).chr(0xa2),
667 # double acute / Hungarian accent
668 'H O' => chr(0xc5).chr(0x90),
669 'H o' => chr(0xc5).chr(0x91),
670 'H U' => chr(0xc5).chr(0xb0),
671 'H u' => chr(0xc5).chr(0xb1),
672
673 # stroke
674 'd' => chr(0xc4).chr(0x91),
675 'D' => chr(0xc4).chr(0x90),
676 'h' => chr(0xc4).chr(0xa7),
677# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
[3112]678 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
[2901]679 'l' => chr(0xc5).chr(0x82),
680 'L' => chr(0xc5).chr(0x81),
681 'o' => chr(0xc3).chr(0xb8),
682 'O' => chr(0xc3).chr(0x98),
683 't' => chr(0xc5).chr(0xa7),
684 'T' => chr(0xc5).chr(0xa6),
685 # german ss/szlig/sharp s
686 'ss' => chr(0xc3).chr(0x9f),
687 );
688
689 # convert latex-style accented characters.
[3249]690
[2901]691 # remove space (if any) between \ and letter to accent (eg {\' a})
692
[3249]693 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
[2901]694
695 # remove {} around a single character (eg \'{e})
[3249]696 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
[2901]697
[3249]698 # \, is another way of doing cedilla \c
699 $text =~ s@\\,(.)@\\c $1@g;
700
[2901]701 # remove {} around a single character for special 1 letter commands -
702 # need to insert a space. Eg \v{s} -> {\v s}
[3156]703 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
[3249]704
[2901]705 # only do if the text contains a '\' character.
706 if ($text =~ m|\\|) {
[3249]707 # "normal" accents - ie non-alpha latex tag
708 while ($text =~ m@\\([`'="^~\.])([\w])@) {
709 my $tex="$1$2"; my $char="$2";
710 my $replacement=$utf8_chars{$tex};
711 if (!defined($replacement)) {
712 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
713 $replacement=$char;
714 }
[5454]715 $text =~ s/\\\Q$tex/$replacement/g;
[3249]716 }
[2901]717
[3249]718 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
719 # only do the change if immediately followed by a space, }, {, or \
720 # one letter accents ( + ss)
721 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
722 my $tex=$1;
723 my $replacement=$special_utf8_chars{$tex};
724 if (!defined($replacement)) {
725 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
726 $replacement=$tex;
727 }
728 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
729
730 }
731
732 # one letter latex accent commands that affect following letter
733 while ($text =~ m@\\([uvcH]) ([\w])@) {
734 my $tex="$1 $2"; my $char="$2";
735 my $replacement=$special_utf8_chars{$tex};
736 if (!defined($replacement)) {
737 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
738 $replacement=$char;
739 }
740 $text =~ s/\\$tex/$replacement/g;
741 }
[2901]742 }
[3249]743
744 # escape html-sensitive characters
745 $text =~ s@&@&amp;@g;
746 $text =~ s@<@&lt;@g;
747 $text =~ s@>@&gt;@g;
748 $text =~ s/''/"/g; # Latex-specific
749 $text =~ s/``/"/g; # Latex-specific
750 # greenstone-specific
751 $text =~ s@\[@&\#91;@g;
752 $text =~ s@\]@&\#93;@g;
753
[3112]754 # remove latex commands
[3249]755
756 # explicitly recognised commands
757 $text =~ s@\\ldots@&hellip;@g;
758
759 # maths mode
760 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
761
762 # remove all other commands with optional arguments...
[3156]763 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
764 # $text =~ s@\\noopsort{[^}]+\}@@g;
765 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
[3142]766
[2901]767 # remove latex groupings { } (but not \{ or \} )
[3249]768 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
[3142]769 $text =~ s/^\{//; # remove { if first char
770
771 # latex characters
772 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
[3249]773 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
[3142]774 # optional break "\-"
[3249]775 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
[3142]776 # the non-macro bits have quotes around them - we just remove quotes
[3249]777 $text =~ s/([^&])[\"\#]/$1/g;
[3142]778 }
[3249]779 # dashes. Convert (m|n)-dash into single dash for html.
780 $text =~ s@\-\-+@\-@g;
781
[2901]782 # quoted { } chars
783 $text =~ s@\\{@{@g;
784 $text =~ s@\\}@}@g;
785
[3249]786 # finally to protect against macro language...
787 $text =~ s@\\@\\\\@g;
788
[2901]789 return $text;
790}
791
792
[3249]793sub process_latex_math {
794 my $text = shift;
795
796 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
797 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
798 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
799 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
800 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
801
802 # put all other command names in italics
803 $text =~ s@\\([\w]+)@<i>$1</i>@g;
804
805 return $text;
806}
807
[2484]808sub set_OID {
809 my $self = shift (@_);
810 my ($doc_obj, $id, $segment_number) = @_;
811
812 if ( $self->{'key'} eq "default") {
[4792]813 $doc_obj->set_OID("$id\_$segment_number");
[2484]814 } else {
815 $doc_obj->set_OID($self->{'key'});
816 }
817}
[1676]818
[2484]8191;
[4792]820
821
Note: See TracBrowser for help on using the repository browser.