source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 13413

Last change on this file since 13413 was 13413, checked in by kjdon, 17 years ago

added some special handling for year so that it only gets 4 digits. Also added some comments to keep emacs colouring working properly

  • Property svn:keywords set to Author Date Id Revision
File size: 23.5 KB
RevLine 
[1676]1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
[2901]10# Copyright 1999-2001 New Zealand Digital Library Project
[1676]11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
[2901]37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
[1676]42
[3587]43
[1676]44package BibTexPlug;
45
46use SplitPlug;
[10254]47use strict;
48no strict 'refs'; # allow filehandles to be variables and viceversa
[1676]49
50# BibTexPlug is a sub-class of BasPlug.
51sub BEGIN {
[10254]52 @BibTexPlug::ISA = ('SplitPlug');
[1676]53}
54
[3540]55my $arguments =
[4744]56 [ { 'name' => "process_exp",
[4873]57 'desc' => "{BasPlug.process_exp}",
[6408]58 'type' => "regexp",
[4873]59 'reqd' => "no",
[6408]60 'deft' => &get_default_process_exp() },
61 { 'name' => "split_exp",
62 'desc' => "{SplitPlug.split_exp}",
63 'type' => "regexp",
64 'deft' => &get_default_split_exp(),
65 'reqd' => "no" }
66 ];
[3540]67
[4744]68my $options = { 'name' => "BibTexPlug",
[5680]69 'desc' => "{BibTexPlug.desc}",
[6408]70 'abstract' => "no",
71 'inherits' => "yes",
[9120]72 'explodes' => "yes",
[4744]73 'args' => $arguments };
[3540]74
[1676]75# This plugin processes files with the suffix ".bib"
76sub get_default_process_exp {
77 return q^(?i)\.bib$^;
78}
79
80# This plugin splits the input text at blank lines
81sub get_default_split_exp {
82 return q^\n+(?=@)^;
83}
[3540]84sub new {
[10218]85 my ($class) = shift (@_);
86 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87 push(@$pluginlist, $class);
88
89 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
90 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
91
[12169]92 my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
[10218]93
[3540]94 return bless $self, $class;
95}
[1676]96
[2901]97
98
[1676]99# The process function reads a single bibliographic record and stores
100# it as a new document.
101
102sub process {
103 my $self = shift (@_);
[6332]104 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1676]105 my $outhandle = $self->{'outhandle'};
106
[10254]107 my $cursection = $doc_obj->get_top_section();
[2484]108 $self->{'key'} = "default";
109
[1676]110 # Check that we're dealing with a valid BibTex record
111 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
112
113 # Ignore things we can't use
114 return 0 if ($$textref =~ /^\@String/);
115
116 # Report that we're processing the file
[6332]117 print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
[1676]118 print $outhandle "BibTexPlug: processing $file\n"
119 if ($self->{'verbosity'}) > 1;
120
121
[1711]122 # This hash translates BibTex field names into metadata names. The
[1676]123 # BibTex names are taken from the "Local Guide to Latex" Graeme
[2901]124 # McKinstry. Metadata names are consistent with ReferPlug.
[1676]125
[9582]126 # The author metadata will be stored as one "Creator" entry, but will
127 # also be split up into several individual "Author" fields.
128
[1676]129 my %field = (
130 'address', 'PublisherAddress',
131 'author', 'Creator',
[2901]132
[1676]133 'booktitle', 'Booktitle',
134 'chapter', 'Chapter',
135 'edition', 'Edition',
[2901]136 'editor', 'Editor',
[1676]137 'institution', 'Publisher',
138 'journal', 'Journal',
139 'month', 'Month',
140 'number', 'Number',
141 'pages', 'Pages',
142 'publisher', 'Publisher',
143 'school', 'Publisher',
144 'title', 'Title',
145 'volume', 'Volume',
[3142]146 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
[1676]147
148 'keywords', 'Keywords',
149 'abstract', 'Abstract',
[4792]150 'copyright', 'Copyright',
151 'note', 'Note',
[11527]152 'url', 'URL',
[8121]153 );
[1676]154
155 # Metadata fields
156 my %metadata;
[4792]157 my ($EntryType, $Creator, $Keywords, $text);
[1676]158
[2901]159 my $verbosity = $self->{'verbosity'};
160 $verbosity = 0 unless $verbosity;
161
[3142]162 # Make sure the text has exactly one entry per line -
163 # append line to previous if it doesn't start with " <key> = "
[2901]164
[4792]165 my @input_lines=split(/\r?\n/, $$textref);
[3142]166 my @all_lines;
167 my $entry_line=shift @input_lines;
168 foreach my $input_line (@input_lines) {
169 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
170 # this is a new key
171 push(@all_lines, $entry_line);
172 $entry_line=$input_line;
173 } else {
174 # this is a continuation of previous line
[3249]175 $entry_line .= " " . $input_line;
[3142]176 }
177
178 }
179 # add final line, removing trailing '}'
180 $entry_line =~ s/\}\s*$//;
181 push(@all_lines, $entry_line);
182 push(@all_lines, "}");
[2901]183
[1676]184 # Read and process each line in the bib file.
[2901]185 my ($entryname, $name, $value, $line);
186 foreach $line (@all_lines) {
187
[1676]188 # Add each line. Most lines consist of a field identifer and
189 # then data, and we simply store them, though we treat some
190 # of the fields a bit differently.
191
192 $line =~ s/\s+/ /g;
193 $text .= "$line\n";
194
[4792]195
[2901]196 print "Processing line = $line \n" if $verbosity>=4;
197
[1676]198 # The first line is special, it contains the reference type and OID
[4792]199 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]*)\W*$/) {
[1676]200 $EntryType = $1;
[4792]201 my $EntryID = (defined $2) ? $2 : "default";
[2484]202 print "** $EntryType - \"$EntryID\" \n"
203 if ($verbosity >= 4);
[4792]204
[1676]205 next;
206 }
[2484]207 if ($line =~ /\@/) {
[2901]208 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
[2484]209 if ($verbosity >= 2);
[2901]210 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
[2484]211 if ($verbosity >= 2);
212 }
[1676]213
214 # otherwise, parse the metadata out of this line
[9582]215 next unless ($line =~ /^\s*(\w+)\s*=\s*(.*)/);
[2901]216 $entryname = lc($1);
[1676]217 $value = $2;
[3112]218 $value =~ s/,?\s*$//; # remove trailing comma and space
219 if ($value =~ /^"/ && $value =~ /"$/) {
220 # remove surrounding " marks
221 $value =~ s/^"//; $value =~ s/"$//;
222 }
[13413]223
224 # special case for year - we only want a 4 digit year, not
225 # "circa 2004" etc
226 if ($entryname=~ /^year$/i) {
227 if ($value=~ /(\d{4})/) {
228 $value=$1;
229 $metadata{$entryname} .= "$value";
230 }
231 }
232 else {
233
234 $value = &process_latex($value);
235 # Add this line of metadata
236 $metadata{$entryname} .= "$value";
237 }
[1676]238 }
239
[1677]240 # Add the Entry type as metadata
[2901]241 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
[8121]242
243 #Add the fileformat as metadata
244 $doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
[1676]245
[8121]246
[1676]247 # Add the various field as metadata
[2901]248 foreach my $entryname (keys %metadata) {
249 next unless (defined $field{$entryname});
250 next unless (defined $metadata{$entryname});
[1676]251
[2901]252 $name = $field{$entryname};
253 $value = $metadata{$entryname};
[1676]254
[3142]255 if ($name =~ /^Month/) {
256 $value=expand_month($value);
257 }
[1676]258
259 # Several special operatons on metadata follow
260
261 # Add individual keywords.
262 # The full set of keywords will be added, in due course, as "Keywords".
263 # However, we also want to add them as individual "Keyword" metadata elements.
[2901]264 if ($entryname eq "keywords") {
[1676]265 my @keywordlist = split(/,/, $value);
266 foreach my $k (@keywordlist) {
[2901]267 $k = lc($k);
[1676]268 $k =~ s/\s*$//;
269 $k =~ s/^\s*//;
270 if ($k =~ /\w/) {
[2901]271 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
[1676]272 }
273 }
274 }
275
276 # Add individual authors
277 # The author metadata will be stored as one "Creator" entry, but we
278 # also want to split it into several individual "Author" fields in
[3112]279 # "Lastname, Firstnames" format so we can browse it.
[2901]280 if ($entryname eq "author") { #added also comparison with editor
281
[3249]282 # take care of "et al."...
[9582]283 my $etal='';
284 if ($value =~ s/\s+(and\s+others|et\.?\s+al\.?)\s*$//i) {
285 $etal=' <em>et. al.</em>';
286 }
[2901]287 # und here for german language...
288 # don't use brackets in pattern, else the matched bit becomes
289 # an element in the list!
[3249]290 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
291 my @formattedlist = ();
[1676]292 foreach $a (@authorlist) {
293 $a =~ s/\s*$//;
294 $a =~ s/^\s*//;
295 # Reformat and add author name
[2901]296 next if $a=~ /^\s*$/;
[3249]297
298 # names are "First von Last", "von Last, First"
299 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
300 my $first="";
301 my $vonlast="";
302 my $jr="";
[2901]303
[3249]304 if ($a =~ /,/) {
305 my @parts=split(/,\s*/, $a);
306 $first = pop @parts;
307 if (scalar(@parts) == 2) {
308 $jr = pop @parts;
309 }
310 $vonlast=shift @parts;
311 if (scalar(@parts) > 0) {
312 print $outhandle "BibTexPlug: couldn't parse name $a\n";
313 # but we continue anyway...
314 }
315 } else { # First von Last
316 my @words = split(/ /, $a);
317 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
318 $first .= " " . shift (@words);
319 }
320 $first =~ s/^\s//;
321 $vonlast = join (' ', @words); # whatever's left...
322 }
323 my $von="";
324 my $last="";
325 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
326 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
327 $von = $1;
328 if (!defined ($von)) {
329 # some non-English names do start with lowercase
330 # eg "Marie desJardins". Also we can get typos...
331 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
332 $von="";
333 if ($vonlast =~ /^[a-z]+$/) {
334 # if it's all lowercase, uppercase 1st.
335 $vonlast =~ s/^(.)/\u$1/;
[2901]336
[3249]337 }
338 }
339 $von =~ s/\s*$//;
340 $last=$vonlast;
341 } else {
342 $last=$vonlast;
343 }
344 my $wholename="$first $von $last $jr";
[9582]345 $wholename =~ s/\s+/ /g; # squeeze multiple spaces
346 $wholename =~ s/ $//;
[3249]347 push (@formattedlist, $wholename);
348 my $fullname = "$last";
349 $fullname .= " $jr" if ($jr);
350 $fullname .= ", $first";
351 $fullname .= " $von" if ($von);
352
[1676]353 # Add each name to set of Authors
[3249]354 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
[1676]355 }
[3249]356
357 # Only want at most one "and" in the Creator field
358 if (scalar(@formattedlist) > 2) {
359 my $lastauthor=pop @formattedlist;
360 $value=join(', ', @formattedlist);
361 $value.=" and $lastauthor";
362 } else { # 1 or 2 authors...
363 $value=join(" and ",@formattedlist);
364 }
365 $value.=$etal; # if there was "et al."
[1676]366 }
367
368 # Books and Journals are additionally marked for display purposes
[2901]369 if ($entryname eq "booktitle") {
370 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
371 } elsif ($entryname eq "journal") {
372 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
[1676]373 }
374
[3249]375 # Add the various fields as metadata
376 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
[13413]377
[1676]378 }
379
[9582]380 # for books and journals...
381 if (!exists $metadata{'title'}) {
382 my $name=$field{'title'}; # get Greenstone metadata name
383 my $value;
384 if (exists $metadata{'booktitle'}) {
385 $value=$metadata{'booktitle'};
386 } elsif (exists $metadata{'journal'}) {
387 $value=$metadata{'journal'};
388 }
389 if ($value) {
390 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
391 }
392 }
393
[3249]394 # Add Date (yyyymmdd) metadata
395 if (defined ($metadata{'year'}) ) {
396 my $date=$metadata{'year'};
397 chomp $date;
398 my $month=$metadata{'month'};
399 if (defined($month)) {
400 # month is currently 3 letter code or a range...
401 $month = expand_month($month);
402 # take the first month found... might not find one!
403 $month =~ m/_textmonth(\d\d)_/;
404 $month = $1;
405 }
406 if (!defined($month)) {
407 $month="00";
408 }
409 $date .= "${month}00";
410 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
411}
412
413# # Add the text in BibTex format (all fields)
[1676]414 if ($text =~ /\w/) {
[3249]415
416 $text =~ s@&@&amp;@g;
417 $text =~ s@<@&lt;@g;
418 $text =~ s@>@&gt;@g;
419 $text =~ s@\n@<br/>\n@g;
420 $text =~ s@\\@\\\\@g;
421
[2901]422 $doc_obj->add_utf8_text ($cursection, $text);
[1676]423 }
424
[1677]425 return 1;
[1676]426}
427
[1677]428
[2901]429
430
431# convert email addresses and URLs into links
432sub convert_urls_into_links{
433 my ($text) = @_;
434
435 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[3540]436 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
[2901]437
438 return $text;
439}
440
441# Clean up whitespace and convert \n charaters to <BR> or <P>
442sub clean_up_whitespaces{
443 my ($text) = @_;
444
445 $text =~ s/%%%%%/<BR> <BR>/g;
446 $text =~ s/ +/ /g;
447 $text =~ s/\s*$//;
448 $text =~ s/^\s*//;
449 $text =~ s/\n/\n<BR>/g;
450 $text =~ s/<BR>\s*<BR>/<P>/g;
451
452 return $text;
453}
454
455
456sub convert_problem_characters_without_ampersand{
457 my ($text) = @_;
458 $text =~ s/</&lt;/g;
459 $text =~ s/>/&gt;/g;
460
461 $text =~ s/\'\'/\"/g; #Latex -specific conversion
462 $text =~ s/\`\`/\"/g; #Latex -specific conversion
463
464 $text =~ s/\"/&quot;/g;
465 $text =~ s/\'/&#8217;/g;
466 $text =~ s/\`/&#8216;/g;
467
[3249]468# $text =~ s/\+/ /g;
469# $text =~ s/\(/ /g;
470# $text =~ s/\)/ /g;
471
[2901]472 $text =~ s/\\/\\\\/g;
473
[3249]474# $text =~ s/\./\\\./g;
[2901]475
476 return $text;
477}
478
[1676]479# Convert a text string into HTML.
480
[1677]481# The HTML is going to be inserted into a GML file, so we have to be
[2901]482# careful not to use symbols like ">", which occurs frequently in email
[1677]483# messages (and use &gt instead.
484
485# This function also turns URLs and email addresses into links, and
486# replaces carriage returns with <BR> tags (and multiple carriage returns
487# with <P> tags).
488
[1676]489sub text_into_html {
490 my ($text) = @_;
491
[2901]492 # Convert problem characters into HTML symbols
[1676]493 $text =~ s/&/&amp;/g;
494
[2901]495 $text = &convert_problem_characters_without_ampersand( $text );
496
[1676]497 # convert email addresses and URLs into links
[2901]498 $text = &convert_urls_into_links( $text );
[1676]499
[2901]500 $text = &clean_up_whitespaces( $text );
[1676]501
502 return $text;
503}
504
[2901]505
[3142]506sub expand_month {
507 my $text=shift;
[2901]508
[3142]509 # bibtex style files expand abbreviations for months.
510 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
511 $text =~ s/jan/_textmonth01_/g;
512 $text =~ s/feb/_textmonth02_/g;
513 $text =~ s/mar/_textmonth03_/g;
514 $text =~ s/apr/_textmonth04_/g;
515 $text =~ s/may/_textmonth05_/g;
516 $text =~ s/jun/_textmonth06_/g;
517 $text =~ s/jul/_textmonth07_/g;
518 $text =~ s/aug/_textmonth08_/g;
519 $text =~ s/sep/_textmonth09_/g;
520 $text =~ s/oct/_textmonth10_/g;
521 $text =~ s/nov/_textmonth11_/g;
522 $text =~ s/dec/_textmonth12_/g;
[2901]523
[3142]524 return $text;
525}
526
527
[2901]528# Convert accented characters, remove { }, interprete some commands....
529# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
530sub process_latex {
531 my ($text) = @_;
532
533 # note - this is really ugly, but it works. There may be a prettier way
534 # of mapping latex accented chars to utf8, but we just brute force it here.
535 # Also, this isn't complete - not every single possible accented letter
536 # is in here yet, but most of the common ones are.
537
538 my %utf8_chars =
539 (
540 # acutes
541 '\'a' => chr(0xc3).chr(0xa1),
542 '\'c' => chr(0xc4).chr(0x87),
543 '\'e' => chr(0xc3).chr(0xa9),
544 '\'i' => chr(0xc3).chr(0xad),
545 '\'l' => chr(0xc3).chr(0xba),
546 '\'n' => chr(0xc3).chr(0x84),
547 '\'o' => chr(0xc3).chr(0xb3),
548 '\'r' => chr(0xc5).chr(0x95),
549 '\'s' => chr(0xc5).chr(0x9b),
550 '\'u' => chr(0xc3).chr(0xba),
551 '\'y' => chr(0xc3).chr(0xbd),
552 '\'z' => chr(0xc5).chr(0xba),
553 # graves
554 '`a' => chr(0xc3).chr(0xa0),
555 '`A' => chr(0xc3).chr(0x80),
556 '`e' => chr(0xc3).chr(0xa8),
557 '`E' => chr(0xc3).chr(0x88),
558 '`i' => chr(0xc3).chr(0xac),
559 '`I' => chr(0xc3).chr(0x8c),
560 '`o' => chr(0xc3).chr(0xb2),
561 '`O' => chr(0xc3).chr(0x92),
562 '`u' => chr(0xc3).chr(0xb9),
563 '`U' => chr(0xc3).chr(0x99),
564 # circumflex
565 '^a' => chr(0xc3).chr(0xa2),
566 '^A' => chr(0xc3).chr(0x82),
567 '^c' => chr(0xc4).chr(0x89),
568 '^C' => chr(0xc4).chr(0x88),
569 '^e' => chr(0xc3).chr(0xaa),
570 '^E' => chr(0xc3).chr(0x8a),
571 '^g' => chr(0xc4).chr(0x9d),
572 '^G' => chr(0xc4).chr(0x9c),
573 '^h' => chr(0xc4).chr(0xa5),
574 '^H' => chr(0xc4).chr(0xa4),
575 '^i' => chr(0xc3).chr(0xae),
576 '^I' => chr(0xc3).chr(0x8e),
577 '^j' => chr(0xc4).chr(0xb5),
578 '^J' => chr(0xc4).chr(0xb4),
579 '^o' => chr(0xc3).chr(0xb4),
580 '^O' => chr(0xc3).chr(0x94),
581 '^s' => chr(0xc5).chr(0x9d),
582 '^S' => chr(0xc5).chr(0x9c),
583 '^u' => chr(0xc3).chr(0xa2),
584 '^U' => chr(0xc3).chr(0xbb),
585 '^w' => chr(0xc5).chr(0xb5),
586 '^W' => chr(0xc5).chr(0xb4),
587 '^y' => chr(0xc5).chr(0xb7),
588 '^Y' => chr(0xc5).chr(0xb6),
589
590 # diaeresis
591 '"a' => chr(0xc3).chr(0xa4),
592 '"A' => chr(0xc3).chr(0x84),
593 '"e' => chr(0xc3).chr(0xab),
594 '"E' => chr(0xc3).chr(0x8b),
595 '"\\\\i' => chr(0xc3).chr(0xaf),
596 '"\\\\I' => chr(0xc3).chr(0x8f),
597 '"o' => chr(0xc3).chr(0xb6),
598 '"O' => chr(0xc3).chr(0x96),
599 '"u' => chr(0xc3).chr(0xbc),
600 '"U' => chr(0xc3).chr(0x9c),
601 '"y' => chr(0xc3).chr(0xbf),
602 '"Y' => chr(0xc3).chr(0xb8),
603 # tilde
[3142]604 '~A' => chr(0xc3).chr(0x83),
605 '~N' => chr(0xc3).chr(0x91),
606 '~O' => chr(0xc3).chr(0x95),
607 '~a' => chr(0xc3).chr(0xa3),
608 '~n' => chr(0xc3).chr(0xb1),
609 '~o' => chr(0xc3).chr(0xb5),
[2901]610 # caron - handled specially
611 # double acute
612 # ring
613 # dot
[3249]614 '.c' => chr(0xc4).chr(0x8b),
615 '.C' => chr(0xc4).chr(0x8a),
616 '.e' => chr(0xc4).chr(0x97),
617 '.E' => chr(0xc4).chr(0x96),
618 '.g' => chr(0xc4).chr(0xa1),
619 '.G' => chr(0xc4).chr(0xa0),
620 '.I' => chr(0xc4).chr(0xb0),
621 '.z' => chr(0xc5).chr(0xbc),
622 '.Z' => chr(0xc5).chr(0xbb),
[2901]623 # macron
624 '=a' => chr(0xc4).chr(0x81),
625 '=A' => chr(0xc4).chr(0x80),
626 '=e' => chr(0xc4).chr(0x93),
627 '=E' => chr(0xc4).chr(0x92),
628 '=i' => chr(0xc4).chr(0xab),
629 '=I' => chr(0xc4).chr(0xaa),
630 '=o' => chr(0xc4).chr(0x8d),
631 '=O' => chr(0xc4).chr(0x8c),
632 '=u' => chr(0xc4).chr(0xab),
633 '=U' => chr(0xc4).chr(0xaa),
634
635 # stroke - handled specially - see below
636
637 # cedilla - handled specially
638 );
639
640# these are one letter latex commands - we make sure they're not a longer
641# command name. eg {\d} is d+stroke, so careful of \d
642 my %special_utf8_chars =
643 (
[3156]644 # breve
645 'u g' => chr(0xc4).chr(0x9f),
646 'u G' => chr(0xc4).chr(0x9e),
647 'u i' => chr(0xc4).chr(0xad),
648 'u I' => chr(0xc4).chr(0xac),
649 'u o' => chr(0xc5).chr(0x8f),
650 'u O' => chr(0xc5).chr(0x8e),
651 'u u' => chr(0xc5).chr(0xad),
652 'u U' => chr(0xc5).chr(0xac),
[2901]653 # caron
[3156]654 'v c' => chr(0xc4).chr(0x8d),
655 'v C' => chr(0xc4).chr(0x8c),
[2901]656 'v n' => chr(0xc5).chr(0x88),
657 'v N' => chr(0xc5).chr(0x87),
658 'v s' => chr(0xc5).chr(0xa1),
659 'v S' => chr(0xc5).chr(0xa5),
[3156]660 'v z' => chr(0xc5).chr(0xbe),
661 'v Z' => chr(0xc5).chr(0xbd),
[2901]662 # cedilla
663 'c c' => chr(0xc3).chr(0xa7),
664 'c C' => chr(0xc3).chr(0x87),
665 'c g' => chr(0xc4).chr(0xa3),
666 'c G' => chr(0xc4).chr(0xa2),
667 'c k' => chr(0xc4).chr(0xb7),
668 'c K' => chr(0xc4).chr(0xb6),
669 'c l' => chr(0xc4).chr(0xbc),
670 'c L' => chr(0xc4).chr(0xbb),
671 'c n' => chr(0xc5).chr(0x86),
672 'c N' => chr(0xc5).chr(0x85),
673 'c r' => chr(0xc5).chr(0x97),
674 'c R' => chr(0xc5).chr(0x96),
675 'c s' => chr(0xc5).chr(0x9f),
676 'c S' => chr(0xc5).chr(0x9e),
677 'c t' => chr(0xc5).chr(0xa3),
678 'c T' => chr(0xc5).chr(0xa2),
679 # double acute / Hungarian accent
680 'H O' => chr(0xc5).chr(0x90),
681 'H o' => chr(0xc5).chr(0x91),
682 'H U' => chr(0xc5).chr(0xb0),
683 'H u' => chr(0xc5).chr(0xb1),
684
685 # stroke
686 'd' => chr(0xc4).chr(0x91),
687 'D' => chr(0xc4).chr(0x90),
688 'h' => chr(0xc4).chr(0xa7),
689# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
[3112]690 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
[2901]691 'l' => chr(0xc5).chr(0x82),
692 'L' => chr(0xc5).chr(0x81),
693 'o' => chr(0xc3).chr(0xb8),
694 'O' => chr(0xc3).chr(0x98),
695 't' => chr(0xc5).chr(0xa7),
696 'T' => chr(0xc5).chr(0xa6),
697 # german ss/szlig/sharp s
698 'ss' => chr(0xc3).chr(0x9f),
699 );
700
701 # convert latex-style accented characters.
[3249]702
[2901]703 # remove space (if any) between \ and letter to accent (eg {\' a})
704
[13413]705 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
706
[2901]707 # remove {} around a single character (eg \'{e})
[13413]708 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
[2901]709
[3249]710 # \, is another way of doing cedilla \c
711 $text =~ s@\\,(.)@\\c $1@g;
712
[2901]713 # remove {} around a single character for special 1 letter commands -
714 # need to insert a space. Eg \v{s} -> {\v s}
[3156]715 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
[3249]716
[2901]717 # only do if the text contains a '\' character.
718 if ($text =~ m|\\|) {
[3249]719 # "normal" accents - ie non-alpha latex tag
[13413]720 while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
[3249]721 my $tex="$1$2"; my $char="$2";
722 my $replacement=$utf8_chars{$tex};
723 if (!defined($replacement)) {
724 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
725 $replacement=$char;
726 }
[5454]727 $text =~ s/\\\Q$tex/$replacement/g;
[3249]728 }
[2901]729
[3249]730 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
731 # only do the change if immediately followed by a space, }, {, or \
732 # one letter accents ( + ss)
733 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
734 my $tex=$1;
735 my $replacement=$special_utf8_chars{$tex};
736 if (!defined($replacement)) {
737 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
738 $replacement=$tex;
739 }
740 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
741
742 }
743
744 # one letter latex accent commands that affect following letter
745 while ($text =~ m@\\([uvcH]) ([\w])@) {
746 my $tex="$1 $2"; my $char="$2";
747 my $replacement=$special_utf8_chars{$tex};
748 if (!defined($replacement)) {
749 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
750 $replacement=$char;
751 }
752 $text =~ s/\\$tex/$replacement/g;
753 }
[2901]754 }
[3249]755
756 # escape html-sensitive characters
757 $text =~ s@&@&amp;@g;
758 $text =~ s@<@&lt;@g;
759 $text =~ s@>@&gt;@g;
760 $text =~ s/''/"/g; # Latex-specific
761 $text =~ s/``/"/g; # Latex-specific
762 # greenstone-specific
763 $text =~ s@\[@&\#91;@g;
764 $text =~ s@\]@&\#93;@g;
765
[3112]766 # remove latex commands
[3249]767
768 # explicitly recognised commands
769 $text =~ s@\\ldots@&hellip;@g;
770
771 # maths mode
772 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
773
774 # remove all other commands with optional arguments...
[3156]775 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
776 # $text =~ s@\\noopsort{[^}]+\}@@g;
777 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
[3142]778
[2901]779 # remove latex groupings { } (but not \{ or \} )
[3249]780 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
[3142]781 $text =~ s/^\{//; # remove { if first char
782
783 # latex characters
784 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
[3249]785 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
[3142]786 # optional break "\-"
[3249]787 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
[3142]788 # the non-macro bits have quotes around them - we just remove quotes
[3249]789 $text =~ s/([^&])[\"\#]/$1/g;
[3142]790 }
[3249]791 # dashes. Convert (m|n)-dash into single dash for html.
792 $text =~ s@\-\-+@\-@g;
793
[2901]794 # quoted { } chars
795 $text =~ s@\\{@{@g;
796 $text =~ s@\\}@}@g;
797
[3249]798 # finally to protect against macro language...
799 $text =~ s@\\@\\\\@g;
800
[2901]801 return $text;
802}
803
804
[3249]805sub process_latex_math {
806 my $text = shift;
807
808 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
809 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
810 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
811 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
812 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
813
814 # put all other command names in italics
815 $text =~ s@\\([\w]+)@<i>$1</i>@g;
816
817 return $text;
818}
819
[2484]820sub set_OID {
821 my $self = shift (@_);
822 my ($doc_obj, $id, $segment_number) = @_;
823
824 if ( $self->{'key'} eq "default") {
[4792]825 $doc_obj->set_OID("$id\_$segment_number");
[2484]826 } else {
827 $doc_obj->set_OID($self->{'key'});
828 }
829}
[1676]830
[2484]8311;
[4792]832
833
Note: See TracBrowser for help on using the repository browser.