source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 13416

Last change on this file since 13416 was 13416, checked in by kjdon, 17 years ago

remove surrounding {} as well as surrounding "" from value

  • Property svn:keywords set to Author Date Id Revision
File size: 23.6 KB
RevLine 
[1676]1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
[2901]10# Copyright 1999-2001 New Zealand Digital Library Project
[1676]11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
[2901]37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
[1676]42
[3587]43
[1676]44package BibTexPlug;
45
46use SplitPlug;
[10254]47use strict;
48no strict 'refs'; # allow filehandles to be variables and viceversa
[1676]49
50# BibTexPlug is a sub-class of BasPlug.
51sub BEGIN {
[10254]52 @BibTexPlug::ISA = ('SplitPlug');
[1676]53}
54
[3540]55my $arguments =
[4744]56 [ { 'name' => "process_exp",
[4873]57 'desc' => "{BasPlug.process_exp}",
[6408]58 'type' => "regexp",
[4873]59 'reqd' => "no",
[6408]60 'deft' => &get_default_process_exp() },
61 { 'name' => "split_exp",
62 'desc' => "{SplitPlug.split_exp}",
63 'type' => "regexp",
64 'deft' => &get_default_split_exp(),
65 'reqd' => "no" }
66 ];
[3540]67
[4744]68my $options = { 'name' => "BibTexPlug",
[5680]69 'desc' => "{BibTexPlug.desc}",
[6408]70 'abstract' => "no",
71 'inherits' => "yes",
[9120]72 'explodes' => "yes",
[4744]73 'args' => $arguments };
[3540]74
[1676]75# This plugin processes files with the suffix ".bib"
76sub get_default_process_exp {
77 return q^(?i)\.bib$^;
78}
79
80# This plugin splits the input text at blank lines
81sub get_default_split_exp {
82 return q^\n+(?=@)^;
83}
[3540]84sub new {
[10218]85 my ($class) = shift (@_);
86 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87 push(@$pluginlist, $class);
88
89 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
90 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
91
[12169]92 my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
[10218]93
[3540]94 return bless $self, $class;
95}
[1676]96
[2901]97
98
[1676]99# The process function reads a single bibliographic record and stores
100# it as a new document.
101
102sub process {
103 my $self = shift (@_);
[6332]104 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1676]105 my $outhandle = $self->{'outhandle'};
106
[10254]107 my $cursection = $doc_obj->get_top_section();
[2484]108 $self->{'key'} = "default";
109
[1676]110 # Check that we're dealing with a valid BibTex record
111 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
112
113 # Ignore things we can't use
114 return 0 if ($$textref =~ /^\@String/);
115
116 # Report that we're processing the file
[6332]117 print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
[1676]118 print $outhandle "BibTexPlug: processing $file\n"
119 if ($self->{'verbosity'}) > 1;
120
121
[1711]122 # This hash translates BibTex field names into metadata names. The
[1676]123 # BibTex names are taken from the "Local Guide to Latex" Graeme
[2901]124 # McKinstry. Metadata names are consistent with ReferPlug.
[1676]125
[9582]126 # The author metadata will be stored as one "Creator" entry, but will
127 # also be split up into several individual "Author" fields.
128
[1676]129 my %field = (
130 'address', 'PublisherAddress',
131 'author', 'Creator',
[2901]132
[1676]133 'booktitle', 'Booktitle',
134 'chapter', 'Chapter',
135 'edition', 'Edition',
[2901]136 'editor', 'Editor',
[1676]137 'institution', 'Publisher',
138 'journal', 'Journal',
139 'month', 'Month',
140 'number', 'Number',
141 'pages', 'Pages',
142 'publisher', 'Publisher',
143 'school', 'Publisher',
144 'title', 'Title',
145 'volume', 'Volume',
[3142]146 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
[1676]147
148 'keywords', 'Keywords',
149 'abstract', 'Abstract',
[4792]150 'copyright', 'Copyright',
151 'note', 'Note',
[11527]152 'url', 'URL',
[8121]153 );
[1676]154
155 # Metadata fields
156 my %metadata;
[4792]157 my ($EntryType, $Creator, $Keywords, $text);
[1676]158
[2901]159 my $verbosity = $self->{'verbosity'};
160 $verbosity = 0 unless $verbosity;
161
[3142]162 # Make sure the text has exactly one entry per line -
163 # append line to previous if it doesn't start with " <key> = "
[2901]164
[4792]165 my @input_lines=split(/\r?\n/, $$textref);
[3142]166 my @all_lines;
167 my $entry_line=shift @input_lines;
168 foreach my $input_line (@input_lines) {
169 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
170 # this is a new key
171 push(@all_lines, $entry_line);
172 $entry_line=$input_line;
173 } else {
174 # this is a continuation of previous line
[3249]175 $entry_line .= " " . $input_line;
[3142]176 }
177
178 }
179 # add final line, removing trailing '}'
180 $entry_line =~ s/\}\s*$//;
181 push(@all_lines, $entry_line);
182 push(@all_lines, "}");
[2901]183
[1676]184 # Read and process each line in the bib file.
[2901]185 my ($entryname, $name, $value, $line);
186 foreach $line (@all_lines) {
187
[1676]188 # Add each line. Most lines consist of a field identifer and
189 # then data, and we simply store them, though we treat some
190 # of the fields a bit differently.
191
192 $line =~ s/\s+/ /g;
193 $text .= "$line\n";
194
[4792]195
[2901]196 print "Processing line = $line \n" if $verbosity>=4;
197
[1676]198 # The first line is special, it contains the reference type and OID
[4792]199 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]*)\W*$/) {
[1676]200 $EntryType = $1;
[4792]201 my $EntryID = (defined $2) ? $2 : "default";
[2484]202 print "** $EntryType - \"$EntryID\" \n"
203 if ($verbosity >= 4);
[4792]204
[1676]205 next;
206 }
[2484]207 if ($line =~ /\@/) {
[2901]208 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
[2484]209 if ($verbosity >= 2);
[2901]210 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
[2484]211 if ($verbosity >= 2);
212 }
[1676]213
214 # otherwise, parse the metadata out of this line
[9582]215 next unless ($line =~ /^\s*(\w+)\s*=\s*(.*)/);
[2901]216 $entryname = lc($1);
[1676]217 $value = $2;
[3112]218 $value =~ s/,?\s*$//; # remove trailing comma and space
219 if ($value =~ /^"/ && $value =~ /"$/) {
220 # remove surrounding " marks
221 $value =~ s/^"//; $value =~ s/"$//;
[13416]222 } elsif ($value =~ /^\{/ && $value =~ /\}$/) {
223 # remove surrounding {} marks
224 $value =~ s/^\{//; $value =~ s/\}$//;
[3112]225 }
[13413]226 # special case for year - we only want a 4 digit year, not
227 # "circa 2004" etc
228 if ($entryname=~ /^year$/i) {
229 if ($value=~ /(\d{4})/) {
230 $value=$1;
231 $metadata{$entryname} .= "$value";
232 }
233 }
234 else {
235
236 $value = &process_latex($value);
237 # Add this line of metadata
238 $metadata{$entryname} .= "$value";
239 }
[1676]240 }
241
[1677]242 # Add the Entry type as metadata
[2901]243 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
[8121]244
245 #Add the fileformat as metadata
246 $doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
[1676]247
[8121]248
[1676]249 # Add the various field as metadata
[2901]250 foreach my $entryname (keys %metadata) {
251 next unless (defined $field{$entryname});
252 next unless (defined $metadata{$entryname});
[1676]253
[2901]254 $name = $field{$entryname};
255 $value = $metadata{$entryname};
[1676]256
[3142]257 if ($name =~ /^Month/) {
258 $value=expand_month($value);
259 }
[1676]260
261 # Several special operatons on metadata follow
262
263 # Add individual keywords.
264 # The full set of keywords will be added, in due course, as "Keywords".
265 # However, we also want to add them as individual "Keyword" metadata elements.
[2901]266 if ($entryname eq "keywords") {
[1676]267 my @keywordlist = split(/,/, $value);
268 foreach my $k (@keywordlist) {
[2901]269 $k = lc($k);
[1676]270 $k =~ s/\s*$//;
271 $k =~ s/^\s*//;
272 if ($k =~ /\w/) {
[2901]273 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
[1676]274 }
275 }
276 }
277
278 # Add individual authors
279 # The author metadata will be stored as one "Creator" entry, but we
280 # also want to split it into several individual "Author" fields in
[3112]281 # "Lastname, Firstnames" format so we can browse it.
[2901]282 if ($entryname eq "author") { #added also comparison with editor
283
[3249]284 # take care of "et al."...
[9582]285 my $etal='';
286 if ($value =~ s/\s+(and\s+others|et\.?\s+al\.?)\s*$//i) {
287 $etal=' <em>et. al.</em>';
288 }
[2901]289 # und here for german language...
290 # don't use brackets in pattern, else the matched bit becomes
291 # an element in the list!
[3249]292 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
293 my @formattedlist = ();
[1676]294 foreach $a (@authorlist) {
295 $a =~ s/\s*$//;
296 $a =~ s/^\s*//;
297 # Reformat and add author name
[2901]298 next if $a=~ /^\s*$/;
[3249]299
300 # names are "First von Last", "von Last, First"
301 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
302 my $first="";
303 my $vonlast="";
304 my $jr="";
[2901]305
[3249]306 if ($a =~ /,/) {
307 my @parts=split(/,\s*/, $a);
308 $first = pop @parts;
309 if (scalar(@parts) == 2) {
310 $jr = pop @parts;
311 }
312 $vonlast=shift @parts;
313 if (scalar(@parts) > 0) {
314 print $outhandle "BibTexPlug: couldn't parse name $a\n";
315 # but we continue anyway...
316 }
317 } else { # First von Last
318 my @words = split(/ /, $a);
319 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
320 $first .= " " . shift (@words);
321 }
322 $first =~ s/^\s//;
323 $vonlast = join (' ', @words); # whatever's left...
324 }
325 my $von="";
326 my $last="";
327 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
328 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
329 $von = $1;
330 if (!defined ($von)) {
331 # some non-English names do start with lowercase
332 # eg "Marie desJardins". Also we can get typos...
333 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
334 $von="";
335 if ($vonlast =~ /^[a-z]+$/) {
336 # if it's all lowercase, uppercase 1st.
337 $vonlast =~ s/^(.)/\u$1/;
[2901]338
[3249]339 }
340 }
341 $von =~ s/\s*$//;
342 $last=$vonlast;
343 } else {
344 $last=$vonlast;
345 }
346 my $wholename="$first $von $last $jr";
[9582]347 $wholename =~ s/\s+/ /g; # squeeze multiple spaces
348 $wholename =~ s/ $//;
[3249]349 push (@formattedlist, $wholename);
350 my $fullname = "$last";
351 $fullname .= " $jr" if ($jr);
352 $fullname .= ", $first";
353 $fullname .= " $von" if ($von);
354
[1676]355 # Add each name to set of Authors
[3249]356 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
[1676]357 }
[3249]358
359 # Only want at most one "and" in the Creator field
360 if (scalar(@formattedlist) > 2) {
361 my $lastauthor=pop @formattedlist;
362 $value=join(', ', @formattedlist);
363 $value.=" and $lastauthor";
364 } else { # 1 or 2 authors...
365 $value=join(" and ",@formattedlist);
366 }
367 $value.=$etal; # if there was "et al."
[1676]368 }
369
370 # Books and Journals are additionally marked for display purposes
[2901]371 if ($entryname eq "booktitle") {
372 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
373 } elsif ($entryname eq "journal") {
374 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
[1676]375 }
376
[3249]377 # Add the various fields as metadata
378 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
[13413]379
[1676]380 }
381
[9582]382 # for books and journals...
383 if (!exists $metadata{'title'}) {
384 my $name=$field{'title'}; # get Greenstone metadata name
385 my $value;
386 if (exists $metadata{'booktitle'}) {
387 $value=$metadata{'booktitle'};
388 } elsif (exists $metadata{'journal'}) {
389 $value=$metadata{'journal'};
390 }
391 if ($value) {
392 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
393 }
394 }
395
[3249]396 # Add Date (yyyymmdd) metadata
397 if (defined ($metadata{'year'}) ) {
398 my $date=$metadata{'year'};
399 chomp $date;
400 my $month=$metadata{'month'};
401 if (defined($month)) {
402 # month is currently 3 letter code or a range...
403 $month = expand_month($month);
404 # take the first month found... might not find one!
405 $month =~ m/_textmonth(\d\d)_/;
406 $month = $1;
407 }
408 if (!defined($month)) {
409 $month="00";
410 }
411 $date .= "${month}00";
412 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
413}
414
415# # Add the text in BibTex format (all fields)
[1676]416 if ($text =~ /\w/) {
[3249]417
418 $text =~ s@&@&amp;@g;
419 $text =~ s@<@&lt;@g;
420 $text =~ s@>@&gt;@g;
421 $text =~ s@\n@<br/>\n@g;
422 $text =~ s@\\@\\\\@g;
423
[2901]424 $doc_obj->add_utf8_text ($cursection, $text);
[1676]425 }
426
[1677]427 return 1;
[1676]428}
429
[1677]430
[2901]431
432
433# convert email addresses and URLs into links
434sub convert_urls_into_links{
435 my ($text) = @_;
436
437 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[3540]438 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
[2901]439
440 return $text;
441}
442
443# Clean up whitespace and convert \n charaters to <BR> or <P>
444sub clean_up_whitespaces{
445 my ($text) = @_;
446
447 $text =~ s/%%%%%/<BR> <BR>/g;
448 $text =~ s/ +/ /g;
449 $text =~ s/\s*$//;
450 $text =~ s/^\s*//;
451 $text =~ s/\n/\n<BR>/g;
452 $text =~ s/<BR>\s*<BR>/<P>/g;
453
454 return $text;
455}
456
457
458sub convert_problem_characters_without_ampersand{
459 my ($text) = @_;
460 $text =~ s/</&lt;/g;
461 $text =~ s/>/&gt;/g;
462
463 $text =~ s/\'\'/\"/g; #Latex -specific conversion
464 $text =~ s/\`\`/\"/g; #Latex -specific conversion
465
466 $text =~ s/\"/&quot;/g;
467 $text =~ s/\'/&#8217;/g;
468 $text =~ s/\`/&#8216;/g;
469
[3249]470# $text =~ s/\+/ /g;
471# $text =~ s/\(/ /g;
472# $text =~ s/\)/ /g;
473
[2901]474 $text =~ s/\\/\\\\/g;
475
[3249]476# $text =~ s/\./\\\./g;
[2901]477
478 return $text;
479}
480
[1676]481# Convert a text string into HTML.
482
[1677]483# The HTML is going to be inserted into a GML file, so we have to be
[2901]484# careful not to use symbols like ">", which occurs frequently in email
[1677]485# messages (and use &gt instead.
486
487# This function also turns URLs and email addresses into links, and
488# replaces carriage returns with <BR> tags (and multiple carriage returns
489# with <P> tags).
490
[1676]491sub text_into_html {
492 my ($text) = @_;
493
[2901]494 # Convert problem characters into HTML symbols
[1676]495 $text =~ s/&/&amp;/g;
496
[2901]497 $text = &convert_problem_characters_without_ampersand( $text );
498
[1676]499 # convert email addresses and URLs into links
[2901]500 $text = &convert_urls_into_links( $text );
[1676]501
[2901]502 $text = &clean_up_whitespaces( $text );
[1676]503
504 return $text;
505}
506
[2901]507
[3142]508sub expand_month {
509 my $text=shift;
[2901]510
[3142]511 # bibtex style files expand abbreviations for months.
512 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
513 $text =~ s/jan/_textmonth01_/g;
514 $text =~ s/feb/_textmonth02_/g;
515 $text =~ s/mar/_textmonth03_/g;
516 $text =~ s/apr/_textmonth04_/g;
517 $text =~ s/may/_textmonth05_/g;
518 $text =~ s/jun/_textmonth06_/g;
519 $text =~ s/jul/_textmonth07_/g;
520 $text =~ s/aug/_textmonth08_/g;
521 $text =~ s/sep/_textmonth09_/g;
522 $text =~ s/oct/_textmonth10_/g;
523 $text =~ s/nov/_textmonth11_/g;
524 $text =~ s/dec/_textmonth12_/g;
[2901]525
[3142]526 return $text;
527}
528
529
[2901]530# Convert accented characters, remove { }, interprete some commands....
531# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
532sub process_latex {
533 my ($text) = @_;
534
535 # note - this is really ugly, but it works. There may be a prettier way
536 # of mapping latex accented chars to utf8, but we just brute force it here.
537 # Also, this isn't complete - not every single possible accented letter
538 # is in here yet, but most of the common ones are.
539
540 my %utf8_chars =
541 (
542 # acutes
543 '\'a' => chr(0xc3).chr(0xa1),
544 '\'c' => chr(0xc4).chr(0x87),
545 '\'e' => chr(0xc3).chr(0xa9),
546 '\'i' => chr(0xc3).chr(0xad),
547 '\'l' => chr(0xc3).chr(0xba),
548 '\'n' => chr(0xc3).chr(0x84),
549 '\'o' => chr(0xc3).chr(0xb3),
550 '\'r' => chr(0xc5).chr(0x95),
551 '\'s' => chr(0xc5).chr(0x9b),
552 '\'u' => chr(0xc3).chr(0xba),
553 '\'y' => chr(0xc3).chr(0xbd),
554 '\'z' => chr(0xc5).chr(0xba),
555 # graves
556 '`a' => chr(0xc3).chr(0xa0),
557 '`A' => chr(0xc3).chr(0x80),
558 '`e' => chr(0xc3).chr(0xa8),
559 '`E' => chr(0xc3).chr(0x88),
560 '`i' => chr(0xc3).chr(0xac),
561 '`I' => chr(0xc3).chr(0x8c),
562 '`o' => chr(0xc3).chr(0xb2),
563 '`O' => chr(0xc3).chr(0x92),
564 '`u' => chr(0xc3).chr(0xb9),
565 '`U' => chr(0xc3).chr(0x99),
566 # circumflex
567 '^a' => chr(0xc3).chr(0xa2),
568 '^A' => chr(0xc3).chr(0x82),
569 '^c' => chr(0xc4).chr(0x89),
570 '^C' => chr(0xc4).chr(0x88),
571 '^e' => chr(0xc3).chr(0xaa),
572 '^E' => chr(0xc3).chr(0x8a),
573 '^g' => chr(0xc4).chr(0x9d),
574 '^G' => chr(0xc4).chr(0x9c),
575 '^h' => chr(0xc4).chr(0xa5),
576 '^H' => chr(0xc4).chr(0xa4),
577 '^i' => chr(0xc3).chr(0xae),
578 '^I' => chr(0xc3).chr(0x8e),
579 '^j' => chr(0xc4).chr(0xb5),
580 '^J' => chr(0xc4).chr(0xb4),
581 '^o' => chr(0xc3).chr(0xb4),
582 '^O' => chr(0xc3).chr(0x94),
583 '^s' => chr(0xc5).chr(0x9d),
584 '^S' => chr(0xc5).chr(0x9c),
585 '^u' => chr(0xc3).chr(0xa2),
586 '^U' => chr(0xc3).chr(0xbb),
587 '^w' => chr(0xc5).chr(0xb5),
588 '^W' => chr(0xc5).chr(0xb4),
589 '^y' => chr(0xc5).chr(0xb7),
590 '^Y' => chr(0xc5).chr(0xb6),
591
592 # diaeresis
593 '"a' => chr(0xc3).chr(0xa4),
594 '"A' => chr(0xc3).chr(0x84),
595 '"e' => chr(0xc3).chr(0xab),
596 '"E' => chr(0xc3).chr(0x8b),
597 '"\\\\i' => chr(0xc3).chr(0xaf),
598 '"\\\\I' => chr(0xc3).chr(0x8f),
599 '"o' => chr(0xc3).chr(0xb6),
600 '"O' => chr(0xc3).chr(0x96),
601 '"u' => chr(0xc3).chr(0xbc),
602 '"U' => chr(0xc3).chr(0x9c),
603 '"y' => chr(0xc3).chr(0xbf),
604 '"Y' => chr(0xc3).chr(0xb8),
605 # tilde
[3142]606 '~A' => chr(0xc3).chr(0x83),
607 '~N' => chr(0xc3).chr(0x91),
608 '~O' => chr(0xc3).chr(0x95),
609 '~a' => chr(0xc3).chr(0xa3),
610 '~n' => chr(0xc3).chr(0xb1),
611 '~o' => chr(0xc3).chr(0xb5),
[2901]612 # caron - handled specially
613 # double acute
614 # ring
615 # dot
[3249]616 '.c' => chr(0xc4).chr(0x8b),
617 '.C' => chr(0xc4).chr(0x8a),
618 '.e' => chr(0xc4).chr(0x97),
619 '.E' => chr(0xc4).chr(0x96),
620 '.g' => chr(0xc4).chr(0xa1),
621 '.G' => chr(0xc4).chr(0xa0),
622 '.I' => chr(0xc4).chr(0xb0),
623 '.z' => chr(0xc5).chr(0xbc),
624 '.Z' => chr(0xc5).chr(0xbb),
[2901]625 # macron
626 '=a' => chr(0xc4).chr(0x81),
627 '=A' => chr(0xc4).chr(0x80),
628 '=e' => chr(0xc4).chr(0x93),
629 '=E' => chr(0xc4).chr(0x92),
630 '=i' => chr(0xc4).chr(0xab),
631 '=I' => chr(0xc4).chr(0xaa),
632 '=o' => chr(0xc4).chr(0x8d),
633 '=O' => chr(0xc4).chr(0x8c),
634 '=u' => chr(0xc4).chr(0xab),
635 '=U' => chr(0xc4).chr(0xaa),
636
637 # stroke - handled specially - see below
638
639 # cedilla - handled specially
640 );
641
642# these are one letter latex commands - we make sure they're not a longer
643# command name. eg {\d} is d+stroke, so careful of \d
644 my %special_utf8_chars =
645 (
[3156]646 # breve
647 'u g' => chr(0xc4).chr(0x9f),
648 'u G' => chr(0xc4).chr(0x9e),
649 'u i' => chr(0xc4).chr(0xad),
650 'u I' => chr(0xc4).chr(0xac),
651 'u o' => chr(0xc5).chr(0x8f),
652 'u O' => chr(0xc5).chr(0x8e),
653 'u u' => chr(0xc5).chr(0xad),
654 'u U' => chr(0xc5).chr(0xac),
[2901]655 # caron
[3156]656 'v c' => chr(0xc4).chr(0x8d),
657 'v C' => chr(0xc4).chr(0x8c),
[2901]658 'v n' => chr(0xc5).chr(0x88),
659 'v N' => chr(0xc5).chr(0x87),
660 'v s' => chr(0xc5).chr(0xa1),
661 'v S' => chr(0xc5).chr(0xa5),
[3156]662 'v z' => chr(0xc5).chr(0xbe),
663 'v Z' => chr(0xc5).chr(0xbd),
[2901]664 # cedilla
665 'c c' => chr(0xc3).chr(0xa7),
666 'c C' => chr(0xc3).chr(0x87),
667 'c g' => chr(0xc4).chr(0xa3),
668 'c G' => chr(0xc4).chr(0xa2),
669 'c k' => chr(0xc4).chr(0xb7),
670 'c K' => chr(0xc4).chr(0xb6),
671 'c l' => chr(0xc4).chr(0xbc),
672 'c L' => chr(0xc4).chr(0xbb),
673 'c n' => chr(0xc5).chr(0x86),
674 'c N' => chr(0xc5).chr(0x85),
675 'c r' => chr(0xc5).chr(0x97),
676 'c R' => chr(0xc5).chr(0x96),
677 'c s' => chr(0xc5).chr(0x9f),
678 'c S' => chr(0xc5).chr(0x9e),
679 'c t' => chr(0xc5).chr(0xa3),
680 'c T' => chr(0xc5).chr(0xa2),
681 # double acute / Hungarian accent
682 'H O' => chr(0xc5).chr(0x90),
683 'H o' => chr(0xc5).chr(0x91),
684 'H U' => chr(0xc5).chr(0xb0),
685 'H u' => chr(0xc5).chr(0xb1),
686
687 # stroke
688 'd' => chr(0xc4).chr(0x91),
689 'D' => chr(0xc4).chr(0x90),
690 'h' => chr(0xc4).chr(0xa7),
691# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
[3112]692 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
[2901]693 'l' => chr(0xc5).chr(0x82),
694 'L' => chr(0xc5).chr(0x81),
695 'o' => chr(0xc3).chr(0xb8),
696 'O' => chr(0xc3).chr(0x98),
697 't' => chr(0xc5).chr(0xa7),
698 'T' => chr(0xc5).chr(0xa6),
699 # german ss/szlig/sharp s
700 'ss' => chr(0xc3).chr(0x9f),
701 );
702
703 # convert latex-style accented characters.
[3249]704
[2901]705 # remove space (if any) between \ and letter to accent (eg {\' a})
706
[13413]707 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
708
[2901]709 # remove {} around a single character (eg \'{e})
[13413]710 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
[2901]711
[3249]712 # \, is another way of doing cedilla \c
713 $text =~ s@\\,(.)@\\c $1@g;
714
[2901]715 # remove {} around a single character for special 1 letter commands -
716 # need to insert a space. Eg \v{s} -> {\v s}
[3156]717 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
[3249]718
[2901]719 # only do if the text contains a '\' character.
720 if ($text =~ m|\\|) {
[3249]721 # "normal" accents - ie non-alpha latex tag
[13413]722 while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
[3249]723 my $tex="$1$2"; my $char="$2";
724 my $replacement=$utf8_chars{$tex};
725 if (!defined($replacement)) {
726 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
727 $replacement=$char;
728 }
[5454]729 $text =~ s/\\\Q$tex/$replacement/g;
[3249]730 }
[2901]731
[3249]732 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
733 # only do the change if immediately followed by a space, }, {, or \
734 # one letter accents ( + ss)
735 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
736 my $tex=$1;
737 my $replacement=$special_utf8_chars{$tex};
738 if (!defined($replacement)) {
739 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
740 $replacement=$tex;
741 }
742 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
743
744 }
745
746 # one letter latex accent commands that affect following letter
747 while ($text =~ m@\\([uvcH]) ([\w])@) {
748 my $tex="$1 $2"; my $char="$2";
749 my $replacement=$special_utf8_chars{$tex};
750 if (!defined($replacement)) {
751 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
752 $replacement=$char;
753 }
754 $text =~ s/\\$tex/$replacement/g;
755 }
[2901]756 }
[3249]757
758 # escape html-sensitive characters
759 $text =~ s@&@&amp;@g;
760 $text =~ s@<@&lt;@g;
761 $text =~ s@>@&gt;@g;
762 $text =~ s/''/"/g; # Latex-specific
763 $text =~ s/``/"/g; # Latex-specific
764 # greenstone-specific
765 $text =~ s@\[@&\#91;@g;
766 $text =~ s@\]@&\#93;@g;
767
[3112]768 # remove latex commands
[3249]769
770 # explicitly recognised commands
771 $text =~ s@\\ldots@&hellip;@g;
772
773 # maths mode
774 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
775
776 # remove all other commands with optional arguments...
[3156]777 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
778 # $text =~ s@\\noopsort{[^}]+\}@@g;
779 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
[3142]780
[2901]781 # remove latex groupings { } (but not \{ or \} )
[3249]782 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
[3142]783 $text =~ s/^\{//; # remove { if first char
784
785 # latex characters
786 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
[3249]787 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
[3142]788 # optional break "\-"
[3249]789 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
[3142]790 # the non-macro bits have quotes around them - we just remove quotes
[3249]791 $text =~ s/([^&])[\"\#]/$1/g;
[3142]792 }
[3249]793 # dashes. Convert (m|n)-dash into single dash for html.
794 $text =~ s@\-\-+@\-@g;
795
[2901]796 # quoted { } chars
797 $text =~ s@\\{@{@g;
798 $text =~ s@\\}@}@g;
799
[3249]800 # finally to protect against macro language...
801 $text =~ s@\\@\\\\@g;
802
[2901]803 return $text;
804}
805
806
[3249]807sub process_latex_math {
808 my $text = shift;
809
810 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
811 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
812 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
813 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
814 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
815
816 # put all other command names in italics
817 $text =~ s@\\([\w]+)@<i>$1</i>@g;
818
819 return $text;
820}
821
[2484]822sub set_OID {
823 my $self = shift (@_);
824 my ($doc_obj, $id, $segment_number) = @_;
825
826 if ( $self->{'key'} eq "default") {
[4792]827 $doc_obj->set_OID("$id\_$segment_number");
[2484]828 } else {
829 $doc_obj->set_OID($self->{'key'});
830 }
831}
[1676]832
[2484]8331;
[4792]834
835
Note: See TracBrowser for help on using the repository browser.