source: main/trunk/greenstone2/perllib/plugins/BibTexPlugin.pm@ 32129

Last change on this file since 32129 was 32129, checked in by kjdon, 6 years ago

After () in a regex, {} signifys quantifiers. eg (xx){2,4} - 2-4 occurrences. In later perl versions, it is illegal to have an unescaped { after a ) in a regex. If you actually want to match { you need to escape it. So I have escaped all { following ) in regex

  • Property svn:keywords set to Author Date Id Revision
File size: 23.5 KB
RevLine 
[1676]1###########################################################################
2#
[15872]3# BibTexPlugin.pm - a plugin for bibliography records in BibTex format
[1676]4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
[2901]10# Copyright 1999-2001 New Zealand Digital Library Project
[1676]11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
[15872]29# BibTexPlugin reads bibliography files in BibTex format.
[1676]30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
[15872]34# BibTexPlugin creates a document object for every reference a the file.
35# It is a subclass of SplitTextFile, so if there are multiple records, all
[1676]36# are read.
[2901]37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
[1676]42
[3587]43
[15872]44package BibTexPlugin;
[1676]45
[15872]46use SplitTextFile;
[24548]47use MetadataRead;
[10254]48use strict;
49no strict 'refs'; # allow filehandles to be variables and viceversa
[1676]50
[15872]51# BibTexPlugin is a sub-class of SplitTextFile.
[1676]52sub BEGIN {
[24548]53 @BibTexPlugin::ISA = ('MetadataRead', 'SplitTextFile');
[1676]54}
55
[3540]56my $arguments =
[4744]57 [ { 'name' => "process_exp",
[31492]58 'desc' => "{BaseImporter.process_exp}",
[6408]59 'type' => "regexp",
[4873]60 'reqd' => "no",
[6408]61 'deft' => &get_default_process_exp() },
62 { 'name' => "split_exp",
[15872]63 'desc' => "{SplitTextFile.split_exp}",
[6408]64 'type' => "regexp",
65 'deft' => &get_default_split_exp(),
66 'reqd' => "no" }
67 ];
[3540]68
[15872]69my $options = { 'name' => "BibTexPlugin",
70 'desc' => "{BibTexPlugin.desc}",
[6408]71 'abstract' => "no",
72 'inherits' => "yes",
[9120]73 'explodes' => "yes",
[4744]74 'args' => $arguments };
[3540]75
[1676]76# This plugin processes files with the suffix ".bib"
77sub get_default_process_exp {
78 return q^(?i)\.bib$^;
79}
80
81# This plugin splits the input text at blank lines
82sub get_default_split_exp {
83 return q^\n+(?=@)^;
84}
[15872]85
[3540]86sub new {
[10218]87 my ($class) = shift (@_);
88 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$pluginlist, $class);
90
[15872]91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]93
[15872]94 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]95
[3540]96 return bless $self, $class;
97}
[1676]98
[2901]99
100
[1676]101# The process function reads a single bibliographic record and stores
102# it as a new document.
103
104sub process {
105 my $self = shift (@_);
[6332]106 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1676]107 my $outhandle = $self->{'outhandle'};
108
[10254]109 my $cursection = $doc_obj->get_top_section();
[2484]110 $self->{'key'} = "default";
111
[1676]112 # Check that we're dealing with a valid BibTex record
113 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
114
115 # Ignore things we can't use
116 return 0 if ($$textref =~ /^\@String/);
117
[1711]118 # This hash translates BibTex field names into metadata names. The
[1676]119 # BibTex names are taken from the "Local Guide to Latex" Graeme
[2901]120 # McKinstry. Metadata names are consistent with ReferPlug.
[1676]121
[9582]122 # The author metadata will be stored as one "Creator" entry, but will
123 # also be split up into several individual "Author" fields.
124
[1676]125 my %field = (
126 'address', 'PublisherAddress',
127 'author', 'Creator',
[2901]128
[1676]129 'booktitle', 'Booktitle',
130 'chapter', 'Chapter',
131 'edition', 'Edition',
[2901]132 'editor', 'Editor',
[1676]133 'institution', 'Publisher',
134 'journal', 'Journal',
135 'month', 'Month',
136 'number', 'Number',
137 'pages', 'Pages',
138 'publisher', 'Publisher',
139 'school', 'Publisher',
140 'title', 'Title',
141 'volume', 'Volume',
[3142]142 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
[1676]143
144 'keywords', 'Keywords',
145 'abstract', 'Abstract',
[4792]146 'copyright', 'Copyright',
147 'note', 'Note',
[11527]148 'url', 'URL',
[8121]149 );
[1676]150
151 # Metadata fields
152 my %metadata;
[4792]153 my ($EntryType, $Creator, $Keywords, $text);
[1676]154
[2901]155 my $verbosity = $self->{'verbosity'};
156 $verbosity = 0 unless $verbosity;
157
[3142]158 # Make sure the text has exactly one entry per line -
159 # append line to previous if it doesn't start with " <key> = "
[2901]160
[4792]161 my @input_lines=split(/\r?\n/, $$textref);
[3142]162 my @all_lines;
163 my $entry_line=shift @input_lines;
164 foreach my $input_line (@input_lines) {
165 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
166 # this is a new key
167 push(@all_lines, $entry_line);
168 $entry_line=$input_line;
169 } else {
170 # this is a continuation of previous line
[3249]171 $entry_line .= " " . $input_line;
[3142]172 }
173
174 }
175 # add final line, removing trailing '}'
176 $entry_line =~ s/\}\s*$//;
177 push(@all_lines, $entry_line);
178 push(@all_lines, "}");
[2901]179
[1676]180 # Read and process each line in the bib file.
[2901]181 my ($entryname, $name, $value, $line);
182 foreach $line (@all_lines) {
183
[1676]184 # Add each line. Most lines consist of a field identifer and
185 # then data, and we simply store them, though we treat some
186 # of the fields a bit differently.
187
188 $line =~ s/\s+/ /g;
189 $text .= "$line\n";
190
[4792]191
[2901]192 print "Processing line = $line \n" if $verbosity>=4;
193
[1676]194 # The first line is special, it contains the reference type and OID
[4792]195 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]*)\W*$/) {
[1676]196 $EntryType = $1;
[4792]197 my $EntryID = (defined $2) ? $2 : "default";
[2484]198 print "** $EntryType - \"$EntryID\" \n"
199 if ($verbosity >= 4);
[4792]200
[1676]201 next;
202 }
[2484]203 if ($line =~ /\@/) {
[2901]204 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
[2484]205 if ($verbosity >= 2);
[2901]206 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
[2484]207 if ($verbosity >= 2);
208 }
[1676]209
210 # otherwise, parse the metadata out of this line
[9582]211 next unless ($line =~ /^\s*(\w+)\s*=\s*(.*)/);
[2901]212 $entryname = lc($1);
[1676]213 $value = $2;
[3112]214 $value =~ s/,?\s*$//; # remove trailing comma and space
215 if ($value =~ /^"/ && $value =~ /"$/) {
216 # remove surrounding " marks
217 $value =~ s/^"//; $value =~ s/"$//;
[13416]218 } elsif ($value =~ /^\{/ && $value =~ /\}$/) {
219 # remove surrounding {} marks
220 $value =~ s/^\{//; $value =~ s/\}$//;
[3112]221 }
[13413]222 # special case for year - we only want a 4 digit year, not
223 # "circa 2004" etc
224 if ($entryname=~ /^year$/i) {
225 if ($value=~ /(\d{4})/) {
226 $value=$1;
227 $metadata{$entryname} .= "$value";
228 }
229 }
230 else {
231
232 $value = &process_latex($value);
233 # Add this line of metadata
234 $metadata{$entryname} .= "$value";
235 }
[1676]236 }
237
[1677]238 # Add the Entry type as metadata
[2901]239 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
[8121]240
241 #Add the fileformat as metadata
242 $doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
[1676]243
[8121]244
[1676]245 # Add the various field as metadata
[2901]246 foreach my $entryname (keys %metadata) {
247 next unless (defined $field{$entryname});
248 next unless (defined $metadata{$entryname});
[1676]249
[2901]250 $name = $field{$entryname};
251 $value = $metadata{$entryname};
[1676]252
[3142]253 if ($name =~ /^Month/) {
254 $value=expand_month($value);
255 }
[1676]256
257 # Several special operatons on metadata follow
258
259 # Add individual keywords.
260 # The full set of keywords will be added, in due course, as "Keywords".
261 # However, we also want to add them as individual "Keyword" metadata elements.
[2901]262 if ($entryname eq "keywords") {
[1676]263 my @keywordlist = split(/,/, $value);
264 foreach my $k (@keywordlist) {
[2901]265 $k = lc($k);
[1676]266 $k =~ s/\s*$//;
267 $k =~ s/^\s*//;
268 if ($k =~ /\w/) {
[2901]269 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
[1676]270 }
271 }
272 }
273
274 # Add individual authors
275 # The author metadata will be stored as one "Creator" entry, but we
276 # also want to split it into several individual "Author" fields in
[3112]277 # "Lastname, Firstnames" format so we can browse it.
[2901]278 if ($entryname eq "author") { #added also comparison with editor
279
[3249]280 # take care of "et al."...
[9582]281 my $etal='';
282 if ($value =~ s/\s+(and\s+others|et\.?\s+al\.?)\s*$//i) {
283 $etal=' <em>et. al.</em>';
284 }
[2901]285 # und here for german language...
286 # don't use brackets in pattern, else the matched bit becomes
287 # an element in the list!
[3249]288 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
289 my @formattedlist = ();
[1676]290 foreach $a (@authorlist) {
291 $a =~ s/\s*$//;
292 $a =~ s/^\s*//;
293 # Reformat and add author name
[2901]294 next if $a=~ /^\s*$/;
[3249]295
296 # names are "First von Last", "von Last, First"
297 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
298 my $first="";
299 my $vonlast="";
300 my $jr="";
[2901]301
[3249]302 if ($a =~ /,/) {
303 my @parts=split(/,\s*/, $a);
304 $first = pop @parts;
305 if (scalar(@parts) == 2) {
306 $jr = pop @parts;
307 }
308 $vonlast=shift @parts;
309 if (scalar(@parts) > 0) {
[15872]310 print $outhandle "BibTexPlugin: couldn't parse name $a\n";
[3249]311 # but we continue anyway...
312 }
313 } else { # First von Last
314 my @words = split(/ /, $a);
[32122]315 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2,}/) {
[3249]316 $first .= " " . shift (@words);
317 }
318 $first =~ s/^\s//;
319 $vonlast = join (' ', @words); # whatever's left...
320 }
321 my $von="";
322 my $last="";
323 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
324 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
325 $von = $1;
326 if (!defined ($von)) {
327 # some non-English names do start with lowercase
328 # eg "Marie desJardins". Also we can get typos...
[15872]329 print $outhandle "BibTexPlugin: couldn't parse surname $vonlast\n";
[3249]330 $von="";
331 if ($vonlast =~ /^[a-z]+$/) {
332 # if it's all lowercase, uppercase 1st.
333 $vonlast =~ s/^(.)/\u$1/;
[2901]334
[3249]335 }
336 }
337 $von =~ s/\s*$//;
338 $last=$vonlast;
339 } else {
340 $last=$vonlast;
341 }
342 my $wholename="$first $von $last $jr";
[9582]343 $wholename =~ s/\s+/ /g; # squeeze multiple spaces
344 $wholename =~ s/ $//;
[3249]345 push (@formattedlist, $wholename);
346 my $fullname = "$last";
347 $fullname .= " $jr" if ($jr);
348 $fullname .= ", $first";
349 $fullname .= " $von" if ($von);
350
[1676]351 # Add each name to set of Authors
[3249]352 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
[1676]353 }
[3249]354
355 # Only want at most one "and" in the Creator field
356 if (scalar(@formattedlist) > 2) {
357 my $lastauthor=pop @formattedlist;
358 $value=join(', ', @formattedlist);
359 $value.=" and $lastauthor";
360 } else { # 1 or 2 authors...
361 $value=join(" and ",@formattedlist);
362 }
363 $value.=$etal; # if there was "et al."
[1676]364 }
365
366 # Books and Journals are additionally marked for display purposes
[2901]367 if ($entryname eq "booktitle") {
368 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
369 } elsif ($entryname eq "journal") {
370 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
[1676]371 }
372
[3249]373 # Add the various fields as metadata
374 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
[13413]375
[1676]376 }
377
[9582]378 # for books and journals...
379 if (!exists $metadata{'title'}) {
380 my $name=$field{'title'}; # get Greenstone metadata name
381 my $value;
382 if (exists $metadata{'booktitle'}) {
383 $value=$metadata{'booktitle'};
384 } elsif (exists $metadata{'journal'}) {
385 $value=$metadata{'journal'};
386 }
387 if ($value) {
388 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
389 }
390 }
391
[3249]392 # Add Date (yyyymmdd) metadata
393 if (defined ($metadata{'year'}) ) {
394 my $date=$metadata{'year'};
395 chomp $date;
396 my $month=$metadata{'month'};
397 if (defined($month)) {
398 # month is currently 3 letter code or a range...
399 $month = expand_month($month);
400 # take the first month found... might not find one!
401 $month =~ m/_textmonth(\d\d)_/;
402 $month = $1;
403 }
404 if (!defined($month)) {
405 $month="00";
406 }
407 $date .= "${month}00";
408 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
409}
410
411# # Add the text in BibTex format (all fields)
[1676]412 if ($text =~ /\w/) {
[3249]413
414 $text =~ s@&@&amp;@g;
415 $text =~ s@<@&lt;@g;
416 $text =~ s@>@&gt;@g;
417 $text =~ s@\n@<br/>\n@g;
418 $text =~ s@\\@\\\\@g;
419
[2901]420 $doc_obj->add_utf8_text ($cursection, $text);
[1676]421 }
422
[1677]423 return 1;
[1676]424}
425
[1677]426
[2901]427
428
429# convert email addresses and URLs into links
430sub convert_urls_into_links{
431 my ($text) = @_;
432
433 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[3540]434 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
[2901]435
436 return $text;
437}
438
439# Clean up whitespace and convert \n charaters to <BR> or <P>
440sub clean_up_whitespaces{
441 my ($text) = @_;
442
443 $text =~ s/%%%%%/<BR> <BR>/g;
444 $text =~ s/ +/ /g;
445 $text =~ s/\s*$//;
446 $text =~ s/^\s*//;
447 $text =~ s/\n/\n<BR>/g;
448 $text =~ s/<BR>\s*<BR>/<P>/g;
449
450 return $text;
451}
452
453
454sub convert_problem_characters_without_ampersand{
455 my ($text) = @_;
456 $text =~ s/</&lt;/g;
457 $text =~ s/>/&gt;/g;
458
459 $text =~ s/\'\'/\"/g; #Latex -specific conversion
460 $text =~ s/\`\`/\"/g; #Latex -specific conversion
461
462 $text =~ s/\"/&quot;/g;
463 $text =~ s/\'/&#8217;/g;
464 $text =~ s/\`/&#8216;/g;
465
[3249]466# $text =~ s/\+/ /g;
467# $text =~ s/\(/ /g;
468# $text =~ s/\)/ /g;
469
[2901]470 $text =~ s/\\/\\\\/g;
471
[3249]472# $text =~ s/\./\\\./g;
[2901]473
474 return $text;
475}
476
[1676]477# Convert a text string into HTML.
478
[1677]479# The HTML is going to be inserted into a GML file, so we have to be
[2901]480# careful not to use symbols like ">", which occurs frequently in email
[1677]481# messages (and use &gt instead.
482
483# This function also turns URLs and email addresses into links, and
484# replaces carriage returns with <BR> tags (and multiple carriage returns
485# with <P> tags).
486
[1676]487sub text_into_html {
488 my ($text) = @_;
489
[2901]490 # Convert problem characters into HTML symbols
[1676]491 $text =~ s/&/&amp;/g;
492
[2901]493 $text = &convert_problem_characters_without_ampersand( $text );
494
[1676]495 # convert email addresses and URLs into links
[2901]496 $text = &convert_urls_into_links( $text );
[1676]497
[2901]498 $text = &clean_up_whitespaces( $text );
[1676]499
500 return $text;
501}
502
[2901]503
[3142]504sub expand_month {
505 my $text=shift;
[2901]506
[3142]507 # bibtex style files expand abbreviations for months.
508 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
509 $text =~ s/jan/_textmonth01_/g;
510 $text =~ s/feb/_textmonth02_/g;
511 $text =~ s/mar/_textmonth03_/g;
512 $text =~ s/apr/_textmonth04_/g;
513 $text =~ s/may/_textmonth05_/g;
514 $text =~ s/jun/_textmonth06_/g;
515 $text =~ s/jul/_textmonth07_/g;
516 $text =~ s/aug/_textmonth08_/g;
517 $text =~ s/sep/_textmonth09_/g;
518 $text =~ s/oct/_textmonth10_/g;
519 $text =~ s/nov/_textmonth11_/g;
520 $text =~ s/dec/_textmonth12_/g;
[2901]521
[3142]522 return $text;
523}
524
525
[2901]526# Convert accented characters, remove { }, interprete some commands....
527# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
528sub process_latex {
529 my ($text) = @_;
530
531 # note - this is really ugly, but it works. There may be a prettier way
532 # of mapping latex accented chars to utf8, but we just brute force it here.
533 # Also, this isn't complete - not every single possible accented letter
534 # is in here yet, but most of the common ones are.
535
536 my %utf8_chars =
537 (
538 # acutes
539 '\'a' => chr(0xc3).chr(0xa1),
540 '\'c' => chr(0xc4).chr(0x87),
541 '\'e' => chr(0xc3).chr(0xa9),
542 '\'i' => chr(0xc3).chr(0xad),
543 '\'l' => chr(0xc3).chr(0xba),
544 '\'n' => chr(0xc3).chr(0x84),
545 '\'o' => chr(0xc3).chr(0xb3),
546 '\'r' => chr(0xc5).chr(0x95),
547 '\'s' => chr(0xc5).chr(0x9b),
548 '\'u' => chr(0xc3).chr(0xba),
549 '\'y' => chr(0xc3).chr(0xbd),
550 '\'z' => chr(0xc5).chr(0xba),
551 # graves
552 '`a' => chr(0xc3).chr(0xa0),
553 '`A' => chr(0xc3).chr(0x80),
554 '`e' => chr(0xc3).chr(0xa8),
555 '`E' => chr(0xc3).chr(0x88),
556 '`i' => chr(0xc3).chr(0xac),
557 '`I' => chr(0xc3).chr(0x8c),
558 '`o' => chr(0xc3).chr(0xb2),
559 '`O' => chr(0xc3).chr(0x92),
560 '`u' => chr(0xc3).chr(0xb9),
561 '`U' => chr(0xc3).chr(0x99),
562 # circumflex
563 '^a' => chr(0xc3).chr(0xa2),
564 '^A' => chr(0xc3).chr(0x82),
565 '^c' => chr(0xc4).chr(0x89),
566 '^C' => chr(0xc4).chr(0x88),
567 '^e' => chr(0xc3).chr(0xaa),
568 '^E' => chr(0xc3).chr(0x8a),
569 '^g' => chr(0xc4).chr(0x9d),
570 '^G' => chr(0xc4).chr(0x9c),
571 '^h' => chr(0xc4).chr(0xa5),
572 '^H' => chr(0xc4).chr(0xa4),
573 '^i' => chr(0xc3).chr(0xae),
574 '^I' => chr(0xc3).chr(0x8e),
575 '^j' => chr(0xc4).chr(0xb5),
576 '^J' => chr(0xc4).chr(0xb4),
577 '^o' => chr(0xc3).chr(0xb4),
578 '^O' => chr(0xc3).chr(0x94),
579 '^s' => chr(0xc5).chr(0x9d),
580 '^S' => chr(0xc5).chr(0x9c),
581 '^u' => chr(0xc3).chr(0xa2),
582 '^U' => chr(0xc3).chr(0xbb),
583 '^w' => chr(0xc5).chr(0xb5),
584 '^W' => chr(0xc5).chr(0xb4),
585 '^y' => chr(0xc5).chr(0xb7),
586 '^Y' => chr(0xc5).chr(0xb6),
587
588 # diaeresis
589 '"a' => chr(0xc3).chr(0xa4),
590 '"A' => chr(0xc3).chr(0x84),
591 '"e' => chr(0xc3).chr(0xab),
592 '"E' => chr(0xc3).chr(0x8b),
593 '"\\\\i' => chr(0xc3).chr(0xaf),
594 '"\\\\I' => chr(0xc3).chr(0x8f),
595 '"o' => chr(0xc3).chr(0xb6),
596 '"O' => chr(0xc3).chr(0x96),
597 '"u' => chr(0xc3).chr(0xbc),
598 '"U' => chr(0xc3).chr(0x9c),
599 '"y' => chr(0xc3).chr(0xbf),
600 '"Y' => chr(0xc3).chr(0xb8),
601 # tilde
[3142]602 '~A' => chr(0xc3).chr(0x83),
603 '~N' => chr(0xc3).chr(0x91),
604 '~O' => chr(0xc3).chr(0x95),
605 '~a' => chr(0xc3).chr(0xa3),
606 '~n' => chr(0xc3).chr(0xb1),
607 '~o' => chr(0xc3).chr(0xb5),
[2901]608 # caron - handled specially
609 # double acute
610 # ring
611 # dot
[3249]612 '.c' => chr(0xc4).chr(0x8b),
613 '.C' => chr(0xc4).chr(0x8a),
614 '.e' => chr(0xc4).chr(0x97),
615 '.E' => chr(0xc4).chr(0x96),
616 '.g' => chr(0xc4).chr(0xa1),
617 '.G' => chr(0xc4).chr(0xa0),
618 '.I' => chr(0xc4).chr(0xb0),
619 '.z' => chr(0xc5).chr(0xbc),
620 '.Z' => chr(0xc5).chr(0xbb),
[2901]621 # macron
622 '=a' => chr(0xc4).chr(0x81),
623 '=A' => chr(0xc4).chr(0x80),
624 '=e' => chr(0xc4).chr(0x93),
625 '=E' => chr(0xc4).chr(0x92),
626 '=i' => chr(0xc4).chr(0xab),
627 '=I' => chr(0xc4).chr(0xaa),
628 '=o' => chr(0xc4).chr(0x8d),
629 '=O' => chr(0xc4).chr(0x8c),
630 '=u' => chr(0xc4).chr(0xab),
631 '=U' => chr(0xc4).chr(0xaa),
632
633 # stroke - handled specially - see below
634
635 # cedilla - handled specially
636 );
637
638# these are one letter latex commands - we make sure they're not a longer
639# command name. eg {\d} is d+stroke, so careful of \d
640 my %special_utf8_chars =
641 (
[3156]642 # breve
643 'u g' => chr(0xc4).chr(0x9f),
644 'u G' => chr(0xc4).chr(0x9e),
645 'u i' => chr(0xc4).chr(0xad),
646 'u I' => chr(0xc4).chr(0xac),
647 'u o' => chr(0xc5).chr(0x8f),
648 'u O' => chr(0xc5).chr(0x8e),
649 'u u' => chr(0xc5).chr(0xad),
650 'u U' => chr(0xc5).chr(0xac),
[2901]651 # caron
[3156]652 'v c' => chr(0xc4).chr(0x8d),
653 'v C' => chr(0xc4).chr(0x8c),
[2901]654 'v n' => chr(0xc5).chr(0x88),
655 'v N' => chr(0xc5).chr(0x87),
656 'v s' => chr(0xc5).chr(0xa1),
657 'v S' => chr(0xc5).chr(0xa5),
[3156]658 'v z' => chr(0xc5).chr(0xbe),
659 'v Z' => chr(0xc5).chr(0xbd),
[2901]660 # cedilla
661 'c c' => chr(0xc3).chr(0xa7),
662 'c C' => chr(0xc3).chr(0x87),
663 'c g' => chr(0xc4).chr(0xa3),
664 'c G' => chr(0xc4).chr(0xa2),
665 'c k' => chr(0xc4).chr(0xb7),
666 'c K' => chr(0xc4).chr(0xb6),
667 'c l' => chr(0xc4).chr(0xbc),
668 'c L' => chr(0xc4).chr(0xbb),
669 'c n' => chr(0xc5).chr(0x86),
670 'c N' => chr(0xc5).chr(0x85),
671 'c r' => chr(0xc5).chr(0x97),
672 'c R' => chr(0xc5).chr(0x96),
673 'c s' => chr(0xc5).chr(0x9f),
674 'c S' => chr(0xc5).chr(0x9e),
675 'c t' => chr(0xc5).chr(0xa3),
676 'c T' => chr(0xc5).chr(0xa2),
677 # double acute / Hungarian accent
678 'H O' => chr(0xc5).chr(0x90),
679 'H o' => chr(0xc5).chr(0x91),
680 'H U' => chr(0xc5).chr(0xb0),
681 'H u' => chr(0xc5).chr(0xb1),
682
683 # stroke
684 'd' => chr(0xc4).chr(0x91),
685 'D' => chr(0xc4).chr(0x90),
686 'h' => chr(0xc4).chr(0xa7),
687# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
[3112]688 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
[2901]689 'l' => chr(0xc5).chr(0x82),
690 'L' => chr(0xc5).chr(0x81),
691 'o' => chr(0xc3).chr(0xb8),
692 'O' => chr(0xc3).chr(0x98),
693 't' => chr(0xc5).chr(0xa7),
694 'T' => chr(0xc5).chr(0xa6),
695 # german ss/szlig/sharp s
696 'ss' => chr(0xc3).chr(0x9f),
697 );
698
699 # convert latex-style accented characters.
[3249]700
[2901]701 # remove space (if any) between \ and letter to accent (eg {\' a})
702
[13413]703 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
704
[2901]705 # remove {} around a single character (eg \'{e})
[32129]706 $text =~ s@(\\[`'="^~\.])\{(\w)}@{$1$2}@g; #`
[2901]707
[3249]708 # \, is another way of doing cedilla \c
709 $text =~ s@\\,(.)@\\c $1@g;
710
[2901]711 # remove {} around a single character for special 1 letter commands -
712 # need to insert a space. Eg \v{s} -> {\v s}
[32129]713 $text =~ s@(\\[uvcH])\{(\w)}@{$1 $2}@g;
[3249]714
[2901]715 # only do if the text contains a '\' character.
716 if ($text =~ m|\\|) {
[3249]717 # "normal" accents - ie non-alpha latex tag
[13413]718 while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
[3249]719 my $tex="$1$2"; my $char="$2";
720 my $replacement=$utf8_chars{$tex};
721 if (!defined($replacement)) {
[15872]722 print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]723 $replacement=$char;
724 }
[5454]725 $text =~ s/\\\Q$tex/$replacement/g;
[3249]726 }
[2901]727
[3249]728 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
729 # only do the change if immediately followed by a space, }, {, or \
730 # one letter accents ( + ss)
731 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
732 my $tex=$1;
733 my $replacement=$special_utf8_chars{$tex};
734 if (!defined($replacement)) {
[15872]735 print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]736 $replacement=$tex;
737 }
738 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
739
740 }
741
742 # one letter latex accent commands that affect following letter
743 while ($text =~ m@\\([uvcH]) ([\w])@) {
744 my $tex="$1 $2"; my $char="$2";
745 my $replacement=$special_utf8_chars{$tex};
746 if (!defined($replacement)) {
[15872]747 print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]748 $replacement=$char;
749 }
750 $text =~ s/\\$tex/$replacement/g;
751 }
[2901]752 }
[3249]753
754 # escape html-sensitive characters
755 $text =~ s@&@&amp;@g;
756 $text =~ s@<@&lt;@g;
757 $text =~ s@>@&gt;@g;
758 $text =~ s/''/"/g; # Latex-specific
759 $text =~ s/``/"/g; # Latex-specific
760 # greenstone-specific
761 $text =~ s@\[@&\#91;@g;
762 $text =~ s@\]@&\#93;@g;
763
[3112]764 # remove latex commands
[3249]765
766 # explicitly recognised commands
767 $text =~ s@\\ldots@&hellip;@g;
768
769 # maths mode
770 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
771
772 # remove all other commands with optional arguments...
[3156]773 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
774 # $text =~ s@\\noopsort{[^}]+\}@@g;
775 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
[3142]776
[2901]777 # remove latex groupings { } (but not \{ or \} )
[3249]778 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
[3142]779 $text =~ s/^\{//; # remove { if first char
780
781 # latex characters
782 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
[3249]783 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
[3142]784 # optional break "\-"
[3249]785 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
[3142]786 # the non-macro bits have quotes around them - we just remove quotes
[3249]787 $text =~ s/([^&])[\"\#]/$1/g;
[3142]788 }
[3249]789 # dashes. Convert (m|n)-dash into single dash for html.
790 $text =~ s@\-\-+@\-@g;
791
[2901]792 # quoted { } chars
[31780]793 $text =~ s@\\\{@{@g;
[2901]794 $text =~ s@\\}@}@g;
795
[3249]796 # finally to protect against macro language...
797 $text =~ s@\\@\\\\@g;
798
[2901]799 return $text;
800}
801
802
[3249]803sub process_latex_math {
804 my $text = shift;
805
806 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
[31780]807 $text =~ s@\^\{(.*?)}@<sup>$1</sup>@g; # superscript
[3249]808 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
[31780]809 $text =~ s@\_\{(.*?)}@<sub>$1</sub>@g; # subscript
[3249]810 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
811
812 # put all other command names in italics
813 $text =~ s@\\([\w]+)@<i>$1</i>@g;
814
815 return $text;
816}
817
[17026]818sub add_OID {
[2484]819 my $self = shift (@_);
820 my ($doc_obj, $id, $segment_number) = @_;
821
822 if ( $self->{'key'} eq "default") {
[17026]823 $self->SUPER::add_OID(@_);
824# $doc_obj->set_OID("$id\_$segment_number");
[2484]825 } else {
826 $doc_obj->set_OID($self->{'key'});
827 }
828}
[1676]829
[2484]8301;
[4792]831
832
Note: See TracBrowser for help on using the repository browser.