source: main/trunk/greenstone2/perllib/plugins/BibTexPlugin.pm@ 29745

Last change on this file since 29745 was 24548, checked in by ak19, 13 years ago

Part 2 of previous commit (r24547). Added new abstract plugin MetadataRead? that defines can_process_this_file_for_metadata that MetadataPlugin? subclasses can inherit (if MetadataRead? is listed first in the ISA inheritance list) and which will then override the one defined in BasePlugin?. For now committing MARC, ISIS and OAIPlugins which now additionally inherit from MetadataRead?. Other metadataPlugins also need to be committed.

  • Property svn:keywords set to Author Date Id Revision
File size: 23.5 KB
RevLine 
[1676]1###########################################################################
2#
[15872]3# BibTexPlugin.pm - a plugin for bibliography records in BibTex format
[1676]4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
[2901]10# Copyright 1999-2001 New Zealand Digital Library Project
[1676]11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
[15872]29# BibTexPlugin reads bibliography files in BibTex format.
[1676]30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
[15872]34# BibTexPlugin creates a document object for every reference a the file.
35# It is a subclass of SplitTextFile, so if there are multiple records, all
[1676]36# are read.
[2901]37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
[1676]42
[3587]43
[15872]44package BibTexPlugin;
[1676]45
[15872]46use SplitTextFile;
[24548]47use MetadataRead;
[10254]48use strict;
49no strict 'refs'; # allow filehandles to be variables and viceversa
[1676]50
[15872]51# BibTexPlugin is a sub-class of SplitTextFile.
[1676]52sub BEGIN {
[24548]53 @BibTexPlugin::ISA = ('MetadataRead', 'SplitTextFile');
[1676]54}
55
[3540]56my $arguments =
[4744]57 [ { 'name' => "process_exp",
[15872]58 'desc' => "{BasePlugin.process_exp}",
[6408]59 'type' => "regexp",
[4873]60 'reqd' => "no",
[6408]61 'deft' => &get_default_process_exp() },
62 { 'name' => "split_exp",
[15872]63 'desc' => "{SplitTextFile.split_exp}",
[6408]64 'type' => "regexp",
65 'deft' => &get_default_split_exp(),
66 'reqd' => "no" }
67 ];
[3540]68
[15872]69my $options = { 'name' => "BibTexPlugin",
70 'desc' => "{BibTexPlugin.desc}",
[6408]71 'abstract' => "no",
72 'inherits' => "yes",
[9120]73 'explodes' => "yes",
[4744]74 'args' => $arguments };
[3540]75
[1676]76# This plugin processes files with the suffix ".bib"
77sub get_default_process_exp {
78 return q^(?i)\.bib$^;
79}
80
81# This plugin splits the input text at blank lines
82sub get_default_split_exp {
83 return q^\n+(?=@)^;
84}
[15872]85
[3540]86sub new {
[10218]87 my ($class) = shift (@_);
88 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$pluginlist, $class);
90
[15872]91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]93
[15872]94 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]95
[3540]96 return bless $self, $class;
97}
[1676]98
[2901]99
100
[1676]101# The process function reads a single bibliographic record and stores
102# it as a new document.
103
104sub process {
105 my $self = shift (@_);
[6332]106 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1676]107 my $outhandle = $self->{'outhandle'};
108
[10254]109 my $cursection = $doc_obj->get_top_section();
[2484]110 $self->{'key'} = "default";
111
[1676]112 # Check that we're dealing with a valid BibTex record
113 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
114
115 # Ignore things we can't use
116 return 0 if ($$textref =~ /^\@String/);
117
[1711]118 # This hash translates BibTex field names into metadata names. The
[1676]119 # BibTex names are taken from the "Local Guide to Latex" Graeme
[2901]120 # McKinstry. Metadata names are consistent with ReferPlug.
[1676]121
[9582]122 # The author metadata will be stored as one "Creator" entry, but will
123 # also be split up into several individual "Author" fields.
124
[1676]125 my %field = (
126 'address', 'PublisherAddress',
127 'author', 'Creator',
[2901]128
[1676]129 'booktitle', 'Booktitle',
130 'chapter', 'Chapter',
131 'edition', 'Edition',
[2901]132 'editor', 'Editor',
[1676]133 'institution', 'Publisher',
134 'journal', 'Journal',
135 'month', 'Month',
136 'number', 'Number',
137 'pages', 'Pages',
138 'publisher', 'Publisher',
139 'school', 'Publisher',
140 'title', 'Title',
141 'volume', 'Volume',
[3142]142 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
[1676]143
144 'keywords', 'Keywords',
145 'abstract', 'Abstract',
[4792]146 'copyright', 'Copyright',
147 'note', 'Note',
[11527]148 'url', 'URL',
[8121]149 );
[1676]150
151 # Metadata fields
152 my %metadata;
[4792]153 my ($EntryType, $Creator, $Keywords, $text);
[1676]154
[2901]155 my $verbosity = $self->{'verbosity'};
156 $verbosity = 0 unless $verbosity;
157
[3142]158 # Make sure the text has exactly one entry per line -
159 # append line to previous if it doesn't start with " <key> = "
[2901]160
[4792]161 my @input_lines=split(/\r?\n/, $$textref);
[3142]162 my @all_lines;
163 my $entry_line=shift @input_lines;
164 foreach my $input_line (@input_lines) {
165 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
166 # this is a new key
167 push(@all_lines, $entry_line);
168 $entry_line=$input_line;
169 } else {
170 # this is a continuation of previous line
[3249]171 $entry_line .= " " . $input_line;
[3142]172 }
173
174 }
175 # add final line, removing trailing '}'
176 $entry_line =~ s/\}\s*$//;
177 push(@all_lines, $entry_line);
178 push(@all_lines, "}");
[2901]179
[1676]180 # Read and process each line in the bib file.
[2901]181 my ($entryname, $name, $value, $line);
182 foreach $line (@all_lines) {
183
[1676]184 # Add each line. Most lines consist of a field identifer and
185 # then data, and we simply store them, though we treat some
186 # of the fields a bit differently.
187
188 $line =~ s/\s+/ /g;
189 $text .= "$line\n";
190
[4792]191
[2901]192 print "Processing line = $line \n" if $verbosity>=4;
193
[1676]194 # The first line is special, it contains the reference type and OID
[4792]195 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]*)\W*$/) {
[1676]196 $EntryType = $1;
[4792]197 my $EntryID = (defined $2) ? $2 : "default";
[2484]198 print "** $EntryType - \"$EntryID\" \n"
199 if ($verbosity >= 4);
[4792]200
[1676]201 next;
202 }
[2484]203 if ($line =~ /\@/) {
[2901]204 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
[2484]205 if ($verbosity >= 2);
[2901]206 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
[2484]207 if ($verbosity >= 2);
208 }
[1676]209
210 # otherwise, parse the metadata out of this line
[9582]211 next unless ($line =~ /^\s*(\w+)\s*=\s*(.*)/);
[2901]212 $entryname = lc($1);
[1676]213 $value = $2;
[3112]214 $value =~ s/,?\s*$//; # remove trailing comma and space
215 if ($value =~ /^"/ && $value =~ /"$/) {
216 # remove surrounding " marks
217 $value =~ s/^"//; $value =~ s/"$//;
[13416]218 } elsif ($value =~ /^\{/ && $value =~ /\}$/) {
219 # remove surrounding {} marks
220 $value =~ s/^\{//; $value =~ s/\}$//;
[3112]221 }
[13413]222 # special case for year - we only want a 4 digit year, not
223 # "circa 2004" etc
224 if ($entryname=~ /^year$/i) {
225 if ($value=~ /(\d{4})/) {
226 $value=$1;
227 $metadata{$entryname} .= "$value";
228 }
229 }
230 else {
231
232 $value = &process_latex($value);
233 # Add this line of metadata
234 $metadata{$entryname} .= "$value";
235 }
[1676]236 }
237
[1677]238 # Add the Entry type as metadata
[2901]239 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
[8121]240
241 #Add the fileformat as metadata
242 $doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
[1676]243
[8121]244
[1676]245 # Add the various field as metadata
[2901]246 foreach my $entryname (keys %metadata) {
247 next unless (defined $field{$entryname});
248 next unless (defined $metadata{$entryname});
[1676]249
[2901]250 $name = $field{$entryname};
251 $value = $metadata{$entryname};
[1676]252
[3142]253 if ($name =~ /^Month/) {
254 $value=expand_month($value);
255 }
[1676]256
257 # Several special operatons on metadata follow
258
259 # Add individual keywords.
260 # The full set of keywords will be added, in due course, as "Keywords".
261 # However, we also want to add them as individual "Keyword" metadata elements.
[2901]262 if ($entryname eq "keywords") {
[1676]263 my @keywordlist = split(/,/, $value);
264 foreach my $k (@keywordlist) {
[2901]265 $k = lc($k);
[1676]266 $k =~ s/\s*$//;
267 $k =~ s/^\s*//;
268 if ($k =~ /\w/) {
[2901]269 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
[1676]270 }
271 }
272 }
273
274 # Add individual authors
275 # The author metadata will be stored as one "Creator" entry, but we
276 # also want to split it into several individual "Author" fields in
[3112]277 # "Lastname, Firstnames" format so we can browse it.
[2901]278 if ($entryname eq "author") { #added also comparison with editor
279
[3249]280 # take care of "et al."...
[9582]281 my $etal='';
282 if ($value =~ s/\s+(and\s+others|et\.?\s+al\.?)\s*$//i) {
283 $etal=' <em>et. al.</em>';
284 }
[2901]285 # und here for german language...
286 # don't use brackets in pattern, else the matched bit becomes
287 # an element in the list!
[3249]288 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
289 my @formattedlist = ();
[1676]290 foreach $a (@authorlist) {
291 $a =~ s/\s*$//;
292 $a =~ s/^\s*//;
293 # Reformat and add author name
[2901]294 next if $a=~ /^\s*$/;
[3249]295
296 # names are "First von Last", "von Last, First"
297 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
298 my $first="";
299 my $vonlast="";
300 my $jr="";
[2901]301
[3249]302 if ($a =~ /,/) {
303 my @parts=split(/,\s*/, $a);
304 $first = pop @parts;
305 if (scalar(@parts) == 2) {
306 $jr = pop @parts;
307 }
308 $vonlast=shift @parts;
309 if (scalar(@parts) > 0) {
[15872]310 print $outhandle "BibTexPlugin: couldn't parse name $a\n";
[3249]311 # but we continue anyway...
312 }
313 } else { # First von Last
314 my @words = split(/ /, $a);
315 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
316 $first .= " " . shift (@words);
317 }
318 $first =~ s/^\s//;
319 $vonlast = join (' ', @words); # whatever's left...
320 }
321 my $von="";
322 my $last="";
323 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
324 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
325 $von = $1;
326 if (!defined ($von)) {
327 # some non-English names do start with lowercase
328 # eg "Marie desJardins". Also we can get typos...
[15872]329 print $outhandle "BibTexPlugin: couldn't parse surname $vonlast\n";
[3249]330 $von="";
331 if ($vonlast =~ /^[a-z]+$/) {
332 # if it's all lowercase, uppercase 1st.
333 $vonlast =~ s/^(.)/\u$1/;
[2901]334
[3249]335 }
336 }
337 $von =~ s/\s*$//;
338 $last=$vonlast;
339 } else {
340 $last=$vonlast;
341 }
342 my $wholename="$first $von $last $jr";
[9582]343 $wholename =~ s/\s+/ /g; # squeeze multiple spaces
344 $wholename =~ s/ $//;
[3249]345 push (@formattedlist, $wholename);
346 my $fullname = "$last";
347 $fullname .= " $jr" if ($jr);
348 $fullname .= ", $first";
349 $fullname .= " $von" if ($von);
350
[1676]351 # Add each name to set of Authors
[3249]352 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
[1676]353 }
[3249]354
355 # Only want at most one "and" in the Creator field
356 if (scalar(@formattedlist) > 2) {
357 my $lastauthor=pop @formattedlist;
358 $value=join(', ', @formattedlist);
359 $value.=" and $lastauthor";
360 } else { # 1 or 2 authors...
361 $value=join(" and ",@formattedlist);
362 }
363 $value.=$etal; # if there was "et al."
[1676]364 }
365
366 # Books and Journals are additionally marked for display purposes
[2901]367 if ($entryname eq "booktitle") {
368 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
369 } elsif ($entryname eq "journal") {
370 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
[1676]371 }
372
[3249]373 # Add the various fields as metadata
374 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
[13413]375
[1676]376 }
377
[9582]378 # for books and journals...
379 if (!exists $metadata{'title'}) {
380 my $name=$field{'title'}; # get Greenstone metadata name
381 my $value;
382 if (exists $metadata{'booktitle'}) {
383 $value=$metadata{'booktitle'};
384 } elsif (exists $metadata{'journal'}) {
385 $value=$metadata{'journal'};
386 }
387 if ($value) {
388 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
389 }
390 }
391
[3249]392 # Add Date (yyyymmdd) metadata
393 if (defined ($metadata{'year'}) ) {
394 my $date=$metadata{'year'};
395 chomp $date;
396 my $month=$metadata{'month'};
397 if (defined($month)) {
398 # month is currently 3 letter code or a range...
399 $month = expand_month($month);
400 # take the first month found... might not find one!
401 $month =~ m/_textmonth(\d\d)_/;
402 $month = $1;
403 }
404 if (!defined($month)) {
405 $month="00";
406 }
407 $date .= "${month}00";
408 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
409}
410
411# # Add the text in BibTex format (all fields)
[1676]412 if ($text =~ /\w/) {
[3249]413
414 $text =~ s@&@&amp;@g;
415 $text =~ s@<@&lt;@g;
416 $text =~ s@>@&gt;@g;
417 $text =~ s@\n@<br/>\n@g;
418 $text =~ s@\\@\\\\@g;
419
[2901]420 $doc_obj->add_utf8_text ($cursection, $text);
[1676]421 }
422
[1677]423 return 1;
[1676]424}
425
[1677]426
[2901]427
428
429# convert email addresses and URLs into links
430sub convert_urls_into_links{
431 my ($text) = @_;
432
433 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[3540]434 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
[2901]435
436 return $text;
437}
438
439# Clean up whitespace and convert \n charaters to <BR> or <P>
440sub clean_up_whitespaces{
441 my ($text) = @_;
442
443 $text =~ s/%%%%%/<BR> <BR>/g;
444 $text =~ s/ +/ /g;
445 $text =~ s/\s*$//;
446 $text =~ s/^\s*//;
447 $text =~ s/\n/\n<BR>/g;
448 $text =~ s/<BR>\s*<BR>/<P>/g;
449
450 return $text;
451}
452
453
454sub convert_problem_characters_without_ampersand{
455 my ($text) = @_;
456 $text =~ s/</&lt;/g;
457 $text =~ s/>/&gt;/g;
458
459 $text =~ s/\'\'/\"/g; #Latex -specific conversion
460 $text =~ s/\`\`/\"/g; #Latex -specific conversion
461
462 $text =~ s/\"/&quot;/g;
463 $text =~ s/\'/&#8217;/g;
464 $text =~ s/\`/&#8216;/g;
465
[3249]466# $text =~ s/\+/ /g;
467# $text =~ s/\(/ /g;
468# $text =~ s/\)/ /g;
469
[2901]470 $text =~ s/\\/\\\\/g;
471
[3249]472# $text =~ s/\./\\\./g;
[2901]473
474 return $text;
475}
476
[1676]477# Convert a text string into HTML.
478
[1677]479# The HTML is going to be inserted into a GML file, so we have to be
[2901]480# careful not to use symbols like ">", which occurs frequently in email
[1677]481# messages (and use &gt instead.
482
483# This function also turns URLs and email addresses into links, and
484# replaces carriage returns with <BR> tags (and multiple carriage returns
485# with <P> tags).
486
[1676]487sub text_into_html {
488 my ($text) = @_;
489
[2901]490 # Convert problem characters into HTML symbols
[1676]491 $text =~ s/&/&amp;/g;
492
[2901]493 $text = &convert_problem_characters_without_ampersand( $text );
494
[1676]495 # convert email addresses and URLs into links
[2901]496 $text = &convert_urls_into_links( $text );
[1676]497
[2901]498 $text = &clean_up_whitespaces( $text );
[1676]499
500 return $text;
501}
502
[2901]503
[3142]504sub expand_month {
505 my $text=shift;
[2901]506
[3142]507 # bibtex style files expand abbreviations for months.
508 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
509 $text =~ s/jan/_textmonth01_/g;
510 $text =~ s/feb/_textmonth02_/g;
511 $text =~ s/mar/_textmonth03_/g;
512 $text =~ s/apr/_textmonth04_/g;
513 $text =~ s/may/_textmonth05_/g;
514 $text =~ s/jun/_textmonth06_/g;
515 $text =~ s/jul/_textmonth07_/g;
516 $text =~ s/aug/_textmonth08_/g;
517 $text =~ s/sep/_textmonth09_/g;
518 $text =~ s/oct/_textmonth10_/g;
519 $text =~ s/nov/_textmonth11_/g;
520 $text =~ s/dec/_textmonth12_/g;
[2901]521
[3142]522 return $text;
523}
524
525
[2901]526# Convert accented characters, remove { }, interprete some commands....
527# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
528sub process_latex {
529 my ($text) = @_;
530
531 # note - this is really ugly, but it works. There may be a prettier way
532 # of mapping latex accented chars to utf8, but we just brute force it here.
533 # Also, this isn't complete - not every single possible accented letter
534 # is in here yet, but most of the common ones are.
535
536 my %utf8_chars =
537 (
538 # acutes
539 '\'a' => chr(0xc3).chr(0xa1),
540 '\'c' => chr(0xc4).chr(0x87),
541 '\'e' => chr(0xc3).chr(0xa9),
542 '\'i' => chr(0xc3).chr(0xad),
543 '\'l' => chr(0xc3).chr(0xba),
544 '\'n' => chr(0xc3).chr(0x84),
545 '\'o' => chr(0xc3).chr(0xb3),
546 '\'r' => chr(0xc5).chr(0x95),
547 '\'s' => chr(0xc5).chr(0x9b),
548 '\'u' => chr(0xc3).chr(0xba),
549 '\'y' => chr(0xc3).chr(0xbd),
550 '\'z' => chr(0xc5).chr(0xba),
551 # graves
552 '`a' => chr(0xc3).chr(0xa0),
553 '`A' => chr(0xc3).chr(0x80),
554 '`e' => chr(0xc3).chr(0xa8),
555 '`E' => chr(0xc3).chr(0x88),
556 '`i' => chr(0xc3).chr(0xac),
557 '`I' => chr(0xc3).chr(0x8c),
558 '`o' => chr(0xc3).chr(0xb2),
559 '`O' => chr(0xc3).chr(0x92),
560 '`u' => chr(0xc3).chr(0xb9),
561 '`U' => chr(0xc3).chr(0x99),
562 # circumflex
563 '^a' => chr(0xc3).chr(0xa2),
564 '^A' => chr(0xc3).chr(0x82),
565 '^c' => chr(0xc4).chr(0x89),
566 '^C' => chr(0xc4).chr(0x88),
567 '^e' => chr(0xc3).chr(0xaa),
568 '^E' => chr(0xc3).chr(0x8a),
569 '^g' => chr(0xc4).chr(0x9d),
570 '^G' => chr(0xc4).chr(0x9c),
571 '^h' => chr(0xc4).chr(0xa5),
572 '^H' => chr(0xc4).chr(0xa4),
573 '^i' => chr(0xc3).chr(0xae),
574 '^I' => chr(0xc3).chr(0x8e),
575 '^j' => chr(0xc4).chr(0xb5),
576 '^J' => chr(0xc4).chr(0xb4),
577 '^o' => chr(0xc3).chr(0xb4),
578 '^O' => chr(0xc3).chr(0x94),
579 '^s' => chr(0xc5).chr(0x9d),
580 '^S' => chr(0xc5).chr(0x9c),
581 '^u' => chr(0xc3).chr(0xa2),
582 '^U' => chr(0xc3).chr(0xbb),
583 '^w' => chr(0xc5).chr(0xb5),
584 '^W' => chr(0xc5).chr(0xb4),
585 '^y' => chr(0xc5).chr(0xb7),
586 '^Y' => chr(0xc5).chr(0xb6),
587
588 # diaeresis
589 '"a' => chr(0xc3).chr(0xa4),
590 '"A' => chr(0xc3).chr(0x84),
591 '"e' => chr(0xc3).chr(0xab),
592 '"E' => chr(0xc3).chr(0x8b),
593 '"\\\\i' => chr(0xc3).chr(0xaf),
594 '"\\\\I' => chr(0xc3).chr(0x8f),
595 '"o' => chr(0xc3).chr(0xb6),
596 '"O' => chr(0xc3).chr(0x96),
597 '"u' => chr(0xc3).chr(0xbc),
598 '"U' => chr(0xc3).chr(0x9c),
599 '"y' => chr(0xc3).chr(0xbf),
600 '"Y' => chr(0xc3).chr(0xb8),
601 # tilde
[3142]602 '~A' => chr(0xc3).chr(0x83),
603 '~N' => chr(0xc3).chr(0x91),
604 '~O' => chr(0xc3).chr(0x95),
605 '~a' => chr(0xc3).chr(0xa3),
606 '~n' => chr(0xc3).chr(0xb1),
607 '~o' => chr(0xc3).chr(0xb5),
[2901]608 # caron - handled specially
609 # double acute
610 # ring
611 # dot
[3249]612 '.c' => chr(0xc4).chr(0x8b),
613 '.C' => chr(0xc4).chr(0x8a),
614 '.e' => chr(0xc4).chr(0x97),
615 '.E' => chr(0xc4).chr(0x96),
616 '.g' => chr(0xc4).chr(0xa1),
617 '.G' => chr(0xc4).chr(0xa0),
618 '.I' => chr(0xc4).chr(0xb0),
619 '.z' => chr(0xc5).chr(0xbc),
620 '.Z' => chr(0xc5).chr(0xbb),
[2901]621 # macron
622 '=a' => chr(0xc4).chr(0x81),
623 '=A' => chr(0xc4).chr(0x80),
624 '=e' => chr(0xc4).chr(0x93),
625 '=E' => chr(0xc4).chr(0x92),
626 '=i' => chr(0xc4).chr(0xab),
627 '=I' => chr(0xc4).chr(0xaa),
628 '=o' => chr(0xc4).chr(0x8d),
629 '=O' => chr(0xc4).chr(0x8c),
630 '=u' => chr(0xc4).chr(0xab),
631 '=U' => chr(0xc4).chr(0xaa),
632
633 # stroke - handled specially - see below
634
635 # cedilla - handled specially
636 );
637
638# these are one letter latex commands - we make sure they're not a longer
639# command name. eg {\d} is d+stroke, so careful of \d
640 my %special_utf8_chars =
641 (
[3156]642 # breve
643 'u g' => chr(0xc4).chr(0x9f),
644 'u G' => chr(0xc4).chr(0x9e),
645 'u i' => chr(0xc4).chr(0xad),
646 'u I' => chr(0xc4).chr(0xac),
647 'u o' => chr(0xc5).chr(0x8f),
648 'u O' => chr(0xc5).chr(0x8e),
649 'u u' => chr(0xc5).chr(0xad),
650 'u U' => chr(0xc5).chr(0xac),
[2901]651 # caron
[3156]652 'v c' => chr(0xc4).chr(0x8d),
653 'v C' => chr(0xc4).chr(0x8c),
[2901]654 'v n' => chr(0xc5).chr(0x88),
655 'v N' => chr(0xc5).chr(0x87),
656 'v s' => chr(0xc5).chr(0xa1),
657 'v S' => chr(0xc5).chr(0xa5),
[3156]658 'v z' => chr(0xc5).chr(0xbe),
659 'v Z' => chr(0xc5).chr(0xbd),
[2901]660 # cedilla
661 'c c' => chr(0xc3).chr(0xa7),
662 'c C' => chr(0xc3).chr(0x87),
663 'c g' => chr(0xc4).chr(0xa3),
664 'c G' => chr(0xc4).chr(0xa2),
665 'c k' => chr(0xc4).chr(0xb7),
666 'c K' => chr(0xc4).chr(0xb6),
667 'c l' => chr(0xc4).chr(0xbc),
668 'c L' => chr(0xc4).chr(0xbb),
669 'c n' => chr(0xc5).chr(0x86),
670 'c N' => chr(0xc5).chr(0x85),
671 'c r' => chr(0xc5).chr(0x97),
672 'c R' => chr(0xc5).chr(0x96),
673 'c s' => chr(0xc5).chr(0x9f),
674 'c S' => chr(0xc5).chr(0x9e),
675 'c t' => chr(0xc5).chr(0xa3),
676 'c T' => chr(0xc5).chr(0xa2),
677 # double acute / Hungarian accent
678 'H O' => chr(0xc5).chr(0x90),
679 'H o' => chr(0xc5).chr(0x91),
680 'H U' => chr(0xc5).chr(0xb0),
681 'H u' => chr(0xc5).chr(0xb1),
682
683 # stroke
684 'd' => chr(0xc4).chr(0x91),
685 'D' => chr(0xc4).chr(0x90),
686 'h' => chr(0xc4).chr(0xa7),
687# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
[3112]688 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
[2901]689 'l' => chr(0xc5).chr(0x82),
690 'L' => chr(0xc5).chr(0x81),
691 'o' => chr(0xc3).chr(0xb8),
692 'O' => chr(0xc3).chr(0x98),
693 't' => chr(0xc5).chr(0xa7),
694 'T' => chr(0xc5).chr(0xa6),
695 # german ss/szlig/sharp s
696 'ss' => chr(0xc3).chr(0x9f),
697 );
698
699 # convert latex-style accented characters.
[3249]700
[2901]701 # remove space (if any) between \ and letter to accent (eg {\' a})
702
[13413]703 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
704
[2901]705 # remove {} around a single character (eg \'{e})
[13413]706 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
[2901]707
[3249]708 # \, is another way of doing cedilla \c
709 $text =~ s@\\,(.)@\\c $1@g;
710
[2901]711 # remove {} around a single character for special 1 letter commands -
712 # need to insert a space. Eg \v{s} -> {\v s}
[3156]713 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
[3249]714
[2901]715 # only do if the text contains a '\' character.
716 if ($text =~ m|\\|) {
[3249]717 # "normal" accents - ie non-alpha latex tag
[13413]718 while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
[3249]719 my $tex="$1$2"; my $char="$2";
720 my $replacement=$utf8_chars{$tex};
721 if (!defined($replacement)) {
[15872]722 print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]723 $replacement=$char;
724 }
[5454]725 $text =~ s/\\\Q$tex/$replacement/g;
[3249]726 }
[2901]727
[3249]728 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
729 # only do the change if immediately followed by a space, }, {, or \
730 # one letter accents ( + ss)
731 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
732 my $tex=$1;
733 my $replacement=$special_utf8_chars{$tex};
734 if (!defined($replacement)) {
[15872]735 print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]736 $replacement=$tex;
737 }
738 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
739
740 }
741
742 # one letter latex accent commands that affect following letter
743 while ($text =~ m@\\([uvcH]) ([\w])@) {
744 my $tex="$1 $2"; my $char="$2";
745 my $replacement=$special_utf8_chars{$tex};
746 if (!defined($replacement)) {
[15872]747 print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]748 $replacement=$char;
749 }
750 $text =~ s/\\$tex/$replacement/g;
751 }
[2901]752 }
[3249]753
754 # escape html-sensitive characters
755 $text =~ s@&@&amp;@g;
756 $text =~ s@<@&lt;@g;
757 $text =~ s@>@&gt;@g;
758 $text =~ s/''/"/g; # Latex-specific
759 $text =~ s/``/"/g; # Latex-specific
760 # greenstone-specific
761 $text =~ s@\[@&\#91;@g;
762 $text =~ s@\]@&\#93;@g;
763
[3112]764 # remove latex commands
[3249]765
766 # explicitly recognised commands
767 $text =~ s@\\ldots@&hellip;@g;
768
769 # maths mode
770 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
771
772 # remove all other commands with optional arguments...
[3156]773 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
774 # $text =~ s@\\noopsort{[^}]+\}@@g;
775 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
[3142]776
[2901]777 # remove latex groupings { } (but not \{ or \} )
[3249]778 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
[3142]779 $text =~ s/^\{//; # remove { if first char
780
781 # latex characters
782 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
[3249]783 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
[3142]784 # optional break "\-"
[3249]785 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
[3142]786 # the non-macro bits have quotes around them - we just remove quotes
[3249]787 $text =~ s/([^&])[\"\#]/$1/g;
[3142]788 }
[3249]789 # dashes. Convert (m|n)-dash into single dash for html.
790 $text =~ s@\-\-+@\-@g;
791
[2901]792 # quoted { } chars
793 $text =~ s@\\{@{@g;
794 $text =~ s@\\}@}@g;
795
[3249]796 # finally to protect against macro language...
797 $text =~ s@\\@\\\\@g;
798
[2901]799 return $text;
800}
801
802
[3249]803sub process_latex_math {
804 my $text = shift;
805
806 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
807 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
808 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
809 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
810 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
811
812 # put all other command names in italics
813 $text =~ s@\\([\w]+)@<i>$1</i>@g;
814
815 return $text;
816}
817
[17026]818sub add_OID {
[2484]819 my $self = shift (@_);
820 my ($doc_obj, $id, $segment_number) = @_;
821
822 if ( $self->{'key'} eq "default") {
[17026]823 $self->SUPER::add_OID(@_);
824# $doc_obj->set_OID("$id\_$segment_number");
[2484]825 } else {
826 $doc_obj->set_OID($self->{'key'});
827 }
828}
[1676]829
[2484]8301;
[4792]831
832
Note: See TracBrowser for help on using the repository browser.