source: gsdl/trunk/perllib/plugins/BibTexPlugin.pm@ 20605

Last change on this file since 20605 was 17026, checked in by kjdon, 16 years ago

OID generation modifications: OIDtype and OIDmetadata options now available for plugins as well as import. OIDtype for plugins defaults to auto - if set to auto, then use the values from import. All plugins now call self->add_OID instead of doc_obj->set_OID. This sets the doc_obj OIDtype so that doesn't need to be donein other places any more. all plugins have the get_oid_hash_type method - normally returns hash_on_file, but can be overridden to return hash_on_ga_xml for those plugins that don't want hashing on file (MP3,OggVorbis...)

  • Property svn:keywords set to Author Date Id Revision
File size: 23.5 KB
RevLine 
[1676]1###########################################################################
2#
[15872]3# BibTexPlugin.pm - a plugin for bibliography records in BibTex format
[1676]4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
[2901]10# Copyright 1999-2001 New Zealand Digital Library Project
[1676]11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
[15872]29# BibTexPlugin reads bibliography files in BibTex format.
[1676]30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
[15872]34# BibTexPlugin creates a document object for every reference a the file.
35# It is a subclass of SplitTextFile, so if there are multiple records, all
[1676]36# are read.
[2901]37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
[1676]42
[3587]43
[15872]44package BibTexPlugin;
[1676]45
[15872]46use SplitTextFile;
[10254]47use strict;
48no strict 'refs'; # allow filehandles to be variables and viceversa
[1676]49
[15872]50# BibTexPlugin is a sub-class of SplitTextFile.
[1676]51sub BEGIN {
[15872]52 @BibTexPlugin::ISA = ('SplitTextFile');
[1676]53}
54
[3540]55my $arguments =
[4744]56 [ { 'name' => "process_exp",
[15872]57 'desc' => "{BasePlugin.process_exp}",
[6408]58 'type' => "regexp",
[4873]59 'reqd' => "no",
[6408]60 'deft' => &get_default_process_exp() },
61 { 'name' => "split_exp",
[15872]62 'desc' => "{SplitTextFile.split_exp}",
[6408]63 'type' => "regexp",
64 'deft' => &get_default_split_exp(),
65 'reqd' => "no" }
66 ];
[3540]67
[15872]68my $options = { 'name' => "BibTexPlugin",
69 'desc' => "{BibTexPlugin.desc}",
[6408]70 'abstract' => "no",
71 'inherits' => "yes",
[9120]72 'explodes' => "yes",
[4744]73 'args' => $arguments };
[3540]74
[1676]75# This plugin processes files with the suffix ".bib"
76sub get_default_process_exp {
77 return q^(?i)\.bib$^;
78}
79
80# This plugin splits the input text at blank lines
81sub get_default_split_exp {
82 return q^\n+(?=@)^;
83}
[15872]84
[3540]85sub new {
[10218]86 my ($class) = shift (@_);
87 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88 push(@$pluginlist, $class);
89
[15872]90 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91 push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]92
[15872]93 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]94
[3540]95 return bless $self, $class;
96}
[1676]97
[2901]98
99
[1676]100# The process function reads a single bibliographic record and stores
101# it as a new document.
102
103sub process {
104 my $self = shift (@_);
[6332]105 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1676]106 my $outhandle = $self->{'outhandle'};
107
[10254]108 my $cursection = $doc_obj->get_top_section();
[2484]109 $self->{'key'} = "default";
110
[1676]111 # Check that we're dealing with a valid BibTex record
112 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
113
114 # Ignore things we can't use
115 return 0 if ($$textref =~ /^\@String/);
116
[1711]117 # This hash translates BibTex field names into metadata names. The
[1676]118 # BibTex names are taken from the "Local Guide to Latex" Graeme
[2901]119 # McKinstry. Metadata names are consistent with ReferPlug.
[1676]120
[9582]121 # The author metadata will be stored as one "Creator" entry, but will
122 # also be split up into several individual "Author" fields.
123
[1676]124 my %field = (
125 'address', 'PublisherAddress',
126 'author', 'Creator',
[2901]127
[1676]128 'booktitle', 'Booktitle',
129 'chapter', 'Chapter',
130 'edition', 'Edition',
[2901]131 'editor', 'Editor',
[1676]132 'institution', 'Publisher',
133 'journal', 'Journal',
134 'month', 'Month',
135 'number', 'Number',
136 'pages', 'Pages',
137 'publisher', 'Publisher',
138 'school', 'Publisher',
139 'title', 'Title',
140 'volume', 'Volume',
[3142]141 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
[1676]142
143 'keywords', 'Keywords',
144 'abstract', 'Abstract',
[4792]145 'copyright', 'Copyright',
146 'note', 'Note',
[11527]147 'url', 'URL',
[8121]148 );
[1676]149
150 # Metadata fields
151 my %metadata;
[4792]152 my ($EntryType, $Creator, $Keywords, $text);
[1676]153
[2901]154 my $verbosity = $self->{'verbosity'};
155 $verbosity = 0 unless $verbosity;
156
[3142]157 # Make sure the text has exactly one entry per line -
158 # append line to previous if it doesn't start with " <key> = "
[2901]159
[4792]160 my @input_lines=split(/\r?\n/, $$textref);
[3142]161 my @all_lines;
162 my $entry_line=shift @input_lines;
163 foreach my $input_line (@input_lines) {
164 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
165 # this is a new key
166 push(@all_lines, $entry_line);
167 $entry_line=$input_line;
168 } else {
169 # this is a continuation of previous line
[3249]170 $entry_line .= " " . $input_line;
[3142]171 }
172
173 }
174 # add final line, removing trailing '}'
175 $entry_line =~ s/\}\s*$//;
176 push(@all_lines, $entry_line);
177 push(@all_lines, "}");
[2901]178
[1676]179 # Read and process each line in the bib file.
[2901]180 my ($entryname, $name, $value, $line);
181 foreach $line (@all_lines) {
182
[1676]183 # Add each line. Most lines consist of a field identifer and
184 # then data, and we simply store them, though we treat some
185 # of the fields a bit differently.
186
187 $line =~ s/\s+/ /g;
188 $text .= "$line\n";
189
[4792]190
[2901]191 print "Processing line = $line \n" if $verbosity>=4;
192
[1676]193 # The first line is special, it contains the reference type and OID
[4792]194 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]*)\W*$/) {
[1676]195 $EntryType = $1;
[4792]196 my $EntryID = (defined $2) ? $2 : "default";
[2484]197 print "** $EntryType - \"$EntryID\" \n"
198 if ($verbosity >= 4);
[4792]199
[1676]200 next;
201 }
[2484]202 if ($line =~ /\@/) {
[2901]203 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
[2484]204 if ($verbosity >= 2);
[2901]205 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
[2484]206 if ($verbosity >= 2);
207 }
[1676]208
209 # otherwise, parse the metadata out of this line
[9582]210 next unless ($line =~ /^\s*(\w+)\s*=\s*(.*)/);
[2901]211 $entryname = lc($1);
[1676]212 $value = $2;
[3112]213 $value =~ s/,?\s*$//; # remove trailing comma and space
214 if ($value =~ /^"/ && $value =~ /"$/) {
215 # remove surrounding " marks
216 $value =~ s/^"//; $value =~ s/"$//;
[13416]217 } elsif ($value =~ /^\{/ && $value =~ /\}$/) {
218 # remove surrounding {} marks
219 $value =~ s/^\{//; $value =~ s/\}$//;
[3112]220 }
[13413]221 # special case for year - we only want a 4 digit year, not
222 # "circa 2004" etc
223 if ($entryname=~ /^year$/i) {
224 if ($value=~ /(\d{4})/) {
225 $value=$1;
226 $metadata{$entryname} .= "$value";
227 }
228 }
229 else {
230
231 $value = &process_latex($value);
232 # Add this line of metadata
233 $metadata{$entryname} .= "$value";
234 }
[1676]235 }
236
[1677]237 # Add the Entry type as metadata
[2901]238 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
[8121]239
240 #Add the fileformat as metadata
241 $doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
[1676]242
[8121]243
[1676]244 # Add the various field as metadata
[2901]245 foreach my $entryname (keys %metadata) {
246 next unless (defined $field{$entryname});
247 next unless (defined $metadata{$entryname});
[1676]248
[2901]249 $name = $field{$entryname};
250 $value = $metadata{$entryname};
[1676]251
[3142]252 if ($name =~ /^Month/) {
253 $value=expand_month($value);
254 }
[1676]255
256 # Several special operatons on metadata follow
257
258 # Add individual keywords.
259 # The full set of keywords will be added, in due course, as "Keywords".
260 # However, we also want to add them as individual "Keyword" metadata elements.
[2901]261 if ($entryname eq "keywords") {
[1676]262 my @keywordlist = split(/,/, $value);
263 foreach my $k (@keywordlist) {
[2901]264 $k = lc($k);
[1676]265 $k =~ s/\s*$//;
266 $k =~ s/^\s*//;
267 if ($k =~ /\w/) {
[2901]268 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
[1676]269 }
270 }
271 }
272
273 # Add individual authors
274 # The author metadata will be stored as one "Creator" entry, but we
275 # also want to split it into several individual "Author" fields in
[3112]276 # "Lastname, Firstnames" format so we can browse it.
[2901]277 if ($entryname eq "author") { #added also comparison with editor
278
[3249]279 # take care of "et al."...
[9582]280 my $etal='';
281 if ($value =~ s/\s+(and\s+others|et\.?\s+al\.?)\s*$//i) {
282 $etal=' <em>et. al.</em>';
283 }
[2901]284 # und here for german language...
285 # don't use brackets in pattern, else the matched bit becomes
286 # an element in the list!
[3249]287 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
288 my @formattedlist = ();
[1676]289 foreach $a (@authorlist) {
290 $a =~ s/\s*$//;
291 $a =~ s/^\s*//;
292 # Reformat and add author name
[2901]293 next if $a=~ /^\s*$/;
[3249]294
295 # names are "First von Last", "von Last, First"
296 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
297 my $first="";
298 my $vonlast="";
299 my $jr="";
[2901]300
[3249]301 if ($a =~ /,/) {
302 my @parts=split(/,\s*/, $a);
303 $first = pop @parts;
304 if (scalar(@parts) == 2) {
305 $jr = pop @parts;
306 }
307 $vonlast=shift @parts;
308 if (scalar(@parts) > 0) {
[15872]309 print $outhandle "BibTexPlugin: couldn't parse name $a\n";
[3249]310 # but we continue anyway...
311 }
312 } else { # First von Last
313 my @words = split(/ /, $a);
314 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
315 $first .= " " . shift (@words);
316 }
317 $first =~ s/^\s//;
318 $vonlast = join (' ', @words); # whatever's left...
319 }
320 my $von="";
321 my $last="";
322 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
323 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
324 $von = $1;
325 if (!defined ($von)) {
326 # some non-English names do start with lowercase
327 # eg "Marie desJardins". Also we can get typos...
[15872]328 print $outhandle "BibTexPlugin: couldn't parse surname $vonlast\n";
[3249]329 $von="";
330 if ($vonlast =~ /^[a-z]+$/) {
331 # if it's all lowercase, uppercase 1st.
332 $vonlast =~ s/^(.)/\u$1/;
[2901]333
[3249]334 }
335 }
336 $von =~ s/\s*$//;
337 $last=$vonlast;
338 } else {
339 $last=$vonlast;
340 }
341 my $wholename="$first $von $last $jr";
[9582]342 $wholename =~ s/\s+/ /g; # squeeze multiple spaces
343 $wholename =~ s/ $//;
[3249]344 push (@formattedlist, $wholename);
345 my $fullname = "$last";
346 $fullname .= " $jr" if ($jr);
347 $fullname .= ", $first";
348 $fullname .= " $von" if ($von);
349
[1676]350 # Add each name to set of Authors
[3249]351 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
[1676]352 }
[3249]353
354 # Only want at most one "and" in the Creator field
355 if (scalar(@formattedlist) > 2) {
356 my $lastauthor=pop @formattedlist;
357 $value=join(', ', @formattedlist);
358 $value.=" and $lastauthor";
359 } else { # 1 or 2 authors...
360 $value=join(" and ",@formattedlist);
361 }
362 $value.=$etal; # if there was "et al."
[1676]363 }
364
365 # Books and Journals are additionally marked for display purposes
[2901]366 if ($entryname eq "booktitle") {
367 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
368 } elsif ($entryname eq "journal") {
369 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
[1676]370 }
371
[3249]372 # Add the various fields as metadata
373 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
[13413]374
[1676]375 }
376
[9582]377 # for books and journals...
378 if (!exists $metadata{'title'}) {
379 my $name=$field{'title'}; # get Greenstone metadata name
380 my $value;
381 if (exists $metadata{'booktitle'}) {
382 $value=$metadata{'booktitle'};
383 } elsif (exists $metadata{'journal'}) {
384 $value=$metadata{'journal'};
385 }
386 if ($value) {
387 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
388 }
389 }
390
[3249]391 # Add Date (yyyymmdd) metadata
392 if (defined ($metadata{'year'}) ) {
393 my $date=$metadata{'year'};
394 chomp $date;
395 my $month=$metadata{'month'};
396 if (defined($month)) {
397 # month is currently 3 letter code or a range...
398 $month = expand_month($month);
399 # take the first month found... might not find one!
400 $month =~ m/_textmonth(\d\d)_/;
401 $month = $1;
402 }
403 if (!defined($month)) {
404 $month="00";
405 }
406 $date .= "${month}00";
407 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
408}
409
410# # Add the text in BibTex format (all fields)
[1676]411 if ($text =~ /\w/) {
[3249]412
413 $text =~ s@&@&amp;@g;
414 $text =~ s@<@&lt;@g;
415 $text =~ s@>@&gt;@g;
416 $text =~ s@\n@<br/>\n@g;
417 $text =~ s@\\@\\\\@g;
418
[2901]419 $doc_obj->add_utf8_text ($cursection, $text);
[1676]420 }
421
[1677]422 return 1;
[1676]423}
424
[1677]425
[2901]426
427
428# convert email addresses and URLs into links
429sub convert_urls_into_links{
430 my ($text) = @_;
431
432 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[3540]433 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
[2901]434
435 return $text;
436}
437
438# Clean up whitespace and convert \n charaters to <BR> or <P>
439sub clean_up_whitespaces{
440 my ($text) = @_;
441
442 $text =~ s/%%%%%/<BR> <BR>/g;
443 $text =~ s/ +/ /g;
444 $text =~ s/\s*$//;
445 $text =~ s/^\s*//;
446 $text =~ s/\n/\n<BR>/g;
447 $text =~ s/<BR>\s*<BR>/<P>/g;
448
449 return $text;
450}
451
452
453sub convert_problem_characters_without_ampersand{
454 my ($text) = @_;
455 $text =~ s/</&lt;/g;
456 $text =~ s/>/&gt;/g;
457
458 $text =~ s/\'\'/\"/g; #Latex -specific conversion
459 $text =~ s/\`\`/\"/g; #Latex -specific conversion
460
461 $text =~ s/\"/&quot;/g;
462 $text =~ s/\'/&#8217;/g;
463 $text =~ s/\`/&#8216;/g;
464
[3249]465# $text =~ s/\+/ /g;
466# $text =~ s/\(/ /g;
467# $text =~ s/\)/ /g;
468
[2901]469 $text =~ s/\\/\\\\/g;
470
[3249]471# $text =~ s/\./\\\./g;
[2901]472
473 return $text;
474}
475
[1676]476# Convert a text string into HTML.
477
[1677]478# The HTML is going to be inserted into a GML file, so we have to be
[2901]479# careful not to use symbols like ">", which occurs frequently in email
[1677]480# messages (and use &gt instead.
481
482# This function also turns URLs and email addresses into links, and
483# replaces carriage returns with <BR> tags (and multiple carriage returns
484# with <P> tags).
485
[1676]486sub text_into_html {
487 my ($text) = @_;
488
[2901]489 # Convert problem characters into HTML symbols
[1676]490 $text =~ s/&/&amp;/g;
491
[2901]492 $text = &convert_problem_characters_without_ampersand( $text );
493
[1676]494 # convert email addresses and URLs into links
[2901]495 $text = &convert_urls_into_links( $text );
[1676]496
[2901]497 $text = &clean_up_whitespaces( $text );
[1676]498
499 return $text;
500}
501
[2901]502
[3142]503sub expand_month {
504 my $text=shift;
[2901]505
[3142]506 # bibtex style files expand abbreviations for months.
507 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
508 $text =~ s/jan/_textmonth01_/g;
509 $text =~ s/feb/_textmonth02_/g;
510 $text =~ s/mar/_textmonth03_/g;
511 $text =~ s/apr/_textmonth04_/g;
512 $text =~ s/may/_textmonth05_/g;
513 $text =~ s/jun/_textmonth06_/g;
514 $text =~ s/jul/_textmonth07_/g;
515 $text =~ s/aug/_textmonth08_/g;
516 $text =~ s/sep/_textmonth09_/g;
517 $text =~ s/oct/_textmonth10_/g;
518 $text =~ s/nov/_textmonth11_/g;
519 $text =~ s/dec/_textmonth12_/g;
[2901]520
[3142]521 return $text;
522}
523
524
[2901]525# Convert accented characters, remove { }, interprete some commands....
526# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
527sub process_latex {
528 my ($text) = @_;
529
530 # note - this is really ugly, but it works. There may be a prettier way
531 # of mapping latex accented chars to utf8, but we just brute force it here.
532 # Also, this isn't complete - not every single possible accented letter
533 # is in here yet, but most of the common ones are.
534
535 my %utf8_chars =
536 (
537 # acutes
538 '\'a' => chr(0xc3).chr(0xa1),
539 '\'c' => chr(0xc4).chr(0x87),
540 '\'e' => chr(0xc3).chr(0xa9),
541 '\'i' => chr(0xc3).chr(0xad),
542 '\'l' => chr(0xc3).chr(0xba),
543 '\'n' => chr(0xc3).chr(0x84),
544 '\'o' => chr(0xc3).chr(0xb3),
545 '\'r' => chr(0xc5).chr(0x95),
546 '\'s' => chr(0xc5).chr(0x9b),
547 '\'u' => chr(0xc3).chr(0xba),
548 '\'y' => chr(0xc3).chr(0xbd),
549 '\'z' => chr(0xc5).chr(0xba),
550 # graves
551 '`a' => chr(0xc3).chr(0xa0),
552 '`A' => chr(0xc3).chr(0x80),
553 '`e' => chr(0xc3).chr(0xa8),
554 '`E' => chr(0xc3).chr(0x88),
555 '`i' => chr(0xc3).chr(0xac),
556 '`I' => chr(0xc3).chr(0x8c),
557 '`o' => chr(0xc3).chr(0xb2),
558 '`O' => chr(0xc3).chr(0x92),
559 '`u' => chr(0xc3).chr(0xb9),
560 '`U' => chr(0xc3).chr(0x99),
561 # circumflex
562 '^a' => chr(0xc3).chr(0xa2),
563 '^A' => chr(0xc3).chr(0x82),
564 '^c' => chr(0xc4).chr(0x89),
565 '^C' => chr(0xc4).chr(0x88),
566 '^e' => chr(0xc3).chr(0xaa),
567 '^E' => chr(0xc3).chr(0x8a),
568 '^g' => chr(0xc4).chr(0x9d),
569 '^G' => chr(0xc4).chr(0x9c),
570 '^h' => chr(0xc4).chr(0xa5),
571 '^H' => chr(0xc4).chr(0xa4),
572 '^i' => chr(0xc3).chr(0xae),
573 '^I' => chr(0xc3).chr(0x8e),
574 '^j' => chr(0xc4).chr(0xb5),
575 '^J' => chr(0xc4).chr(0xb4),
576 '^o' => chr(0xc3).chr(0xb4),
577 '^O' => chr(0xc3).chr(0x94),
578 '^s' => chr(0xc5).chr(0x9d),
579 '^S' => chr(0xc5).chr(0x9c),
580 '^u' => chr(0xc3).chr(0xa2),
581 '^U' => chr(0xc3).chr(0xbb),
582 '^w' => chr(0xc5).chr(0xb5),
583 '^W' => chr(0xc5).chr(0xb4),
584 '^y' => chr(0xc5).chr(0xb7),
585 '^Y' => chr(0xc5).chr(0xb6),
586
587 # diaeresis
588 '"a' => chr(0xc3).chr(0xa4),
589 '"A' => chr(0xc3).chr(0x84),
590 '"e' => chr(0xc3).chr(0xab),
591 '"E' => chr(0xc3).chr(0x8b),
592 '"\\\\i' => chr(0xc3).chr(0xaf),
593 '"\\\\I' => chr(0xc3).chr(0x8f),
594 '"o' => chr(0xc3).chr(0xb6),
595 '"O' => chr(0xc3).chr(0x96),
596 '"u' => chr(0xc3).chr(0xbc),
597 '"U' => chr(0xc3).chr(0x9c),
598 '"y' => chr(0xc3).chr(0xbf),
599 '"Y' => chr(0xc3).chr(0xb8),
600 # tilde
[3142]601 '~A' => chr(0xc3).chr(0x83),
602 '~N' => chr(0xc3).chr(0x91),
603 '~O' => chr(0xc3).chr(0x95),
604 '~a' => chr(0xc3).chr(0xa3),
605 '~n' => chr(0xc3).chr(0xb1),
606 '~o' => chr(0xc3).chr(0xb5),
[2901]607 # caron - handled specially
608 # double acute
609 # ring
610 # dot
[3249]611 '.c' => chr(0xc4).chr(0x8b),
612 '.C' => chr(0xc4).chr(0x8a),
613 '.e' => chr(0xc4).chr(0x97),
614 '.E' => chr(0xc4).chr(0x96),
615 '.g' => chr(0xc4).chr(0xa1),
616 '.G' => chr(0xc4).chr(0xa0),
617 '.I' => chr(0xc4).chr(0xb0),
618 '.z' => chr(0xc5).chr(0xbc),
619 '.Z' => chr(0xc5).chr(0xbb),
[2901]620 # macron
621 '=a' => chr(0xc4).chr(0x81),
622 '=A' => chr(0xc4).chr(0x80),
623 '=e' => chr(0xc4).chr(0x93),
624 '=E' => chr(0xc4).chr(0x92),
625 '=i' => chr(0xc4).chr(0xab),
626 '=I' => chr(0xc4).chr(0xaa),
627 '=o' => chr(0xc4).chr(0x8d),
628 '=O' => chr(0xc4).chr(0x8c),
629 '=u' => chr(0xc4).chr(0xab),
630 '=U' => chr(0xc4).chr(0xaa),
631
632 # stroke - handled specially - see below
633
634 # cedilla - handled specially
635 );
636
637# these are one letter latex commands - we make sure they're not a longer
638# command name. eg {\d} is d+stroke, so careful of \d
639 my %special_utf8_chars =
640 (
[3156]641 # breve
642 'u g' => chr(0xc4).chr(0x9f),
643 'u G' => chr(0xc4).chr(0x9e),
644 'u i' => chr(0xc4).chr(0xad),
645 'u I' => chr(0xc4).chr(0xac),
646 'u o' => chr(0xc5).chr(0x8f),
647 'u O' => chr(0xc5).chr(0x8e),
648 'u u' => chr(0xc5).chr(0xad),
649 'u U' => chr(0xc5).chr(0xac),
[2901]650 # caron
[3156]651 'v c' => chr(0xc4).chr(0x8d),
652 'v C' => chr(0xc4).chr(0x8c),
[2901]653 'v n' => chr(0xc5).chr(0x88),
654 'v N' => chr(0xc5).chr(0x87),
655 'v s' => chr(0xc5).chr(0xa1),
656 'v S' => chr(0xc5).chr(0xa5),
[3156]657 'v z' => chr(0xc5).chr(0xbe),
658 'v Z' => chr(0xc5).chr(0xbd),
[2901]659 # cedilla
660 'c c' => chr(0xc3).chr(0xa7),
661 'c C' => chr(0xc3).chr(0x87),
662 'c g' => chr(0xc4).chr(0xa3),
663 'c G' => chr(0xc4).chr(0xa2),
664 'c k' => chr(0xc4).chr(0xb7),
665 'c K' => chr(0xc4).chr(0xb6),
666 'c l' => chr(0xc4).chr(0xbc),
667 'c L' => chr(0xc4).chr(0xbb),
668 'c n' => chr(0xc5).chr(0x86),
669 'c N' => chr(0xc5).chr(0x85),
670 'c r' => chr(0xc5).chr(0x97),
671 'c R' => chr(0xc5).chr(0x96),
672 'c s' => chr(0xc5).chr(0x9f),
673 'c S' => chr(0xc5).chr(0x9e),
674 'c t' => chr(0xc5).chr(0xa3),
675 'c T' => chr(0xc5).chr(0xa2),
676 # double acute / Hungarian accent
677 'H O' => chr(0xc5).chr(0x90),
678 'H o' => chr(0xc5).chr(0x91),
679 'H U' => chr(0xc5).chr(0xb0),
680 'H u' => chr(0xc5).chr(0xb1),
681
682 # stroke
683 'd' => chr(0xc4).chr(0x91),
684 'D' => chr(0xc4).chr(0x90),
685 'h' => chr(0xc4).chr(0xa7),
686# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
[3112]687 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
[2901]688 'l' => chr(0xc5).chr(0x82),
689 'L' => chr(0xc5).chr(0x81),
690 'o' => chr(0xc3).chr(0xb8),
691 'O' => chr(0xc3).chr(0x98),
692 't' => chr(0xc5).chr(0xa7),
693 'T' => chr(0xc5).chr(0xa6),
694 # german ss/szlig/sharp s
695 'ss' => chr(0xc3).chr(0x9f),
696 );
697
698 # convert latex-style accented characters.
[3249]699
[2901]700 # remove space (if any) between \ and letter to accent (eg {\' a})
701
[13413]702 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
703
[2901]704 # remove {} around a single character (eg \'{e})
[13413]705 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
[2901]706
[3249]707 # \, is another way of doing cedilla \c
708 $text =~ s@\\,(.)@\\c $1@g;
709
[2901]710 # remove {} around a single character for special 1 letter commands -
711 # need to insert a space. Eg \v{s} -> {\v s}
[3156]712 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
[3249]713
[2901]714 # only do if the text contains a '\' character.
715 if ($text =~ m|\\|) {
[3249]716 # "normal" accents - ie non-alpha latex tag
[13413]717 while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
[3249]718 my $tex="$1$2"; my $char="$2";
719 my $replacement=$utf8_chars{$tex};
720 if (!defined($replacement)) {
[15872]721 print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]722 $replacement=$char;
723 }
[5454]724 $text =~ s/\\\Q$tex/$replacement/g;
[3249]725 }
[2901]726
[3249]727 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
728 # only do the change if immediately followed by a space, }, {, or \
729 # one letter accents ( + ss)
730 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
731 my $tex=$1;
732 my $replacement=$special_utf8_chars{$tex};
733 if (!defined($replacement)) {
[15872]734 print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]735 $replacement=$tex;
736 }
737 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
738
739 }
740
741 # one letter latex accent commands that affect following letter
742 while ($text =~ m@\\([uvcH]) ([\w])@) {
743 my $tex="$1 $2"; my $char="$2";
744 my $replacement=$special_utf8_chars{$tex};
745 if (!defined($replacement)) {
[15872]746 print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
[3249]747 $replacement=$char;
748 }
749 $text =~ s/\\$tex/$replacement/g;
750 }
[2901]751 }
[3249]752
753 # escape html-sensitive characters
754 $text =~ s@&@&amp;@g;
755 $text =~ s@<@&lt;@g;
756 $text =~ s@>@&gt;@g;
757 $text =~ s/''/"/g; # Latex-specific
758 $text =~ s/``/"/g; # Latex-specific
759 # greenstone-specific
760 $text =~ s@\[@&\#91;@g;
761 $text =~ s@\]@&\#93;@g;
762
[3112]763 # remove latex commands
[3249]764
765 # explicitly recognised commands
766 $text =~ s@\\ldots@&hellip;@g;
767
768 # maths mode
769 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
770
771 # remove all other commands with optional arguments...
[3156]772 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
773 # $text =~ s@\\noopsort{[^}]+\}@@g;
774 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
[3142]775
[2901]776 # remove latex groupings { } (but not \{ or \} )
[3249]777 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
[3142]778 $text =~ s/^\{//; # remove { if first char
779
780 # latex characters
781 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
[3249]782 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
[3142]783 # optional break "\-"
[3249]784 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
[3142]785 # the non-macro bits have quotes around them - we just remove quotes
[3249]786 $text =~ s/([^&])[\"\#]/$1/g;
[3142]787 }
[3249]788 # dashes. Convert (m|n)-dash into single dash for html.
789 $text =~ s@\-\-+@\-@g;
790
[2901]791 # quoted { } chars
792 $text =~ s@\\{@{@g;
793 $text =~ s@\\}@}@g;
794
[3249]795 # finally to protect against macro language...
796 $text =~ s@\\@\\\\@g;
797
[2901]798 return $text;
799}
800
801
[3249]802sub process_latex_math {
803 my $text = shift;
804
805 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
806 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
807 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
808 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
809 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
810
811 # put all other command names in italics
812 $text =~ s@\\([\w]+)@<i>$1</i>@g;
813
814 return $text;
815}
816
[17026]817sub add_OID {
[2484]818 my $self = shift (@_);
819 my ($doc_obj, $id, $segment_number) = @_;
820
821 if ( $self->{'key'} eq "default") {
[17026]822 $self->SUPER::add_OID(@_);
823# $doc_obj->set_OID("$id\_$segment_number");
[2484]824 } else {
825 $doc_obj->set_OID($self->{'key'});
826 }
827}
[1676]828
[2484]8291;
[4792]830
831
Note: See TracBrowser for help on using the repository browser.