source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 12969

Last change on this file since 12969 was 12169, checked in by mdewsnip, 18 years ago

Tidied up that horrible long line in the new() function of every plugin.

  • Property svn:keywords set to Author Date Id Revision
File size: 23.3 KB
Line 
1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
42
43
44package BibTexPlug;
45
46use SplitPlug;
47use strict;
48no strict 'refs'; # allow filehandles to be variables and viceversa
49
50# BibTexPlug is a sub-class of BasPlug.
51sub BEGIN {
52 @BibTexPlug::ISA = ('SplitPlug');
53}
54
55my $arguments =
56 [ { 'name' => "process_exp",
57 'desc' => "{BasPlug.process_exp}",
58 'type' => "regexp",
59 'reqd' => "no",
60 'deft' => &get_default_process_exp() },
61 { 'name' => "split_exp",
62 'desc' => "{SplitPlug.split_exp}",
63 'type' => "regexp",
64 'deft' => &get_default_split_exp(),
65 'reqd' => "no" }
66 ];
67
68my $options = { 'name' => "BibTexPlug",
69 'desc' => "{BibTexPlug.desc}",
70 'abstract' => "no",
71 'inherits' => "yes",
72 'explodes' => "yes",
73 'args' => $arguments };
74
75# This plugin processes files with the suffix ".bib"
76sub get_default_process_exp {
77 return q^(?i)\.bib$^;
78}
79
80# This plugin splits the input text at blank lines
81sub get_default_split_exp {
82 return q^\n+(?=@)^;
83}
84sub new {
85 my ($class) = shift (@_);
86 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87 push(@$pluginlist, $class);
88
89 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
90 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
91
92 my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
93
94 return bless $self, $class;
95}
96
97
98
99# The process function reads a single bibliographic record and stores
100# it as a new document.
101
102sub process {
103 my $self = shift (@_);
104 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
105 my $outhandle = $self->{'outhandle'};
106
107 my $cursection = $doc_obj->get_top_section();
108 $self->{'key'} = "default";
109
110 # Check that we're dealing with a valid BibTex record
111 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
112
113 # Ignore things we can't use
114 return 0 if ($$textref =~ /^\@String/);
115
116 # Report that we're processing the file
117 print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
118 print $outhandle "BibTexPlug: processing $file\n"
119 if ($self->{'verbosity'}) > 1;
120
121
122 # This hash translates BibTex field names into metadata names. The
123 # BibTex names are taken from the "Local Guide to Latex" Graeme
124 # McKinstry. Metadata names are consistent with ReferPlug.
125
126 # The author metadata will be stored as one "Creator" entry, but will
127 # also be split up into several individual "Author" fields.
128
129 my %field = (
130 'address', 'PublisherAddress',
131 'author', 'Creator',
132
133 'booktitle', 'Booktitle',
134 'chapter', 'Chapter',
135 'edition', 'Edition',
136 'editor', 'Editor',
137 'institution', 'Publisher',
138 'journal', 'Journal',
139 'month', 'Month',
140 'number', 'Number',
141 'pages', 'Pages',
142 'publisher', 'Publisher',
143 'school', 'Publisher',
144 'title', 'Title',
145 'volume', 'Volume',
146 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
147
148 'keywords', 'Keywords',
149 'abstract', 'Abstract',
150 'copyright', 'Copyright',
151 'note', 'Note',
152 'url', 'URL',
153 );
154
155 # Metadata fields
156 my %metadata;
157 my ($EntryType, $Creator, $Keywords, $text);
158
159 my $verbosity = $self->{'verbosity'};
160 $verbosity = 0 unless $verbosity;
161
162 # Make sure the text has exactly one entry per line -
163 # append line to previous if it doesn't start with " <key> = "
164
165 my @input_lines=split(/\r?\n/, $$textref);
166 my @all_lines;
167 my $entry_line=shift @input_lines;
168 foreach my $input_line (@input_lines) {
169 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
170 # this is a new key
171 push(@all_lines, $entry_line);
172 $entry_line=$input_line;
173 } else {
174 # this is a continuation of previous line
175 $entry_line .= " " . $input_line;
176 }
177
178 }
179 # add final line, removing trailing '}'
180 $entry_line =~ s/\}\s*$//;
181 push(@all_lines, $entry_line);
182 push(@all_lines, "}");
183
184 # Read and process each line in the bib file.
185 my ($entryname, $name, $value, $line);
186 foreach $line (@all_lines) {
187
188 # Add each line. Most lines consist of a field identifer and
189 # then data, and we simply store them, though we treat some
190 # of the fields a bit differently.
191
192 $line =~ s/\s+/ /g;
193 $text .= "$line\n";
194
195
196 print "Processing line = $line \n" if $verbosity>=4;
197
198 # The first line is special, it contains the reference type and OID
199 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]*)\W*$/) {
200 $EntryType = $1;
201 my $EntryID = (defined $2) ? $2 : "default";
202 print "** $EntryType - \"$EntryID\" \n"
203 if ($verbosity >= 4);
204
205 next;
206 }
207 if ($line =~ /\@/) {
208 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
209 if ($verbosity >= 2);
210 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
211 if ($verbosity >= 2);
212 }
213
214 # otherwise, parse the metadata out of this line
215 next unless ($line =~ /^\s*(\w+)\s*=\s*(.*)/);
216 $entryname = lc($1);
217 $value = $2;
218 $value =~ s/,?\s*$//; # remove trailing comma and space
219 if ($value =~ /^"/ && $value =~ /"$/) {
220 # remove surrounding " marks
221 $value =~ s/^"//; $value =~ s/"$//;
222 }
223 $value = &process_latex($value);
224
225 # Add this line of metadata
226 $metadata{$entryname} .= "$value";
227
228 }
229
230 # Add the Entry type as metadata
231 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
232
233 #Add the fileformat as metadata
234 $doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
235
236
237 # Add the various field as metadata
238 foreach my $entryname (keys %metadata) {
239 next unless (defined $field{$entryname});
240 next unless (defined $metadata{$entryname});
241
242 $name = $field{$entryname};
243 $value = $metadata{$entryname};
244
245 if ($name =~ /^Month/) {
246 $value=expand_month($value);
247 }
248
249 # Several special operatons on metadata follow
250
251 # Add individual keywords.
252 # The full set of keywords will be added, in due course, as "Keywords".
253 # However, we also want to add them as individual "Keyword" metadata elements.
254 if ($entryname eq "keywords") {
255 my @keywordlist = split(/,/, $value);
256 foreach my $k (@keywordlist) {
257 $k = lc($k);
258 $k =~ s/\s*$//;
259 $k =~ s/^\s*//;
260 if ($k =~ /\w/) {
261 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
262 }
263 }
264 }
265
266 # Add individual authors
267 # The author metadata will be stored as one "Creator" entry, but we
268 # also want to split it into several individual "Author" fields in
269 # "Lastname, Firstnames" format so we can browse it.
270 if ($entryname eq "author") { #added also comparison with editor
271
272 # take care of "et al."...
273 my $etal='';
274 if ($value =~ s/\s+(and\s+others|et\.?\s+al\.?)\s*$//i) {
275 $etal=' <em>et. al.</em>';
276 }
277 # und here for german language...
278 # don't use brackets in pattern, else the matched bit becomes
279 # an element in the list!
280 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
281 my @formattedlist = ();
282 foreach $a (@authorlist) {
283 $a =~ s/\s*$//;
284 $a =~ s/^\s*//;
285 # Reformat and add author name
286 next if $a=~ /^\s*$/;
287
288 # names are "First von Last", "von Last, First"
289 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
290 my $first="";
291 my $vonlast="";
292 my $jr="";
293
294 if ($a =~ /,/) {
295 my @parts=split(/,\s*/, $a);
296 $first = pop @parts;
297 if (scalar(@parts) == 2) {
298 $jr = pop @parts;
299 }
300 $vonlast=shift @parts;
301 if (scalar(@parts) > 0) {
302 print $outhandle "BibTexPlug: couldn't parse name $a\n";
303 # but we continue anyway...
304 }
305 } else { # First von Last
306 my @words = split(/ /, $a);
307 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
308 $first .= " " . shift (@words);
309 }
310 $first =~ s/^\s//;
311 $vonlast = join (' ', @words); # whatever's left...
312 }
313 my $von="";
314 my $last="";
315 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
316 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
317 $von = $1;
318 if (!defined ($von)) {
319 # some non-English names do start with lowercase
320 # eg "Marie desJardins". Also we can get typos...
321 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
322 $von="";
323 if ($vonlast =~ /^[a-z]+$/) {
324 # if it's all lowercase, uppercase 1st.
325 $vonlast =~ s/^(.)/\u$1/;
326
327 }
328 }
329 $von =~ s/\s*$//;
330 $last=$vonlast;
331 } else {
332 $last=$vonlast;
333 }
334 my $wholename="$first $von $last $jr";
335 $wholename =~ s/\s+/ /g; # squeeze multiple spaces
336 $wholename =~ s/ $//;
337 push (@formattedlist, $wholename);
338 my $fullname = "$last";
339 $fullname .= " $jr" if ($jr);
340 $fullname .= ", $first";
341 $fullname .= " $von" if ($von);
342
343 # Add each name to set of Authors
344 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
345 }
346
347 # Only want at most one "and" in the Creator field
348 if (scalar(@formattedlist) > 2) {
349 my $lastauthor=pop @formattedlist;
350 $value=join(', ', @formattedlist);
351 $value.=" and $lastauthor";
352 } else { # 1 or 2 authors...
353 $value=join(" and ",@formattedlist);
354 }
355 $value.=$etal; # if there was "et al."
356 }
357
358 # Books and Journals are additionally marked for display purposes
359 if ($entryname eq "booktitle") {
360 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
361 } elsif ($entryname eq "journal") {
362 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
363 }
364
365 # Add the various fields as metadata
366 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
367 }
368
369 # for books and journals...
370 if (!exists $metadata{'title'}) {
371 my $name=$field{'title'}; # get Greenstone metadata name
372 my $value;
373 if (exists $metadata{'booktitle'}) {
374 $value=$metadata{'booktitle'};
375 } elsif (exists $metadata{'journal'}) {
376 $value=$metadata{'journal'};
377 }
378 if ($value) {
379 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
380 }
381 }
382
383 # Add Date (yyyymmdd) metadata
384 if (defined ($metadata{'year'}) ) {
385 my $date=$metadata{'year'};
386 chomp $date;
387 my $month=$metadata{'month'};
388 if (defined($month)) {
389 # month is currently 3 letter code or a range...
390 $month = expand_month($month);
391 # take the first month found... might not find one!
392 $month =~ m/_textmonth(\d\d)_/;
393 $month = $1;
394 }
395 if (!defined($month)) {
396 $month="00";
397 }
398 $date .= "${month}00";
399 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
400}
401
402# # Add the text in BibTex format (all fields)
403 if ($text =~ /\w/) {
404
405 $text =~ s@&@&amp;@g;
406 $text =~ s@<@&lt;@g;
407 $text =~ s@>@&gt;@g;
408 $text =~ s@\n@<br/>\n@g;
409 $text =~ s@\\@\\\\@g;
410
411 $doc_obj->add_utf8_text ($cursection, $text);
412 }
413
414 return 1;
415}
416
417
418
419
420# convert email addresses and URLs into links
421sub convert_urls_into_links{
422 my ($text) = @_;
423
424 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
425 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
426
427 return $text;
428}
429
430# Clean up whitespace and convert \n charaters to <BR> or <P>
431sub clean_up_whitespaces{
432 my ($text) = @_;
433
434 $text =~ s/%%%%%/<BR> <BR>/g;
435 $text =~ s/ +/ /g;
436 $text =~ s/\s*$//;
437 $text =~ s/^\s*//;
438 $text =~ s/\n/\n<BR>/g;
439 $text =~ s/<BR>\s*<BR>/<P>/g;
440
441 return $text;
442}
443
444
445sub convert_problem_characters_without_ampersand{
446 my ($text) = @_;
447 $text =~ s/</&lt;/g;
448 $text =~ s/>/&gt;/g;
449
450 $text =~ s/\'\'/\"/g; #Latex -specific conversion
451 $text =~ s/\`\`/\"/g; #Latex -specific conversion
452
453 $text =~ s/\"/&quot;/g;
454 $text =~ s/\'/&#8217;/g;
455 $text =~ s/\`/&#8216;/g;
456
457# $text =~ s/\+/ /g;
458# $text =~ s/\(/ /g;
459# $text =~ s/\)/ /g;
460
461 $text =~ s/\\/\\\\/g;
462
463# $text =~ s/\./\\\./g;
464
465 return $text;
466}
467
468# Convert a text string into HTML.
469
470# The HTML is going to be inserted into a GML file, so we have to be
471# careful not to use symbols like ">", which occurs frequently in email
472# messages (and use &gt instead.
473
474# This function also turns URLs and email addresses into links, and
475# replaces carriage returns with <BR> tags (and multiple carriage returns
476# with <P> tags).
477
478sub text_into_html {
479 my ($text) = @_;
480
481 # Convert problem characters into HTML symbols
482 $text =~ s/&/&amp;/g;
483
484 $text = &convert_problem_characters_without_ampersand( $text );
485
486 # convert email addresses and URLs into links
487 $text = &convert_urls_into_links( $text );
488
489 $text = &clean_up_whitespaces( $text );
490
491 return $text;
492}
493
494
495sub expand_month {
496 my $text=shift;
497
498 # bibtex style files expand abbreviations for months.
499 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
500 $text =~ s/jan/_textmonth01_/g;
501 $text =~ s/feb/_textmonth02_/g;
502 $text =~ s/mar/_textmonth03_/g;
503 $text =~ s/apr/_textmonth04_/g;
504 $text =~ s/may/_textmonth05_/g;
505 $text =~ s/jun/_textmonth06_/g;
506 $text =~ s/jul/_textmonth07_/g;
507 $text =~ s/aug/_textmonth08_/g;
508 $text =~ s/sep/_textmonth09_/g;
509 $text =~ s/oct/_textmonth10_/g;
510 $text =~ s/nov/_textmonth11_/g;
511 $text =~ s/dec/_textmonth12_/g;
512
513 return $text;
514}
515
516
517# Convert accented characters, remove { }, interprete some commands....
518# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
519sub process_latex {
520 my ($text) = @_;
521
522 # note - this is really ugly, but it works. There may be a prettier way
523 # of mapping latex accented chars to utf8, but we just brute force it here.
524 # Also, this isn't complete - not every single possible accented letter
525 # is in here yet, but most of the common ones are.
526
527 my %utf8_chars =
528 (
529 # acutes
530 '\'a' => chr(0xc3).chr(0xa1),
531 '\'c' => chr(0xc4).chr(0x87),
532 '\'e' => chr(0xc3).chr(0xa9),
533 '\'i' => chr(0xc3).chr(0xad),
534 '\'l' => chr(0xc3).chr(0xba),
535 '\'n' => chr(0xc3).chr(0x84),
536 '\'o' => chr(0xc3).chr(0xb3),
537 '\'r' => chr(0xc5).chr(0x95),
538 '\'s' => chr(0xc5).chr(0x9b),
539 '\'u' => chr(0xc3).chr(0xba),
540 '\'y' => chr(0xc3).chr(0xbd),
541 '\'z' => chr(0xc5).chr(0xba),
542 # graves
543 '`a' => chr(0xc3).chr(0xa0),
544 '`A' => chr(0xc3).chr(0x80),
545 '`e' => chr(0xc3).chr(0xa8),
546 '`E' => chr(0xc3).chr(0x88),
547 '`i' => chr(0xc3).chr(0xac),
548 '`I' => chr(0xc3).chr(0x8c),
549 '`o' => chr(0xc3).chr(0xb2),
550 '`O' => chr(0xc3).chr(0x92),
551 '`u' => chr(0xc3).chr(0xb9),
552 '`U' => chr(0xc3).chr(0x99),
553 # circumflex
554 '^a' => chr(0xc3).chr(0xa2),
555 '^A' => chr(0xc3).chr(0x82),
556 '^c' => chr(0xc4).chr(0x89),
557 '^C' => chr(0xc4).chr(0x88),
558 '^e' => chr(0xc3).chr(0xaa),
559 '^E' => chr(0xc3).chr(0x8a),
560 '^g' => chr(0xc4).chr(0x9d),
561 '^G' => chr(0xc4).chr(0x9c),
562 '^h' => chr(0xc4).chr(0xa5),
563 '^H' => chr(0xc4).chr(0xa4),
564 '^i' => chr(0xc3).chr(0xae),
565 '^I' => chr(0xc3).chr(0x8e),
566 '^j' => chr(0xc4).chr(0xb5),
567 '^J' => chr(0xc4).chr(0xb4),
568 '^o' => chr(0xc3).chr(0xb4),
569 '^O' => chr(0xc3).chr(0x94),
570 '^s' => chr(0xc5).chr(0x9d),
571 '^S' => chr(0xc5).chr(0x9c),
572 '^u' => chr(0xc3).chr(0xa2),
573 '^U' => chr(0xc3).chr(0xbb),
574 '^w' => chr(0xc5).chr(0xb5),
575 '^W' => chr(0xc5).chr(0xb4),
576 '^y' => chr(0xc5).chr(0xb7),
577 '^Y' => chr(0xc5).chr(0xb6),
578
579 # diaeresis
580 '"a' => chr(0xc3).chr(0xa4),
581 '"A' => chr(0xc3).chr(0x84),
582 '"e' => chr(0xc3).chr(0xab),
583 '"E' => chr(0xc3).chr(0x8b),
584 '"\\\\i' => chr(0xc3).chr(0xaf),
585 '"\\\\I' => chr(0xc3).chr(0x8f),
586 '"o' => chr(0xc3).chr(0xb6),
587 '"O' => chr(0xc3).chr(0x96),
588 '"u' => chr(0xc3).chr(0xbc),
589 '"U' => chr(0xc3).chr(0x9c),
590 '"y' => chr(0xc3).chr(0xbf),
591 '"Y' => chr(0xc3).chr(0xb8),
592 # tilde
593 '~A' => chr(0xc3).chr(0x83),
594 '~N' => chr(0xc3).chr(0x91),
595 '~O' => chr(0xc3).chr(0x95),
596 '~a' => chr(0xc3).chr(0xa3),
597 '~n' => chr(0xc3).chr(0xb1),
598 '~o' => chr(0xc3).chr(0xb5),
599 # caron - handled specially
600 # double acute
601 # ring
602 # dot
603 '.c' => chr(0xc4).chr(0x8b),
604 '.C' => chr(0xc4).chr(0x8a),
605 '.e' => chr(0xc4).chr(0x97),
606 '.E' => chr(0xc4).chr(0x96),
607 '.g' => chr(0xc4).chr(0xa1),
608 '.G' => chr(0xc4).chr(0xa0),
609 '.I' => chr(0xc4).chr(0xb0),
610 '.z' => chr(0xc5).chr(0xbc),
611 '.Z' => chr(0xc5).chr(0xbb),
612 # macron
613 '=a' => chr(0xc4).chr(0x81),
614 '=A' => chr(0xc4).chr(0x80),
615 '=e' => chr(0xc4).chr(0x93),
616 '=E' => chr(0xc4).chr(0x92),
617 '=i' => chr(0xc4).chr(0xab),
618 '=I' => chr(0xc4).chr(0xaa),
619 '=o' => chr(0xc4).chr(0x8d),
620 '=O' => chr(0xc4).chr(0x8c),
621 '=u' => chr(0xc4).chr(0xab),
622 '=U' => chr(0xc4).chr(0xaa),
623
624 # stroke - handled specially - see below
625
626 # cedilla - handled specially
627 );
628
629# these are one letter latex commands - we make sure they're not a longer
630# command name. eg {\d} is d+stroke, so careful of \d
631 my %special_utf8_chars =
632 (
633 # breve
634 'u g' => chr(0xc4).chr(0x9f),
635 'u G' => chr(0xc4).chr(0x9e),
636 'u i' => chr(0xc4).chr(0xad),
637 'u I' => chr(0xc4).chr(0xac),
638 'u o' => chr(0xc5).chr(0x8f),
639 'u O' => chr(0xc5).chr(0x8e),
640 'u u' => chr(0xc5).chr(0xad),
641 'u U' => chr(0xc5).chr(0xac),
642 # caron
643 'v c' => chr(0xc4).chr(0x8d),
644 'v C' => chr(0xc4).chr(0x8c),
645 'v n' => chr(0xc5).chr(0x88),
646 'v N' => chr(0xc5).chr(0x87),
647 'v s' => chr(0xc5).chr(0xa1),
648 'v S' => chr(0xc5).chr(0xa5),
649 'v z' => chr(0xc5).chr(0xbe),
650 'v Z' => chr(0xc5).chr(0xbd),
651 # cedilla
652 'c c' => chr(0xc3).chr(0xa7),
653 'c C' => chr(0xc3).chr(0x87),
654 'c g' => chr(0xc4).chr(0xa3),
655 'c G' => chr(0xc4).chr(0xa2),
656 'c k' => chr(0xc4).chr(0xb7),
657 'c K' => chr(0xc4).chr(0xb6),
658 'c l' => chr(0xc4).chr(0xbc),
659 'c L' => chr(0xc4).chr(0xbb),
660 'c n' => chr(0xc5).chr(0x86),
661 'c N' => chr(0xc5).chr(0x85),
662 'c r' => chr(0xc5).chr(0x97),
663 'c R' => chr(0xc5).chr(0x96),
664 'c s' => chr(0xc5).chr(0x9f),
665 'c S' => chr(0xc5).chr(0x9e),
666 'c t' => chr(0xc5).chr(0xa3),
667 'c T' => chr(0xc5).chr(0xa2),
668 # double acute / Hungarian accent
669 'H O' => chr(0xc5).chr(0x90),
670 'H o' => chr(0xc5).chr(0x91),
671 'H U' => chr(0xc5).chr(0xb0),
672 'H u' => chr(0xc5).chr(0xb1),
673
674 # stroke
675 'd' => chr(0xc4).chr(0x91),
676 'D' => chr(0xc4).chr(0x90),
677 'h' => chr(0xc4).chr(0xa7),
678# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
679 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
680 'l' => chr(0xc5).chr(0x82),
681 'L' => chr(0xc5).chr(0x81),
682 'o' => chr(0xc3).chr(0xb8),
683 'O' => chr(0xc3).chr(0x98),
684 't' => chr(0xc5).chr(0xa7),
685 'T' => chr(0xc5).chr(0xa6),
686 # german ss/szlig/sharp s
687 'ss' => chr(0xc3).chr(0x9f),
688 );
689
690 # convert latex-style accented characters.
691
692 # remove space (if any) between \ and letter to accent (eg {\' a})
693
694 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
695
696 # remove {} around a single character (eg \'{e})
697 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
698
699 # \, is another way of doing cedilla \c
700 $text =~ s@\\,(.)@\\c $1@g;
701
702 # remove {} around a single character for special 1 letter commands -
703 # need to insert a space. Eg \v{s} -> {\v s}
704 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
705
706 # only do if the text contains a '\' character.
707 if ($text =~ m|\\|) {
708 # "normal" accents - ie non-alpha latex tag
709 while ($text =~ m@\\([`'="^~\.])([\w])@) {
710 my $tex="$1$2"; my $char="$2";
711 my $replacement=$utf8_chars{$tex};
712 if (!defined($replacement)) {
713 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
714 $replacement=$char;
715 }
716 $text =~ s/\\\Q$tex/$replacement/g;
717 }
718
719 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
720 # only do the change if immediately followed by a space, }, {, or \
721 # one letter accents ( + ss)
722 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
723 my $tex=$1;
724 my $replacement=$special_utf8_chars{$tex};
725 if (!defined($replacement)) {
726 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
727 $replacement=$tex;
728 }
729 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
730
731 }
732
733 # one letter latex accent commands that affect following letter
734 while ($text =~ m@\\([uvcH]) ([\w])@) {
735 my $tex="$1 $2"; my $char="$2";
736 my $replacement=$special_utf8_chars{$tex};
737 if (!defined($replacement)) {
738 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
739 $replacement=$char;
740 }
741 $text =~ s/\\$tex/$replacement/g;
742 }
743 }
744
745 # escape html-sensitive characters
746 $text =~ s@&@&amp;@g;
747 $text =~ s@<@&lt;@g;
748 $text =~ s@>@&gt;@g;
749 $text =~ s/''/"/g; # Latex-specific
750 $text =~ s/``/"/g; # Latex-specific
751 # greenstone-specific
752 $text =~ s@\[@&\#91;@g;
753 $text =~ s@\]@&\#93;@g;
754
755 # remove latex commands
756
757 # explicitly recognised commands
758 $text =~ s@\\ldots@&hellip;@g;
759
760 # maths mode
761 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
762
763 # remove all other commands with optional arguments...
764 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
765 # $text =~ s@\\noopsort{[^}]+\}@@g;
766 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
767
768 # remove latex groupings { } (but not \{ or \} )
769 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
770 $text =~ s/^\{//; # remove { if first char
771
772 # latex characters
773 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
774 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
775 # optional break "\-"
776 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
777 # the non-macro bits have quotes around them - we just remove quotes
778 $text =~ s/([^&])[\"\#]/$1/g;
779 }
780 # dashes. Convert (m|n)-dash into single dash for html.
781 $text =~ s@\-\-+@\-@g;
782
783 # quoted { } chars
784 $text =~ s@\\{@{@g;
785 $text =~ s@\\}@}@g;
786
787 # finally to protect against macro language...
788 $text =~ s@\\@\\\\@g;
789
790 return $text;
791}
792
793
794sub process_latex_math {
795 my $text = shift;
796
797 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
798 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
799 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
800 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
801 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
802
803 # put all other command names in italics
804 $text =~ s@\\([\w]+)@<i>$1</i>@g;
805
806 return $text;
807}
808
809sub set_OID {
810 my $self = shift (@_);
811 my ($doc_obj, $id, $segment_number) = @_;
812
813 if ( $self->{'key'} eq "default") {
814 $doc_obj->set_OID("$id\_$segment_number");
815 } else {
816 $doc_obj->set_OID($self->{'key'});
817 }
818}
819
8201;
821
822
Note: See TracBrowser for help on using the repository browser.