source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 9403

Last change on this file since 9403 was 9120, checked in by kjdon, 19 years ago

BibTex plug can do exploding - set 'explodes' to yes in xml description

  • Property svn:keywords set to Author Date Id Revision
File size: 22.5 KB
Line 
1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
42
43
44package BibTexPlug;
45
46use SplitPlug;
47
48# BibTexPlug is a sub-class of BasPlug.
49sub BEGIN {
50 @ISA = ('SplitPlug');
51}
52
53my $arguments =
54 [ { 'name' => "process_exp",
55 'desc' => "{BasPlug.process_exp}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => &get_default_process_exp() },
59 { 'name' => "split_exp",
60 'desc' => "{SplitPlug.split_exp}",
61 'type' => "regexp",
62 'deft' => &get_default_split_exp(),
63 'reqd' => "no" }
64 ];
65
66my $options = { 'name' => "BibTexPlug",
67 'desc' => "{BibTexPlug.desc}",
68 'abstract' => "no",
69 'inherits' => "yes",
70 'explodes' => "yes",
71 'args' => $arguments };
72
73# This plugin processes files with the suffix ".bib"
74sub get_default_process_exp {
75 return q^(?i)\.bib$^;
76}
77
78# This plugin splits the input text at blank lines
79sub get_default_split_exp {
80 return q^\n+(?=@)^;
81}
82sub new {
83 my $class = shift (@_);
84 my $self = new SplitPlug ($class, @_);
85 $self->{'plugin_type'} = "BibTexPlug";
86 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
87 my $option_list = $self->{'option_list'};
88 push( @{$option_list}, $options );
89 return bless $self, $class;
90}
91
92
93
94# The process function reads a single bibliographic record and stores
95# it as a new document.
96
97sub process {
98 my $self = shift (@_);
99 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
100 my $outhandle = $self->{'outhandle'};
101
102 $self->{'key'} = "default";
103
104 # Check that we're dealing with a valid BibTex record
105 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
106
107 # Ignore things we can't use
108 return 0 if ($$textref =~ /^\@String/);
109
110 # Report that we're processing the file
111 print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
112 print $outhandle "BibTexPlug: processing $file\n"
113 if ($self->{'verbosity'}) > 1;
114
115
116 # This hash translates BibTex field names into metadata names. The
117 # BibTex names are taken from the "Local Guide to Latex" Graeme
118 # McKinstry. Metadata names are consistent with ReferPlug.
119
120 my %field = (
121 'address', 'PublisherAddress',
122 'author', 'Creator',
123
124 'booktitle', 'Booktitle',
125 'chapter', 'Chapter',
126 'edition', 'Edition',
127 'editor', 'Editor',
128 'institution', 'Publisher',
129 'journal', 'Journal',
130 'month', 'Month',
131 'number', 'Number',
132 'pages', 'Pages',
133 'publisher', 'Publisher',
134 'school', 'Publisher',
135 'title', 'Title',
136 'volume', 'Volume',
137 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
138
139 'keywords', 'Keywords',
140 'abstract', 'Abstract',
141 'copyright', 'Copyright',
142 'note', 'Note',
143 );
144
145 # Metadata fields
146 my %metadata;
147 my ($EntryType, $Creator, $Keywords, $text);
148
149 my $verbosity = $self->{'verbosity'};
150 $verbosity = 0 unless $verbosity;
151
152 # Make sure the text has exactly one entry per line -
153 # append line to previous if it doesn't start with " <key> = "
154
155 my @input_lines=split(/\r?\n/, $$textref);
156 my @all_lines;
157 my $entry_line=shift @input_lines;
158 foreach my $input_line (@input_lines) {
159 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
160 # this is a new key
161 push(@all_lines, $entry_line);
162 $entry_line=$input_line;
163 } else {
164 # this is a continuation of previous line
165 $entry_line .= " " . $input_line;
166 }
167
168 }
169 # add final line, removing trailing '}'
170 $entry_line =~ s/\}\s*$//;
171 push(@all_lines, $entry_line);
172 push(@all_lines, "}");
173
174 # Read and process each line in the bib file.
175 my ($entryname, $name, $value, $line);
176 foreach $line (@all_lines) {
177
178 # Add each line. Most lines consist of a field identifer and
179 # then data, and we simply store them, though we treat some
180 # of the fields a bit differently.
181
182 $line =~ s/\s+/ /g;
183 $text .= "$line\n";
184
185
186 print "Processing line = $line \n" if $verbosity>=4;
187
188 # The first line is special, it contains the reference type and OID
189 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]*)\W*$/) {
190 $EntryType = $1;
191 my $EntryID = (defined $2) ? $2 : "default";
192 print "** $EntryType - \"$EntryID\" \n"
193 if ($verbosity >= 4);
194
195 next;
196 }
197 if ($line =~ /\@/) {
198 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
199 if ($verbosity >= 2);
200 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
201 if ($verbosity >= 2);
202 }
203
204 # otherwise, parse the metadata out of this line
205 next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
206 $entryname = lc($1);
207 $value = $2;
208 $value =~ s/,?\s*$//; # remove trailing comma and space
209 if ($value =~ /^"/ && $value =~ /"$/) {
210 # remove surrounding " marks
211 $value =~ s/^"//; $value =~ s/"$//;
212 }
213 $value = &process_latex($value);
214
215 # Add this line of metadata
216 $metadata{$entryname} .= "$value";
217
218 }
219
220 # Add the Entry type as metadata
221 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
222
223 #Add the fileformat as metadata
224 $doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
225
226
227 # Add the various field as metadata
228 foreach my $entryname (keys %metadata) {
229 next unless (defined $field{$entryname});
230 next unless (defined $metadata{$entryname});
231
232 $name = $field{$entryname};
233 $value = $metadata{$entryname};
234
235 if ($name =~ /^Month/) {
236 $value=expand_month($value);
237 }
238
239 # Several special operatons on metadata follow
240
241 # Add individual keywords.
242 # The full set of keywords will be added, in due course, as "Keywords".
243 # However, we also want to add them as individual "Keyword" metadata elements.
244 if ($entryname eq "keywords") {
245 my @keywordlist = split(/,/, $value);
246 foreach my $k (@keywordlist) {
247 $k = lc($k);
248 $k =~ s/\s*$//;
249 $k =~ s/^\s*//;
250 if ($k =~ /\w/) {
251 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
252 }
253 }
254 }
255
256 # Add individual authors
257 # The author metadata will be stored as one "Creator" entry, but we
258 # also want to split it into several individual "Author" fields in
259 # "Lastname, Firstnames" format so we can browse it.
260 if ($entryname eq "author") { #added also comparison with editor
261
262 # take care of "et al."...
263 $value =~ s/(\s+et\.?\s+al\.?)\s*$//;
264 my $etal=$1;
265 $etal="" if (!defined ($etal));
266 # und here for german language...
267 # don't use brackets in pattern, else the matched bit becomes
268 # an element in the list!
269 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
270 my @formattedlist = ();
271 foreach $a (@authorlist) {
272 $a =~ s/\s*$//;
273 $a =~ s/^\s*//;
274 # Reformat and add author name
275 next if $a=~ /^\s*$/;
276
277 # names are "First von Last", "von Last, First"
278 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
279 my $first="";
280 my $vonlast="";
281 my $jr="";
282
283 if ($a =~ /,/) {
284 my @parts=split(/,\s*/, $a);
285 $first = pop @parts;
286 if (scalar(@parts) == 2) {
287 $jr = pop @parts;
288 }
289 $vonlast=shift @parts;
290 if (scalar(@parts) > 0) {
291 print $outhandle "BibTexPlug: couldn't parse name $a\n";
292 # but we continue anyway...
293 }
294 } else { # First von Last
295 my @words = split(/ /, $a);
296 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
297 $first .= " " . shift (@words);
298 }
299 $first =~ s/^\s//;
300 $vonlast = join (' ', @words); # whatever's left...
301 }
302 my $von="";
303 my $last="";
304 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
305 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
306 $von = $1;
307 if (!defined ($von)) {
308 # some non-English names do start with lowercase
309 # eg "Marie desJardins". Also we can get typos...
310 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
311 $von="";
312 if ($vonlast =~ /^[a-z]+$/) {
313 # if it's all lowercase, uppercase 1st.
314 $vonlast =~ s/^(.)/\u$1/;
315
316 }
317 }
318 $von =~ s/\s*$//;
319 $last=$vonlast;
320 } else {
321 $last=$vonlast;
322 }
323 my $wholename="$first $von $last $jr";
324 $wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
325 push (@formattedlist, $wholename);
326 my $fullname = "$last";
327 $fullname .= " $jr" if ($jr);
328 $fullname .= ", $first";
329 $fullname .= " $von" if ($von);
330
331 # Add each name to set of Authors
332 # force utf8 pragma so that \w matches in this scope
333 use utf8;
334 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
335 }
336
337 # Only want at most one "and" in the Creator field
338 if (scalar(@formattedlist) > 2) {
339 my $lastauthor=pop @formattedlist;
340 $value=join(', ', @formattedlist);
341 $value.=" and $lastauthor";
342 } else { # 1 or 2 authors...
343 $value=join(" and ",@formattedlist);
344 }
345 $value.=$etal; # if there was "et al."
346 }
347
348 # Books and Journals are additionally marked for display purposes
349 if ($entryname eq "booktitle") {
350 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
351 } elsif ($entryname eq "journal") {
352 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
353 }
354
355
356 # Add the various fields as metadata
357 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
358 }
359
360 # Add Date (yyyymmdd) metadata
361 if (defined ($metadata{'year'}) ) {
362 my $date=$metadata{'year'};
363 chomp $date;
364 my $month=$metadata{'month'};
365 if (defined($month)) {
366 # month is currently 3 letter code or a range...
367 $month = expand_month($month);
368 # take the first month found... might not find one!
369 $month =~ m/_textmonth(\d\d)_/;
370 $month = $1;
371 }
372 if (!defined($month)) {
373 $month="00";
374 }
375 $date .= "${month}00";
376 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
377}
378
379# # Add the text in BibTex format (all fields)
380 if ($text =~ /\w/) {
381
382 $text =~ s@&@&amp;@g;
383 $text =~ s@<@&lt;@g;
384 $text =~ s@>@&gt;@g;
385 $text =~ s@\n@<br/>\n@g;
386 $text =~ s@\\@\\\\@g;
387
388 $doc_obj->add_utf8_text ($cursection, $text);
389 }
390
391 return 1;
392}
393
394
395
396
397# convert email addresses and URLs into links
398sub convert_urls_into_links{
399 my ($text) = @_;
400
401 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
402 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
403
404 return $text;
405}
406
407# Clean up whitespace and convert \n charaters to <BR> or <P>
408sub clean_up_whitespaces{
409 my ($text) = @_;
410
411 $text =~ s/%%%%%/<BR> <BR>/g;
412 $text =~ s/ +/ /g;
413 $text =~ s/\s*$//;
414 $text =~ s/^\s*//;
415 $text =~ s/\n/\n<BR>/g;
416 $text =~ s/<BR>\s*<BR>/<P>/g;
417
418 return $text;
419}
420
421
422sub convert_problem_characters_without_ampersand{
423 my ($text) = @_;
424 $text =~ s/</&lt;/g;
425 $text =~ s/>/&gt;/g;
426
427 $text =~ s/\'\'/\"/g; #Latex -specific conversion
428 $text =~ s/\`\`/\"/g; #Latex -specific conversion
429
430 $text =~ s/\"/&quot;/g;
431 $text =~ s/\'/&#8217;/g;
432 $text =~ s/\`/&#8216;/g;
433
434# $text =~ s/\+/ /g;
435# $text =~ s/\(/ /g;
436# $text =~ s/\)/ /g;
437
438 $text =~ s/\\/\\\\/g;
439
440# $text =~ s/\./\\\./g;
441
442 return $text;
443}
444
445# Convert a text string into HTML.
446
447# The HTML is going to be inserted into a GML file, so we have to be
448# careful not to use symbols like ">", which occurs frequently in email
449# messages (and use &gt instead.
450
451# This function also turns URLs and email addresses into links, and
452# replaces carriage returns with <BR> tags (and multiple carriage returns
453# with <P> tags).
454
455sub text_into_html {
456 my ($text) = @_;
457
458 # Convert problem characters into HTML symbols
459 $text =~ s/&/&amp;/g;
460
461 $text = &convert_problem_characters_without_ampersand( $text );
462
463 # convert email addresses and URLs into links
464 $text = &convert_urls_into_links( $text );
465
466 $text = &clean_up_whitespaces( $text );
467
468 return $text;
469}
470
471
472sub expand_month {
473 my $text=shift;
474
475 # bibtex style files expand abbreviations for months.
476 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
477 $text =~ s/jan/_textmonth01_/g;
478 $text =~ s/feb/_textmonth02_/g;
479 $text =~ s/mar/_textmonth03_/g;
480 $text =~ s/apr/_textmonth04_/g;
481 $text =~ s/may/_textmonth05_/g;
482 $text =~ s/jun/_textmonth06_/g;
483 $text =~ s/jul/_textmonth07_/g;
484 $text =~ s/aug/_textmonth08_/g;
485 $text =~ s/sep/_textmonth09_/g;
486 $text =~ s/oct/_textmonth10_/g;
487 $text =~ s/nov/_textmonth11_/g;
488 $text =~ s/dec/_textmonth12_/g;
489
490 return $text;
491}
492
493
494# Convert accented characters, remove { }, interprete some commands....
495# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
496sub process_latex {
497 my ($text) = @_;
498
499 # note - this is really ugly, but it works. There may be a prettier way
500 # of mapping latex accented chars to utf8, but we just brute force it here.
501 # Also, this isn't complete - not every single possible accented letter
502 # is in here yet, but most of the common ones are.
503
504 my %utf8_chars =
505 (
506 # acutes
507 '\'a' => chr(0xc3).chr(0xa1),
508 '\'c' => chr(0xc4).chr(0x87),
509 '\'e' => chr(0xc3).chr(0xa9),
510 '\'i' => chr(0xc3).chr(0xad),
511 '\'l' => chr(0xc3).chr(0xba),
512 '\'n' => chr(0xc3).chr(0x84),
513 '\'o' => chr(0xc3).chr(0xb3),
514 '\'r' => chr(0xc5).chr(0x95),
515 '\'s' => chr(0xc5).chr(0x9b),
516 '\'u' => chr(0xc3).chr(0xba),
517 '\'y' => chr(0xc3).chr(0xbd),
518 '\'z' => chr(0xc5).chr(0xba),
519 # graves
520 '`a' => chr(0xc3).chr(0xa0),
521 '`A' => chr(0xc3).chr(0x80),
522 '`e' => chr(0xc3).chr(0xa8),
523 '`E' => chr(0xc3).chr(0x88),
524 '`i' => chr(0xc3).chr(0xac),
525 '`I' => chr(0xc3).chr(0x8c),
526 '`o' => chr(0xc3).chr(0xb2),
527 '`O' => chr(0xc3).chr(0x92),
528 '`u' => chr(0xc3).chr(0xb9),
529 '`U' => chr(0xc3).chr(0x99),
530 # circumflex
531 '^a' => chr(0xc3).chr(0xa2),
532 '^A' => chr(0xc3).chr(0x82),
533 '^c' => chr(0xc4).chr(0x89),
534 '^C' => chr(0xc4).chr(0x88),
535 '^e' => chr(0xc3).chr(0xaa),
536 '^E' => chr(0xc3).chr(0x8a),
537 '^g' => chr(0xc4).chr(0x9d),
538 '^G' => chr(0xc4).chr(0x9c),
539 '^h' => chr(0xc4).chr(0xa5),
540 '^H' => chr(0xc4).chr(0xa4),
541 '^i' => chr(0xc3).chr(0xae),
542 '^I' => chr(0xc3).chr(0x8e),
543 '^j' => chr(0xc4).chr(0xb5),
544 '^J' => chr(0xc4).chr(0xb4),
545 '^o' => chr(0xc3).chr(0xb4),
546 '^O' => chr(0xc3).chr(0x94),
547 '^s' => chr(0xc5).chr(0x9d),
548 '^S' => chr(0xc5).chr(0x9c),
549 '^u' => chr(0xc3).chr(0xa2),
550 '^U' => chr(0xc3).chr(0xbb),
551 '^w' => chr(0xc5).chr(0xb5),
552 '^W' => chr(0xc5).chr(0xb4),
553 '^y' => chr(0xc5).chr(0xb7),
554 '^Y' => chr(0xc5).chr(0xb6),
555
556 # diaeresis
557 '"a' => chr(0xc3).chr(0xa4),
558 '"A' => chr(0xc3).chr(0x84),
559 '"e' => chr(0xc3).chr(0xab),
560 '"E' => chr(0xc3).chr(0x8b),
561 '"\\\\i' => chr(0xc3).chr(0xaf),
562 '"\\\\I' => chr(0xc3).chr(0x8f),
563 '"o' => chr(0xc3).chr(0xb6),
564 '"O' => chr(0xc3).chr(0x96),
565 '"u' => chr(0xc3).chr(0xbc),
566 '"U' => chr(0xc3).chr(0x9c),
567 '"y' => chr(0xc3).chr(0xbf),
568 '"Y' => chr(0xc3).chr(0xb8),
569 # tilde
570 '~A' => chr(0xc3).chr(0x83),
571 '~N' => chr(0xc3).chr(0x91),
572 '~O' => chr(0xc3).chr(0x95),
573 '~a' => chr(0xc3).chr(0xa3),
574 '~n' => chr(0xc3).chr(0xb1),
575 '~o' => chr(0xc3).chr(0xb5),
576 # caron - handled specially
577 # double acute
578 # ring
579 # dot
580 '.c' => chr(0xc4).chr(0x8b),
581 '.C' => chr(0xc4).chr(0x8a),
582 '.e' => chr(0xc4).chr(0x97),
583 '.E' => chr(0xc4).chr(0x96),
584 '.g' => chr(0xc4).chr(0xa1),
585 '.G' => chr(0xc4).chr(0xa0),
586 '.I' => chr(0xc4).chr(0xb0),
587 '.z' => chr(0xc5).chr(0xbc),
588 '.Z' => chr(0xc5).chr(0xbb),
589 # macron
590 '=a' => chr(0xc4).chr(0x81),
591 '=A' => chr(0xc4).chr(0x80),
592 '=e' => chr(0xc4).chr(0x93),
593 '=E' => chr(0xc4).chr(0x92),
594 '=i' => chr(0xc4).chr(0xab),
595 '=I' => chr(0xc4).chr(0xaa),
596 '=o' => chr(0xc4).chr(0x8d),
597 '=O' => chr(0xc4).chr(0x8c),
598 '=u' => chr(0xc4).chr(0xab),
599 '=U' => chr(0xc4).chr(0xaa),
600
601 # stroke - handled specially - see below
602
603 # cedilla - handled specially
604 );
605
606# these are one letter latex commands - we make sure they're not a longer
607# command name. eg {\d} is d+stroke, so careful of \d
608 my %special_utf8_chars =
609 (
610 # breve
611 'u g' => chr(0xc4).chr(0x9f),
612 'u G' => chr(0xc4).chr(0x9e),
613 'u i' => chr(0xc4).chr(0xad),
614 'u I' => chr(0xc4).chr(0xac),
615 'u o' => chr(0xc5).chr(0x8f),
616 'u O' => chr(0xc5).chr(0x8e),
617 'u u' => chr(0xc5).chr(0xad),
618 'u U' => chr(0xc5).chr(0xac),
619 # caron
620 'v c' => chr(0xc4).chr(0x8d),
621 'v C' => chr(0xc4).chr(0x8c),
622 'v n' => chr(0xc5).chr(0x88),
623 'v N' => chr(0xc5).chr(0x87),
624 'v s' => chr(0xc5).chr(0xa1),
625 'v S' => chr(0xc5).chr(0xa5),
626 'v z' => chr(0xc5).chr(0xbe),
627 'v Z' => chr(0xc5).chr(0xbd),
628 # cedilla
629 'c c' => chr(0xc3).chr(0xa7),
630 'c C' => chr(0xc3).chr(0x87),
631 'c g' => chr(0xc4).chr(0xa3),
632 'c G' => chr(0xc4).chr(0xa2),
633 'c k' => chr(0xc4).chr(0xb7),
634 'c K' => chr(0xc4).chr(0xb6),
635 'c l' => chr(0xc4).chr(0xbc),
636 'c L' => chr(0xc4).chr(0xbb),
637 'c n' => chr(0xc5).chr(0x86),
638 'c N' => chr(0xc5).chr(0x85),
639 'c r' => chr(0xc5).chr(0x97),
640 'c R' => chr(0xc5).chr(0x96),
641 'c s' => chr(0xc5).chr(0x9f),
642 'c S' => chr(0xc5).chr(0x9e),
643 'c t' => chr(0xc5).chr(0xa3),
644 'c T' => chr(0xc5).chr(0xa2),
645 # double acute / Hungarian accent
646 'H O' => chr(0xc5).chr(0x90),
647 'H o' => chr(0xc5).chr(0x91),
648 'H U' => chr(0xc5).chr(0xb0),
649 'H u' => chr(0xc5).chr(0xb1),
650
651 # stroke
652 'd' => chr(0xc4).chr(0x91),
653 'D' => chr(0xc4).chr(0x90),
654 'h' => chr(0xc4).chr(0xa7),
655# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
656 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
657 'l' => chr(0xc5).chr(0x82),
658 'L' => chr(0xc5).chr(0x81),
659 'o' => chr(0xc3).chr(0xb8),
660 'O' => chr(0xc3).chr(0x98),
661 't' => chr(0xc5).chr(0xa7),
662 'T' => chr(0xc5).chr(0xa6),
663 # german ss/szlig/sharp s
664 'ss' => chr(0xc3).chr(0x9f),
665 );
666
667 # convert latex-style accented characters.
668
669 # remove space (if any) between \ and letter to accent (eg {\' a})
670
671 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
672
673 # remove {} around a single character (eg \'{e})
674 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
675
676 # \, is another way of doing cedilla \c
677 $text =~ s@\\,(.)@\\c $1@g;
678
679 # remove {} around a single character for special 1 letter commands -
680 # need to insert a space. Eg \v{s} -> {\v s}
681 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
682
683 # only do if the text contains a '\' character.
684 if ($text =~ m|\\|) {
685 # "normal" accents - ie non-alpha latex tag
686 while ($text =~ m@\\([`'="^~\.])([\w])@) {
687 my $tex="$1$2"; my $char="$2";
688 my $replacement=$utf8_chars{$tex};
689 if (!defined($replacement)) {
690 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
691 $replacement=$char;
692 }
693 $text =~ s/\\\Q$tex/$replacement/g;
694 }
695
696 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
697 # only do the change if immediately followed by a space, }, {, or \
698 # one letter accents ( + ss)
699 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
700 my $tex=$1;
701 my $replacement=$special_utf8_chars{$tex};
702 if (!defined($replacement)) {
703 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
704 $replacement=$tex;
705 }
706 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
707
708 }
709
710 # one letter latex accent commands that affect following letter
711 while ($text =~ m@\\([uvcH]) ([\w])@) {
712 my $tex="$1 $2"; my $char="$2";
713 my $replacement=$special_utf8_chars{$tex};
714 if (!defined($replacement)) {
715 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
716 $replacement=$char;
717 }
718 $text =~ s/\\$tex/$replacement/g;
719 }
720 }
721
722 # escape html-sensitive characters
723 $text =~ s@&@&amp;@g;
724 $text =~ s@<@&lt;@g;
725 $text =~ s@>@&gt;@g;
726 $text =~ s/''/"/g; # Latex-specific
727 $text =~ s/``/"/g; # Latex-specific
728 # greenstone-specific
729 $text =~ s@\[@&\#91;@g;
730 $text =~ s@\]@&\#93;@g;
731
732 # remove latex commands
733
734 # explicitly recognised commands
735 $text =~ s@\\ldots@&hellip;@g;
736
737 # maths mode
738 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
739
740 # remove all other commands with optional arguments...
741 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
742 # $text =~ s@\\noopsort{[^}]+\}@@g;
743 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
744
745 # remove latex groupings { } (but not \{ or \} )
746 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
747 $text =~ s/^\{//; # remove { if first char
748
749 # latex characters
750 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
751 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
752 # optional break "\-"
753 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
754 # the non-macro bits have quotes around them - we just remove quotes
755 $text =~ s/([^&])[\"\#]/$1/g;
756 }
757 # dashes. Convert (m|n)-dash into single dash for html.
758 $text =~ s@\-\-+@\-@g;
759
760 # quoted { } chars
761 $text =~ s@\\{@{@g;
762 $text =~ s@\\}@}@g;
763
764 # finally to protect against macro language...
765 $text =~ s@\\@\\\\@g;
766
767 return $text;
768}
769
770
771sub process_latex_math {
772 my $text = shift;
773
774 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
775 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
776 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
777 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
778 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
779
780 # put all other command names in italics
781 $text =~ s@\\([\w]+)@<i>$1</i>@g;
782
783 return $text;
784}
785
786sub set_OID {
787 my $self = shift (@_);
788 my ($doc_obj, $id, $segment_number) = @_;
789
790 if ( $self->{'key'} eq "default") {
791 $doc_obj->set_OID("$id\_$segment_number");
792 } else {
793 $doc_obj->set_OID($self->{'key'});
794 }
795}
796
7971;
798
799
Note: See TracBrowser for help on using the repository browser.