source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 9582

Last change on this file since 9582 was 9582, checked in by jrm21, 19 years ago

couple of fixes:

" and others" in the author field => 'et. al.'
make whitespace optional around the = sign

use the booktitle or journal name if the entry has no 'title'

  • Property svn:keywords set to Author Date Id Revision
File size: 23.0 KB
Line 
1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
42
43
44package BibTexPlug;
45
46use SplitPlug;
47
48# BibTexPlug is a sub-class of BasPlug.
49sub BEGIN {
50 @ISA = ('SplitPlug');
51}
52
53my $arguments =
54 [ { 'name' => "process_exp",
55 'desc' => "{BasPlug.process_exp}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => &get_default_process_exp() },
59 { 'name' => "split_exp",
60 'desc' => "{SplitPlug.split_exp}",
61 'type' => "regexp",
62 'deft' => &get_default_split_exp(),
63 'reqd' => "no" }
64 ];
65
66my $options = { 'name' => "BibTexPlug",
67 'desc' => "{BibTexPlug.desc}",
68 'abstract' => "no",
69 'inherits' => "yes",
70 'explodes' => "yes",
71 'args' => $arguments };
72
73# This plugin processes files with the suffix ".bib"
74sub get_default_process_exp {
75 return q^(?i)\.bib$^;
76}
77
78# This plugin splits the input text at blank lines
79sub get_default_split_exp {
80 return q^\n+(?=@)^;
81}
82sub new {
83 my $class = shift (@_);
84 my $self = new SplitPlug ($class, @_);
85 $self->{'plugin_type'} = "BibTexPlug";
86 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
87 my $option_list = $self->{'option_list'};
88 push( @{$option_list}, $options );
89 return bless $self, $class;
90}
91
92
93
94# The process function reads a single bibliographic record and stores
95# it as a new document.
96
97sub process {
98 my $self = shift (@_);
99 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
100 my $outhandle = $self->{'outhandle'};
101
102 $self->{'key'} = "default";
103
104 # Check that we're dealing with a valid BibTex record
105 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
106
107 # Ignore things we can't use
108 return 0 if ($$textref =~ /^\@String/);
109
110 # Report that we're processing the file
111 print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
112 print $outhandle "BibTexPlug: processing $file\n"
113 if ($self->{'verbosity'}) > 1;
114
115
116 # This hash translates BibTex field names into metadata names. The
117 # BibTex names are taken from the "Local Guide to Latex" Graeme
118 # McKinstry. Metadata names are consistent with ReferPlug.
119
120 # The author metadata will be stored as one "Creator" entry, but will
121 # also be split up into several individual "Author" fields.
122
123 my %field = (
124 'address', 'PublisherAddress',
125 'author', 'Creator',
126
127 'booktitle', 'Booktitle',
128 'chapter', 'Chapter',
129 'edition', 'Edition',
130 'editor', 'Editor',
131 'institution', 'Publisher',
132 'journal', 'Journal',
133 'month', 'Month',
134 'number', 'Number',
135 'pages', 'Pages',
136 'publisher', 'Publisher',
137 'school', 'Publisher',
138 'title', 'Title',
139 'volume', 'Volume',
140 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
141
142 'keywords', 'Keywords',
143 'abstract', 'Abstract',
144 'copyright', 'Copyright',
145 'note', 'Note',
146 );
147
148 # Metadata fields
149 my %metadata;
150 my ($EntryType, $Creator, $Keywords, $text);
151
152 my $verbosity = $self->{'verbosity'};
153 $verbosity = 0 unless $verbosity;
154
155 # Make sure the text has exactly one entry per line -
156 # append line to previous if it doesn't start with " <key> = "
157
158 my @input_lines=split(/\r?\n/, $$textref);
159 my @all_lines;
160 my $entry_line=shift @input_lines;
161 foreach my $input_line (@input_lines) {
162 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
163 # this is a new key
164 push(@all_lines, $entry_line);
165 $entry_line=$input_line;
166 } else {
167 # this is a continuation of previous line
168 $entry_line .= " " . $input_line;
169 }
170
171 }
172 # add final line, removing trailing '}'
173 $entry_line =~ s/\}\s*$//;
174 push(@all_lines, $entry_line);
175 push(@all_lines, "}");
176
177 # Read and process each line in the bib file.
178 my ($entryname, $name, $value, $line);
179 foreach $line (@all_lines) {
180
181 # Add each line. Most lines consist of a field identifer and
182 # then data, and we simply store them, though we treat some
183 # of the fields a bit differently.
184
185 $line =~ s/\s+/ /g;
186 $text .= "$line\n";
187
188
189 print "Processing line = $line \n" if $verbosity>=4;
190
191 # The first line is special, it contains the reference type and OID
192 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]*)\W*$/) {
193 $EntryType = $1;
194 my $EntryID = (defined $2) ? $2 : "default";
195 print "** $EntryType - \"$EntryID\" \n"
196 if ($verbosity >= 4);
197
198 next;
199 }
200 if ($line =~ /\@/) {
201 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
202 if ($verbosity >= 2);
203 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
204 if ($verbosity >= 2);
205 }
206
207 # otherwise, parse the metadata out of this line
208 next unless ($line =~ /^\s*(\w+)\s*=\s*(.*)/);
209 $entryname = lc($1);
210 $value = $2;
211 $value =~ s/,?\s*$//; # remove trailing comma and space
212 if ($value =~ /^"/ && $value =~ /"$/) {
213 # remove surrounding " marks
214 $value =~ s/^"//; $value =~ s/"$//;
215 }
216 $value = &process_latex($value);
217
218 # Add this line of metadata
219 $metadata{$entryname} .= "$value";
220
221 }
222
223 # Add the Entry type as metadata
224 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
225
226 #Add the fileformat as metadata
227 $doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
228
229
230 # Add the various field as metadata
231 foreach my $entryname (keys %metadata) {
232 next unless (defined $field{$entryname});
233 next unless (defined $metadata{$entryname});
234
235 $name = $field{$entryname};
236 $value = $metadata{$entryname};
237
238 if ($name =~ /^Month/) {
239 $value=expand_month($value);
240 }
241
242 # Several special operatons on metadata follow
243
244 # Add individual keywords.
245 # The full set of keywords will be added, in due course, as "Keywords".
246 # However, we also want to add them as individual "Keyword" metadata elements.
247 if ($entryname eq "keywords") {
248 my @keywordlist = split(/,/, $value);
249 foreach my $k (@keywordlist) {
250 $k = lc($k);
251 $k =~ s/\s*$//;
252 $k =~ s/^\s*//;
253 if ($k =~ /\w/) {
254 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
255 }
256 }
257 }
258
259 # Add individual authors
260 # The author metadata will be stored as one "Creator" entry, but we
261 # also want to split it into several individual "Author" fields in
262 # "Lastname, Firstnames" format so we can browse it.
263 if ($entryname eq "author") { #added also comparison with editor
264
265 # take care of "et al."...
266 my $etal='';
267 if ($value =~ s/\s+(and\s+others|et\.?\s+al\.?)\s*$//i) {
268 $etal=' <em>et. al.</em>';
269 }
270 # und here for german language...
271 # don't use brackets in pattern, else the matched bit becomes
272 # an element in the list!
273 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
274 my @formattedlist = ();
275 foreach $a (@authorlist) {
276 $a =~ s/\s*$//;
277 $a =~ s/^\s*//;
278 # Reformat and add author name
279 next if $a=~ /^\s*$/;
280
281 # names are "First von Last", "von Last, First"
282 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
283 my $first="";
284 my $vonlast="";
285 my $jr="";
286
287 if ($a =~ /,/) {
288 my @parts=split(/,\s*/, $a);
289 $first = pop @parts;
290 if (scalar(@parts) == 2) {
291 $jr = pop @parts;
292 }
293 $vonlast=shift @parts;
294 if (scalar(@parts) > 0) {
295 print $outhandle "BibTexPlug: couldn't parse name $a\n";
296 # but we continue anyway...
297 }
298 } else { # First von Last
299 my @words = split(/ /, $a);
300 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
301 $first .= " " . shift (@words);
302 }
303 $first =~ s/^\s//;
304 $vonlast = join (' ', @words); # whatever's left...
305 }
306 my $von="";
307 my $last="";
308 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
309 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
310 $von = $1;
311 if (!defined ($von)) {
312 # some non-English names do start with lowercase
313 # eg "Marie desJardins". Also we can get typos...
314 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
315 $von="";
316 if ($vonlast =~ /^[a-z]+$/) {
317 # if it's all lowercase, uppercase 1st.
318 $vonlast =~ s/^(.)/\u$1/;
319
320 }
321 }
322 $von =~ s/\s*$//;
323 $last=$vonlast;
324 } else {
325 $last=$vonlast;
326 }
327 my $wholename="$first $von $last $jr";
328 $wholename =~ s/\s+/ /g; # squeeze multiple spaces
329 $wholename =~ s/ $//;
330 push (@formattedlist, $wholename);
331 my $fullname = "$last";
332 $fullname .= " $jr" if ($jr);
333 $fullname .= ", $first";
334 $fullname .= " $von" if ($von);
335
336 # Add each name to set of Authors
337 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
338 }
339
340 # Only want at most one "and" in the Creator field
341 if (scalar(@formattedlist) > 2) {
342 my $lastauthor=pop @formattedlist;
343 $value=join(', ', @formattedlist);
344 $value.=" and $lastauthor";
345 } else { # 1 or 2 authors...
346 $value=join(" and ",@formattedlist);
347 }
348 $value.=$etal; # if there was "et al."
349 }
350
351 # Books and Journals are additionally marked for display purposes
352 if ($entryname eq "booktitle") {
353 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
354 } elsif ($entryname eq "journal") {
355 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
356 }
357
358 # Add the various fields as metadata
359 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
360 }
361
362 # for books and journals...
363 if (!exists $metadata{'title'}) {
364 my $name=$field{'title'}; # get Greenstone metadata name
365 my $value;
366 if (exists $metadata{'booktitle'}) {
367 $value=$metadata{'booktitle'};
368 } elsif (exists $metadata{'journal'}) {
369 $value=$metadata{'journal'};
370 }
371 if ($value) {
372 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
373 }
374 }
375
376 # Add Date (yyyymmdd) metadata
377 if (defined ($metadata{'year'}) ) {
378 my $date=$metadata{'year'};
379 chomp $date;
380 my $month=$metadata{'month'};
381 if (defined($month)) {
382 # month is currently 3 letter code or a range...
383 $month = expand_month($month);
384 # take the first month found... might not find one!
385 $month =~ m/_textmonth(\d\d)_/;
386 $month = $1;
387 }
388 if (!defined($month)) {
389 $month="00";
390 }
391 $date .= "${month}00";
392 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
393}
394
395# # Add the text in BibTex format (all fields)
396 if ($text =~ /\w/) {
397
398 $text =~ s@&@&amp;@g;
399 $text =~ s@<@&lt;@g;
400 $text =~ s@>@&gt;@g;
401 $text =~ s@\n@<br/>\n@g;
402 $text =~ s@\\@\\\\@g;
403
404 $doc_obj->add_utf8_text ($cursection, $text);
405 }
406
407 return 1;
408}
409
410
411
412
413# convert email addresses and URLs into links
414sub convert_urls_into_links{
415 my ($text) = @_;
416
417 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
418 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
419
420 return $text;
421}
422
423# Clean up whitespace and convert \n charaters to <BR> or <P>
424sub clean_up_whitespaces{
425 my ($text) = @_;
426
427 $text =~ s/%%%%%/<BR> <BR>/g;
428 $text =~ s/ +/ /g;
429 $text =~ s/\s*$//;
430 $text =~ s/^\s*//;
431 $text =~ s/\n/\n<BR>/g;
432 $text =~ s/<BR>\s*<BR>/<P>/g;
433
434 return $text;
435}
436
437
438sub convert_problem_characters_without_ampersand{
439 my ($text) = @_;
440 $text =~ s/</&lt;/g;
441 $text =~ s/>/&gt;/g;
442
443 $text =~ s/\'\'/\"/g; #Latex -specific conversion
444 $text =~ s/\`\`/\"/g; #Latex -specific conversion
445
446 $text =~ s/\"/&quot;/g;
447 $text =~ s/\'/&#8217;/g;
448 $text =~ s/\`/&#8216;/g;
449
450# $text =~ s/\+/ /g;
451# $text =~ s/\(/ /g;
452# $text =~ s/\)/ /g;
453
454 $text =~ s/\\/\\\\/g;
455
456# $text =~ s/\./\\\./g;
457
458 return $text;
459}
460
461# Convert a text string into HTML.
462
463# The HTML is going to be inserted into a GML file, so we have to be
464# careful not to use symbols like ">", which occurs frequently in email
465# messages (and use &gt instead.
466
467# This function also turns URLs and email addresses into links, and
468# replaces carriage returns with <BR> tags (and multiple carriage returns
469# with <P> tags).
470
471sub text_into_html {
472 my ($text) = @_;
473
474 # Convert problem characters into HTML symbols
475 $text =~ s/&/&amp;/g;
476
477 $text = &convert_problem_characters_without_ampersand( $text );
478
479 # convert email addresses and URLs into links
480 $text = &convert_urls_into_links( $text );
481
482 $text = &clean_up_whitespaces( $text );
483
484 return $text;
485}
486
487
488sub expand_month {
489 my $text=shift;
490
491 # bibtex style files expand abbreviations for months.
492 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
493 $text =~ s/jan/_textmonth01_/g;
494 $text =~ s/feb/_textmonth02_/g;
495 $text =~ s/mar/_textmonth03_/g;
496 $text =~ s/apr/_textmonth04_/g;
497 $text =~ s/may/_textmonth05_/g;
498 $text =~ s/jun/_textmonth06_/g;
499 $text =~ s/jul/_textmonth07_/g;
500 $text =~ s/aug/_textmonth08_/g;
501 $text =~ s/sep/_textmonth09_/g;
502 $text =~ s/oct/_textmonth10_/g;
503 $text =~ s/nov/_textmonth11_/g;
504 $text =~ s/dec/_textmonth12_/g;
505
506 return $text;
507}
508
509
510# Convert accented characters, remove { }, interprete some commands....
511# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
512sub process_latex {
513 my ($text) = @_;
514
515 # note - this is really ugly, but it works. There may be a prettier way
516 # of mapping latex accented chars to utf8, but we just brute force it here.
517 # Also, this isn't complete - not every single possible accented letter
518 # is in here yet, but most of the common ones are.
519
520 my %utf8_chars =
521 (
522 # acutes
523 '\'a' => chr(0xc3).chr(0xa1),
524 '\'c' => chr(0xc4).chr(0x87),
525 '\'e' => chr(0xc3).chr(0xa9),
526 '\'i' => chr(0xc3).chr(0xad),
527 '\'l' => chr(0xc3).chr(0xba),
528 '\'n' => chr(0xc3).chr(0x84),
529 '\'o' => chr(0xc3).chr(0xb3),
530 '\'r' => chr(0xc5).chr(0x95),
531 '\'s' => chr(0xc5).chr(0x9b),
532 '\'u' => chr(0xc3).chr(0xba),
533 '\'y' => chr(0xc3).chr(0xbd),
534 '\'z' => chr(0xc5).chr(0xba),
535 # graves
536 '`a' => chr(0xc3).chr(0xa0),
537 '`A' => chr(0xc3).chr(0x80),
538 '`e' => chr(0xc3).chr(0xa8),
539 '`E' => chr(0xc3).chr(0x88),
540 '`i' => chr(0xc3).chr(0xac),
541 '`I' => chr(0xc3).chr(0x8c),
542 '`o' => chr(0xc3).chr(0xb2),
543 '`O' => chr(0xc3).chr(0x92),
544 '`u' => chr(0xc3).chr(0xb9),
545 '`U' => chr(0xc3).chr(0x99),
546 # circumflex
547 '^a' => chr(0xc3).chr(0xa2),
548 '^A' => chr(0xc3).chr(0x82),
549 '^c' => chr(0xc4).chr(0x89),
550 '^C' => chr(0xc4).chr(0x88),
551 '^e' => chr(0xc3).chr(0xaa),
552 '^E' => chr(0xc3).chr(0x8a),
553 '^g' => chr(0xc4).chr(0x9d),
554 '^G' => chr(0xc4).chr(0x9c),
555 '^h' => chr(0xc4).chr(0xa5),
556 '^H' => chr(0xc4).chr(0xa4),
557 '^i' => chr(0xc3).chr(0xae),
558 '^I' => chr(0xc3).chr(0x8e),
559 '^j' => chr(0xc4).chr(0xb5),
560 '^J' => chr(0xc4).chr(0xb4),
561 '^o' => chr(0xc3).chr(0xb4),
562 '^O' => chr(0xc3).chr(0x94),
563 '^s' => chr(0xc5).chr(0x9d),
564 '^S' => chr(0xc5).chr(0x9c),
565 '^u' => chr(0xc3).chr(0xa2),
566 '^U' => chr(0xc3).chr(0xbb),
567 '^w' => chr(0xc5).chr(0xb5),
568 '^W' => chr(0xc5).chr(0xb4),
569 '^y' => chr(0xc5).chr(0xb7),
570 '^Y' => chr(0xc5).chr(0xb6),
571
572 # diaeresis
573 '"a' => chr(0xc3).chr(0xa4),
574 '"A' => chr(0xc3).chr(0x84),
575 '"e' => chr(0xc3).chr(0xab),
576 '"E' => chr(0xc3).chr(0x8b),
577 '"\\\\i' => chr(0xc3).chr(0xaf),
578 '"\\\\I' => chr(0xc3).chr(0x8f),
579 '"o' => chr(0xc3).chr(0xb6),
580 '"O' => chr(0xc3).chr(0x96),
581 '"u' => chr(0xc3).chr(0xbc),
582 '"U' => chr(0xc3).chr(0x9c),
583 '"y' => chr(0xc3).chr(0xbf),
584 '"Y' => chr(0xc3).chr(0xb8),
585 # tilde
586 '~A' => chr(0xc3).chr(0x83),
587 '~N' => chr(0xc3).chr(0x91),
588 '~O' => chr(0xc3).chr(0x95),
589 '~a' => chr(0xc3).chr(0xa3),
590 '~n' => chr(0xc3).chr(0xb1),
591 '~o' => chr(0xc3).chr(0xb5),
592 # caron - handled specially
593 # double acute
594 # ring
595 # dot
596 '.c' => chr(0xc4).chr(0x8b),
597 '.C' => chr(0xc4).chr(0x8a),
598 '.e' => chr(0xc4).chr(0x97),
599 '.E' => chr(0xc4).chr(0x96),
600 '.g' => chr(0xc4).chr(0xa1),
601 '.G' => chr(0xc4).chr(0xa0),
602 '.I' => chr(0xc4).chr(0xb0),
603 '.z' => chr(0xc5).chr(0xbc),
604 '.Z' => chr(0xc5).chr(0xbb),
605 # macron
606 '=a' => chr(0xc4).chr(0x81),
607 '=A' => chr(0xc4).chr(0x80),
608 '=e' => chr(0xc4).chr(0x93),
609 '=E' => chr(0xc4).chr(0x92),
610 '=i' => chr(0xc4).chr(0xab),
611 '=I' => chr(0xc4).chr(0xaa),
612 '=o' => chr(0xc4).chr(0x8d),
613 '=O' => chr(0xc4).chr(0x8c),
614 '=u' => chr(0xc4).chr(0xab),
615 '=U' => chr(0xc4).chr(0xaa),
616
617 # stroke - handled specially - see below
618
619 # cedilla - handled specially
620 );
621
622# these are one letter latex commands - we make sure they're not a longer
623# command name. eg {\d} is d+stroke, so careful of \d
624 my %special_utf8_chars =
625 (
626 # breve
627 'u g' => chr(0xc4).chr(0x9f),
628 'u G' => chr(0xc4).chr(0x9e),
629 'u i' => chr(0xc4).chr(0xad),
630 'u I' => chr(0xc4).chr(0xac),
631 'u o' => chr(0xc5).chr(0x8f),
632 'u O' => chr(0xc5).chr(0x8e),
633 'u u' => chr(0xc5).chr(0xad),
634 'u U' => chr(0xc5).chr(0xac),
635 # caron
636 'v c' => chr(0xc4).chr(0x8d),
637 'v C' => chr(0xc4).chr(0x8c),
638 'v n' => chr(0xc5).chr(0x88),
639 'v N' => chr(0xc5).chr(0x87),
640 'v s' => chr(0xc5).chr(0xa1),
641 'v S' => chr(0xc5).chr(0xa5),
642 'v z' => chr(0xc5).chr(0xbe),
643 'v Z' => chr(0xc5).chr(0xbd),
644 # cedilla
645 'c c' => chr(0xc3).chr(0xa7),
646 'c C' => chr(0xc3).chr(0x87),
647 'c g' => chr(0xc4).chr(0xa3),
648 'c G' => chr(0xc4).chr(0xa2),
649 'c k' => chr(0xc4).chr(0xb7),
650 'c K' => chr(0xc4).chr(0xb6),
651 'c l' => chr(0xc4).chr(0xbc),
652 'c L' => chr(0xc4).chr(0xbb),
653 'c n' => chr(0xc5).chr(0x86),
654 'c N' => chr(0xc5).chr(0x85),
655 'c r' => chr(0xc5).chr(0x97),
656 'c R' => chr(0xc5).chr(0x96),
657 'c s' => chr(0xc5).chr(0x9f),
658 'c S' => chr(0xc5).chr(0x9e),
659 'c t' => chr(0xc5).chr(0xa3),
660 'c T' => chr(0xc5).chr(0xa2),
661 # double acute / Hungarian accent
662 'H O' => chr(0xc5).chr(0x90),
663 'H o' => chr(0xc5).chr(0x91),
664 'H U' => chr(0xc5).chr(0xb0),
665 'H u' => chr(0xc5).chr(0xb1),
666
667 # stroke
668 'd' => chr(0xc4).chr(0x91),
669 'D' => chr(0xc4).chr(0x90),
670 'h' => chr(0xc4).chr(0xa7),
671# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
672 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
673 'l' => chr(0xc5).chr(0x82),
674 'L' => chr(0xc5).chr(0x81),
675 'o' => chr(0xc3).chr(0xb8),
676 'O' => chr(0xc3).chr(0x98),
677 't' => chr(0xc5).chr(0xa7),
678 'T' => chr(0xc5).chr(0xa6),
679 # german ss/szlig/sharp s
680 'ss' => chr(0xc3).chr(0x9f),
681 );
682
683 # convert latex-style accented characters.
684
685 # remove space (if any) between \ and letter to accent (eg {\' a})
686
687 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
688
689 # remove {} around a single character (eg \'{e})
690 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
691
692 # \, is another way of doing cedilla \c
693 $text =~ s@\\,(.)@\\c $1@g;
694
695 # remove {} around a single character for special 1 letter commands -
696 # need to insert a space. Eg \v{s} -> {\v s}
697 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
698
699 # only do if the text contains a '\' character.
700 if ($text =~ m|\\|) {
701 # "normal" accents - ie non-alpha latex tag
702 while ($text =~ m@\\([`'="^~\.])([\w])@) {
703 my $tex="$1$2"; my $char="$2";
704 my $replacement=$utf8_chars{$tex};
705 if (!defined($replacement)) {
706 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
707 $replacement=$char;
708 }
709 $text =~ s/\\\Q$tex/$replacement/g;
710 }
711
712 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
713 # only do the change if immediately followed by a space, }, {, or \
714 # one letter accents ( + ss)
715 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
716 my $tex=$1;
717 my $replacement=$special_utf8_chars{$tex};
718 if (!defined($replacement)) {
719 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
720 $replacement=$tex;
721 }
722 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
723
724 }
725
726 # one letter latex accent commands that affect following letter
727 while ($text =~ m@\\([uvcH]) ([\w])@) {
728 my $tex="$1 $2"; my $char="$2";
729 my $replacement=$special_utf8_chars{$tex};
730 if (!defined($replacement)) {
731 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
732 $replacement=$char;
733 }
734 $text =~ s/\\$tex/$replacement/g;
735 }
736 }
737
738 # escape html-sensitive characters
739 $text =~ s@&@&amp;@g;
740 $text =~ s@<@&lt;@g;
741 $text =~ s@>@&gt;@g;
742 $text =~ s/''/"/g; # Latex-specific
743 $text =~ s/``/"/g; # Latex-specific
744 # greenstone-specific
745 $text =~ s@\[@&\#91;@g;
746 $text =~ s@\]@&\#93;@g;
747
748 # remove latex commands
749
750 # explicitly recognised commands
751 $text =~ s@\\ldots@&hellip;@g;
752
753 # maths mode
754 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
755
756 # remove all other commands with optional arguments...
757 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
758 # $text =~ s@\\noopsort{[^}]+\}@@g;
759 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
760
761 # remove latex groupings { } (but not \{ or \} )
762 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
763 $text =~ s/^\{//; # remove { if first char
764
765 # latex characters
766 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
767 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
768 # optional break "\-"
769 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
770 # the non-macro bits have quotes around them - we just remove quotes
771 $text =~ s/([^&])[\"\#]/$1/g;
772 }
773 # dashes. Convert (m|n)-dash into single dash for html.
774 $text =~ s@\-\-+@\-@g;
775
776 # quoted { } chars
777 $text =~ s@\\{@{@g;
778 $text =~ s@\\}@}@g;
779
780 # finally to protect against macro language...
781 $text =~ s@\\@\\\\@g;
782
783 return $text;
784}
785
786
787sub process_latex_math {
788 my $text = shift;
789
790 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
791 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
792 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
793 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
794 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
795
796 # put all other command names in italics
797 $text =~ s@\\([\w]+)@<i>$1</i>@g;
798
799 return $text;
800}
801
802sub set_OID {
803 my $self = shift (@_);
804 my ($doc_obj, $id, $segment_number) = @_;
805
806 if ( $self->{'key'} eq "default") {
807 $doc_obj->set_OID("$id\_$segment_number");
808 } else {
809 $doc_obj->set_OID($self->{'key'});
810 }
811}
812
8131;
814
815
Note: See TracBrowser for help on using the repository browser.