source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 23.3 KB
Line 
1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
42
43
44package BibTexPlug;
45
46use SplitPlug;
47use strict;
48no strict 'refs'; # allow filehandles to be variables and viceversa
49
50# BibTexPlug is a sub-class of BasPlug.
51sub BEGIN {
52 @BibTexPlug::ISA = ('SplitPlug');
53}
54
55my $arguments =
56 [ { 'name' => "process_exp",
57 'desc' => "{BasPlug.process_exp}",
58 'type' => "regexp",
59 'reqd' => "no",
60 'deft' => &get_default_process_exp() },
61 { 'name' => "split_exp",
62 'desc' => "{SplitPlug.split_exp}",
63 'type' => "regexp",
64 'deft' => &get_default_split_exp(),
65 'reqd' => "no" }
66 ];
67
68my $options = { 'name' => "BibTexPlug",
69 'desc' => "{BibTexPlug.desc}",
70 'abstract' => "no",
71 'inherits' => "yes",
72 'explodes' => "yes",
73 'args' => $arguments };
74
75# This plugin processes files with the suffix ".bib"
76sub get_default_process_exp {
77 return q^(?i)\.bib$^;
78}
79
80# This plugin splits the input text at blank lines
81sub get_default_split_exp {
82 return q^\n+(?=@)^;
83}
84sub new {
85 my ($class) = shift (@_);
86 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87 push(@$pluginlist, $class);
88
89 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
90 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
91
92 my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
93
94 return bless $self, $class;
95}
96
97
98
99# The process function reads a single bibliographic record and stores
100# it as a new document.
101
102sub process {
103 my $self = shift (@_);
104 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
105 my $outhandle = $self->{'outhandle'};
106
107 my $cursection = $doc_obj->get_top_section();
108 $self->{'key'} = "default";
109
110 # Check that we're dealing with a valid BibTex record
111 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
112
113 # Ignore things we can't use
114 return 0 if ($$textref =~ /^\@String/);
115
116 # Report that we're processing the file
117 print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
118 print $outhandle "BibTexPlug: processing $file\n"
119 if ($self->{'verbosity'}) > 1;
120
121
122 # This hash translates BibTex field names into metadata names. The
123 # BibTex names are taken from the "Local Guide to Latex" Graeme
124 # McKinstry. Metadata names are consistent with ReferPlug.
125
126 # The author metadata will be stored as one "Creator" entry, but will
127 # also be split up into several individual "Author" fields.
128
129 my %field = (
130 'address', 'PublisherAddress',
131 'author', 'Creator',
132
133 'booktitle', 'Booktitle',
134 'chapter', 'Chapter',
135 'edition', 'Edition',
136 'editor', 'Editor',
137 'institution', 'Publisher',
138 'journal', 'Journal',
139 'month', 'Month',
140 'number', 'Number',
141 'pages', 'Pages',
142 'publisher', 'Publisher',
143 'school', 'Publisher',
144 'title', 'Title',
145 'volume', 'Volume',
146 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
147
148 'keywords', 'Keywords',
149 'abstract', 'Abstract',
150 'copyright', 'Copyright',
151 'note', 'Note',
152 );
153
154 # Metadata fields
155 my %metadata;
156 my ($EntryType, $Creator, $Keywords, $text);
157
158 my $verbosity = $self->{'verbosity'};
159 $verbosity = 0 unless $verbosity;
160
161 # Make sure the text has exactly one entry per line -
162 # append line to previous if it doesn't start with " <key> = "
163
164 my @input_lines=split(/\r?\n/, $$textref);
165 my @all_lines;
166 my $entry_line=shift @input_lines;
167 foreach my $input_line (@input_lines) {
168 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
169 # this is a new key
170 push(@all_lines, $entry_line);
171 $entry_line=$input_line;
172 } else {
173 # this is a continuation of previous line
174 $entry_line .= " " . $input_line;
175 }
176
177 }
178 # add final line, removing trailing '}'
179 $entry_line =~ s/\}\s*$//;
180 push(@all_lines, $entry_line);
181 push(@all_lines, "}");
182
183 # Read and process each line in the bib file.
184 my ($entryname, $name, $value, $line);
185 foreach $line (@all_lines) {
186
187 # Add each line. Most lines consist of a field identifer and
188 # then data, and we simply store them, though we treat some
189 # of the fields a bit differently.
190
191 $line =~ s/\s+/ /g;
192 $text .= "$line\n";
193
194
195 print "Processing line = $line \n" if $verbosity>=4;
196
197 # The first line is special, it contains the reference type and OID
198 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]*)\W*$/) {
199 $EntryType = $1;
200 my $EntryID = (defined $2) ? $2 : "default";
201 print "** $EntryType - \"$EntryID\" \n"
202 if ($verbosity >= 4);
203
204 next;
205 }
206 if ($line =~ /\@/) {
207 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
208 if ($verbosity >= 2);
209 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
210 if ($verbosity >= 2);
211 }
212
213 # otherwise, parse the metadata out of this line
214 next unless ($line =~ /^\s*(\w+)\s*=\s*(.*)/);
215 $entryname = lc($1);
216 $value = $2;
217 $value =~ s/,?\s*$//; # remove trailing comma and space
218 if ($value =~ /^"/ && $value =~ /"$/) {
219 # remove surrounding " marks
220 $value =~ s/^"//; $value =~ s/"$//;
221 }
222 $value = &process_latex($value);
223
224 # Add this line of metadata
225 $metadata{$entryname} .= "$value";
226
227 }
228
229 # Add the Entry type as metadata
230 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
231
232 #Add the fileformat as metadata
233 $doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
234
235
236 # Add the various field as metadata
237 foreach my $entryname (keys %metadata) {
238 next unless (defined $field{$entryname});
239 next unless (defined $metadata{$entryname});
240
241 $name = $field{$entryname};
242 $value = $metadata{$entryname};
243
244 if ($name =~ /^Month/) {
245 $value=expand_month($value);
246 }
247
248 # Several special operatons on metadata follow
249
250 # Add individual keywords.
251 # The full set of keywords will be added, in due course, as "Keywords".
252 # However, we also want to add them as individual "Keyword" metadata elements.
253 if ($entryname eq "keywords") {
254 my @keywordlist = split(/,/, $value);
255 foreach my $k (@keywordlist) {
256 $k = lc($k);
257 $k =~ s/\s*$//;
258 $k =~ s/^\s*//;
259 if ($k =~ /\w/) {
260 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
261 }
262 }
263 }
264
265 # Add individual authors
266 # The author metadata will be stored as one "Creator" entry, but we
267 # also want to split it into several individual "Author" fields in
268 # "Lastname, Firstnames" format so we can browse it.
269 if ($entryname eq "author") { #added also comparison with editor
270
271 # take care of "et al."...
272 my $etal='';
273 if ($value =~ s/\s+(and\s+others|et\.?\s+al\.?)\s*$//i) {
274 $etal=' <em>et. al.</em>';
275 }
276 # und here for german language...
277 # don't use brackets in pattern, else the matched bit becomes
278 # an element in the list!
279 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
280 my @formattedlist = ();
281 foreach $a (@authorlist) {
282 $a =~ s/\s*$//;
283 $a =~ s/^\s*//;
284 # Reformat and add author name
285 next if $a=~ /^\s*$/;
286
287 # names are "First von Last", "von Last, First"
288 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
289 my $first="";
290 my $vonlast="";
291 my $jr="";
292
293 if ($a =~ /,/) {
294 my @parts=split(/,\s*/, $a);
295 $first = pop @parts;
296 if (scalar(@parts) == 2) {
297 $jr = pop @parts;
298 }
299 $vonlast=shift @parts;
300 if (scalar(@parts) > 0) {
301 print $outhandle "BibTexPlug: couldn't parse name $a\n";
302 # but we continue anyway...
303 }
304 } else { # First von Last
305 my @words = split(/ /, $a);
306 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
307 $first .= " " . shift (@words);
308 }
309 $first =~ s/^\s//;
310 $vonlast = join (' ', @words); # whatever's left...
311 }
312 my $von="";
313 my $last="";
314 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
315 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
316 $von = $1;
317 if (!defined ($von)) {
318 # some non-English names do start with lowercase
319 # eg "Marie desJardins". Also we can get typos...
320 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
321 $von="";
322 if ($vonlast =~ /^[a-z]+$/) {
323 # if it's all lowercase, uppercase 1st.
324 $vonlast =~ s/^(.)/\u$1/;
325
326 }
327 }
328 $von =~ s/\s*$//;
329 $last=$vonlast;
330 } else {
331 $last=$vonlast;
332 }
333 my $wholename="$first $von $last $jr";
334 $wholename =~ s/\s+/ /g; # squeeze multiple spaces
335 $wholename =~ s/ $//;
336 push (@formattedlist, $wholename);
337 my $fullname = "$last";
338 $fullname .= " $jr" if ($jr);
339 $fullname .= ", $first";
340 $fullname .= " $von" if ($von);
341
342 # Add each name to set of Authors
343 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
344 }
345
346 # Only want at most one "and" in the Creator field
347 if (scalar(@formattedlist) > 2) {
348 my $lastauthor=pop @formattedlist;
349 $value=join(', ', @formattedlist);
350 $value.=" and $lastauthor";
351 } else { # 1 or 2 authors...
352 $value=join(" and ",@formattedlist);
353 }
354 $value.=$etal; # if there was "et al."
355 }
356
357 # Books and Journals are additionally marked for display purposes
358 if ($entryname eq "booktitle") {
359 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
360 } elsif ($entryname eq "journal") {
361 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
362 }
363
364 # Add the various fields as metadata
365 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
366 }
367
368 # for books and journals...
369 if (!exists $metadata{'title'}) {
370 my $name=$field{'title'}; # get Greenstone metadata name
371 my $value;
372 if (exists $metadata{'booktitle'}) {
373 $value=$metadata{'booktitle'};
374 } elsif (exists $metadata{'journal'}) {
375 $value=$metadata{'journal'};
376 }
377 if ($value) {
378 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
379 }
380 }
381
382 # Add Date (yyyymmdd) metadata
383 if (defined ($metadata{'year'}) ) {
384 my $date=$metadata{'year'};
385 chomp $date;
386 my $month=$metadata{'month'};
387 if (defined($month)) {
388 # month is currently 3 letter code or a range...
389 $month = expand_month($month);
390 # take the first month found... might not find one!
391 $month =~ m/_textmonth(\d\d)_/;
392 $month = $1;
393 }
394 if (!defined($month)) {
395 $month="00";
396 }
397 $date .= "${month}00";
398 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
399}
400
401# # Add the text in BibTex format (all fields)
402 if ($text =~ /\w/) {
403
404 $text =~ s@&@&amp;@g;
405 $text =~ s@<@&lt;@g;
406 $text =~ s@>@&gt;@g;
407 $text =~ s@\n@<br/>\n@g;
408 $text =~ s@\\@\\\\@g;
409
410 $doc_obj->add_utf8_text ($cursection, $text);
411 }
412
413 return 1;
414}
415
416
417
418
419# convert email addresses and URLs into links
420sub convert_urls_into_links{
421 my ($text) = @_;
422
423 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
424 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
425
426 return $text;
427}
428
429# Clean up whitespace and convert \n charaters to <BR> or <P>
430sub clean_up_whitespaces{
431 my ($text) = @_;
432
433 $text =~ s/%%%%%/<BR> <BR>/g;
434 $text =~ s/ +/ /g;
435 $text =~ s/\s*$//;
436 $text =~ s/^\s*//;
437 $text =~ s/\n/\n<BR>/g;
438 $text =~ s/<BR>\s*<BR>/<P>/g;
439
440 return $text;
441}
442
443
444sub convert_problem_characters_without_ampersand{
445 my ($text) = @_;
446 $text =~ s/</&lt;/g;
447 $text =~ s/>/&gt;/g;
448
449 $text =~ s/\'\'/\"/g; #Latex -specific conversion
450 $text =~ s/\`\`/\"/g; #Latex -specific conversion
451
452 $text =~ s/\"/&quot;/g;
453 $text =~ s/\'/&#8217;/g;
454 $text =~ s/\`/&#8216;/g;
455
456# $text =~ s/\+/ /g;
457# $text =~ s/\(/ /g;
458# $text =~ s/\)/ /g;
459
460 $text =~ s/\\/\\\\/g;
461
462# $text =~ s/\./\\\./g;
463
464 return $text;
465}
466
467# Convert a text string into HTML.
468
469# The HTML is going to be inserted into a GML file, so we have to be
470# careful not to use symbols like ">", which occurs frequently in email
471# messages (and use &gt instead.
472
473# This function also turns URLs and email addresses into links, and
474# replaces carriage returns with <BR> tags (and multiple carriage returns
475# with <P> tags).
476
477sub text_into_html {
478 my ($text) = @_;
479
480 # Convert problem characters into HTML symbols
481 $text =~ s/&/&amp;/g;
482
483 $text = &convert_problem_characters_without_ampersand( $text );
484
485 # convert email addresses and URLs into links
486 $text = &convert_urls_into_links( $text );
487
488 $text = &clean_up_whitespaces( $text );
489
490 return $text;
491}
492
493
494sub expand_month {
495 my $text=shift;
496
497 # bibtex style files expand abbreviations for months.
498 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
499 $text =~ s/jan/_textmonth01_/g;
500 $text =~ s/feb/_textmonth02_/g;
501 $text =~ s/mar/_textmonth03_/g;
502 $text =~ s/apr/_textmonth04_/g;
503 $text =~ s/may/_textmonth05_/g;
504 $text =~ s/jun/_textmonth06_/g;
505 $text =~ s/jul/_textmonth07_/g;
506 $text =~ s/aug/_textmonth08_/g;
507 $text =~ s/sep/_textmonth09_/g;
508 $text =~ s/oct/_textmonth10_/g;
509 $text =~ s/nov/_textmonth11_/g;
510 $text =~ s/dec/_textmonth12_/g;
511
512 return $text;
513}
514
515
516# Convert accented characters, remove { }, interprete some commands....
517# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
518sub process_latex {
519 my ($text) = @_;
520
521 # note - this is really ugly, but it works. There may be a prettier way
522 # of mapping latex accented chars to utf8, but we just brute force it here.
523 # Also, this isn't complete - not every single possible accented letter
524 # is in here yet, but most of the common ones are.
525
526 my %utf8_chars =
527 (
528 # acutes
529 '\'a' => chr(0xc3).chr(0xa1),
530 '\'c' => chr(0xc4).chr(0x87),
531 '\'e' => chr(0xc3).chr(0xa9),
532 '\'i' => chr(0xc3).chr(0xad),
533 '\'l' => chr(0xc3).chr(0xba),
534 '\'n' => chr(0xc3).chr(0x84),
535 '\'o' => chr(0xc3).chr(0xb3),
536 '\'r' => chr(0xc5).chr(0x95),
537 '\'s' => chr(0xc5).chr(0x9b),
538 '\'u' => chr(0xc3).chr(0xba),
539 '\'y' => chr(0xc3).chr(0xbd),
540 '\'z' => chr(0xc5).chr(0xba),
541 # graves
542 '`a' => chr(0xc3).chr(0xa0),
543 '`A' => chr(0xc3).chr(0x80),
544 '`e' => chr(0xc3).chr(0xa8),
545 '`E' => chr(0xc3).chr(0x88),
546 '`i' => chr(0xc3).chr(0xac),
547 '`I' => chr(0xc3).chr(0x8c),
548 '`o' => chr(0xc3).chr(0xb2),
549 '`O' => chr(0xc3).chr(0x92),
550 '`u' => chr(0xc3).chr(0xb9),
551 '`U' => chr(0xc3).chr(0x99),
552 # circumflex
553 '^a' => chr(0xc3).chr(0xa2),
554 '^A' => chr(0xc3).chr(0x82),
555 '^c' => chr(0xc4).chr(0x89),
556 '^C' => chr(0xc4).chr(0x88),
557 '^e' => chr(0xc3).chr(0xaa),
558 '^E' => chr(0xc3).chr(0x8a),
559 '^g' => chr(0xc4).chr(0x9d),
560 '^G' => chr(0xc4).chr(0x9c),
561 '^h' => chr(0xc4).chr(0xa5),
562 '^H' => chr(0xc4).chr(0xa4),
563 '^i' => chr(0xc3).chr(0xae),
564 '^I' => chr(0xc3).chr(0x8e),
565 '^j' => chr(0xc4).chr(0xb5),
566 '^J' => chr(0xc4).chr(0xb4),
567 '^o' => chr(0xc3).chr(0xb4),
568 '^O' => chr(0xc3).chr(0x94),
569 '^s' => chr(0xc5).chr(0x9d),
570 '^S' => chr(0xc5).chr(0x9c),
571 '^u' => chr(0xc3).chr(0xa2),
572 '^U' => chr(0xc3).chr(0xbb),
573 '^w' => chr(0xc5).chr(0xb5),
574 '^W' => chr(0xc5).chr(0xb4),
575 '^y' => chr(0xc5).chr(0xb7),
576 '^Y' => chr(0xc5).chr(0xb6),
577
578 # diaeresis
579 '"a' => chr(0xc3).chr(0xa4),
580 '"A' => chr(0xc3).chr(0x84),
581 '"e' => chr(0xc3).chr(0xab),
582 '"E' => chr(0xc3).chr(0x8b),
583 '"\\\\i' => chr(0xc3).chr(0xaf),
584 '"\\\\I' => chr(0xc3).chr(0x8f),
585 '"o' => chr(0xc3).chr(0xb6),
586 '"O' => chr(0xc3).chr(0x96),
587 '"u' => chr(0xc3).chr(0xbc),
588 '"U' => chr(0xc3).chr(0x9c),
589 '"y' => chr(0xc3).chr(0xbf),
590 '"Y' => chr(0xc3).chr(0xb8),
591 # tilde
592 '~A' => chr(0xc3).chr(0x83),
593 '~N' => chr(0xc3).chr(0x91),
594 '~O' => chr(0xc3).chr(0x95),
595 '~a' => chr(0xc3).chr(0xa3),
596 '~n' => chr(0xc3).chr(0xb1),
597 '~o' => chr(0xc3).chr(0xb5),
598 # caron - handled specially
599 # double acute
600 # ring
601 # dot
602 '.c' => chr(0xc4).chr(0x8b),
603 '.C' => chr(0xc4).chr(0x8a),
604 '.e' => chr(0xc4).chr(0x97),
605 '.E' => chr(0xc4).chr(0x96),
606 '.g' => chr(0xc4).chr(0xa1),
607 '.G' => chr(0xc4).chr(0xa0),
608 '.I' => chr(0xc4).chr(0xb0),
609 '.z' => chr(0xc5).chr(0xbc),
610 '.Z' => chr(0xc5).chr(0xbb),
611 # macron
612 '=a' => chr(0xc4).chr(0x81),
613 '=A' => chr(0xc4).chr(0x80),
614 '=e' => chr(0xc4).chr(0x93),
615 '=E' => chr(0xc4).chr(0x92),
616 '=i' => chr(0xc4).chr(0xab),
617 '=I' => chr(0xc4).chr(0xaa),
618 '=o' => chr(0xc4).chr(0x8d),
619 '=O' => chr(0xc4).chr(0x8c),
620 '=u' => chr(0xc4).chr(0xab),
621 '=U' => chr(0xc4).chr(0xaa),
622
623 # stroke - handled specially - see below
624
625 # cedilla - handled specially
626 );
627
628# these are one letter latex commands - we make sure they're not a longer
629# command name. eg {\d} is d+stroke, so careful of \d
630 my %special_utf8_chars =
631 (
632 # breve
633 'u g' => chr(0xc4).chr(0x9f),
634 'u G' => chr(0xc4).chr(0x9e),
635 'u i' => chr(0xc4).chr(0xad),
636 'u I' => chr(0xc4).chr(0xac),
637 'u o' => chr(0xc5).chr(0x8f),
638 'u O' => chr(0xc5).chr(0x8e),
639 'u u' => chr(0xc5).chr(0xad),
640 'u U' => chr(0xc5).chr(0xac),
641 # caron
642 'v c' => chr(0xc4).chr(0x8d),
643 'v C' => chr(0xc4).chr(0x8c),
644 'v n' => chr(0xc5).chr(0x88),
645 'v N' => chr(0xc5).chr(0x87),
646 'v s' => chr(0xc5).chr(0xa1),
647 'v S' => chr(0xc5).chr(0xa5),
648 'v z' => chr(0xc5).chr(0xbe),
649 'v Z' => chr(0xc5).chr(0xbd),
650 # cedilla
651 'c c' => chr(0xc3).chr(0xa7),
652 'c C' => chr(0xc3).chr(0x87),
653 'c g' => chr(0xc4).chr(0xa3),
654 'c G' => chr(0xc4).chr(0xa2),
655 'c k' => chr(0xc4).chr(0xb7),
656 'c K' => chr(0xc4).chr(0xb6),
657 'c l' => chr(0xc4).chr(0xbc),
658 'c L' => chr(0xc4).chr(0xbb),
659 'c n' => chr(0xc5).chr(0x86),
660 'c N' => chr(0xc5).chr(0x85),
661 'c r' => chr(0xc5).chr(0x97),
662 'c R' => chr(0xc5).chr(0x96),
663 'c s' => chr(0xc5).chr(0x9f),
664 'c S' => chr(0xc5).chr(0x9e),
665 'c t' => chr(0xc5).chr(0xa3),
666 'c T' => chr(0xc5).chr(0xa2),
667 # double acute / Hungarian accent
668 'H O' => chr(0xc5).chr(0x90),
669 'H o' => chr(0xc5).chr(0x91),
670 'H U' => chr(0xc5).chr(0xb0),
671 'H u' => chr(0xc5).chr(0xb1),
672
673 # stroke
674 'd' => chr(0xc4).chr(0x91),
675 'D' => chr(0xc4).chr(0x90),
676 'h' => chr(0xc4).chr(0xa7),
677# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
678 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
679 'l' => chr(0xc5).chr(0x82),
680 'L' => chr(0xc5).chr(0x81),
681 'o' => chr(0xc3).chr(0xb8),
682 'O' => chr(0xc3).chr(0x98),
683 't' => chr(0xc5).chr(0xa7),
684 'T' => chr(0xc5).chr(0xa6),
685 # german ss/szlig/sharp s
686 'ss' => chr(0xc3).chr(0x9f),
687 );
688
689 # convert latex-style accented characters.
690
691 # remove space (if any) between \ and letter to accent (eg {\' a})
692
693 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
694
695 # remove {} around a single character (eg \'{e})
696 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
697
698 # \, is another way of doing cedilla \c
699 $text =~ s@\\,(.)@\\c $1@g;
700
701 # remove {} around a single character for special 1 letter commands -
702 # need to insert a space. Eg \v{s} -> {\v s}
703 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
704
705 # only do if the text contains a '\' character.
706 if ($text =~ m|\\|) {
707 # "normal" accents - ie non-alpha latex tag
708 while ($text =~ m@\\([`'="^~\.])([\w])@) {
709 my $tex="$1$2"; my $char="$2";
710 my $replacement=$utf8_chars{$tex};
711 if (!defined($replacement)) {
712 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
713 $replacement=$char;
714 }
715 $text =~ s/\\\Q$tex/$replacement/g;
716 }
717
718 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
719 # only do the change if immediately followed by a space, }, {, or \
720 # one letter accents ( + ss)
721 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
722 my $tex=$1;
723 my $replacement=$special_utf8_chars{$tex};
724 if (!defined($replacement)) {
725 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
726 $replacement=$tex;
727 }
728 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
729
730 }
731
732 # one letter latex accent commands that affect following letter
733 while ($text =~ m@\\([uvcH]) ([\w])@) {
734 my $tex="$1 $2"; my $char="$2";
735 my $replacement=$special_utf8_chars{$tex};
736 if (!defined($replacement)) {
737 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
738 $replacement=$char;
739 }
740 $text =~ s/\\$tex/$replacement/g;
741 }
742 }
743
744 # escape html-sensitive characters
745 $text =~ s@&@&amp;@g;
746 $text =~ s@<@&lt;@g;
747 $text =~ s@>@&gt;@g;
748 $text =~ s/''/"/g; # Latex-specific
749 $text =~ s/``/"/g; # Latex-specific
750 # greenstone-specific
751 $text =~ s@\[@&\#91;@g;
752 $text =~ s@\]@&\#93;@g;
753
754 # remove latex commands
755
756 # explicitly recognised commands
757 $text =~ s@\\ldots@&hellip;@g;
758
759 # maths mode
760 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
761
762 # remove all other commands with optional arguments...
763 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
764 # $text =~ s@\\noopsort{[^}]+\}@@g;
765 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
766
767 # remove latex groupings { } (but not \{ or \} )
768 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
769 $text =~ s/^\{//; # remove { if first char
770
771 # latex characters
772 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
773 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
774 # optional break "\-"
775 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
776 # the non-macro bits have quotes around them - we just remove quotes
777 $text =~ s/([^&])[\"\#]/$1/g;
778 }
779 # dashes. Convert (m|n)-dash into single dash for html.
780 $text =~ s@\-\-+@\-@g;
781
782 # quoted { } chars
783 $text =~ s@\\{@{@g;
784 $text =~ s@\\}@}@g;
785
786 # finally to protect against macro language...
787 $text =~ s@\\@\\\\@g;
788
789 return $text;
790}
791
792
793sub process_latex_math {
794 my $text = shift;
795
796 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
797 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
798 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
799 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
800 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
801
802 # put all other command names in italics
803 $text =~ s@\\([\w]+)@<i>$1</i>@g;
804
805 return $text;
806}
807
808sub set_OID {
809 my $self = shift (@_);
810 my ($doc_obj, $id, $segment_number) = @_;
811
812 if ( $self->{'key'} eq "default") {
813 $doc_obj->set_OID("$id\_$segment_number");
814 } else {
815 $doc_obj->set_OID($self->{'key'});
816 }
817}
818
8191;
820
821
Note: See TracBrowser for help on using the repository browser.