source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 13416

Last change on this file since 13416 was 13416, checked in by kjdon, 17 years ago

remove surrounding {} as well as surrounding "" from value

  • Property svn:keywords set to Author Date Id Revision
File size: 23.6 KB
Line 
1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
42
43
44package BibTexPlug;
45
46use SplitPlug;
47use strict;
48no strict 'refs'; # allow filehandles to be variables and viceversa
49
50# BibTexPlug is a sub-class of BasPlug.
51sub BEGIN {
52 @BibTexPlug::ISA = ('SplitPlug');
53}
54
55my $arguments =
56 [ { 'name' => "process_exp",
57 'desc' => "{BasPlug.process_exp}",
58 'type' => "regexp",
59 'reqd' => "no",
60 'deft' => &get_default_process_exp() },
61 { 'name' => "split_exp",
62 'desc' => "{SplitPlug.split_exp}",
63 'type' => "regexp",
64 'deft' => &get_default_split_exp(),
65 'reqd' => "no" }
66 ];
67
68my $options = { 'name' => "BibTexPlug",
69 'desc' => "{BibTexPlug.desc}",
70 'abstract' => "no",
71 'inherits' => "yes",
72 'explodes' => "yes",
73 'args' => $arguments };
74
75# This plugin processes files with the suffix ".bib"
76sub get_default_process_exp {
77 return q^(?i)\.bib$^;
78}
79
80# This plugin splits the input text at blank lines
81sub get_default_split_exp {
82 return q^\n+(?=@)^;
83}
84sub new {
85 my ($class) = shift (@_);
86 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87 push(@$pluginlist, $class);
88
89 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
90 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
91
92 my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
93
94 return bless $self, $class;
95}
96
97
98
99# The process function reads a single bibliographic record and stores
100# it as a new document.
101
102sub process {
103 my $self = shift (@_);
104 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
105 my $outhandle = $self->{'outhandle'};
106
107 my $cursection = $doc_obj->get_top_section();
108 $self->{'key'} = "default";
109
110 # Check that we're dealing with a valid BibTex record
111 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
112
113 # Ignore things we can't use
114 return 0 if ($$textref =~ /^\@String/);
115
116 # Report that we're processing the file
117 print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
118 print $outhandle "BibTexPlug: processing $file\n"
119 if ($self->{'verbosity'}) > 1;
120
121
122 # This hash translates BibTex field names into metadata names. The
123 # BibTex names are taken from the "Local Guide to Latex" Graeme
124 # McKinstry. Metadata names are consistent with ReferPlug.
125
126 # The author metadata will be stored as one "Creator" entry, but will
127 # also be split up into several individual "Author" fields.
128
129 my %field = (
130 'address', 'PublisherAddress',
131 'author', 'Creator',
132
133 'booktitle', 'Booktitle',
134 'chapter', 'Chapter',
135 'edition', 'Edition',
136 'editor', 'Editor',
137 'institution', 'Publisher',
138 'journal', 'Journal',
139 'month', 'Month',
140 'number', 'Number',
141 'pages', 'Pages',
142 'publisher', 'Publisher',
143 'school', 'Publisher',
144 'title', 'Title',
145 'volume', 'Volume',
146 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
147
148 'keywords', 'Keywords',
149 'abstract', 'Abstract',
150 'copyright', 'Copyright',
151 'note', 'Note',
152 'url', 'URL',
153 );
154
155 # Metadata fields
156 my %metadata;
157 my ($EntryType, $Creator, $Keywords, $text);
158
159 my $verbosity = $self->{'verbosity'};
160 $verbosity = 0 unless $verbosity;
161
162 # Make sure the text has exactly one entry per line -
163 # append line to previous if it doesn't start with " <key> = "
164
165 my @input_lines=split(/\r?\n/, $$textref);
166 my @all_lines;
167 my $entry_line=shift @input_lines;
168 foreach my $input_line (@input_lines) {
169 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
170 # this is a new key
171 push(@all_lines, $entry_line);
172 $entry_line=$input_line;
173 } else {
174 # this is a continuation of previous line
175 $entry_line .= " " . $input_line;
176 }
177
178 }
179 # add final line, removing trailing '}'
180 $entry_line =~ s/\}\s*$//;
181 push(@all_lines, $entry_line);
182 push(@all_lines, "}");
183
184 # Read and process each line in the bib file.
185 my ($entryname, $name, $value, $line);
186 foreach $line (@all_lines) {
187
188 # Add each line. Most lines consist of a field identifer and
189 # then data, and we simply store them, though we treat some
190 # of the fields a bit differently.
191
192 $line =~ s/\s+/ /g;
193 $text .= "$line\n";
194
195
196 print "Processing line = $line \n" if $verbosity>=4;
197
198 # The first line is special, it contains the reference type and OID
199 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]*)\W*$/) {
200 $EntryType = $1;
201 my $EntryID = (defined $2) ? $2 : "default";
202 print "** $EntryType - \"$EntryID\" \n"
203 if ($verbosity >= 4);
204
205 next;
206 }
207 if ($line =~ /\@/) {
208 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
209 if ($verbosity >= 2);
210 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
211 if ($verbosity >= 2);
212 }
213
214 # otherwise, parse the metadata out of this line
215 next unless ($line =~ /^\s*(\w+)\s*=\s*(.*)/);
216 $entryname = lc($1);
217 $value = $2;
218 $value =~ s/,?\s*$//; # remove trailing comma and space
219 if ($value =~ /^"/ && $value =~ /"$/) {
220 # remove surrounding " marks
221 $value =~ s/^"//; $value =~ s/"$//;
222 } elsif ($value =~ /^\{/ && $value =~ /\}$/) {
223 # remove surrounding {} marks
224 $value =~ s/^\{//; $value =~ s/\}$//;
225 }
226 # special case for year - we only want a 4 digit year, not
227 # "circa 2004" etc
228 if ($entryname=~ /^year$/i) {
229 if ($value=~ /(\d{4})/) {
230 $value=$1;
231 $metadata{$entryname} .= "$value";
232 }
233 }
234 else {
235
236 $value = &process_latex($value);
237 # Add this line of metadata
238 $metadata{$entryname} .= "$value";
239 }
240 }
241
242 # Add the Entry type as metadata
243 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
244
245 #Add the fileformat as metadata
246 $doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
247
248
249 # Add the various field as metadata
250 foreach my $entryname (keys %metadata) {
251 next unless (defined $field{$entryname});
252 next unless (defined $metadata{$entryname});
253
254 $name = $field{$entryname};
255 $value = $metadata{$entryname};
256
257 if ($name =~ /^Month/) {
258 $value=expand_month($value);
259 }
260
261 # Several special operatons on metadata follow
262
263 # Add individual keywords.
264 # The full set of keywords will be added, in due course, as "Keywords".
265 # However, we also want to add them as individual "Keyword" metadata elements.
266 if ($entryname eq "keywords") {
267 my @keywordlist = split(/,/, $value);
268 foreach my $k (@keywordlist) {
269 $k = lc($k);
270 $k =~ s/\s*$//;
271 $k =~ s/^\s*//;
272 if ($k =~ /\w/) {
273 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
274 }
275 }
276 }
277
278 # Add individual authors
279 # The author metadata will be stored as one "Creator" entry, but we
280 # also want to split it into several individual "Author" fields in
281 # "Lastname, Firstnames" format so we can browse it.
282 if ($entryname eq "author") { #added also comparison with editor
283
284 # take care of "et al."...
285 my $etal='';
286 if ($value =~ s/\s+(and\s+others|et\.?\s+al\.?)\s*$//i) {
287 $etal=' <em>et. al.</em>';
288 }
289 # und here for german language...
290 # don't use brackets in pattern, else the matched bit becomes
291 # an element in the list!
292 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
293 my @formattedlist = ();
294 foreach $a (@authorlist) {
295 $a =~ s/\s*$//;
296 $a =~ s/^\s*//;
297 # Reformat and add author name
298 next if $a=~ /^\s*$/;
299
300 # names are "First von Last", "von Last, First"
301 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
302 my $first="";
303 my $vonlast="";
304 my $jr="";
305
306 if ($a =~ /,/) {
307 my @parts=split(/,\s*/, $a);
308 $first = pop @parts;
309 if (scalar(@parts) == 2) {
310 $jr = pop @parts;
311 }
312 $vonlast=shift @parts;
313 if (scalar(@parts) > 0) {
314 print $outhandle "BibTexPlug: couldn't parse name $a\n";
315 # but we continue anyway...
316 }
317 } else { # First von Last
318 my @words = split(/ /, $a);
319 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
320 $first .= " " . shift (@words);
321 }
322 $first =~ s/^\s//;
323 $vonlast = join (' ', @words); # whatever's left...
324 }
325 my $von="";
326 my $last="";
327 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
328 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
329 $von = $1;
330 if (!defined ($von)) {
331 # some non-English names do start with lowercase
332 # eg "Marie desJardins". Also we can get typos...
333 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
334 $von="";
335 if ($vonlast =~ /^[a-z]+$/) {
336 # if it's all lowercase, uppercase 1st.
337 $vonlast =~ s/^(.)/\u$1/;
338
339 }
340 }
341 $von =~ s/\s*$//;
342 $last=$vonlast;
343 } else {
344 $last=$vonlast;
345 }
346 my $wholename="$first $von $last $jr";
347 $wholename =~ s/\s+/ /g; # squeeze multiple spaces
348 $wholename =~ s/ $//;
349 push (@formattedlist, $wholename);
350 my $fullname = "$last";
351 $fullname .= " $jr" if ($jr);
352 $fullname .= ", $first";
353 $fullname .= " $von" if ($von);
354
355 # Add each name to set of Authors
356 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
357 }
358
359 # Only want at most one "and" in the Creator field
360 if (scalar(@formattedlist) > 2) {
361 my $lastauthor=pop @formattedlist;
362 $value=join(', ', @formattedlist);
363 $value.=" and $lastauthor";
364 } else { # 1 or 2 authors...
365 $value=join(" and ",@formattedlist);
366 }
367 $value.=$etal; # if there was "et al."
368 }
369
370 # Books and Journals are additionally marked for display purposes
371 if ($entryname eq "booktitle") {
372 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
373 } elsif ($entryname eq "journal") {
374 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
375 }
376
377 # Add the various fields as metadata
378 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
379
380 }
381
382 # for books and journals...
383 if (!exists $metadata{'title'}) {
384 my $name=$field{'title'}; # get Greenstone metadata name
385 my $value;
386 if (exists $metadata{'booktitle'}) {
387 $value=$metadata{'booktitle'};
388 } elsif (exists $metadata{'journal'}) {
389 $value=$metadata{'journal'};
390 }
391 if ($value) {
392 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
393 }
394 }
395
396 # Add Date (yyyymmdd) metadata
397 if (defined ($metadata{'year'}) ) {
398 my $date=$metadata{'year'};
399 chomp $date;
400 my $month=$metadata{'month'};
401 if (defined($month)) {
402 # month is currently 3 letter code or a range...
403 $month = expand_month($month);
404 # take the first month found... might not find one!
405 $month =~ m/_textmonth(\d\d)_/;
406 $month = $1;
407 }
408 if (!defined($month)) {
409 $month="00";
410 }
411 $date .= "${month}00";
412 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
413}
414
415# # Add the text in BibTex format (all fields)
416 if ($text =~ /\w/) {
417
418 $text =~ s@&@&amp;@g;
419 $text =~ s@<@&lt;@g;
420 $text =~ s@>@&gt;@g;
421 $text =~ s@\n@<br/>\n@g;
422 $text =~ s@\\@\\\\@g;
423
424 $doc_obj->add_utf8_text ($cursection, $text);
425 }
426
427 return 1;
428}
429
430
431
432
433# convert email addresses and URLs into links
434sub convert_urls_into_links{
435 my ($text) = @_;
436
437 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
438 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
439
440 return $text;
441}
442
443# Clean up whitespace and convert \n charaters to <BR> or <P>
444sub clean_up_whitespaces{
445 my ($text) = @_;
446
447 $text =~ s/%%%%%/<BR> <BR>/g;
448 $text =~ s/ +/ /g;
449 $text =~ s/\s*$//;
450 $text =~ s/^\s*//;
451 $text =~ s/\n/\n<BR>/g;
452 $text =~ s/<BR>\s*<BR>/<P>/g;
453
454 return $text;
455}
456
457
458sub convert_problem_characters_without_ampersand{
459 my ($text) = @_;
460 $text =~ s/</&lt;/g;
461 $text =~ s/>/&gt;/g;
462
463 $text =~ s/\'\'/\"/g; #Latex -specific conversion
464 $text =~ s/\`\`/\"/g; #Latex -specific conversion
465
466 $text =~ s/\"/&quot;/g;
467 $text =~ s/\'/&#8217;/g;
468 $text =~ s/\`/&#8216;/g;
469
470# $text =~ s/\+/ /g;
471# $text =~ s/\(/ /g;
472# $text =~ s/\)/ /g;
473
474 $text =~ s/\\/\\\\/g;
475
476# $text =~ s/\./\\\./g;
477
478 return $text;
479}
480
481# Convert a text string into HTML.
482
483# The HTML is going to be inserted into a GML file, so we have to be
484# careful not to use symbols like ">", which occurs frequently in email
485# messages (and use &gt instead.
486
487# This function also turns URLs and email addresses into links, and
488# replaces carriage returns with <BR> tags (and multiple carriage returns
489# with <P> tags).
490
491sub text_into_html {
492 my ($text) = @_;
493
494 # Convert problem characters into HTML symbols
495 $text =~ s/&/&amp;/g;
496
497 $text = &convert_problem_characters_without_ampersand( $text );
498
499 # convert email addresses and URLs into links
500 $text = &convert_urls_into_links( $text );
501
502 $text = &clean_up_whitespaces( $text );
503
504 return $text;
505}
506
507
508sub expand_month {
509 my $text=shift;
510
511 # bibtex style files expand abbreviations for months.
512 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
513 $text =~ s/jan/_textmonth01_/g;
514 $text =~ s/feb/_textmonth02_/g;
515 $text =~ s/mar/_textmonth03_/g;
516 $text =~ s/apr/_textmonth04_/g;
517 $text =~ s/may/_textmonth05_/g;
518 $text =~ s/jun/_textmonth06_/g;
519 $text =~ s/jul/_textmonth07_/g;
520 $text =~ s/aug/_textmonth08_/g;
521 $text =~ s/sep/_textmonth09_/g;
522 $text =~ s/oct/_textmonth10_/g;
523 $text =~ s/nov/_textmonth11_/g;
524 $text =~ s/dec/_textmonth12_/g;
525
526 return $text;
527}
528
529
530# Convert accented characters, remove { }, interprete some commands....
531# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
532sub process_latex {
533 my ($text) = @_;
534
535 # note - this is really ugly, but it works. There may be a prettier way
536 # of mapping latex accented chars to utf8, but we just brute force it here.
537 # Also, this isn't complete - not every single possible accented letter
538 # is in here yet, but most of the common ones are.
539
540 my %utf8_chars =
541 (
542 # acutes
543 '\'a' => chr(0xc3).chr(0xa1),
544 '\'c' => chr(0xc4).chr(0x87),
545 '\'e' => chr(0xc3).chr(0xa9),
546 '\'i' => chr(0xc3).chr(0xad),
547 '\'l' => chr(0xc3).chr(0xba),
548 '\'n' => chr(0xc3).chr(0x84),
549 '\'o' => chr(0xc3).chr(0xb3),
550 '\'r' => chr(0xc5).chr(0x95),
551 '\'s' => chr(0xc5).chr(0x9b),
552 '\'u' => chr(0xc3).chr(0xba),
553 '\'y' => chr(0xc3).chr(0xbd),
554 '\'z' => chr(0xc5).chr(0xba),
555 # graves
556 '`a' => chr(0xc3).chr(0xa0),
557 '`A' => chr(0xc3).chr(0x80),
558 '`e' => chr(0xc3).chr(0xa8),
559 '`E' => chr(0xc3).chr(0x88),
560 '`i' => chr(0xc3).chr(0xac),
561 '`I' => chr(0xc3).chr(0x8c),
562 '`o' => chr(0xc3).chr(0xb2),
563 '`O' => chr(0xc3).chr(0x92),
564 '`u' => chr(0xc3).chr(0xb9),
565 '`U' => chr(0xc3).chr(0x99),
566 # circumflex
567 '^a' => chr(0xc3).chr(0xa2),
568 '^A' => chr(0xc3).chr(0x82),
569 '^c' => chr(0xc4).chr(0x89),
570 '^C' => chr(0xc4).chr(0x88),
571 '^e' => chr(0xc3).chr(0xaa),
572 '^E' => chr(0xc3).chr(0x8a),
573 '^g' => chr(0xc4).chr(0x9d),
574 '^G' => chr(0xc4).chr(0x9c),
575 '^h' => chr(0xc4).chr(0xa5),
576 '^H' => chr(0xc4).chr(0xa4),
577 '^i' => chr(0xc3).chr(0xae),
578 '^I' => chr(0xc3).chr(0x8e),
579 '^j' => chr(0xc4).chr(0xb5),
580 '^J' => chr(0xc4).chr(0xb4),
581 '^o' => chr(0xc3).chr(0xb4),
582 '^O' => chr(0xc3).chr(0x94),
583 '^s' => chr(0xc5).chr(0x9d),
584 '^S' => chr(0xc5).chr(0x9c),
585 '^u' => chr(0xc3).chr(0xa2),
586 '^U' => chr(0xc3).chr(0xbb),
587 '^w' => chr(0xc5).chr(0xb5),
588 '^W' => chr(0xc5).chr(0xb4),
589 '^y' => chr(0xc5).chr(0xb7),
590 '^Y' => chr(0xc5).chr(0xb6),
591
592 # diaeresis
593 '"a' => chr(0xc3).chr(0xa4),
594 '"A' => chr(0xc3).chr(0x84),
595 '"e' => chr(0xc3).chr(0xab),
596 '"E' => chr(0xc3).chr(0x8b),
597 '"\\\\i' => chr(0xc3).chr(0xaf),
598 '"\\\\I' => chr(0xc3).chr(0x8f),
599 '"o' => chr(0xc3).chr(0xb6),
600 '"O' => chr(0xc3).chr(0x96),
601 '"u' => chr(0xc3).chr(0xbc),
602 '"U' => chr(0xc3).chr(0x9c),
603 '"y' => chr(0xc3).chr(0xbf),
604 '"Y' => chr(0xc3).chr(0xb8),
605 # tilde
606 '~A' => chr(0xc3).chr(0x83),
607 '~N' => chr(0xc3).chr(0x91),
608 '~O' => chr(0xc3).chr(0x95),
609 '~a' => chr(0xc3).chr(0xa3),
610 '~n' => chr(0xc3).chr(0xb1),
611 '~o' => chr(0xc3).chr(0xb5),
612 # caron - handled specially
613 # double acute
614 # ring
615 # dot
616 '.c' => chr(0xc4).chr(0x8b),
617 '.C' => chr(0xc4).chr(0x8a),
618 '.e' => chr(0xc4).chr(0x97),
619 '.E' => chr(0xc4).chr(0x96),
620 '.g' => chr(0xc4).chr(0xa1),
621 '.G' => chr(0xc4).chr(0xa0),
622 '.I' => chr(0xc4).chr(0xb0),
623 '.z' => chr(0xc5).chr(0xbc),
624 '.Z' => chr(0xc5).chr(0xbb),
625 # macron
626 '=a' => chr(0xc4).chr(0x81),
627 '=A' => chr(0xc4).chr(0x80),
628 '=e' => chr(0xc4).chr(0x93),
629 '=E' => chr(0xc4).chr(0x92),
630 '=i' => chr(0xc4).chr(0xab),
631 '=I' => chr(0xc4).chr(0xaa),
632 '=o' => chr(0xc4).chr(0x8d),
633 '=O' => chr(0xc4).chr(0x8c),
634 '=u' => chr(0xc4).chr(0xab),
635 '=U' => chr(0xc4).chr(0xaa),
636
637 # stroke - handled specially - see below
638
639 # cedilla - handled specially
640 );
641
642# these are one letter latex commands - we make sure they're not a longer
643# command name. eg {\d} is d+stroke, so careful of \d
644 my %special_utf8_chars =
645 (
646 # breve
647 'u g' => chr(0xc4).chr(0x9f),
648 'u G' => chr(0xc4).chr(0x9e),
649 'u i' => chr(0xc4).chr(0xad),
650 'u I' => chr(0xc4).chr(0xac),
651 'u o' => chr(0xc5).chr(0x8f),
652 'u O' => chr(0xc5).chr(0x8e),
653 'u u' => chr(0xc5).chr(0xad),
654 'u U' => chr(0xc5).chr(0xac),
655 # caron
656 'v c' => chr(0xc4).chr(0x8d),
657 'v C' => chr(0xc4).chr(0x8c),
658 'v n' => chr(0xc5).chr(0x88),
659 'v N' => chr(0xc5).chr(0x87),
660 'v s' => chr(0xc5).chr(0xa1),
661 'v S' => chr(0xc5).chr(0xa5),
662 'v z' => chr(0xc5).chr(0xbe),
663 'v Z' => chr(0xc5).chr(0xbd),
664 # cedilla
665 'c c' => chr(0xc3).chr(0xa7),
666 'c C' => chr(0xc3).chr(0x87),
667 'c g' => chr(0xc4).chr(0xa3),
668 'c G' => chr(0xc4).chr(0xa2),
669 'c k' => chr(0xc4).chr(0xb7),
670 'c K' => chr(0xc4).chr(0xb6),
671 'c l' => chr(0xc4).chr(0xbc),
672 'c L' => chr(0xc4).chr(0xbb),
673 'c n' => chr(0xc5).chr(0x86),
674 'c N' => chr(0xc5).chr(0x85),
675 'c r' => chr(0xc5).chr(0x97),
676 'c R' => chr(0xc5).chr(0x96),
677 'c s' => chr(0xc5).chr(0x9f),
678 'c S' => chr(0xc5).chr(0x9e),
679 'c t' => chr(0xc5).chr(0xa3),
680 'c T' => chr(0xc5).chr(0xa2),
681 # double acute / Hungarian accent
682 'H O' => chr(0xc5).chr(0x90),
683 'H o' => chr(0xc5).chr(0x91),
684 'H U' => chr(0xc5).chr(0xb0),
685 'H u' => chr(0xc5).chr(0xb1),
686
687 # stroke
688 'd' => chr(0xc4).chr(0x91),
689 'D' => chr(0xc4).chr(0x90),
690 'h' => chr(0xc4).chr(0xa7),
691# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
692 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
693 'l' => chr(0xc5).chr(0x82),
694 'L' => chr(0xc5).chr(0x81),
695 'o' => chr(0xc3).chr(0xb8),
696 'O' => chr(0xc3).chr(0x98),
697 't' => chr(0xc5).chr(0xa7),
698 'T' => chr(0xc5).chr(0xa6),
699 # german ss/szlig/sharp s
700 'ss' => chr(0xc3).chr(0x9f),
701 );
702
703 # convert latex-style accented characters.
704
705 # remove space (if any) between \ and letter to accent (eg {\' a})
706
707 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
708
709 # remove {} around a single character (eg \'{e})
710 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
711
712 # \, is another way of doing cedilla \c
713 $text =~ s@\\,(.)@\\c $1@g;
714
715 # remove {} around a single character for special 1 letter commands -
716 # need to insert a space. Eg \v{s} -> {\v s}
717 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
718
719 # only do if the text contains a '\' character.
720 if ($text =~ m|\\|) {
721 # "normal" accents - ie non-alpha latex tag
722 while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
723 my $tex="$1$2"; my $char="$2";
724 my $replacement=$utf8_chars{$tex};
725 if (!defined($replacement)) {
726 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
727 $replacement=$char;
728 }
729 $text =~ s/\\\Q$tex/$replacement/g;
730 }
731
732 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
733 # only do the change if immediately followed by a space, }, {, or \
734 # one letter accents ( + ss)
735 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
736 my $tex=$1;
737 my $replacement=$special_utf8_chars{$tex};
738 if (!defined($replacement)) {
739 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
740 $replacement=$tex;
741 }
742 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
743
744 }
745
746 # one letter latex accent commands that affect following letter
747 while ($text =~ m@\\([uvcH]) ([\w])@) {
748 my $tex="$1 $2"; my $char="$2";
749 my $replacement=$special_utf8_chars{$tex};
750 if (!defined($replacement)) {
751 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
752 $replacement=$char;
753 }
754 $text =~ s/\\$tex/$replacement/g;
755 }
756 }
757
758 # escape html-sensitive characters
759 $text =~ s@&@&amp;@g;
760 $text =~ s@<@&lt;@g;
761 $text =~ s@>@&gt;@g;
762 $text =~ s/''/"/g; # Latex-specific
763 $text =~ s/``/"/g; # Latex-specific
764 # greenstone-specific
765 $text =~ s@\[@&\#91;@g;
766 $text =~ s@\]@&\#93;@g;
767
768 # remove latex commands
769
770 # explicitly recognised commands
771 $text =~ s@\\ldots@&hellip;@g;
772
773 # maths mode
774 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
775
776 # remove all other commands with optional arguments...
777 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
778 # $text =~ s@\\noopsort{[^}]+\}@@g;
779 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
780
781 # remove latex groupings { } (but not \{ or \} )
782 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
783 $text =~ s/^\{//; # remove { if first char
784
785 # latex characters
786 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
787 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
788 # optional break "\-"
789 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
790 # the non-macro bits have quotes around them - we just remove quotes
791 $text =~ s/([^&])[\"\#]/$1/g;
792 }
793 # dashes. Convert (m|n)-dash into single dash for html.
794 $text =~ s@\-\-+@\-@g;
795
796 # quoted { } chars
797 $text =~ s@\\{@{@g;
798 $text =~ s@\\}@}@g;
799
800 # finally to protect against macro language...
801 $text =~ s@\\@\\\\@g;
802
803 return $text;
804}
805
806
807sub process_latex_math {
808 my $text = shift;
809
810 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
811 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
812 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
813 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
814 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
815
816 # put all other command names in italics
817 $text =~ s@\\([\w]+)@<i>$1</i>@g;
818
819 return $text;
820}
821
822sub set_OID {
823 my $self = shift (@_);
824 my ($doc_obj, $id, $segment_number) = @_;
825
826 if ( $self->{'key'} eq "default") {
827 $doc_obj->set_OID("$id\_$segment_number");
828 } else {
829 $doc_obj->set_OID($self->{'key'});
830 }
831}
832
8331;
834
835
Note: See TracBrowser for help on using the repository browser.