source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 4785

Last change on this file since 4785 was 4744, checked in by mdewsnip, 21 years ago

Tidied up and structures (representing the options of the plugin) in preparation for removing the print_usage() routines.

  • Property svn:keywords set to Author Date Id Revision
File size: 22.4 KB
Line 
1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
42
43
44package BibTexPlug;
45
46use SplitPlug;
47
48# BibTexPlug is a sub-class of BasPlug.
49sub BEGIN {
50 @ISA = ('SplitPlug');
51}
52
53my $arguments =
54 [ { 'name' => "process_exp",
55 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
56 'type' => "string",
57 'reqd' => "no" ,
58 'deft' => q^(?i)\.bib$^ } ];
59
60my $options = { 'name' => "BibTexPlug",
61 'desc' => "BibTexPlug reads bibliography files in BibTex format. BibTexPlug creates a document object for every reference in the file. It is a subclass of SplitPlug, so if there are multiple records, all are read.",
62 'inherits' => "Yes",
63 'args' => $arguments };
64
65# This plugin processes files with the suffix ".bib"
66sub get_default_process_exp {
67 return q^(?i)\.bib$^;
68}
69
70# This plugin splits the input text at blank lines
71sub get_default_split_exp {
72 return q^\n+(?=@)^;
73}
74sub new {
75 my $class = shift (@_);
76 my $self = new SplitPlug ($class, @_);
77 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
78 my $option_list = $self->{'option_list'};
79 push( @{$option_list}, $options );
80 return bless $self, $class;
81}
82
83
84
85# The process function reads a single bibliographic record and stores
86# it as a new document.
87
88sub process {
89 my $self = shift (@_);
90 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
91 my $outhandle = $self->{'outhandle'};
92
93 $self->{'key'} = "default";
94
95 # Check that we're dealing with a valid BibTex record
96 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
97
98 # Ignore things we can't use
99 return 0 if ($$textref =~ /^\@String/);
100
101 # Report that we're processing the file
102 print $outhandle "BibTexPlug: processing $file\n"
103 if ($self->{'verbosity'}) > 1;
104
105
106 # This hash translates BibTex field names into metadata names. The
107 # BibTex names are taken from the "Local Guide to Latex" Graeme
108 # McKinstry. Metadata names are consistent with ReferPlug.
109
110 my %field = (
111 'address', 'PublisherAddress',
112 'author', 'Creator',
113
114 'booktitle', 'Booktitle',
115 'chapter', 'Chapter',
116 'edition', 'Edition',
117 'editor', 'Editor',
118 'institution', 'Publisher',
119 'journal', 'Journal',
120 'month', 'Month',
121 'number', 'Number',
122 'pages', 'Pages',
123 'publisher', 'Publisher',
124 'school', 'Publisher',
125 'title', 'Title',
126 'volume', 'Volume',
127 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
128
129 'keywords', 'Keywords',
130 'abstract', 'Abstract',
131 'copyright', 'Copyright'
132);
133
134 # Metadata fields
135 my %metadata;
136 my ($EntryType, $EntryID, $Creator, $Keywords, $text);
137
138 my $verbosity = $self->{'verbosity'};
139 $verbosity = 0 unless $verbosity;
140
141 # Make sure the text has exactly one entry per line -
142 # append line to previous if it doesn't start with " <key> = "
143
144 my @input_lines=split('\n', $$textref);
145 my @all_lines;
146 my $entry_line=shift @input_lines;
147 foreach my $input_line (@input_lines) {
148 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
149 # this is a new key
150 push(@all_lines, $entry_line);
151 $entry_line=$input_line;
152 } else {
153 # this is a continuation of previous line
154 $entry_line .= " " . $input_line;
155 }
156
157 }
158 # add final line, removing trailing '}'
159 $entry_line =~ s/\}\s*$//;
160 push(@all_lines, $entry_line);
161 push(@all_lines, "}");
162
163
164
165 # Read and process each line in the bib file.
166 my ($entryname, $name, $value, $line);
167 foreach $line (@all_lines) {
168
169 # Add each line. Most lines consist of a field identifer and
170 # then data, and we simply store them, though we treat some
171 # of the fields a bit differently.
172
173 $line =~ s/\s+/ /g;
174 $text .= "$line\n";
175
176 print "Processing line = $line \n" if $verbosity>=4;
177
178 # The first line is special, it contains the reference type and OID
179 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]+)\W*$/) {
180 $EntryType = $1;
181 $EntryID = $2;
182 print "** $EntryType - \"$EntryID\" \n"
183 if ($verbosity >= 4);
184 $self->{'key'} = $EntryID;
185 next;
186 }
187 if ($line =~ /\@/) {
188 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
189 if ($verbosity >= 2);
190 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
191 if ($verbosity >= 2);
192 }
193
194 # otherwise, parse the metadata out of this line
195 next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
196 $entryname = lc($1);
197 $value = $2;
198 $value =~ s/,?\s*$//; # remove trailing comma and space
199 if ($value =~ /^"/ && $value =~ /"$/) {
200 # remove surrounding " marks
201 $value =~ s/^"//; $value =~ s/"$//;
202 }
203 $value = &process_latex($value);
204
205 # Add this line of metadata
206 $metadata{$entryname} .= "$value";
207
208 }
209
210 # Add the Entry type as metadata
211 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
212
213 # Add the various field as metadata
214 foreach my $entryname (keys %metadata) {
215 next unless (defined $field{$entryname});
216 next unless (defined $metadata{$entryname});
217
218 $name = $field{$entryname};
219 $value = $metadata{$entryname};
220
221 if ($name =~ /^Month/) {
222 $value=expand_month($value);
223 }
224
225 # Several special operatons on metadata follow
226
227 # Add individual keywords.
228 # The full set of keywords will be added, in due course, as "Keywords".
229 # However, we also want to add them as individual "Keyword" metadata elements.
230 if ($entryname eq "keywords") {
231 my @keywordlist = split(/,/, $value);
232 foreach my $k (@keywordlist) {
233 $k = lc($k);
234 $k =~ s/\s*$//;
235 $k =~ s/^\s*//;
236 if ($k =~ /\w/) {
237 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
238 }
239 }
240 }
241
242 # Add individual authors
243 # The author metadata will be stored as one "Creator" entry, but we
244 # also want to split it into several individual "Author" fields in
245 # "Lastname, Firstnames" format so we can browse it.
246 if ($entryname eq "author") { #added also comparison with editor
247
248 # take care of "et al."...
249 $value =~ s/(\s+et\.?\s+al\.?)\s*$//;
250 my $etal=$1;
251 $etal="" if (!defined ($etal));
252 # und here for german language...
253 # don't use brackets in pattern, else the matched bit becomes
254 # an element in the list!
255 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
256 my @formattedlist = ();
257 foreach $a (@authorlist) {
258 $a =~ s/\s*$//;
259 $a =~ s/^\s*//;
260 # Reformat and add author name
261 next if $a=~ /^\s*$/;
262
263 # names are "First von Last", "von Last, First"
264 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
265 my $first="";
266 my $vonlast="";
267 my $jr="";
268
269 if ($a =~ /,/) {
270 my @parts=split(/,\s*/, $a);
271 $first = pop @parts;
272 if (scalar(@parts) == 2) {
273 $jr = pop @parts;
274 }
275 $vonlast=shift @parts;
276 if (scalar(@parts) > 0) {
277 print $outhandle "BibTexPlug: couldn't parse name $a\n";
278 # but we continue anyway...
279 }
280 } else { # First von Last
281 my @words = split(/ /, $a);
282 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
283 $first .= " " . shift (@words);
284 }
285 $first =~ s/^\s//;
286 $vonlast = join (' ', @words); # whatever's left...
287 }
288 my $von="";
289 my $last="";
290 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
291 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
292 $von = $1;
293 if (!defined ($von)) {
294 # some non-English names do start with lowercase
295 # eg "Marie desJardins". Also we can get typos...
296 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
297 $von="";
298 if ($vonlast =~ /^[a-z]+$/) {
299 # if it's all lowercase, uppercase 1st.
300 $vonlast =~ s/^(.)/\u$1/;
301
302 }
303 }
304 $von =~ s/\s*$//;
305 $last=$vonlast;
306 } else {
307 $last=$vonlast;
308 }
309 my $wholename="$first $von $last $jr";
310 $wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
311 push (@formattedlist, $wholename);
312 my $fullname = "$last";
313 $fullname .= " $jr" if ($jr);
314 $fullname .= ", $first";
315 $fullname .= " $von" if ($von);
316
317 # Add each name to set of Authors
318 # force utf8 pragma so that \w matches in this scope
319 use utf8;
320 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
321 }
322
323 # Only want at most one "and" in the Creator field
324 if (scalar(@formattedlist) > 2) {
325 my $lastauthor=pop @formattedlist;
326 $value=join(', ', @formattedlist);
327 $value.=" and $lastauthor";
328 } else { # 1 or 2 authors...
329 $value=join(" and ",@formattedlist);
330 }
331 $value.=$etal; # if there was "et al."
332 }
333
334 # Books and Journals are additionally marked for display purposes
335 if ($entryname eq "booktitle") {
336 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
337 } elsif ($entryname eq "journal") {
338 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
339 }
340
341
342 # Add the various fields as metadata
343 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
344 }
345
346 # Add Date (yyyymmdd) metadata
347 if (defined ($metadata{'year'}) ) {
348 my $date=$metadata{'year'};
349 chomp $date;
350 my $month=$metadata{'month'};
351 if (defined($month)) {
352 # month is currently 3 letter code or a range...
353 $month = expand_month($month);
354 # take the first month found... might not find one!
355 $month =~ m/_textmonth(\d\d)_/;
356 $month = $1;
357 }
358 if (!defined($month)) {
359 $month="00";
360 }
361 $date .= "${month}00";
362 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
363}
364
365# # Add the text in BibTex format (all fields)
366 if ($text =~ /\w/) {
367
368 $text =~ s@&@&amp;@g;
369 $text =~ s@<@&lt;@g;
370 $text =~ s@>@&gt;@g;
371 $text =~ s@\n@<br/>\n@g;
372 $text =~ s@\\@\\\\@g;
373
374 $doc_obj->add_utf8_text ($cursection, $text);
375 }
376
377 return 1;
378}
379
380
381
382
383# convert email addresses and URLs into links
384sub convert_urls_into_links{
385 my ($text) = @_;
386
387 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
388 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
389
390 return $text;
391}
392
393# Clean up whitespace and convert \n charaters to <BR> or <P>
394sub clean_up_whitespaces{
395 my ($text) = @_;
396
397 $text =~ s/%%%%%/<BR> <BR>/g;
398 $text =~ s/ +/ /g;
399 $text =~ s/\s*$//;
400 $text =~ s/^\s*//;
401 $text =~ s/\n/\n<BR>/g;
402 $text =~ s/<BR>\s*<BR>/<P>/g;
403
404 return $text;
405}
406
407
408sub convert_problem_characters_without_ampersand{
409 my ($text) = @_;
410 $text =~ s/</&lt;/g;
411 $text =~ s/>/&gt;/g;
412
413 $text =~ s/\'\'/\"/g; #Latex -specific conversion
414 $text =~ s/\`\`/\"/g; #Latex -specific conversion
415
416 $text =~ s/\"/&quot;/g;
417 $text =~ s/\'/&#8217;/g;
418 $text =~ s/\`/&#8216;/g;
419
420# $text =~ s/\+/ /g;
421# $text =~ s/\(/ /g;
422# $text =~ s/\)/ /g;
423
424 $text =~ s/\\/\\\\/g;
425
426# $text =~ s/\./\\\./g;
427
428 return $text;
429}
430
431# Convert a text string into HTML.
432
433# The HTML is going to be inserted into a GML file, so we have to be
434# careful not to use symbols like ">", which occurs frequently in email
435# messages (and use &gt instead.
436
437# This function also turns URLs and email addresses into links, and
438# replaces carriage returns with <BR> tags (and multiple carriage returns
439# with <P> tags).
440
441sub text_into_html {
442 my ($text) = @_;
443
444 # Convert problem characters into HTML symbols
445 $text =~ s/&/&amp;/g;
446
447 $text = &convert_problem_characters_without_ampersand( $text );
448
449 # convert email addresses and URLs into links
450 $text = &convert_urls_into_links( $text );
451
452 $text = &clean_up_whitespaces( $text );
453
454 return $text;
455}
456
457
458sub expand_month {
459 my $text=shift;
460
461 # bibtex style files expand abbreviations for months.
462 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
463 $text =~ s/jan/_textmonth01_/g;
464 $text =~ s/feb/_textmonth02_/g;
465 $text =~ s/mar/_textmonth03_/g;
466 $text =~ s/apr/_textmonth04_/g;
467 $text =~ s/may/_textmonth05_/g;
468 $text =~ s/jun/_textmonth06_/g;
469 $text =~ s/jul/_textmonth07_/g;
470 $text =~ s/aug/_textmonth08_/g;
471 $text =~ s/sep/_textmonth09_/g;
472 $text =~ s/oct/_textmonth10_/g;
473 $text =~ s/nov/_textmonth11_/g;
474 $text =~ s/dec/_textmonth12_/g;
475
476 return $text;
477}
478
479
480# Convert accented characters, remove { }, interprete some commands....
481# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
482sub process_latex {
483 my ($text) = @_;
484
485 # note - this is really ugly, but it works. There may be a prettier way
486 # of mapping latex accented chars to utf8, but we just brute force it here.
487 # Also, this isn't complete - not every single possible accented letter
488 # is in here yet, but most of the common ones are.
489
490 my %utf8_chars =
491 (
492 # acutes
493 '\'a' => chr(0xc3).chr(0xa1),
494 '\'c' => chr(0xc4).chr(0x87),
495 '\'e' => chr(0xc3).chr(0xa9),
496 '\'i' => chr(0xc3).chr(0xad),
497 '\'l' => chr(0xc3).chr(0xba),
498 '\'n' => chr(0xc3).chr(0x84),
499 '\'o' => chr(0xc3).chr(0xb3),
500 '\'r' => chr(0xc5).chr(0x95),
501 '\'s' => chr(0xc5).chr(0x9b),
502 '\'u' => chr(0xc3).chr(0xba),
503 '\'y' => chr(0xc3).chr(0xbd),
504 '\'z' => chr(0xc5).chr(0xba),
505 # graves
506 '`a' => chr(0xc3).chr(0xa0),
507 '`A' => chr(0xc3).chr(0x80),
508 '`e' => chr(0xc3).chr(0xa8),
509 '`E' => chr(0xc3).chr(0x88),
510 '`i' => chr(0xc3).chr(0xac),
511 '`I' => chr(0xc3).chr(0x8c),
512 '`o' => chr(0xc3).chr(0xb2),
513 '`O' => chr(0xc3).chr(0x92),
514 '`u' => chr(0xc3).chr(0xb9),
515 '`U' => chr(0xc3).chr(0x99),
516 # circumflex
517 '^a' => chr(0xc3).chr(0xa2),
518 '^A' => chr(0xc3).chr(0x82),
519 '^c' => chr(0xc4).chr(0x89),
520 '^C' => chr(0xc4).chr(0x88),
521 '^e' => chr(0xc3).chr(0xaa),
522 '^E' => chr(0xc3).chr(0x8a),
523 '^g' => chr(0xc4).chr(0x9d),
524 '^G' => chr(0xc4).chr(0x9c),
525 '^h' => chr(0xc4).chr(0xa5),
526 '^H' => chr(0xc4).chr(0xa4),
527 '^i' => chr(0xc3).chr(0xae),
528 '^I' => chr(0xc3).chr(0x8e),
529 '^j' => chr(0xc4).chr(0xb5),
530 '^J' => chr(0xc4).chr(0xb4),
531 '^o' => chr(0xc3).chr(0xb4),
532 '^O' => chr(0xc3).chr(0x94),
533 '^s' => chr(0xc5).chr(0x9d),
534 '^S' => chr(0xc5).chr(0x9c),
535 '^u' => chr(0xc3).chr(0xa2),
536 '^U' => chr(0xc3).chr(0xbb),
537 '^w' => chr(0xc5).chr(0xb5),
538 '^W' => chr(0xc5).chr(0xb4),
539 '^y' => chr(0xc5).chr(0xb7),
540 '^Y' => chr(0xc5).chr(0xb6),
541
542 # diaeresis
543 '"a' => chr(0xc3).chr(0xa4),
544 '"A' => chr(0xc3).chr(0x84),
545 '"e' => chr(0xc3).chr(0xab),
546 '"E' => chr(0xc3).chr(0x8b),
547 '"\\\\i' => chr(0xc3).chr(0xaf),
548 '"\\\\I' => chr(0xc3).chr(0x8f),
549 '"o' => chr(0xc3).chr(0xb6),
550 '"O' => chr(0xc3).chr(0x96),
551 '"u' => chr(0xc3).chr(0xbc),
552 '"U' => chr(0xc3).chr(0x9c),
553 '"y' => chr(0xc3).chr(0xbf),
554 '"Y' => chr(0xc3).chr(0xb8),
555 # tilde
556 '~A' => chr(0xc3).chr(0x83),
557 '~N' => chr(0xc3).chr(0x91),
558 '~O' => chr(0xc3).chr(0x95),
559 '~a' => chr(0xc3).chr(0xa3),
560 '~n' => chr(0xc3).chr(0xb1),
561 '~o' => chr(0xc3).chr(0xb5),
562 # caron - handled specially
563 # double acute
564 # ring
565 # dot
566 '.c' => chr(0xc4).chr(0x8b),
567 '.C' => chr(0xc4).chr(0x8a),
568 '.e' => chr(0xc4).chr(0x97),
569 '.E' => chr(0xc4).chr(0x96),
570 '.g' => chr(0xc4).chr(0xa1),
571 '.G' => chr(0xc4).chr(0xa0),
572 '.I' => chr(0xc4).chr(0xb0),
573 '.z' => chr(0xc5).chr(0xbc),
574 '.Z' => chr(0xc5).chr(0xbb),
575 # macron
576 '=a' => chr(0xc4).chr(0x81),
577 '=A' => chr(0xc4).chr(0x80),
578 '=e' => chr(0xc4).chr(0x93),
579 '=E' => chr(0xc4).chr(0x92),
580 '=i' => chr(0xc4).chr(0xab),
581 '=I' => chr(0xc4).chr(0xaa),
582 '=o' => chr(0xc4).chr(0x8d),
583 '=O' => chr(0xc4).chr(0x8c),
584 '=u' => chr(0xc4).chr(0xab),
585 '=U' => chr(0xc4).chr(0xaa),
586
587 # stroke - handled specially - see below
588
589 # cedilla - handled specially
590 );
591
592# these are one letter latex commands - we make sure they're not a longer
593# command name. eg {\d} is d+stroke, so careful of \d
594 my %special_utf8_chars =
595 (
596 # breve
597 'u g' => chr(0xc4).chr(0x9f),
598 'u G' => chr(0xc4).chr(0x9e),
599 'u i' => chr(0xc4).chr(0xad),
600 'u I' => chr(0xc4).chr(0xac),
601 'u o' => chr(0xc5).chr(0x8f),
602 'u O' => chr(0xc5).chr(0x8e),
603 'u u' => chr(0xc5).chr(0xad),
604 'u U' => chr(0xc5).chr(0xac),
605 # caron
606 'v c' => chr(0xc4).chr(0x8d),
607 'v C' => chr(0xc4).chr(0x8c),
608 'v n' => chr(0xc5).chr(0x88),
609 'v N' => chr(0xc5).chr(0x87),
610 'v s' => chr(0xc5).chr(0xa1),
611 'v S' => chr(0xc5).chr(0xa5),
612 'v z' => chr(0xc5).chr(0xbe),
613 'v Z' => chr(0xc5).chr(0xbd),
614 # cedilla
615 'c c' => chr(0xc3).chr(0xa7),
616 'c C' => chr(0xc3).chr(0x87),
617 'c g' => chr(0xc4).chr(0xa3),
618 'c G' => chr(0xc4).chr(0xa2),
619 'c k' => chr(0xc4).chr(0xb7),
620 'c K' => chr(0xc4).chr(0xb6),
621 'c l' => chr(0xc4).chr(0xbc),
622 'c L' => chr(0xc4).chr(0xbb),
623 'c n' => chr(0xc5).chr(0x86),
624 'c N' => chr(0xc5).chr(0x85),
625 'c r' => chr(0xc5).chr(0x97),
626 'c R' => chr(0xc5).chr(0x96),
627 'c s' => chr(0xc5).chr(0x9f),
628 'c S' => chr(0xc5).chr(0x9e),
629 'c t' => chr(0xc5).chr(0xa3),
630 'c T' => chr(0xc5).chr(0xa2),
631 # double acute / Hungarian accent
632 'H O' => chr(0xc5).chr(0x90),
633 'H o' => chr(0xc5).chr(0x91),
634 'H U' => chr(0xc5).chr(0xb0),
635 'H u' => chr(0xc5).chr(0xb1),
636
637 # stroke
638 'd' => chr(0xc4).chr(0x91),
639 'D' => chr(0xc4).chr(0x90),
640 'h' => chr(0xc4).chr(0xa7),
641# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
642 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
643 'l' => chr(0xc5).chr(0x82),
644 'L' => chr(0xc5).chr(0x81),
645 'o' => chr(0xc3).chr(0xb8),
646 'O' => chr(0xc3).chr(0x98),
647 't' => chr(0xc5).chr(0xa7),
648 'T' => chr(0xc5).chr(0xa6),
649 # german ss/szlig/sharp s
650 'ss' => chr(0xc3).chr(0x9f),
651 );
652
653 # convert latex-style accented characters.
654
655 # remove space (if any) between \ and letter to accent (eg {\' a})
656
657 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
658
659 # remove {} around a single character (eg \'{e})
660 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
661
662 # \, is another way of doing cedilla \c
663 $text =~ s@\\,(.)@\\c $1@g;
664
665 # remove {} around a single character for special 1 letter commands -
666 # need to insert a space. Eg \v{s} -> {\v s}
667 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
668
669 # only do if the text contains a '\' character.
670 if ($text =~ m|\\|) {
671 # "normal" accents - ie non-alpha latex tag
672 while ($text =~ m@\\([`'="^~\.])([\w])@) {
673 my $tex="$1$2"; my $char="$2";
674 my $replacement=$utf8_chars{$tex};
675 if (!defined($replacement)) {
676 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
677 $replacement=$char;
678 }
679 $text =~ s/\\$tex/$replacement/g;
680 }
681
682 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
683 # only do the change if immediately followed by a space, }, {, or \
684 # one letter accents ( + ss)
685 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
686 my $tex=$1;
687 my $replacement=$special_utf8_chars{$tex};
688 if (!defined($replacement)) {
689 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
690 $replacement=$tex;
691 }
692 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
693
694 }
695
696 # one letter latex accent commands that affect following letter
697 while ($text =~ m@\\([uvcH]) ([\w])@) {
698 my $tex="$1 $2"; my $char="$2";
699 my $replacement=$special_utf8_chars{$tex};
700 if (!defined($replacement)) {
701 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
702 $replacement=$char;
703 }
704 $text =~ s/\\$tex/$replacement/g;
705 }
706 }
707
708 # escape html-sensitive characters
709 $text =~ s@&@&amp;@g;
710 $text =~ s@<@&lt;@g;
711 $text =~ s@>@&gt;@g;
712 $text =~ s/''/"/g; # Latex-specific
713 $text =~ s/``/"/g; # Latex-specific
714 # greenstone-specific
715 $text =~ s@\[@&\#91;@g;
716 $text =~ s@\]@&\#93;@g;
717
718 # remove latex commands
719
720 # explicitly recognised commands
721 $text =~ s@\\ldots@&hellip;@g;
722
723 # maths mode
724 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
725
726 # remove all other commands with optional arguments...
727 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
728 # $text =~ s@\\noopsort{[^}]+\}@@g;
729 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
730
731 # remove latex groupings { } (but not \{ or \} )
732 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
733 $text =~ s/^\{//; # remove { if first char
734
735 # latex characters
736 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
737 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
738 # optional break "\-"
739 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
740 # the non-macro bits have quotes around them - we just remove quotes
741 $text =~ s/([^&])[\"\#]/$1/g;
742 }
743 # dashes. Convert (m|n)-dash into single dash for html.
744 $text =~ s@\-\-+@\-@g;
745
746 # quoted { } chars
747 $text =~ s@\\{@{@g;
748 $text =~ s@\\}@}@g;
749
750 # finally to protect against macro language...
751 $text =~ s@\\@\\\\@g;
752
753 return $text;
754}
755
756
757sub process_latex_math {
758 my $text = shift;
759
760 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
761 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
762 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
763 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
764 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
765
766 # put all other command names in italics
767 $text =~ s@\\([\w]+)@<i>$1</i>@g;
768
769 return $text;
770}
771
772sub set_OID {
773 my $self = shift (@_);
774 my ($doc_obj, $id, $segment_number) = @_;
775
776 if ( $self->{'key'} eq "default") {
777 $doc_obj->set_OID();
778 } else {
779 $doc_obj->set_OID($self->{'key'});
780 }
781}
782
7831;
Note: See TracBrowser for help on using the repository browser.