source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 3517

Last change on this file since 3517 was 3426, checked in by jrm21, 22 years ago

Don't add \n to the end of each metadata value.

  • Property svn:keywords set to Author Date Id Revision
File size: 21.7 KB
Line 
1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
42# * The raw ascii bibtex entry is stored as "BibTex" metadata.
43
44package BibTexPlug;
45
46use SplitPlug;
47
48# BibTexPlug is a sub-class of BasPlug.
49sub BEGIN {
50 @ISA = ('SplitPlug');
51}
52
53# This plugin processes files with the suffix ".bib"
54sub get_default_process_exp {
55 return q^(?i)\.bib$^;
56}
57
58# This plugin splits the input text at blank lines
59sub get_default_split_exp {
60 return q^\n+(?=@)^;
61}
62
63
64
65# The process function reads a single bibliographic record and stores
66# it as a new document.
67
68sub process {
69 my $self = shift (@_);
70 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
71 my $outhandle = $self->{'outhandle'};
72
73 $self->{'key'} = "default";
74
75 # Check that we're dealing with a valid BibTex record
76 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
77
78 # Ignore things we can't use
79 return 0 if ($$textref =~ /^\@String/);
80
81 # Report that we're processing the file
82 print $outhandle "BibTexPlug: processing $file\n"
83 if ($self->{'verbosity'}) > 1;
84
85
86 # This hash translates BibTex field names into metadata names. The
87 # BibTex names are taken from the "Local Guide to Latex" Graeme
88 # McKinstry. Metadata names are consistent with ReferPlug.
89
90 my %field = (
91 'address', 'PublisherAddress',
92 'author', 'Creator',
93
94 'booktitle', 'Booktitle',
95 'chapter', 'Chapter',
96 'edition', 'Edition',
97 'editor', 'Editor',
98 'institution', 'Publisher',
99 'journal', 'Journal',
100 'month', 'Month',
101 'number', 'Number',
102 'pages', 'Pages',
103 'publisher', 'Publisher',
104 'school', 'Publisher',
105 'title', 'Title',
106 'volume', 'Volume',
107 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
108
109 'keywords', 'Keywords',
110 'abstract', 'Abstract',
111 'copyright', 'Copyright'
112);
113
114 # Metadata fields
115 my %metadata;
116 my ($EntryType, $EntryID, $Creator, $Keywords, $text);
117
118 my $verbosity = $self->{'verbosity'};
119 $verbosity = 0 unless $verbosity;
120
121 # Make sure the text has exactly one entry per line -
122 # append line to previous if it doesn't start with " <key> = "
123
124 my @input_lines=split('\n', $$textref);
125 my @all_lines;
126 my $entry_line=shift @input_lines;
127 foreach my $input_line (@input_lines) {
128 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
129 # this is a new key
130 push(@all_lines, $entry_line);
131 $entry_line=$input_line;
132 } else {
133 # this is a continuation of previous line
134 $entry_line .= " " . $input_line;
135 }
136
137 }
138 # add final line, removing trailing '}'
139 $entry_line =~ s/\}\s*$//;
140 push(@all_lines, $entry_line);
141 push(@all_lines, "}");
142
143
144
145 # Read and process each line in the bib file.
146 my ($entryname, $name, $value, $line);
147 foreach $line (@all_lines) {
148
149 # Add each line. Most lines consist of a field identifer and
150 # then data, and we simply store them, though we treat some
151 # of the fields a bit differently.
152
153 $line =~ s/\s+/ /g;
154 $text .= "$line\n";
155
156 print "Processing line = $line \n" if $verbosity>=4;
157
158 # The first line is special, it contains the reference type and OID
159 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]+)\W*$/) {
160 $EntryType = $1;
161 $EntryID = $2;
162 print "** $EntryType - \"$EntryID\" \n"
163 if ($verbosity >= 4);
164 $self->{'key'} = $EntryID;
165 next;
166 }
167 if ($line =~ /\@/) {
168 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
169 if ($verbosity >= 2);
170 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
171 if ($verbosity >= 2);
172 }
173
174 # otherwise, parse the metadata out of this line
175 next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
176 $entryname = lc($1);
177 $value = $2;
178 $value =~ s/,?\s*$//; # remove trailing comma and space
179 if ($value =~ /^"/ && $value =~ /"$/) {
180 # remove surrounding " marks
181 $value =~ s/^"//; $value =~ s/"$//;
182 }
183 $value = &process_latex($value);
184
185 # Add this line of metadata
186 $metadata{$entryname} .= "$value";
187
188 }
189
190 # Add the Entry type as metadata
191 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
192
193 # Add the various field as metadata
194 foreach my $entryname (keys %metadata) {
195 next unless (defined $field{$entryname});
196 next unless (defined $metadata{$entryname});
197
198 $name = $field{$entryname};
199 $value = $metadata{$entryname};
200
201 if ($name =~ /^Month/) {
202 $value=expand_month($value);
203 }
204
205 # Several special operatons on metadata follow
206
207 # Add individual keywords.
208 # The full set of keywords will be added, in due course, as "Keywords".
209 # However, we also want to add them as individual "Keyword" metadata elements.
210 if ($entryname eq "keywords") {
211 my @keywordlist = split(/,/, $value);
212 foreach my $k (@keywordlist) {
213 $k = lc($k);
214 $k =~ s/\s*$//;
215 $k =~ s/^\s*//;
216 if ($k =~ /\w/) {
217 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
218 }
219 }
220 }
221
222 # Add individual authors
223 # The author metadata will be stored as one "Creator" entry, but we
224 # also want to split it into several individual "Author" fields in
225 # "Lastname, Firstnames" format so we can browse it.
226 if ($entryname eq "author") { #added also comparison with editor
227
228 # take care of "et al."...
229 $value =~ s/(\s+et\.?\s+al\.?)\s*$//;
230 my $etal=$1;
231 $etal="" if (!defined ($etal));
232 # und here for german language...
233 # don't use brackets in pattern, else the matched bit becomes
234 # an element in the list!
235 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
236 my @formattedlist = ();
237 foreach $a (@authorlist) {
238 $a =~ s/\s*$//;
239 $a =~ s/^\s*//;
240 # Reformat and add author name
241 next if $a=~ /^\s*$/;
242
243 # names are "First von Last", "von Last, First"
244 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
245 my $first="";
246 my $vonlast="";
247 my $jr="";
248
249 if ($a =~ /,/) {
250 my @parts=split(/,\s*/, $a);
251 $first = pop @parts;
252 if (scalar(@parts) == 2) {
253 $jr = pop @parts;
254 }
255 $vonlast=shift @parts;
256 if (scalar(@parts) > 0) {
257 print $outhandle "BibTexPlug: couldn't parse name $a\n";
258 # but we continue anyway...
259 }
260 } else { # First von Last
261 my @words = split(/ /, $a);
262 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
263 $first .= " " . shift (@words);
264 }
265 $first =~ s/^\s//;
266 $vonlast = join (' ', @words); # whatever's left...
267 }
268 my $von="";
269 my $last="";
270 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
271 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
272 $von = $1;
273 if (!defined ($von)) {
274 # some non-English names do start with lowercase
275 # eg "Marie desJardins". Also we can get typos...
276 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
277 $von="";
278 if ($vonlast =~ /^[a-z]+$/) {
279 # if it's all lowercase, uppercase 1st.
280 $vonlast =~ s/^(.)/\u$1/;
281
282 }
283 }
284 $von =~ s/\s*$//;
285 $last=$vonlast;
286 } else {
287 $last=$vonlast;
288 }
289 my $wholename="$first $von $last $jr";
290 $wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
291 push (@formattedlist, $wholename);
292 my $fullname = "$last";
293 $fullname .= " $jr" if ($jr);
294 $fullname .= ", $first";
295 $fullname .= " $von" if ($von);
296
297 # Add each name to set of Authors
298 # force utf8 pragma so that \w matches in this scope
299 use utf8;
300 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
301 }
302
303 # Only want at most one "and" in the Creator field
304 if (scalar(@formattedlist) > 2) {
305 my $lastauthor=pop @formattedlist;
306 $value=join(', ', @formattedlist);
307 $value.=" and $lastauthor";
308 } else { # 1 or 2 authors...
309 $value=join(" and ",@formattedlist);
310 }
311 $value.=$etal; # if there was "et al."
312 }
313
314 # Books and Journals are additionally marked for display purposes
315 if ($entryname eq "booktitle") {
316 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
317 } elsif ($entryname eq "journal") {
318 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
319 }
320
321
322 # Add the various fields as metadata
323 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
324 }
325
326 # Add Date (yyyymmdd) metadata
327 if (defined ($metadata{'year'}) ) {
328 my $date=$metadata{'year'};
329 chomp $date;
330 my $month=$metadata{'month'};
331 if (defined($month)) {
332 # month is currently 3 letter code or a range...
333 $month = expand_month($month);
334 # take the first month found... might not find one!
335 $month =~ m/_textmonth(\d\d)_/;
336 $month = $1;
337 }
338 if (!defined($month)) {
339 $month="00";
340 }
341 $date .= "${month}00";
342 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
343}
344
345# # Add the text in BibTex format (all fields)
346 if ($text =~ /\w/) {
347
348 $text =~ s@&@&amp;@g;
349 $text =~ s@<@&lt;@g;
350 $text =~ s@>@&gt;@g;
351 $text =~ s@\n@<br/>\n@g;
352 $text =~ s@\\@\\\\@g;
353
354# Not really required...
355# $doc_obj->add_utf8_metadata($cursection, "BibTex", $text);
356 $doc_obj->add_utf8_text ($cursection, $text);
357 }
358
359 return 1;
360}
361
362
363
364
365# convert email addresses and URLs into links
366sub convert_urls_into_links{
367 my ($text) = @_;
368
369 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
370 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
371
372 return $text;
373}
374
375# Clean up whitespace and convert \n charaters to <BR> or <P>
376sub clean_up_whitespaces{
377 my ($text) = @_;
378
379 $text =~ s/%%%%%/<BR> <BR>/g;
380 $text =~ s/ +/ /g;
381 $text =~ s/\s*$//;
382 $text =~ s/^\s*//;
383 $text =~ s/\n/\n<BR>/g;
384 $text =~ s/<BR>\s*<BR>/<P>/g;
385
386 return $text;
387}
388
389
390sub convert_problem_characters_without_ampersand{
391 my ($text) = @_;
392 $text =~ s/</&lt;/g;
393 $text =~ s/>/&gt;/g;
394
395 $text =~ s/\'\'/\"/g; #Latex -specific conversion
396 $text =~ s/\`\`/\"/g; #Latex -specific conversion
397
398 $text =~ s/\"/&quot;/g;
399 $text =~ s/\'/&#8217;/g;
400 $text =~ s/\`/&#8216;/g;
401
402# $text =~ s/\+/ /g;
403# $text =~ s/\(/ /g;
404# $text =~ s/\)/ /g;
405
406 $text =~ s/\\/\\\\/g;
407
408# $text =~ s/\./\\\./g;
409
410 return $text;
411}
412
413# Convert a text string into HTML.
414
415# The HTML is going to be inserted into a GML file, so we have to be
416# careful not to use symbols like ">", which occurs frequently in email
417# messages (and use &gt instead.
418
419# This function also turns URLs and email addresses into links, and
420# replaces carriage returns with <BR> tags (and multiple carriage returns
421# with <P> tags).
422
423sub text_into_html {
424 my ($text) = @_;
425
426 # Convert problem characters into HTML symbols
427 $text =~ s/&/&amp;/g;
428
429 $text = &convert_problem_characters_without_ampersand( $text );
430
431 # convert email addresses and URLs into links
432 $text = &convert_urls_into_links( $text );
433
434 $text = &clean_up_whitespaces( $text );
435
436 return $text;
437}
438
439
440sub expand_month {
441 my $text=shift;
442
443 # bibtex style files expand abbreviations for months.
444 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
445 $text =~ s/jan/_textmonth01_/g;
446 $text =~ s/feb/_textmonth02_/g;
447 $text =~ s/mar/_textmonth03_/g;
448 $text =~ s/apr/_textmonth04_/g;
449 $text =~ s/may/_textmonth05_/g;
450 $text =~ s/jun/_textmonth06_/g;
451 $text =~ s/jul/_textmonth07_/g;
452 $text =~ s/aug/_textmonth08_/g;
453 $text =~ s/sep/_textmonth09_/g;
454 $text =~ s/oct/_textmonth10_/g;
455 $text =~ s/nov/_textmonth11_/g;
456 $text =~ s/dec/_textmonth12_/g;
457
458 return $text;
459}
460
461
462# Convert accented characters, remove { }, interprete some commands....
463# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
464sub process_latex {
465 my ($text) = @_;
466
467 # note - this is really ugly, but it works. There may be a prettier way
468 # of mapping latex accented chars to utf8, but we just brute force it here.
469 # Also, this isn't complete - not every single possible accented letter
470 # is in here yet, but most of the common ones are.
471
472 my %utf8_chars =
473 (
474 # acutes
475 '\'a' => chr(0xc3).chr(0xa1),
476 '\'c' => chr(0xc4).chr(0x87),
477 '\'e' => chr(0xc3).chr(0xa9),
478 '\'i' => chr(0xc3).chr(0xad),
479 '\'l' => chr(0xc3).chr(0xba),
480 '\'n' => chr(0xc3).chr(0x84),
481 '\'o' => chr(0xc3).chr(0xb3),
482 '\'r' => chr(0xc5).chr(0x95),
483 '\'s' => chr(0xc5).chr(0x9b),
484 '\'u' => chr(0xc3).chr(0xba),
485 '\'y' => chr(0xc3).chr(0xbd),
486 '\'z' => chr(0xc5).chr(0xba),
487 # graves
488 '`a' => chr(0xc3).chr(0xa0),
489 '`A' => chr(0xc3).chr(0x80),
490 '`e' => chr(0xc3).chr(0xa8),
491 '`E' => chr(0xc3).chr(0x88),
492 '`i' => chr(0xc3).chr(0xac),
493 '`I' => chr(0xc3).chr(0x8c),
494 '`o' => chr(0xc3).chr(0xb2),
495 '`O' => chr(0xc3).chr(0x92),
496 '`u' => chr(0xc3).chr(0xb9),
497 '`U' => chr(0xc3).chr(0x99),
498 # circumflex
499 '^a' => chr(0xc3).chr(0xa2),
500 '^A' => chr(0xc3).chr(0x82),
501 '^c' => chr(0xc4).chr(0x89),
502 '^C' => chr(0xc4).chr(0x88),
503 '^e' => chr(0xc3).chr(0xaa),
504 '^E' => chr(0xc3).chr(0x8a),
505 '^g' => chr(0xc4).chr(0x9d),
506 '^G' => chr(0xc4).chr(0x9c),
507 '^h' => chr(0xc4).chr(0xa5),
508 '^H' => chr(0xc4).chr(0xa4),
509 '^i' => chr(0xc3).chr(0xae),
510 '^I' => chr(0xc3).chr(0x8e),
511 '^j' => chr(0xc4).chr(0xb5),
512 '^J' => chr(0xc4).chr(0xb4),
513 '^o' => chr(0xc3).chr(0xb4),
514 '^O' => chr(0xc3).chr(0x94),
515 '^s' => chr(0xc5).chr(0x9d),
516 '^S' => chr(0xc5).chr(0x9c),
517 '^u' => chr(0xc3).chr(0xa2),
518 '^U' => chr(0xc3).chr(0xbb),
519 '^w' => chr(0xc5).chr(0xb5),
520 '^W' => chr(0xc5).chr(0xb4),
521 '^y' => chr(0xc5).chr(0xb7),
522 '^Y' => chr(0xc5).chr(0xb6),
523
524 # diaeresis
525 '"a' => chr(0xc3).chr(0xa4),
526 '"A' => chr(0xc3).chr(0x84),
527 '"e' => chr(0xc3).chr(0xab),
528 '"E' => chr(0xc3).chr(0x8b),
529 '"\\\\i' => chr(0xc3).chr(0xaf),
530 '"\\\\I' => chr(0xc3).chr(0x8f),
531 '"o' => chr(0xc3).chr(0xb6),
532 '"O' => chr(0xc3).chr(0x96),
533 '"u' => chr(0xc3).chr(0xbc),
534 '"U' => chr(0xc3).chr(0x9c),
535 '"y' => chr(0xc3).chr(0xbf),
536 '"Y' => chr(0xc3).chr(0xb8),
537 # tilde
538 '~A' => chr(0xc3).chr(0x83),
539 '~N' => chr(0xc3).chr(0x91),
540 '~O' => chr(0xc3).chr(0x95),
541 '~a' => chr(0xc3).chr(0xa3),
542 '~n' => chr(0xc3).chr(0xb1),
543 '~o' => chr(0xc3).chr(0xb5),
544 # caron - handled specially
545 # double acute
546 # ring
547 # dot
548 '.c' => chr(0xc4).chr(0x8b),
549 '.C' => chr(0xc4).chr(0x8a),
550 '.e' => chr(0xc4).chr(0x97),
551 '.E' => chr(0xc4).chr(0x96),
552 '.g' => chr(0xc4).chr(0xa1),
553 '.G' => chr(0xc4).chr(0xa0),
554 '.I' => chr(0xc4).chr(0xb0),
555 '.z' => chr(0xc5).chr(0xbc),
556 '.Z' => chr(0xc5).chr(0xbb),
557 # macron
558 '=a' => chr(0xc4).chr(0x81),
559 '=A' => chr(0xc4).chr(0x80),
560 '=e' => chr(0xc4).chr(0x93),
561 '=E' => chr(0xc4).chr(0x92),
562 '=i' => chr(0xc4).chr(0xab),
563 '=I' => chr(0xc4).chr(0xaa),
564 '=o' => chr(0xc4).chr(0x8d),
565 '=O' => chr(0xc4).chr(0x8c),
566 '=u' => chr(0xc4).chr(0xab),
567 '=U' => chr(0xc4).chr(0xaa),
568
569 # stroke - handled specially - see below
570
571 # cedilla - handled specially
572 );
573
574# these are one letter latex commands - we make sure they're not a longer
575# command name. eg {\d} is d+stroke, so careful of \d
576 my %special_utf8_chars =
577 (
578 # breve
579 'u g' => chr(0xc4).chr(0x9f),
580 'u G' => chr(0xc4).chr(0x9e),
581 'u i' => chr(0xc4).chr(0xad),
582 'u I' => chr(0xc4).chr(0xac),
583 'u o' => chr(0xc5).chr(0x8f),
584 'u O' => chr(0xc5).chr(0x8e),
585 'u u' => chr(0xc5).chr(0xad),
586 'u U' => chr(0xc5).chr(0xac),
587 # caron
588 'v c' => chr(0xc4).chr(0x8d),
589 'v C' => chr(0xc4).chr(0x8c),
590 'v n' => chr(0xc5).chr(0x88),
591 'v N' => chr(0xc5).chr(0x87),
592 'v s' => chr(0xc5).chr(0xa1),
593 'v S' => chr(0xc5).chr(0xa5),
594 'v z' => chr(0xc5).chr(0xbe),
595 'v Z' => chr(0xc5).chr(0xbd),
596 # cedilla
597 'c c' => chr(0xc3).chr(0xa7),
598 'c C' => chr(0xc3).chr(0x87),
599 'c g' => chr(0xc4).chr(0xa3),
600 'c G' => chr(0xc4).chr(0xa2),
601 'c k' => chr(0xc4).chr(0xb7),
602 'c K' => chr(0xc4).chr(0xb6),
603 'c l' => chr(0xc4).chr(0xbc),
604 'c L' => chr(0xc4).chr(0xbb),
605 'c n' => chr(0xc5).chr(0x86),
606 'c N' => chr(0xc5).chr(0x85),
607 'c r' => chr(0xc5).chr(0x97),
608 'c R' => chr(0xc5).chr(0x96),
609 'c s' => chr(0xc5).chr(0x9f),
610 'c S' => chr(0xc5).chr(0x9e),
611 'c t' => chr(0xc5).chr(0xa3),
612 'c T' => chr(0xc5).chr(0xa2),
613 # double acute / Hungarian accent
614 'H O' => chr(0xc5).chr(0x90),
615 'H o' => chr(0xc5).chr(0x91),
616 'H U' => chr(0xc5).chr(0xb0),
617 'H u' => chr(0xc5).chr(0xb1),
618
619 # stroke
620 'd' => chr(0xc4).chr(0x91),
621 'D' => chr(0xc4).chr(0x90),
622 'h' => chr(0xc4).chr(0xa7),
623# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
624 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
625 'l' => chr(0xc5).chr(0x82),
626 'L' => chr(0xc5).chr(0x81),
627 'o' => chr(0xc3).chr(0xb8),
628 'O' => chr(0xc3).chr(0x98),
629 't' => chr(0xc5).chr(0xa7),
630 'T' => chr(0xc5).chr(0xa6),
631 # german ss/szlig/sharp s
632 'ss' => chr(0xc3).chr(0x9f),
633 );
634
635 # convert latex-style accented characters.
636
637 # remove space (if any) between \ and letter to accent (eg {\' a})
638
639 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
640
641 # remove {} around a single character (eg \'{e})
642 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
643
644 # \, is another way of doing cedilla \c
645 $text =~ s@\\,(.)@\\c $1@g;
646
647 # remove {} around a single character for special 1 letter commands -
648 # need to insert a space. Eg \v{s} -> {\v s}
649 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
650
651 # only do if the text contains a '\' character.
652 if ($text =~ m|\\|) {
653 # "normal" accents - ie non-alpha latex tag
654 while ($text =~ m@\\([`'="^~\.])([\w])@) {
655 my $tex="$1$2"; my $char="$2";
656 my $replacement=$utf8_chars{$tex};
657 if (!defined($replacement)) {
658 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
659 $replacement=$char;
660 }
661 $text =~ s/\\$tex/$replacement/g;
662 }
663
664 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
665 # only do the change if immediately followed by a space, }, {, or \
666 # one letter accents ( + ss)
667 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
668 my $tex=$1;
669 my $replacement=$special_utf8_chars{$tex};
670 if (!defined($replacement)) {
671 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
672 $replacement=$tex;
673 }
674 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
675
676 }
677
678 # one letter latex accent commands that affect following letter
679 while ($text =~ m@\\([uvcH]) ([\w])@) {
680 my $tex="$1 $2"; my $char="$2";
681 my $replacement=$special_utf8_chars{$tex};
682 if (!defined($replacement)) {
683 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
684 $replacement=$char;
685 }
686 $text =~ s/\\$tex/$replacement/g;
687 }
688 }
689
690 # escape html-sensitive characters
691 $text =~ s@&@&amp;@g;
692 $text =~ s@<@&lt;@g;
693 $text =~ s@>@&gt;@g;
694 $text =~ s/''/"/g; # Latex-specific
695 $text =~ s/``/"/g; # Latex-specific
696 # greenstone-specific
697 $text =~ s@\[@&\#91;@g;
698 $text =~ s@\]@&\#93;@g;
699
700 # remove latex commands
701
702 # explicitly recognised commands
703 $text =~ s@\\ldots@&hellip;@g;
704
705 # maths mode
706 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
707
708 # remove all other commands with optional arguments...
709 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
710 # $text =~ s@\\noopsort{[^}]+\}@@g;
711 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
712
713 # remove latex groupings { } (but not \{ or \} )
714 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
715 $text =~ s/^\{//; # remove { if first char
716
717 # latex characters
718 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
719 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
720 # optional break "\-"
721 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
722 # the non-macro bits have quotes around them - we just remove quotes
723 $text =~ s/([^&])[\"\#]/$1/g;
724 }
725 # dashes. Convert (m|n)-dash into single dash for html.
726 $text =~ s@\-\-+@\-@g;
727
728 # quoted { } chars
729 $text =~ s@\\{@{@g;
730 $text =~ s@\\}@}@g;
731
732 # finally to protect against macro language...
733 $text =~ s@\\@\\\\@g;
734
735 return $text;
736}
737
738
739sub process_latex_math {
740 my $text = shift;
741
742 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
743 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
744 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
745 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
746 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
747
748 # put all other command names in italics
749 $text =~ s@\\([\w]+)@<i>$1</i>@g;
750
751 return $text;
752}
753
754sub set_OID {
755 my $self = shift (@_);
756 my ($doc_obj, $id, $segment_number) = @_;
757
758 if ( $self->{'key'} eq "default") {
759 $doc_obj->set_OID();
760 } else {
761 $doc_obj->set_OID($self->{'key'});
762 }
763}
764
7651;
Note: See TracBrowser for help on using the repository browser.