source: gsdl/trunk/perllib/plugins/BibTexPlugin.pm@ 17026

Last change on this file since 17026 was 17026, checked in by kjdon, 16 years ago

OID generation modifications: OIDtype and OIDmetadata options now available for plugins as well as import. OIDtype for plugins defaults to auto - if set to auto, then use the values from import. All plugins now call self->add_OID instead of doc_obj->set_OID. This sets the doc_obj OIDtype so that doesn't need to be donein other places any more. all plugins have the get_oid_hash_type method - normally returns hash_on_file, but can be overridden to return hash_on_ga_xml for those plugins that don't want hashing on file (MP3,OggVorbis...)

  • Property svn:keywords set to Author Date Id Revision
File size: 23.5 KB
Line 
1###########################################################################
2#
3# BibTexPlugin.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlugin reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlugin creates a document object for every reference a the file.
35# It is a subclass of SplitTextFile, so if there are multiple records, all
36# are read.
37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
42
43
44package BibTexPlugin;
45
46use SplitTextFile;
47use strict;
48no strict 'refs'; # allow filehandles to be variables and viceversa
49
50# BibTexPlugin is a sub-class of SplitTextFile.
51sub BEGIN {
52 @BibTexPlugin::ISA = ('SplitTextFile');
53}
54
55my $arguments =
56 [ { 'name' => "process_exp",
57 'desc' => "{BasePlugin.process_exp}",
58 'type' => "regexp",
59 'reqd' => "no",
60 'deft' => &get_default_process_exp() },
61 { 'name' => "split_exp",
62 'desc' => "{SplitTextFile.split_exp}",
63 'type' => "regexp",
64 'deft' => &get_default_split_exp(),
65 'reqd' => "no" }
66 ];
67
68my $options = { 'name' => "BibTexPlugin",
69 'desc' => "{BibTexPlugin.desc}",
70 'abstract' => "no",
71 'inherits' => "yes",
72 'explodes' => "yes",
73 'args' => $arguments };
74
75# This plugin processes files with the suffix ".bib"
76sub get_default_process_exp {
77 return q^(?i)\.bib$^;
78}
79
80# This plugin splits the input text at blank lines
81sub get_default_split_exp {
82 return q^\n+(?=@)^;
83}
84
85sub new {
86 my ($class) = shift (@_);
87 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88 push(@$pluginlist, $class);
89
90 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91 push(@{$hashArgOptLists->{"OptList"}},$options);
92
93 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
94
95 return bless $self, $class;
96}
97
98
99
100# The process function reads a single bibliographic record and stores
101# it as a new document.
102
103sub process {
104 my $self = shift (@_);
105 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
106 my $outhandle = $self->{'outhandle'};
107
108 my $cursection = $doc_obj->get_top_section();
109 $self->{'key'} = "default";
110
111 # Check that we're dealing with a valid BibTex record
112 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
113
114 # Ignore things we can't use
115 return 0 if ($$textref =~ /^\@String/);
116
117 # This hash translates BibTex field names into metadata names. The
118 # BibTex names are taken from the "Local Guide to Latex" Graeme
119 # McKinstry. Metadata names are consistent with ReferPlug.
120
121 # The author metadata will be stored as one "Creator" entry, but will
122 # also be split up into several individual "Author" fields.
123
124 my %field = (
125 'address', 'PublisherAddress',
126 'author', 'Creator',
127
128 'booktitle', 'Booktitle',
129 'chapter', 'Chapter',
130 'edition', 'Edition',
131 'editor', 'Editor',
132 'institution', 'Publisher',
133 'journal', 'Journal',
134 'month', 'Month',
135 'number', 'Number',
136 'pages', 'Pages',
137 'publisher', 'Publisher',
138 'school', 'Publisher',
139 'title', 'Title',
140 'volume', 'Volume',
141 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
142
143 'keywords', 'Keywords',
144 'abstract', 'Abstract',
145 'copyright', 'Copyright',
146 'note', 'Note',
147 'url', 'URL',
148 );
149
150 # Metadata fields
151 my %metadata;
152 my ($EntryType, $Creator, $Keywords, $text);
153
154 my $verbosity = $self->{'verbosity'};
155 $verbosity = 0 unless $verbosity;
156
157 # Make sure the text has exactly one entry per line -
158 # append line to previous if it doesn't start with " <key> = "
159
160 my @input_lines=split(/\r?\n/, $$textref);
161 my @all_lines;
162 my $entry_line=shift @input_lines;
163 foreach my $input_line (@input_lines) {
164 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
165 # this is a new key
166 push(@all_lines, $entry_line);
167 $entry_line=$input_line;
168 } else {
169 # this is a continuation of previous line
170 $entry_line .= " " . $input_line;
171 }
172
173 }
174 # add final line, removing trailing '}'
175 $entry_line =~ s/\}\s*$//;
176 push(@all_lines, $entry_line);
177 push(@all_lines, "}");
178
179 # Read and process each line in the bib file.
180 my ($entryname, $name, $value, $line);
181 foreach $line (@all_lines) {
182
183 # Add each line. Most lines consist of a field identifer and
184 # then data, and we simply store them, though we treat some
185 # of the fields a bit differently.
186
187 $line =~ s/\s+/ /g;
188 $text .= "$line\n";
189
190
191 print "Processing line = $line \n" if $verbosity>=4;
192
193 # The first line is special, it contains the reference type and OID
194 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]*)\W*$/) {
195 $EntryType = $1;
196 my $EntryID = (defined $2) ? $2 : "default";
197 print "** $EntryType - \"$EntryID\" \n"
198 if ($verbosity >= 4);
199
200 next;
201 }
202 if ($line =~ /\@/) {
203 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
204 if ($verbosity >= 2);
205 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
206 if ($verbosity >= 2);
207 }
208
209 # otherwise, parse the metadata out of this line
210 next unless ($line =~ /^\s*(\w+)\s*=\s*(.*)/);
211 $entryname = lc($1);
212 $value = $2;
213 $value =~ s/,?\s*$//; # remove trailing comma and space
214 if ($value =~ /^"/ && $value =~ /"$/) {
215 # remove surrounding " marks
216 $value =~ s/^"//; $value =~ s/"$//;
217 } elsif ($value =~ /^\{/ && $value =~ /\}$/) {
218 # remove surrounding {} marks
219 $value =~ s/^\{//; $value =~ s/\}$//;
220 }
221 # special case for year - we only want a 4 digit year, not
222 # "circa 2004" etc
223 if ($entryname=~ /^year$/i) {
224 if ($value=~ /(\d{4})/) {
225 $value=$1;
226 $metadata{$entryname} .= "$value";
227 }
228 }
229 else {
230
231 $value = &process_latex($value);
232 # Add this line of metadata
233 $metadata{$entryname} .= "$value";
234 }
235 }
236
237 # Add the Entry type as metadata
238 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
239
240 #Add the fileformat as metadata
241 $doc_obj->add_metadata($cursection, "FileFormat", "BibTex");
242
243
244 # Add the various field as metadata
245 foreach my $entryname (keys %metadata) {
246 next unless (defined $field{$entryname});
247 next unless (defined $metadata{$entryname});
248
249 $name = $field{$entryname};
250 $value = $metadata{$entryname};
251
252 if ($name =~ /^Month/) {
253 $value=expand_month($value);
254 }
255
256 # Several special operatons on metadata follow
257
258 # Add individual keywords.
259 # The full set of keywords will be added, in due course, as "Keywords".
260 # However, we also want to add them as individual "Keyword" metadata elements.
261 if ($entryname eq "keywords") {
262 my @keywordlist = split(/,/, $value);
263 foreach my $k (@keywordlist) {
264 $k = lc($k);
265 $k =~ s/\s*$//;
266 $k =~ s/^\s*//;
267 if ($k =~ /\w/) {
268 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
269 }
270 }
271 }
272
273 # Add individual authors
274 # The author metadata will be stored as one "Creator" entry, but we
275 # also want to split it into several individual "Author" fields in
276 # "Lastname, Firstnames" format so we can browse it.
277 if ($entryname eq "author") { #added also comparison with editor
278
279 # take care of "et al."...
280 my $etal='';
281 if ($value =~ s/\s+(and\s+others|et\.?\s+al\.?)\s*$//i) {
282 $etal=' <em>et. al.</em>';
283 }
284 # und here for german language...
285 # don't use brackets in pattern, else the matched bit becomes
286 # an element in the list!
287 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
288 my @formattedlist = ();
289 foreach $a (@authorlist) {
290 $a =~ s/\s*$//;
291 $a =~ s/^\s*//;
292 # Reformat and add author name
293 next if $a=~ /^\s*$/;
294
295 # names are "First von Last", "von Last, First"
296 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
297 my $first="";
298 my $vonlast="";
299 my $jr="";
300
301 if ($a =~ /,/) {
302 my @parts=split(/,\s*/, $a);
303 $first = pop @parts;
304 if (scalar(@parts) == 2) {
305 $jr = pop @parts;
306 }
307 $vonlast=shift @parts;
308 if (scalar(@parts) > 0) {
309 print $outhandle "BibTexPlugin: couldn't parse name $a\n";
310 # but we continue anyway...
311 }
312 } else { # First von Last
313 my @words = split(/ /, $a);
314 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
315 $first .= " " . shift (@words);
316 }
317 $first =~ s/^\s//;
318 $vonlast = join (' ', @words); # whatever's left...
319 }
320 my $von="";
321 my $last="";
322 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
323 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
324 $von = $1;
325 if (!defined ($von)) {
326 # some non-English names do start with lowercase
327 # eg "Marie desJardins". Also we can get typos...
328 print $outhandle "BibTexPlugin: couldn't parse surname $vonlast\n";
329 $von="";
330 if ($vonlast =~ /^[a-z]+$/) {
331 # if it's all lowercase, uppercase 1st.
332 $vonlast =~ s/^(.)/\u$1/;
333
334 }
335 }
336 $von =~ s/\s*$//;
337 $last=$vonlast;
338 } else {
339 $last=$vonlast;
340 }
341 my $wholename="$first $von $last $jr";
342 $wholename =~ s/\s+/ /g; # squeeze multiple spaces
343 $wholename =~ s/ $//;
344 push (@formattedlist, $wholename);
345 my $fullname = "$last";
346 $fullname .= " $jr" if ($jr);
347 $fullname .= ", $first";
348 $fullname .= " $von" if ($von);
349
350 # Add each name to set of Authors
351 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
352 }
353
354 # Only want at most one "and" in the Creator field
355 if (scalar(@formattedlist) > 2) {
356 my $lastauthor=pop @formattedlist;
357 $value=join(', ', @formattedlist);
358 $value.=" and $lastauthor";
359 } else { # 1 or 2 authors...
360 $value=join(" and ",@formattedlist);
361 }
362 $value.=$etal; # if there was "et al."
363 }
364
365 # Books and Journals are additionally marked for display purposes
366 if ($entryname eq "booktitle") {
367 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
368 } elsif ($entryname eq "journal") {
369 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
370 }
371
372 # Add the various fields as metadata
373 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
374
375 }
376
377 # for books and journals...
378 if (!exists $metadata{'title'}) {
379 my $name=$field{'title'}; # get Greenstone metadata name
380 my $value;
381 if (exists $metadata{'booktitle'}) {
382 $value=$metadata{'booktitle'};
383 } elsif (exists $metadata{'journal'}) {
384 $value=$metadata{'journal'};
385 }
386 if ($value) {
387 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
388 }
389 }
390
391 # Add Date (yyyymmdd) metadata
392 if (defined ($metadata{'year'}) ) {
393 my $date=$metadata{'year'};
394 chomp $date;
395 my $month=$metadata{'month'};
396 if (defined($month)) {
397 # month is currently 3 letter code or a range...
398 $month = expand_month($month);
399 # take the first month found... might not find one!
400 $month =~ m/_textmonth(\d\d)_/;
401 $month = $1;
402 }
403 if (!defined($month)) {
404 $month="00";
405 }
406 $date .= "${month}00";
407 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
408}
409
410# # Add the text in BibTex format (all fields)
411 if ($text =~ /\w/) {
412
413 $text =~ s@&@&amp;@g;
414 $text =~ s@<@&lt;@g;
415 $text =~ s@>@&gt;@g;
416 $text =~ s@\n@<br/>\n@g;
417 $text =~ s@\\@\\\\@g;
418
419 $doc_obj->add_utf8_text ($cursection, $text);
420 }
421
422 return 1;
423}
424
425
426
427
428# convert email addresses and URLs into links
429sub convert_urls_into_links{
430 my ($text) = @_;
431
432 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
433 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
434
435 return $text;
436}
437
438# Clean up whitespace and convert \n charaters to <BR> or <P>
439sub clean_up_whitespaces{
440 my ($text) = @_;
441
442 $text =~ s/%%%%%/<BR> <BR>/g;
443 $text =~ s/ +/ /g;
444 $text =~ s/\s*$//;
445 $text =~ s/^\s*//;
446 $text =~ s/\n/\n<BR>/g;
447 $text =~ s/<BR>\s*<BR>/<P>/g;
448
449 return $text;
450}
451
452
453sub convert_problem_characters_without_ampersand{
454 my ($text) = @_;
455 $text =~ s/</&lt;/g;
456 $text =~ s/>/&gt;/g;
457
458 $text =~ s/\'\'/\"/g; #Latex -specific conversion
459 $text =~ s/\`\`/\"/g; #Latex -specific conversion
460
461 $text =~ s/\"/&quot;/g;
462 $text =~ s/\'/&#8217;/g;
463 $text =~ s/\`/&#8216;/g;
464
465# $text =~ s/\+/ /g;
466# $text =~ s/\(/ /g;
467# $text =~ s/\)/ /g;
468
469 $text =~ s/\\/\\\\/g;
470
471# $text =~ s/\./\\\./g;
472
473 return $text;
474}
475
476# Convert a text string into HTML.
477
478# The HTML is going to be inserted into a GML file, so we have to be
479# careful not to use symbols like ">", which occurs frequently in email
480# messages (and use &gt instead.
481
482# This function also turns URLs and email addresses into links, and
483# replaces carriage returns with <BR> tags (and multiple carriage returns
484# with <P> tags).
485
486sub text_into_html {
487 my ($text) = @_;
488
489 # Convert problem characters into HTML symbols
490 $text =~ s/&/&amp;/g;
491
492 $text = &convert_problem_characters_without_ampersand( $text );
493
494 # convert email addresses and URLs into links
495 $text = &convert_urls_into_links( $text );
496
497 $text = &clean_up_whitespaces( $text );
498
499 return $text;
500}
501
502
503sub expand_month {
504 my $text=shift;
505
506 # bibtex style files expand abbreviations for months.
507 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
508 $text =~ s/jan/_textmonth01_/g;
509 $text =~ s/feb/_textmonth02_/g;
510 $text =~ s/mar/_textmonth03_/g;
511 $text =~ s/apr/_textmonth04_/g;
512 $text =~ s/may/_textmonth05_/g;
513 $text =~ s/jun/_textmonth06_/g;
514 $text =~ s/jul/_textmonth07_/g;
515 $text =~ s/aug/_textmonth08_/g;
516 $text =~ s/sep/_textmonth09_/g;
517 $text =~ s/oct/_textmonth10_/g;
518 $text =~ s/nov/_textmonth11_/g;
519 $text =~ s/dec/_textmonth12_/g;
520
521 return $text;
522}
523
524
525# Convert accented characters, remove { }, interprete some commands....
526# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
527sub process_latex {
528 my ($text) = @_;
529
530 # note - this is really ugly, but it works. There may be a prettier way
531 # of mapping latex accented chars to utf8, but we just brute force it here.
532 # Also, this isn't complete - not every single possible accented letter
533 # is in here yet, but most of the common ones are.
534
535 my %utf8_chars =
536 (
537 # acutes
538 '\'a' => chr(0xc3).chr(0xa1),
539 '\'c' => chr(0xc4).chr(0x87),
540 '\'e' => chr(0xc3).chr(0xa9),
541 '\'i' => chr(0xc3).chr(0xad),
542 '\'l' => chr(0xc3).chr(0xba),
543 '\'n' => chr(0xc3).chr(0x84),
544 '\'o' => chr(0xc3).chr(0xb3),
545 '\'r' => chr(0xc5).chr(0x95),
546 '\'s' => chr(0xc5).chr(0x9b),
547 '\'u' => chr(0xc3).chr(0xba),
548 '\'y' => chr(0xc3).chr(0xbd),
549 '\'z' => chr(0xc5).chr(0xba),
550 # graves
551 '`a' => chr(0xc3).chr(0xa0),
552 '`A' => chr(0xc3).chr(0x80),
553 '`e' => chr(0xc3).chr(0xa8),
554 '`E' => chr(0xc3).chr(0x88),
555 '`i' => chr(0xc3).chr(0xac),
556 '`I' => chr(0xc3).chr(0x8c),
557 '`o' => chr(0xc3).chr(0xb2),
558 '`O' => chr(0xc3).chr(0x92),
559 '`u' => chr(0xc3).chr(0xb9),
560 '`U' => chr(0xc3).chr(0x99),
561 # circumflex
562 '^a' => chr(0xc3).chr(0xa2),
563 '^A' => chr(0xc3).chr(0x82),
564 '^c' => chr(0xc4).chr(0x89),
565 '^C' => chr(0xc4).chr(0x88),
566 '^e' => chr(0xc3).chr(0xaa),
567 '^E' => chr(0xc3).chr(0x8a),
568 '^g' => chr(0xc4).chr(0x9d),
569 '^G' => chr(0xc4).chr(0x9c),
570 '^h' => chr(0xc4).chr(0xa5),
571 '^H' => chr(0xc4).chr(0xa4),
572 '^i' => chr(0xc3).chr(0xae),
573 '^I' => chr(0xc3).chr(0x8e),
574 '^j' => chr(0xc4).chr(0xb5),
575 '^J' => chr(0xc4).chr(0xb4),
576 '^o' => chr(0xc3).chr(0xb4),
577 '^O' => chr(0xc3).chr(0x94),
578 '^s' => chr(0xc5).chr(0x9d),
579 '^S' => chr(0xc5).chr(0x9c),
580 '^u' => chr(0xc3).chr(0xa2),
581 '^U' => chr(0xc3).chr(0xbb),
582 '^w' => chr(0xc5).chr(0xb5),
583 '^W' => chr(0xc5).chr(0xb4),
584 '^y' => chr(0xc5).chr(0xb7),
585 '^Y' => chr(0xc5).chr(0xb6),
586
587 # diaeresis
588 '"a' => chr(0xc3).chr(0xa4),
589 '"A' => chr(0xc3).chr(0x84),
590 '"e' => chr(0xc3).chr(0xab),
591 '"E' => chr(0xc3).chr(0x8b),
592 '"\\\\i' => chr(0xc3).chr(0xaf),
593 '"\\\\I' => chr(0xc3).chr(0x8f),
594 '"o' => chr(0xc3).chr(0xb6),
595 '"O' => chr(0xc3).chr(0x96),
596 '"u' => chr(0xc3).chr(0xbc),
597 '"U' => chr(0xc3).chr(0x9c),
598 '"y' => chr(0xc3).chr(0xbf),
599 '"Y' => chr(0xc3).chr(0xb8),
600 # tilde
601 '~A' => chr(0xc3).chr(0x83),
602 '~N' => chr(0xc3).chr(0x91),
603 '~O' => chr(0xc3).chr(0x95),
604 '~a' => chr(0xc3).chr(0xa3),
605 '~n' => chr(0xc3).chr(0xb1),
606 '~o' => chr(0xc3).chr(0xb5),
607 # caron - handled specially
608 # double acute
609 # ring
610 # dot
611 '.c' => chr(0xc4).chr(0x8b),
612 '.C' => chr(0xc4).chr(0x8a),
613 '.e' => chr(0xc4).chr(0x97),
614 '.E' => chr(0xc4).chr(0x96),
615 '.g' => chr(0xc4).chr(0xa1),
616 '.G' => chr(0xc4).chr(0xa0),
617 '.I' => chr(0xc4).chr(0xb0),
618 '.z' => chr(0xc5).chr(0xbc),
619 '.Z' => chr(0xc5).chr(0xbb),
620 # macron
621 '=a' => chr(0xc4).chr(0x81),
622 '=A' => chr(0xc4).chr(0x80),
623 '=e' => chr(0xc4).chr(0x93),
624 '=E' => chr(0xc4).chr(0x92),
625 '=i' => chr(0xc4).chr(0xab),
626 '=I' => chr(0xc4).chr(0xaa),
627 '=o' => chr(0xc4).chr(0x8d),
628 '=O' => chr(0xc4).chr(0x8c),
629 '=u' => chr(0xc4).chr(0xab),
630 '=U' => chr(0xc4).chr(0xaa),
631
632 # stroke - handled specially - see below
633
634 # cedilla - handled specially
635 );
636
637# these are one letter latex commands - we make sure they're not a longer
638# command name. eg {\d} is d+stroke, so careful of \d
639 my %special_utf8_chars =
640 (
641 # breve
642 'u g' => chr(0xc4).chr(0x9f),
643 'u G' => chr(0xc4).chr(0x9e),
644 'u i' => chr(0xc4).chr(0xad),
645 'u I' => chr(0xc4).chr(0xac),
646 'u o' => chr(0xc5).chr(0x8f),
647 'u O' => chr(0xc5).chr(0x8e),
648 'u u' => chr(0xc5).chr(0xad),
649 'u U' => chr(0xc5).chr(0xac),
650 # caron
651 'v c' => chr(0xc4).chr(0x8d),
652 'v C' => chr(0xc4).chr(0x8c),
653 'v n' => chr(0xc5).chr(0x88),
654 'v N' => chr(0xc5).chr(0x87),
655 'v s' => chr(0xc5).chr(0xa1),
656 'v S' => chr(0xc5).chr(0xa5),
657 'v z' => chr(0xc5).chr(0xbe),
658 'v Z' => chr(0xc5).chr(0xbd),
659 # cedilla
660 'c c' => chr(0xc3).chr(0xa7),
661 'c C' => chr(0xc3).chr(0x87),
662 'c g' => chr(0xc4).chr(0xa3),
663 'c G' => chr(0xc4).chr(0xa2),
664 'c k' => chr(0xc4).chr(0xb7),
665 'c K' => chr(0xc4).chr(0xb6),
666 'c l' => chr(0xc4).chr(0xbc),
667 'c L' => chr(0xc4).chr(0xbb),
668 'c n' => chr(0xc5).chr(0x86),
669 'c N' => chr(0xc5).chr(0x85),
670 'c r' => chr(0xc5).chr(0x97),
671 'c R' => chr(0xc5).chr(0x96),
672 'c s' => chr(0xc5).chr(0x9f),
673 'c S' => chr(0xc5).chr(0x9e),
674 'c t' => chr(0xc5).chr(0xa3),
675 'c T' => chr(0xc5).chr(0xa2),
676 # double acute / Hungarian accent
677 'H O' => chr(0xc5).chr(0x90),
678 'H o' => chr(0xc5).chr(0x91),
679 'H U' => chr(0xc5).chr(0xb0),
680 'H u' => chr(0xc5).chr(0xb1),
681
682 # stroke
683 'd' => chr(0xc4).chr(0x91),
684 'D' => chr(0xc4).chr(0x90),
685 'h' => chr(0xc4).chr(0xa7),
686# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
687 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
688 'l' => chr(0xc5).chr(0x82),
689 'L' => chr(0xc5).chr(0x81),
690 'o' => chr(0xc3).chr(0xb8),
691 'O' => chr(0xc3).chr(0x98),
692 't' => chr(0xc5).chr(0xa7),
693 'T' => chr(0xc5).chr(0xa6),
694 # german ss/szlig/sharp s
695 'ss' => chr(0xc3).chr(0x9f),
696 );
697
698 # convert latex-style accented characters.
699
700 # remove space (if any) between \ and letter to accent (eg {\' a})
701
702 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; #`
703
704 # remove {} around a single character (eg \'{e})
705 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; #`
706
707 # \, is another way of doing cedilla \c
708 $text =~ s@\\,(.)@\\c $1@g;
709
710 # remove {} around a single character for special 1 letter commands -
711 # need to insert a space. Eg \v{s} -> {\v s}
712 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
713
714 # only do if the text contains a '\' character.
715 if ($text =~ m|\\|) {
716 # "normal" accents - ie non-alpha latex tag
717 while ($text =~ m@\\([`'="^~\.])([\w])@) { #`
718 my $tex="$1$2"; my $char="$2";
719 my $replacement=$utf8_chars{$tex};
720 if (!defined($replacement)) {
721 print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
722 $replacement=$char;
723 }
724 $text =~ s/\\\Q$tex/$replacement/g;
725 }
726
727 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
728 # only do the change if immediately followed by a space, }, {, or \
729 # one letter accents ( + ss)
730 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
731 my $tex=$1;
732 my $replacement=$special_utf8_chars{$tex};
733 if (!defined($replacement)) {
734 print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
735 $replacement=$tex;
736 }
737 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
738
739 }
740
741 # one letter latex accent commands that affect following letter
742 while ($text =~ m@\\([uvcH]) ([\w])@) {
743 my $tex="$1 $2"; my $char="$2";
744 my $replacement=$special_utf8_chars{$tex};
745 if (!defined($replacement)) {
746 print STDERR "BibTexPlugin: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
747 $replacement=$char;
748 }
749 $text =~ s/\\$tex/$replacement/g;
750 }
751 }
752
753 # escape html-sensitive characters
754 $text =~ s@&@&amp;@g;
755 $text =~ s@<@&lt;@g;
756 $text =~ s@>@&gt;@g;
757 $text =~ s/''/"/g; # Latex-specific
758 $text =~ s/``/"/g; # Latex-specific
759 # greenstone-specific
760 $text =~ s@\[@&\#91;@g;
761 $text =~ s@\]@&\#93;@g;
762
763 # remove latex commands
764
765 # explicitly recognised commands
766 $text =~ s@\\ldots@&hellip;@g;
767
768 # maths mode
769 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
770
771 # remove all other commands with optional arguments...
772 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
773 # $text =~ s@\\noopsort{[^}]+\}@@g;
774 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
775
776 # remove latex groupings { } (but not \{ or \} )
777 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
778 $text =~ s/^\{//; # remove { if first char
779
780 # latex characters
781 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
782 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
783 # optional break "\-"
784 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
785 # the non-macro bits have quotes around them - we just remove quotes
786 $text =~ s/([^&])[\"\#]/$1/g;
787 }
788 # dashes. Convert (m|n)-dash into single dash for html.
789 $text =~ s@\-\-+@\-@g;
790
791 # quoted { } chars
792 $text =~ s@\\{@{@g;
793 $text =~ s@\\}@}@g;
794
795 # finally to protect against macro language...
796 $text =~ s@\\@\\\\@g;
797
798 return $text;
799}
800
801
802sub process_latex_math {
803 my $text = shift;
804
805 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
806 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
807 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
808 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
809 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
810
811 # put all other command names in italics
812 $text =~ s@\\([\w]+)@<i>$1</i>@g;
813
814 return $text;
815}
816
817sub add_OID {
818 my $self = shift (@_);
819 my ($doc_obj, $id, $segment_number) = @_;
820
821 if ( $self->{'key'} eq "default") {
822 $self->SUPER::add_OID(@_);
823# $doc_obj->set_OID("$id\_$segment_number");
824 } else {
825 $doc_obj->set_OID($self->{'key'});
826 }
827}
828
8291;
830
831
Note: See TracBrowser for help on using the repository browser.