source: trunk/gsdl/perllib/plugins/BibTexPlug.pm@ 7243

Last change on this file since 7243 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:keywords set to Author Date Id Revision
File size: 22.4 KB
Line 
1###########################################################################
2#
3# BibTexPlug.pm - a plugin for bibliography records in BibTex format
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2000 Gordon W. Paynter
10# Copyright 1999-2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# BibTexPlug reads bibliography files in BibTex format.
30#
31# by Gordon W. Paynter ([email protected]), November 2000
32# Based on ReferPlug. See ReferPlug for geneology.
33#
34# BibTexPlug creates a document object for every reference a the file.
35# It is a subclass of SplitPlug, so if there are multiple records, all
36# are read.
37#
38# Modified Dec 2001 by John McPherson:
39# * some modifications submitted by Sergey Yevtushenko
40# <[email protected]>
41# * some non-ascii char support (ie mostly Latin)
42
43
44package BibTexPlug;
45
46use SplitPlug;
47
48# BibTexPlug is a sub-class of BasPlug.
49sub BEGIN {
50 @ISA = ('SplitPlug');
51}
52
53my $arguments =
54 [ { 'name' => "process_exp",
55 'desc' => "{BasPlug.process_exp}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => &get_default_process_exp() },
59 { 'name' => "split_exp",
60 'desc' => "{SplitPlug.split_exp}",
61 'type' => "regexp",
62 'deft' => &get_default_split_exp(),
63 'reqd' => "no" }
64 ];
65
66my $options = { 'name' => "BibTexPlug",
67 'desc' => "{BibTexPlug.desc}",
68 'abstract' => "no",
69 'inherits' => "yes",
70 'args' => $arguments };
71
72# This plugin processes files with the suffix ".bib"
73sub get_default_process_exp {
74 return q^(?i)\.bib$^;
75}
76
77# This plugin splits the input text at blank lines
78sub get_default_split_exp {
79 return q^\n+(?=@)^;
80}
81sub new {
82 my $class = shift (@_);
83 my $self = new SplitPlug ($class, @_);
84 $self->{'plugin_type'} = "BibTexPlug";
85 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
86 my $option_list = $self->{'option_list'};
87 push( @{$option_list}, $options );
88 return bless $self, $class;
89}
90
91
92
93# The process function reads a single bibliographic record and stores
94# it as a new document.
95
96sub process {
97 my $self = shift (@_);
98 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
99 my $outhandle = $self->{'outhandle'};
100
101 $self->{'key'} = "default";
102
103 # Check that we're dealing with a valid BibTex record
104 return undef unless ($$textref =~ /^@\w+\{.*\}/s);
105
106 # Ignore things we can't use
107 return 0 if ($$textref =~ /^\@String/);
108
109 # Report that we're processing the file
110 print STDERR "<Processing n='$file' p='BibTexPlug'>\n" if ($gli);
111 print $outhandle "BibTexPlug: processing $file\n"
112 if ($self->{'verbosity'}) > 1;
113
114
115 # This hash translates BibTex field names into metadata names. The
116 # BibTex names are taken from the "Local Guide to Latex" Graeme
117 # McKinstry. Metadata names are consistent with ReferPlug.
118
119 my %field = (
120 'address', 'PublisherAddress',
121 'author', 'Creator',
122
123 'booktitle', 'Booktitle',
124 'chapter', 'Chapter',
125 'edition', 'Edition',
126 'editor', 'Editor',
127 'institution', 'Publisher',
128 'journal', 'Journal',
129 'month', 'Month',
130 'number', 'Number',
131 'pages', 'Pages',
132 'publisher', 'Publisher',
133 'school', 'Publisher',
134 'title', 'Title',
135 'volume', 'Volume',
136 'year', 'Year', # Can't use "Date" as this implies DDDDMMYY!
137
138 'keywords', 'Keywords',
139 'abstract', 'Abstract',
140 'copyright', 'Copyright',
141 'note', 'Note',
142);
143
144 # Metadata fields
145 my %metadata;
146 my ($EntryType, $Creator, $Keywords, $text);
147
148 my $verbosity = $self->{'verbosity'};
149 $verbosity = 0 unless $verbosity;
150
151 # Make sure the text has exactly one entry per line -
152 # append line to previous if it doesn't start with " <key> = "
153
154 my @input_lines=split(/\r?\n/, $$textref);
155 my @all_lines;
156 my $entry_line=shift @input_lines;
157 foreach my $input_line (@input_lines) {
158 if ($input_line =~ m/^\s*\w+\s*=\s*/) {
159 # this is a new key
160 push(@all_lines, $entry_line);
161 $entry_line=$input_line;
162 } else {
163 # this is a continuation of previous line
164 $entry_line .= " " . $input_line;
165 }
166
167 }
168 # add final line, removing trailing '}'
169 $entry_line =~ s/\}\s*$//;
170 push(@all_lines, $entry_line);
171 push(@all_lines, "}");
172
173 # Read and process each line in the bib file.
174 my ($entryname, $name, $value, $line);
175 foreach $line (@all_lines) {
176
177 # Add each line. Most lines consist of a field identifer and
178 # then data, and we simply store them, though we treat some
179 # of the fields a bit differently.
180
181 $line =~ s/\s+/ /g;
182 $text .= "$line\n";
183
184
185 print "Processing line = $line \n" if $verbosity>=4;
186
187 # The first line is special, it contains the reference type and OID
188 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]*)\W*$/) {
189 $EntryType = $1;
190 my $EntryID = (defined $2) ? $2 : "default";
191 print "** $EntryType - \"$EntryID\" \n"
192 if ($verbosity >= 4);
193
194 next;
195 }
196 if ($line =~ /\@/) {
197 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n"
198 if ($verbosity >= 2);
199 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"
200 if ($verbosity >= 2);
201 }
202
203 # otherwise, parse the metadata out of this line
204 next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/);
205 $entryname = lc($1);
206 $value = $2;
207 $value =~ s/,?\s*$//; # remove trailing comma and space
208 if ($value =~ /^"/ && $value =~ /"$/) {
209 # remove surrounding " marks
210 $value =~ s/^"//; $value =~ s/"$//;
211 }
212 $value = &process_latex($value);
213
214 # Add this line of metadata
215 $metadata{$entryname} .= "$value";
216
217 }
218
219 # Add the Entry type as metadata
220 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType);
221
222 # Add the various field as metadata
223 foreach my $entryname (keys %metadata) {
224 next unless (defined $field{$entryname});
225 next unless (defined $metadata{$entryname});
226
227 $name = $field{$entryname};
228 $value = $metadata{$entryname};
229
230 if ($name =~ /^Month/) {
231 $value=expand_month($value);
232 }
233
234 # Several special operatons on metadata follow
235
236 # Add individual keywords.
237 # The full set of keywords will be added, in due course, as "Keywords".
238 # However, we also want to add them as individual "Keyword" metadata elements.
239 if ($entryname eq "keywords") {
240 my @keywordlist = split(/,/, $value);
241 foreach my $k (@keywordlist) {
242 $k = lc($k);
243 $k =~ s/\s*$//;
244 $k =~ s/^\s*//;
245 if ($k =~ /\w/) {
246 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k);
247 }
248 }
249 }
250
251 # Add individual authors
252 # The author metadata will be stored as one "Creator" entry, but we
253 # also want to split it into several individual "Author" fields in
254 # "Lastname, Firstnames" format so we can browse it.
255 if ($entryname eq "author") { #added also comparison with editor
256
257 # take care of "et al."...
258 $value =~ s/(\s+et\.?\s+al\.?)\s*$//;
259 my $etal=$1;
260 $etal="" if (!defined ($etal));
261 # und here for german language...
262 # don't use brackets in pattern, else the matched bit becomes
263 # an element in the list!
264 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value);
265 my @formattedlist = ();
266 foreach $a (@authorlist) {
267 $a =~ s/\s*$//;
268 $a =~ s/^\s*//;
269 # Reformat and add author name
270 next if $a=~ /^\s*$/;
271
272 # names are "First von Last", "von Last, First"
273 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16
274 my $first="";
275 my $vonlast="";
276 my $jr="";
277
278 if ($a =~ /,/) {
279 my @parts=split(/,\s*/, $a);
280 $first = pop @parts;
281 if (scalar(@parts) == 2) {
282 $jr = pop @parts;
283 }
284 $vonlast=shift @parts;
285 if (scalar(@parts) > 0) {
286 print $outhandle "BibTexPlug: couldn't parse name $a\n";
287 # but we continue anyway...
288 }
289 } else { # First von Last
290 my @words = split(/ /, $a);
291 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) {
292 $first .= " " . shift (@words);
293 }
294 $first =~ s/^\s//;
295 $vonlast = join (' ', @words); # whatever's left...
296 }
297 my $von="";
298 my $last="";
299 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von"
300 $vonlast =~ s/^(([a-z]\w+\s+)+)//;
301 $von = $1;
302 if (!defined ($von)) {
303 # some non-English names do start with lowercase
304 # eg "Marie desJardins". Also we can get typos...
305 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n";
306 $von="";
307 if ($vonlast =~ /^[a-z]+$/) {
308 # if it's all lowercase, uppercase 1st.
309 $vonlast =~ s/^(.)/\u$1/;
310
311 }
312 }
313 $von =~ s/\s*$//;
314 $last=$vonlast;
315 } else {
316 $last=$vonlast;
317 }
318 my $wholename="$first $von $last $jr";
319 $wholename =~ s/ $//; $wholename =~ s/\s+/ /g;
320 push (@formattedlist, $wholename);
321 my $fullname = "$last";
322 $fullname .= " $jr" if ($jr);
323 $fullname .= ", $first";
324 $fullname .= " $von" if ($von);
325
326 # Add each name to set of Authors
327 # force utf8 pragma so that \w matches in this scope
328 use utf8;
329 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname);
330 }
331
332 # Only want at most one "and" in the Creator field
333 if (scalar(@formattedlist) > 2) {
334 my $lastauthor=pop @formattedlist;
335 $value=join(', ', @formattedlist);
336 $value.=" and $lastauthor";
337 } else { # 1 or 2 authors...
338 $value=join(" and ",@formattedlist);
339 }
340 $value.=$etal; # if there was "et al."
341 }
342
343 # Books and Journals are additionally marked for display purposes
344 if ($entryname eq "booktitle") {
345 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1);
346 } elsif ($entryname eq "journal") {
347 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1);
348 }
349
350
351 # Add the various fields as metadata
352 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
353 }
354
355 # Add Date (yyyymmdd) metadata
356 if (defined ($metadata{'year'}) ) {
357 my $date=$metadata{'year'};
358 chomp $date;
359 my $month=$metadata{'month'};
360 if (defined($month)) {
361 # month is currently 3 letter code or a range...
362 $month = expand_month($month);
363 # take the first month found... might not find one!
364 $month =~ m/_textmonth(\d\d)_/;
365 $month = $1;
366 }
367 if (!defined($month)) {
368 $month="00";
369 }
370 $date .= "${month}00";
371 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
372}
373
374# # Add the text in BibTex format (all fields)
375 if ($text =~ /\w/) {
376
377 $text =~ s@&@&amp;@g;
378 $text =~ s@<@&lt;@g;
379 $text =~ s@>@&gt;@g;
380 $text =~ s@\n@<br/>\n@g;
381 $text =~ s@\\@\\\\@g;
382
383 $doc_obj->add_utf8_text ($cursection, $text);
384 }
385
386 return 1;
387}
388
389
390
391
392# convert email addresses and URLs into links
393sub convert_urls_into_links{
394 my ($text) = @_;
395
396 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
397 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g;
398
399 return $text;
400}
401
402# Clean up whitespace and convert \n charaters to <BR> or <P>
403sub clean_up_whitespaces{
404 my ($text) = @_;
405
406 $text =~ s/%%%%%/<BR> <BR>/g;
407 $text =~ s/ +/ /g;
408 $text =~ s/\s*$//;
409 $text =~ s/^\s*//;
410 $text =~ s/\n/\n<BR>/g;
411 $text =~ s/<BR>\s*<BR>/<P>/g;
412
413 return $text;
414}
415
416
417sub convert_problem_characters_without_ampersand{
418 my ($text) = @_;
419 $text =~ s/</&lt;/g;
420 $text =~ s/>/&gt;/g;
421
422 $text =~ s/\'\'/\"/g; #Latex -specific conversion
423 $text =~ s/\`\`/\"/g; #Latex -specific conversion
424
425 $text =~ s/\"/&quot;/g;
426 $text =~ s/\'/&#8217;/g;
427 $text =~ s/\`/&#8216;/g;
428
429# $text =~ s/\+/ /g;
430# $text =~ s/\(/ /g;
431# $text =~ s/\)/ /g;
432
433 $text =~ s/\\/\\\\/g;
434
435# $text =~ s/\./\\\./g;
436
437 return $text;
438}
439
440# Convert a text string into HTML.
441
442# The HTML is going to be inserted into a GML file, so we have to be
443# careful not to use symbols like ">", which occurs frequently in email
444# messages (and use &gt instead.
445
446# This function also turns URLs and email addresses into links, and
447# replaces carriage returns with <BR> tags (and multiple carriage returns
448# with <P> tags).
449
450sub text_into_html {
451 my ($text) = @_;
452
453 # Convert problem characters into HTML symbols
454 $text =~ s/&/&amp;/g;
455
456 $text = &convert_problem_characters_without_ampersand( $text );
457
458 # convert email addresses and URLs into links
459 $text = &convert_urls_into_links( $text );
460
461 $text = &clean_up_whitespaces( $text );
462
463 return $text;
464}
465
466
467sub expand_month {
468 my $text=shift;
469
470 # bibtex style files expand abbreviations for months.
471 # Entries can contain more than one month (eg ' month = jun # "-" # aug, ')
472 $text =~ s/jan/_textmonth01_/g;
473 $text =~ s/feb/_textmonth02_/g;
474 $text =~ s/mar/_textmonth03_/g;
475 $text =~ s/apr/_textmonth04_/g;
476 $text =~ s/may/_textmonth05_/g;
477 $text =~ s/jun/_textmonth06_/g;
478 $text =~ s/jul/_textmonth07_/g;
479 $text =~ s/aug/_textmonth08_/g;
480 $text =~ s/sep/_textmonth09_/g;
481 $text =~ s/oct/_textmonth10_/g;
482 $text =~ s/nov/_textmonth11_/g;
483 $text =~ s/dec/_textmonth12_/g;
484
485 return $text;
486}
487
488
489# Convert accented characters, remove { }, interprete some commands....
490# Note!! This is not comprehensive! Also assumes Latin -> Unicode!
491sub process_latex {
492 my ($text) = @_;
493
494 # note - this is really ugly, but it works. There may be a prettier way
495 # of mapping latex accented chars to utf8, but we just brute force it here.
496 # Also, this isn't complete - not every single possible accented letter
497 # is in here yet, but most of the common ones are.
498
499 my %utf8_chars =
500 (
501 # acutes
502 '\'a' => chr(0xc3).chr(0xa1),
503 '\'c' => chr(0xc4).chr(0x87),
504 '\'e' => chr(0xc3).chr(0xa9),
505 '\'i' => chr(0xc3).chr(0xad),
506 '\'l' => chr(0xc3).chr(0xba),
507 '\'n' => chr(0xc3).chr(0x84),
508 '\'o' => chr(0xc3).chr(0xb3),
509 '\'r' => chr(0xc5).chr(0x95),
510 '\'s' => chr(0xc5).chr(0x9b),
511 '\'u' => chr(0xc3).chr(0xba),
512 '\'y' => chr(0xc3).chr(0xbd),
513 '\'z' => chr(0xc5).chr(0xba),
514 # graves
515 '`a' => chr(0xc3).chr(0xa0),
516 '`A' => chr(0xc3).chr(0x80),
517 '`e' => chr(0xc3).chr(0xa8),
518 '`E' => chr(0xc3).chr(0x88),
519 '`i' => chr(0xc3).chr(0xac),
520 '`I' => chr(0xc3).chr(0x8c),
521 '`o' => chr(0xc3).chr(0xb2),
522 '`O' => chr(0xc3).chr(0x92),
523 '`u' => chr(0xc3).chr(0xb9),
524 '`U' => chr(0xc3).chr(0x99),
525 # circumflex
526 '^a' => chr(0xc3).chr(0xa2),
527 '^A' => chr(0xc3).chr(0x82),
528 '^c' => chr(0xc4).chr(0x89),
529 '^C' => chr(0xc4).chr(0x88),
530 '^e' => chr(0xc3).chr(0xaa),
531 '^E' => chr(0xc3).chr(0x8a),
532 '^g' => chr(0xc4).chr(0x9d),
533 '^G' => chr(0xc4).chr(0x9c),
534 '^h' => chr(0xc4).chr(0xa5),
535 '^H' => chr(0xc4).chr(0xa4),
536 '^i' => chr(0xc3).chr(0xae),
537 '^I' => chr(0xc3).chr(0x8e),
538 '^j' => chr(0xc4).chr(0xb5),
539 '^J' => chr(0xc4).chr(0xb4),
540 '^o' => chr(0xc3).chr(0xb4),
541 '^O' => chr(0xc3).chr(0x94),
542 '^s' => chr(0xc5).chr(0x9d),
543 '^S' => chr(0xc5).chr(0x9c),
544 '^u' => chr(0xc3).chr(0xa2),
545 '^U' => chr(0xc3).chr(0xbb),
546 '^w' => chr(0xc5).chr(0xb5),
547 '^W' => chr(0xc5).chr(0xb4),
548 '^y' => chr(0xc5).chr(0xb7),
549 '^Y' => chr(0xc5).chr(0xb6),
550
551 # diaeresis
552 '"a' => chr(0xc3).chr(0xa4),
553 '"A' => chr(0xc3).chr(0x84),
554 '"e' => chr(0xc3).chr(0xab),
555 '"E' => chr(0xc3).chr(0x8b),
556 '"\\\\i' => chr(0xc3).chr(0xaf),
557 '"\\\\I' => chr(0xc3).chr(0x8f),
558 '"o' => chr(0xc3).chr(0xb6),
559 '"O' => chr(0xc3).chr(0x96),
560 '"u' => chr(0xc3).chr(0xbc),
561 '"U' => chr(0xc3).chr(0x9c),
562 '"y' => chr(0xc3).chr(0xbf),
563 '"Y' => chr(0xc3).chr(0xb8),
564 # tilde
565 '~A' => chr(0xc3).chr(0x83),
566 '~N' => chr(0xc3).chr(0x91),
567 '~O' => chr(0xc3).chr(0x95),
568 '~a' => chr(0xc3).chr(0xa3),
569 '~n' => chr(0xc3).chr(0xb1),
570 '~o' => chr(0xc3).chr(0xb5),
571 # caron - handled specially
572 # double acute
573 # ring
574 # dot
575 '.c' => chr(0xc4).chr(0x8b),
576 '.C' => chr(0xc4).chr(0x8a),
577 '.e' => chr(0xc4).chr(0x97),
578 '.E' => chr(0xc4).chr(0x96),
579 '.g' => chr(0xc4).chr(0xa1),
580 '.G' => chr(0xc4).chr(0xa0),
581 '.I' => chr(0xc4).chr(0xb0),
582 '.z' => chr(0xc5).chr(0xbc),
583 '.Z' => chr(0xc5).chr(0xbb),
584 # macron
585 '=a' => chr(0xc4).chr(0x81),
586 '=A' => chr(0xc4).chr(0x80),
587 '=e' => chr(0xc4).chr(0x93),
588 '=E' => chr(0xc4).chr(0x92),
589 '=i' => chr(0xc4).chr(0xab),
590 '=I' => chr(0xc4).chr(0xaa),
591 '=o' => chr(0xc4).chr(0x8d),
592 '=O' => chr(0xc4).chr(0x8c),
593 '=u' => chr(0xc4).chr(0xab),
594 '=U' => chr(0xc4).chr(0xaa),
595
596 # stroke - handled specially - see below
597
598 # cedilla - handled specially
599 );
600
601# these are one letter latex commands - we make sure they're not a longer
602# command name. eg {\d} is d+stroke, so careful of \d
603 my %special_utf8_chars =
604 (
605 # breve
606 'u g' => chr(0xc4).chr(0x9f),
607 'u G' => chr(0xc4).chr(0x9e),
608 'u i' => chr(0xc4).chr(0xad),
609 'u I' => chr(0xc4).chr(0xac),
610 'u o' => chr(0xc5).chr(0x8f),
611 'u O' => chr(0xc5).chr(0x8e),
612 'u u' => chr(0xc5).chr(0xad),
613 'u U' => chr(0xc5).chr(0xac),
614 # caron
615 'v c' => chr(0xc4).chr(0x8d),
616 'v C' => chr(0xc4).chr(0x8c),
617 'v n' => chr(0xc5).chr(0x88),
618 'v N' => chr(0xc5).chr(0x87),
619 'v s' => chr(0xc5).chr(0xa1),
620 'v S' => chr(0xc5).chr(0xa5),
621 'v z' => chr(0xc5).chr(0xbe),
622 'v Z' => chr(0xc5).chr(0xbd),
623 # cedilla
624 'c c' => chr(0xc3).chr(0xa7),
625 'c C' => chr(0xc3).chr(0x87),
626 'c g' => chr(0xc4).chr(0xa3),
627 'c G' => chr(0xc4).chr(0xa2),
628 'c k' => chr(0xc4).chr(0xb7),
629 'c K' => chr(0xc4).chr(0xb6),
630 'c l' => chr(0xc4).chr(0xbc),
631 'c L' => chr(0xc4).chr(0xbb),
632 'c n' => chr(0xc5).chr(0x86),
633 'c N' => chr(0xc5).chr(0x85),
634 'c r' => chr(0xc5).chr(0x97),
635 'c R' => chr(0xc5).chr(0x96),
636 'c s' => chr(0xc5).chr(0x9f),
637 'c S' => chr(0xc5).chr(0x9e),
638 'c t' => chr(0xc5).chr(0xa3),
639 'c T' => chr(0xc5).chr(0xa2),
640 # double acute / Hungarian accent
641 'H O' => chr(0xc5).chr(0x90),
642 'H o' => chr(0xc5).chr(0x91),
643 'H U' => chr(0xc5).chr(0xb0),
644 'H u' => chr(0xc5).chr(0xb1),
645
646 # stroke
647 'd' => chr(0xc4).chr(0x91),
648 'D' => chr(0xc4).chr(0x90),
649 'h' => chr(0xc4).chr(0xa7),
650# 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut
651 'i' => chr(0xc4).chr(0xb1), # dotless lowercase i
652 'l' => chr(0xc5).chr(0x82),
653 'L' => chr(0xc5).chr(0x81),
654 'o' => chr(0xc3).chr(0xb8),
655 'O' => chr(0xc3).chr(0x98),
656 't' => chr(0xc5).chr(0xa7),
657 'T' => chr(0xc5).chr(0xa6),
658 # german ss/szlig/sharp s
659 'ss' => chr(0xc3).chr(0x9f),
660 );
661
662 # convert latex-style accented characters.
663
664 # remove space (if any) between \ and letter to accent (eg {\' a})
665
666 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g;
667
668 # remove {} around a single character (eg \'{e})
669 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g;
670
671 # \, is another way of doing cedilla \c
672 $text =~ s@\\,(.)@\\c $1@g;
673
674 # remove {} around a single character for special 1 letter commands -
675 # need to insert a space. Eg \v{s} -> {\v s}
676 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g;
677
678 # only do if the text contains a '\' character.
679 if ($text =~ m|\\|) {
680 # "normal" accents - ie non-alpha latex tag
681 while ($text =~ m@\\([`'="^~\.])([\w])@) {
682 my $tex="$1$2"; my $char="$2";
683 my $replacement=$utf8_chars{$tex};
684 if (!defined($replacement)) {
685 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
686 $replacement=$char;
687 }
688 $text =~ s/\\\Q$tex/$replacement/g;
689 }
690
691 # where the following letter matters (eg "sm\o rrebr\o d", \ss{})
692 # only do the change if immediately followed by a space, }, {, or \
693 # one letter accents ( + ss)
694 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) {
695 my $tex=$1;
696 my $replacement=$special_utf8_chars{$tex};
697 if (!defined($replacement)) {
698 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
699 $replacement=$tex;
700 }
701 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g;
702
703 }
704
705 # one letter latex accent commands that affect following letter
706 while ($text =~ m@\\([uvcH]) ([\w])@) {
707 my $tex="$1 $2"; my $char="$2";
708 my $replacement=$special_utf8_chars{$tex};
709 if (!defined($replacement)) {
710 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n";
711 $replacement=$char;
712 }
713 $text =~ s/\\$tex/$replacement/g;
714 }
715 }
716
717 # escape html-sensitive characters
718 $text =~ s@&@&amp;@g;
719 $text =~ s@<@&lt;@g;
720 $text =~ s@>@&gt;@g;
721 $text =~ s/''/"/g; # Latex-specific
722 $text =~ s/``/"/g; # Latex-specific
723 # greenstone-specific
724 $text =~ s@\[@&\#91;@g;
725 $text =~ s@\]@&\#93;@g;
726
727 # remove latex commands
728
729 # explicitly recognised commands
730 $text =~ s@\\ldots@&hellip;@g;
731
732 # maths mode
733 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge;
734
735 # remove all other commands with optional arguments...
736 $text =~ s@\\\w+(\[.*?\])?\s*@@g;
737 # $text =~ s@\\noopsort{[^}]+\}@@g;
738 # $text =~ s@\\\w+{(([^}]*[^\\])*)}@$1@g; # all other commands
739
740 # remove latex groupings { } (but not \{ or \} )
741 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..."
742 $text =~ s/^\{//; # remove { if first char
743
744 # latex characters
745 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only)
746 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"
747 # optional break "\-"
748 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes
749 # the non-macro bits have quotes around them - we just remove quotes
750 $text =~ s/([^&])[\"\#]/$1/g;
751 }
752 # dashes. Convert (m|n)-dash into single dash for html.
753 $text =~ s@\-\-+@\-@g;
754
755 # quoted { } chars
756 $text =~ s@\\{@{@g;
757 $text =~ s@\\}@}@g;
758
759 # finally to protect against macro language...
760 $text =~ s@\\@\\\\@g;
761
762 return $text;
763}
764
765
766sub process_latex_math {
767 my $text = shift;
768
769 $text =~ s@\\infty@infinity@g; # or unicode 0x221E...
770 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript
771 $text =~ s@\^([^\{])@<sup>$1</sup>@g;
772 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript
773 $text =~ s@\_([^\{])@<sub>$1</sub>@g;
774
775 # put all other command names in italics
776 $text =~ s@\\([\w]+)@<i>$1</i>@g;
777
778 return $text;
779}
780
781sub set_OID {
782 my $self = shift (@_);
783 my ($doc_obj, $id, $segment_number) = @_;
784
785 if ( $self->{'key'} eq "default") {
786 $doc_obj->set_OID("$id\_$segment_number");
787 } else {
788 $doc_obj->set_OID($self->{'key'});
789 }
790}
791
7921;
793
794
Note: See TracBrowser for help on using the repository browser.