Changeset 3249 for trunk/gsdl/perllib/plugins/BibTexPlug.pm
- Timestamp:
- 2002-07-12T15:19:17+12:00 (22 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BibTexPlug.pm
r3156 r3249 132 132 } else { 133 133 # this is a continuation of previous line 134 $entry_line .= $input_line;134 $entry_line .= " " . $input_line; 135 135 } 136 136 … … 202 202 $value=expand_month($value); 203 203 } 204 # Add the various fields as metadata205 my $html_value = &text_into_html($value);206 $doc_obj->add_utf8_metadata ($cursection, $name, $html_value);207 204 208 205 # Several special operatons on metadata follow … … 218 215 $k =~ s/^\s*//; 219 216 if ($k =~ /\w/) { 220 $k = &text_into_html($k);221 217 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k); 222 218 } … … 230 226 if ($entryname eq "author") { #added also comparison with editor 231 227 228 # take care of "et al."... 229 $value =~ s/(\s+et\.?\s+al\.?)\s*$//; 230 my $etal=$1; 231 $etal="" if (!defined ($etal)); 232 232 # und here for german language... 233 233 # don't use brackets in pattern, else the matched bit becomes 234 234 # an element in the list! 235 my @authorlist = split(/,|\s+and\s+|\s+und\s+/, $value); 235 my @authorlist = split(/\s+and\s+|\s+und\s+/, $value); 236 my @formattedlist = (); 236 237 foreach $a (@authorlist) { 237 238 $a =~ s/\s*$//; … … 239 240 # Reformat and add author name 240 241 next if $a=~ /^\s*$/; 241 my @words = split(/ /, $a); 242 my $lastname = pop @words; 243 my $firstname = join(" ", @words); 244 245 my $fullname = $lastname . ", " . $firstname; 242 243 # names are "First von Last", "von Last, First" 244 # or "von Last, Jr, First". See the "BibTeXing" manual, page 16 245 my $first=""; 246 my $vonlast=""; 247 my $jr=""; 248 249 if ($a =~ /,/) { 250 my @parts=split(/,\s*/, $a); 251 $first = pop @parts; 252 if (scalar(@parts) == 2) { 253 $jr = pop @parts; 254 } 255 $vonlast=shift @parts; 256 if (scalar(@parts) > 0) { 257 print $outhandle "BibTexPlug: couldn't parse name $a\n"; 258 # but we continue anyway... 259 } 260 } else { # First von Last 261 my @words = split(/ /, $a); 262 while (scalar(@words) > 1 && $words[0] !~ /^[a-z]{2..}/) { 263 $first .= " " . shift (@words); 264 } 265 $first =~ s/^\s//; 266 $vonlast = join (' ', @words); # whatever's left... 267 } 268 my $von=""; 269 my $last=""; 270 if ($vonlast =~ m/^[a-z]/) { # lowercase implies "von" 271 $vonlast =~ s/^(([a-z]\w+\s+)+)//; 272 $von = $1; 273 if (!defined ($von)) { 274 # some non-English names do start with lowercase 275 # eg "Marie desJardins". Also we can get typos... 276 print $outhandle "BibTexPlug: couldn't parse surname $vonlast\n"; 277 $von=""; 278 if ($vonlast =~ /^[a-z]+$/) { 279 # if it's all lowercase, uppercase 1st. 280 $vonlast =~ s/^(.)/\u$1/; 281 282 } 283 } 284 $von =~ s/\s*$//; 285 $last=$vonlast; 286 } else { 287 $last=$vonlast; 288 } 289 my $wholename="$first $von $last $jr"; 290 $wholename =~ s/ $//; $wholename =~ s/\s+/ /g; 291 push (@formattedlist, $wholename); 292 my $fullname = "$last"; 293 $fullname .= " $jr" if ($jr); 294 $fullname .= ", $first"; 295 $fullname .= " $von" if ($von); 246 296 247 297 # Add each name to set of Authors 248 298 # force utf8 pragma so that \w matches in this scope 249 299 use utf8; 250 if ($fullname =~ /\w+, \w+/) { 251 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname); 252 } else { 253 } 300 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname); 254 301 } 302 303 # Only want at most one "and" in the Creator field 304 if (scalar(@formattedlist) > 2) { 305 my $lastauthor=pop @formattedlist; 306 $value=join(', ', @formattedlist); 307 $value.=" and $lastauthor"; 308 } else { # 1 or 2 authors... 309 $value=join(" and ",@formattedlist); 310 } 311 $value.=$etal; # if there was "et al." 255 312 } 256 313 … … 262 319 } 263 320 321 322 # Add the various fields as metadata 323 $doc_obj->add_utf8_metadata ($cursection, $name, $value); 264 324 } 265 325 266 # Add the text in BibTex format (all fields) 326 # Add Date (yyyymmdd) metadata 327 if (defined ($metadata{'year'}) ) { 328 my $date=$metadata{'year'}; 329 chomp $date; 330 my $month=$metadata{'month'}; 331 if (defined($month)) { 332 # month is currently 3 letter code or a range... 333 $month = expand_month($month); 334 # take the first month found... might not find one! 335 $month =~ m/_textmonth(\d\d)_/; 336 $month = $1; 337 } 338 if (!defined($month)) { 339 $month="00"; 340 } 341 $date .= "${month}00"; 342 $doc_obj->add_utf8_metadata($cursection, "Date", $date); 343 } 344 345 # # Add the text in BibTex format (all fields) 267 346 if ($text =~ /\w/) { 268 $text = &text_into_html($text); 347 348 $text =~ s@&@&@g; 349 $text =~ s@<@<@g; 350 $text =~ s@>@>@g; 351 $text =~ s@\n@<br/>\n@g; 352 $text =~ s@\\@\\\\@g; 353 354 # Not really required... 355 # $doc_obj->add_utf8_metadata($cursection, "BibTex", $text); 269 356 $doc_obj->add_utf8_text ($cursection, $text); 270 $doc_obj->add_utf8_metadata($cursection, "BibTex", $text);271 357 } 272 358 … … 310 396 $text =~ s/\`\`/\"/g; #Latex -specific conversion 311 397 312 313 398 $text =~ s/\"/"/g; 314 399 $text =~ s/\'/’/g; 315 400 $text =~ s/\`/‘/g; 316 $text =~ s/\+/ /g; 317 $text =~ s/\(/ /g; 318 $text =~ s/\)/ /g; 401 402 # $text =~ s/\+/ /g; 403 # $text =~ s/\(/ /g; 404 # $text =~ s/\)/ /g; 319 405 320 406 $text =~ s/\\/\\\\/g; 321 407 322 $text =~ s/\./\\\./g;408 # $text =~ s/\./\\\./g; 323 409 324 410 return $text; … … 457 543 '~o' => chr(0xc3).chr(0xb5), 458 544 # caron - handled specially 459 # ',s' => chr(0xc5).chr(0xa1),460 # ',S' => chr(0xc5).chr(0xa5),461 545 # double acute 462 546 # ring 463 547 # dot 464 ' \.c' => chr(0xc4).chr(0x8b),465 ' \.C' => chr(0xc4).chr(0x8a),466 ' \.e' => chr(0xc4).chr(0x97),467 ' \.E' => chr(0xc4).chr(0x96),468 ' \.g' => chr(0xc4).chr(0xa1),469 ' \.G' => chr(0xc4).chr(0xa0),470 ' \.I' => chr(0xc4).chr(0xb0),471 ' \.z' => chr(0xc5).chr(0xbc),472 ' \.Z' => chr(0xc5).chr(0xbb),548 '.c' => chr(0xc4).chr(0x8b), 549 '.C' => chr(0xc4).chr(0x8a), 550 '.e' => chr(0xc4).chr(0x97), 551 '.E' => chr(0xc4).chr(0x96), 552 '.g' => chr(0xc4).chr(0xa1), 553 '.G' => chr(0xc4).chr(0xa0), 554 '.I' => chr(0xc4).chr(0xb0), 555 '.z' => chr(0xc5).chr(0xbc), 556 '.Z' => chr(0xc5).chr(0xbb), 473 557 # macron 474 558 '=a' => chr(0xc4).chr(0x81), … … 486 570 487 571 # cedilla - handled specially 488 489 572 ); 490 573 … … 551 634 552 635 # convert latex-style accented characters. 636 553 637 # remove space (if any) between \ and letter to accent (eg {\' a}) 554 638 555 $text =~ s@(\\[`'=" ])\s(\w)@$1$2@g;639 $text =~ s@(\\[`'="^~\.])\s(\w)@$1$2@g; 556 640 557 641 # remove {} around a single character (eg \'{e}) 558 $text =~ s@(\\[`'="\.]){(\w)}@{$1$2}@g; 642 $text =~ s@(\\[`'="^~\.]){(\w)}@{$1$2}@g; 643 644 # \, is another way of doing cedilla \c 645 $text =~ s@\\,(.)@\\c $1@g; 559 646 560 647 # remove {} around a single character for special 1 letter commands - 561 648 # need to insert a space. Eg \v{s} -> {\v s} 562 649 $text =~ s@(\\[uvcH]){(\w)}@{$1 $2}@g; 563 # this is slow (go through whole hash for each substitution!) so 650 564 651 # only do if the text contains a '\' character. 565 652 if ($text =~ m|\\|) { 566 for $latex_code (keys %utf8_chars) { 567 $text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g; 568 } 569 570 # where the following letter matters (eg "sm\o rrebr\o d", \ss{}) 571 # only do the change if immediately followed by a space, }, {, or \ 572 for $latex_code (keys %special_utf8_chars) { 573 $text =~ s/\\${latex_code}([\\\s{}])/$special_utf8_chars{$latex_code}$1/g; 574 } 653 # "normal" accents - ie non-alpha latex tag 654 while ($text =~ m@\\([`'="^~\.])([\w])@) { 655 my $tex="$1$2"; my $char="$2"; 656 my $replacement=$utf8_chars{$tex}; 657 if (!defined($replacement)) { 658 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n"; 659 $replacement=$char; 660 } 661 $text =~ s/\\$tex/$replacement/g; 662 } 663 664 # where the following letter matters (eg "sm\o rrebr\o d", \ss{}) 665 # only do the change if immediately followed by a space, }, {, or \ 666 # one letter accents ( + ss) 667 while ($text =~ m@\\([DdhiLlOoTt]|ss)[{}\s\"\\]@) { 668 my $tex=$1; 669 my $replacement=$special_utf8_chars{$tex}; 670 if (!defined($replacement)) { 671 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n"; 672 $replacement=$tex; 673 } 674 $text =~ s/\\$tex([{}\s\"\\])/$replacement$1/g; 675 676 } 677 678 # one letter latex accent commands that affect following letter 679 while ($text =~ m@\\([uvcH]) ([\w])@) { 680 my $tex="$1 $2"; my $char="$2"; 681 my $replacement=$special_utf8_chars{$tex}; 682 if (!defined($replacement)) { 683 print STDERR "BibTexPlug: Warning: unknown latex accent \"$tex\" in \"$text\"\n"; 684 $replacement=$char; 685 } 686 $text =~ s/\\$tex/$replacement/g; 687 } 575 688 } 689 690 # escape html-sensitive characters 691 $text =~ s@&@&@g; 692 $text =~ s@<@<@g; 693 $text =~ s@>@>@g; 694 $text =~ s/''/"/g; # Latex-specific 695 $text =~ s/``/"/g; # Latex-specific 696 # greenstone-specific 697 $text =~ s@\[@&\#91;@g; 698 $text =~ s@\]@&\#93;@g; 699 576 700 # remove latex commands 577 # commands with optional arguments... 701 702 # explicitly recognised commands 703 $text =~ s@\\ldots@…@g; 704 705 # maths mode 706 $text =~ s@\$(.*?)\$@&process_latex_math($1)@ge; 707 708 # remove all other commands with optional arguments... 578 709 $text =~ s@\\\w+(\[.*?\])?\s*@@g; 579 710 # $text =~ s@\\noopsort{[^}]+\}@@g; … … 581 712 582 713 # remove latex groupings { } (but not \{ or \} ) 583 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} 714 while ($text =~ s/([^\\])[\{\}]/$1/g) {;} # needed for "...}{..." 584 715 $text =~ s/^\{//; # remove { if first char 585 586 # maths mode $...$ - this is not interpreted in any way at the moment...587 $text =~ s@\$(.*)\$@$1@g;588 716 589 717 # latex characters 590 718 # spaces - nobr space (~), opt break (\-), append ("#" - bibtex only) 591 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~"719 $text =~ s/([^\\])~+/$1 /g; # non-breaking space "~" 592 720 # optional break "\-" 593 if ($text =~ m/ \#/) { # concat macros (bibtex)721 if ($text =~ m/[^&]\#/) { # concat macros (bibtex) but not HTML codes 594 722 # the non-macro bits have quotes around them - we just remove quotes 595 $text =~ s/ [\"\#]//g;723 $text =~ s/([^&])[\"\#]/$1/g; 596 724 } 725 # dashes. Convert (m|n)-dash into single dash for html. 726 $text =~ s@\-\-+@\-@g; 727 597 728 # quoted { } chars 598 729 $text =~ s@\\{@{@g; 599 730 $text =~ s@\\}@}@g; 600 731 732 # finally to protect against macro language... 733 $text =~ s@\\@\\\\@g; 734 601 735 return $text; 602 736 } 603 737 738 739 sub process_latex_math { 740 my $text = shift; 741 742 $text =~ s@\\infty@infinity@g; # or unicode 0x221E... 743 $text =~ s@\^{(.*?)}@<sup>$1</sup>@g; # superscript 744 $text =~ s@\^([^\{])@<sup>$1</sup>@g; 745 $text =~ s@\_{(.*?)}@<sub>$1</sub>@g; # subscript 746 $text =~ s@\_([^\{])@<sub>$1</sub>@g; 747 748 # put all other command names in italics 749 $text =~ s@\\([\w]+)@<i>$1</i>@g; 750 751 return $text; 752 } 604 753 605 754 sub set_OID {
Note:
See TracChangeset
for help on using the changeset viewer.