Changeset 2901
- Timestamp:
- 2002-01-14T17:38:47+13:00 (22 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BibTexPlug.pm
r2484 r2901 8 8 # 9 9 # Copyright 2000 Gordon W. Paynter 10 # Copyright 1999-200 0New Zealand Digital Library Project10 # Copyright 1999-2001 New Zealand Digital Library Project 11 11 # 12 12 # This program is free software; you can redistribute it and/or modify … … 35 35 # It is a subclass of SplitPlug, so if there are multiple records, all 36 36 # are read. 37 37 # 38 # Modified Dec 2001 by John McPherson: 39 # * some modifications submitted by Sergey Yevtushenko 40 # <[email protected]> 41 # * some non-ascii char support (ie mostly Latin) 42 # * The raw ascii bibtex entry is stored as "BibTex" metadata. 38 43 39 44 package BibTexPlug; 40 45 41 46 use SplitPlug; 42 43 47 44 48 # BibTexPlug is a sub-class of BasPlug. … … 56 60 return q^\n+(?=@)^; 57 61 } 62 63 58 64 59 65 # The process function reads a single bibliographic record and stores … … 80 86 # This hash translates BibTex field names into metadata names. The 81 87 # BibTex names are taken from the "Local Guide to Latex" Graeme 82 # McKinstry. Metadata names are consist abnt with ReferPlug.88 # McKinstry. Metadata names are consistent with ReferPlug. 83 89 84 90 my %field = ( 85 91 'address', 'PublisherAddress', 86 92 'author', 'Creator', 93 87 94 'booktitle', 'Booktitle', 88 95 'chapter', 'Chapter', 89 96 'edition', 'Edition', 90 'editor', 'Editor', 97 'editor', 'Editor', 91 98 'institution', 'Publisher', 92 99 'journal', 'Journal', … … 102 109 'keywords', 'Keywords', 103 110 'abstract', 'Abstract', 104 'copyright', 'Copyright'); 111 'copyright', 'Copyright' 112 ); 105 113 106 114 # Metadata fields … … 108 116 my ($EntryType, $EntryID, $Creator, $Keywords, $text); 109 117 118 my $verbosity = $self->{'verbosity'}; 119 $verbosity = 0 unless $verbosity; 120 121 my $lines=$$textref; 122 110 123 # Make sure the text has exactly one entry per line 111 my $lines = $$textref; 112 $lines =~ s/,\s*\n/=====/g; 124 125 $lines =~ s/^\s*(\@[^,]+,)\s*\n/$1=====/; #splitting key in entry 126 $lines =~ s/([\"\}]\s*,)\s*\n/$1=====/g; #splitting by comma, followed by \n (assuming end of lines are " or }) 127 $lines =~ s/(\d+\s*\,)\s*\n/$1=====/g; #for the case, when we have number entry without closing " 128 $lines =~ s/\n\s*\n/%%%%%/g; #this was simply added in order to allow to process newline inside quoted strings, 129 #that continues for several lines 113 130 $lines =~ s/\s+/ /g; 114 131 $lines =~ s/\s*=====\s*/\n/g; 115 my @lines = split(/\n+/, $lines); 116 132 133 my @all_lines = split(/\n+/, $lines); 134 117 135 # Read and process each line in the bib file. 118 my ($ id, $name, $value, $line);119 foreach $line (@ lines) {120 136 my ($entryname, $name, $value, $line); 137 foreach $line (@all_lines) { 138 121 139 # Add each line. Most lines consist of a field identifer and 122 140 # then data, and we simply store them, though we treat some … … 126 144 $text .= "$line\n"; 127 145 128 146 print "Processing line = $line \n" if $verbosity>=4; 147 129 148 # The first line is special, it contains the reference type and OID 130 149 if ($line =~ /\@(\w+)\W*\{\W*([\*\.\w\d:-]+)\W*$/) { … … 137 156 } 138 157 if ($line =~ /\@/) { 139 print "bibtexplug: suspect line in bibtex file: $line\n"158 print $outhandle "bibtexplug: suspect line in bibtex file: $line\n" 140 159 if ($verbosity >= 2); 141 print "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n"160 print $outhandle "bibtexplug: if that's the start of a new bibtex record ammend regexp in bibtexplug::process()\n" 142 161 if ($verbosity >= 2); 143 162 } … … 145 164 # otherwise, parse the metadata out of this line 146 165 next unless ($line =~ /^\s*(\w+)\s+=\s+(.*)/); 147 $ id= lc($1);166 $entryname = lc($1); 148 167 $value = $2; 149 168 # tidy up, removing " at start and end 169 $value =~ s/^"//; 170 $value =~ s/(",)\s*$//; 171 $value = &process_latex($value); 172 150 173 # Add this line of metadata 151 $metadata{$id} .= "$value\n"; 174 $metadata{$entryname} .= "$value\n"; 175 152 176 } 153 177 154 178 # Add the Entry type as metadata 155 $doc_obj->add_ metadata ($cursection, "EntryType", $EntryType);179 $doc_obj->add_utf8_metadata ($cursection, "EntryType", $EntryType); 156 180 157 181 # Add the various field as metadata 158 foreach my $id (keys %metadata) { 182 foreach my $entryname (keys %metadata) { 183 next unless (defined $field{$entryname}); 184 next unless (defined $metadata{$entryname}); 159 185 160 next unless (defined $field{$id}); 161 next unless (defined $metadata{$id}); 162 163 $name = $field{$id}; 164 $value = $metadata{$id}; 165 166 # Get rid of silly Latex stuff 167 if ($value =~ /\"(.*)\"/) { 168 $value = $1; 169 } 170 if ($value =~ /\{(.*)\}/) { 171 $value = $1; 172 } 173 174 # Add the various field as metadata 175 $value = &text_into_html($value); 176 $doc_obj->add_metadata ($cursection, $name, $value); 186 $name = $field{$entryname}; 187 $value = $metadata{$entryname}; 188 189 # Add the various fields as metadata 190 my $html_value = &text_into_html($value); 191 $doc_obj->add_utf8_metadata ($cursection, $name, $html_value); 177 192 178 193 # Several special operatons on metadata follow … … 181 196 # The full set of keywords will be added, in due course, as "Keywords". 182 197 # However, we also want to add them as individual "Keyword" metadata elements. 183 if ($ ideq "keywords") {198 if ($entryname eq "keywords") { 184 199 my @keywordlist = split(/,/, $value); 185 200 foreach my $k (@keywordlist) { 186 $k = lc($k); 201 $k = lc($k); 187 202 $k =~ s/\s*$//; 188 203 $k =~ s/^\s*//; 189 204 if ($k =~ /\w/) { 190 205 $k = &text_into_html($k); 191 $doc_obj->add_ metadata ($cursection, "Keyword", $k);206 $doc_obj->add_utf8_metadata ($cursection, "Keyword", $k); 192 207 } 193 208 } … … 198 213 # also want to split it into several individual "Author" fields in 199 214 # "Lastename, Firstnames" format so we can browse it. 200 if ($id eq "author") { 201 202 my @authorlist = split(/(,|and)/, $value); 215 if ($entryname eq "author") { #added also comparison with editor 216 217 # und here for german language... 218 # don't use brackets in pattern, else the matched bit becomes 219 # an element in the list! 220 my @authorlist = split(/,|\s+and\s+|\s+und\s+/, $value); 203 221 foreach $a (@authorlist) { 204 222 $a =~ s/\s*$//; 205 223 $a =~ s/^\s*//; 206 207 224 # Reformat and add author name 225 next if $a=~ /^\s*$/; 208 226 my @words = split(/ /, $a); 209 227 my $lastname = pop @words; 210 228 my $firstname = join(" ", @words); 229 211 230 my $fullname = $lastname . ", " . $firstname; 212 231 213 232 # Add each name to set of Authors 233 # force utf8 pragma so that \w matches in this scope 234 use utf8; 214 235 if ($fullname =~ /\w+, \w+/) { 215 $fullname = &text_into_html($fullname); 216 $doc_obj->add_metadata ($cursection, "Author", $fullname); 236 $doc_obj->add_utf8_metadata ($cursection, "Author", $fullname); 217 237 } 218 238 } … … 220 240 221 241 # Books and Journals are additionally marked for display purposes 222 if ($ ideq "booktitle") {223 $doc_obj->add_ metadata($cursection, "BookConfOnly", 1);224 } elsif ($ ideq "journal") {225 $doc_obj->add_ metadata($cursection, "JournalsOnly", 1);242 if ($entryname eq "booktitle") { 243 $doc_obj->add_utf8_metadata($cursection, "BookConfOnly", 1); 244 } elsif ($entryname eq "journal") { 245 $doc_obj->add_utf8_metadata($cursection, "JournalsOnly", 1); 226 246 } 227 247 … … 231 251 if ($text =~ /\w/) { 232 252 $text = &text_into_html($text); 233 $doc_obj->add_text ($cursection, $text); 253 $doc_obj->add_utf8_text ($cursection, $text); 254 $doc_obj->add_utf8_metadata($cursection, "BibTex", $text); 234 255 } 235 256 … … 238 259 239 260 261 262 263 # convert email addresses and URLs into links 264 sub convert_urls_into_links{ 265 my ($text) = @_; 266 267 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g; 268 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g; 269 270 return $text; 271 } 272 273 # Clean up whitespace and convert \n charaters to <BR> or <P> 274 sub clean_up_whitespaces{ 275 my ($text) = @_; 276 277 $text =~ s/%%%%%/<BR> <BR>/g; 278 $text =~ s/ +/ /g; 279 $text =~ s/\s*$//; 280 $text =~ s/^\s*//; 281 $text =~ s/\n/\n<BR>/g; 282 $text =~ s/<BR>\s*<BR>/<P>/g; 283 284 return $text; 285 } 286 287 288 sub convert_problem_characters_without_ampersand{ 289 my ($text) = @_; 290 $text =~ s/</</g; 291 $text =~ s/>/>/g; 292 293 $text =~ s/\'\'/\"/g; #Latex -specific conversion 294 $text =~ s/\`\`/\"/g; #Latex -specific conversion 295 296 297 $text =~ s/\"/"/g; 298 $text =~ s/\'/’/g; 299 $text =~ s/\`/‘/g; 300 $text =~ s/\+/ /g; 301 $text =~ s/\(/ /g; 302 $text =~ s/\)/ /g; 303 304 $text =~ s/\\/\\\\/g; 305 306 $text =~ s/\./\\\./g; 307 308 return $text; 309 } 310 240 311 # Convert a text string into HTML. 241 312 242 313 # The HTML is going to be inserted into a GML file, so we have to be 243 # careful not to use symbols like ">", which oc urs frequently in email314 # careful not to use symbols like ">", which occurs frequently in email 244 315 # messages (and use > instead. 245 316 … … 248 319 # with <P> tags). 249 320 250 251 321 sub text_into_html { 252 322 my ($text) = @_; 253 323 254 255 # Convert problem charaters into HTML symbols 324 # Convert problem characters into HTML symbols 256 325 $text =~ s/&/&/g; 257 $text =~ s/</</g; 258 $text =~ s/>/>/g; 259 $text =~ s/\"/"/g; 260 $text =~ s/\'/ /g; 261 $text =~ s/\+/ /g; 262 $text =~ s/\(/ /g; 263 $text =~ s/\)/ /g; 326 327 $text = &convert_problem_characters_without_ampersand( $text ); 264 328 265 329 # convert email addresses and URLs into links 266 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g; 267 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g; 268 269 # Clean up whitespace and convert \n charaters to <BR> or <P> 270 $text =~ s/ +/ /g; 271 $text =~ s/\s*$//; 272 $text =~ s/^\s*//; 273 $text =~ s/\n/\n<BR>/g; 274 $text =~ s/<BR>\s*<BR>/<P>/g; 330 $text = &convert_urls_into_links( $text ); 331 332 $text = &clean_up_whitespaces( $text ); 275 333 276 334 return $text; 277 335 } 336 337 338 339 340 # Convert accented characters, remove { }, interprete some commands.... 341 # Note!! This is not comprehensive! Also assumes Latin -> Unicode! 342 sub process_latex { 343 my ($text) = @_; 344 345 # note - this is really ugly, but it works. There may be a prettier way 346 # of mapping latex accented chars to utf8, but we just brute force it here. 347 # Also, this isn't complete - not every single possible accented letter 348 # is in here yet, but most of the common ones are. 349 350 my %utf8_chars = 351 ( 352 # acutes 353 '\'a' => chr(0xc3).chr(0xa1), 354 '\'c' => chr(0xc4).chr(0x87), 355 '\'e' => chr(0xc3).chr(0xa9), 356 '\'i' => chr(0xc3).chr(0xad), 357 '\'l' => chr(0xc3).chr(0xba), 358 '\'n' => chr(0xc3).chr(0x84), 359 '\'o' => chr(0xc3).chr(0xb3), 360 '\'r' => chr(0xc5).chr(0x95), 361 '\'s' => chr(0xc5).chr(0x9b), 362 '\'u' => chr(0xc3).chr(0xba), 363 '\'y' => chr(0xc3).chr(0xbd), 364 '\'z' => chr(0xc5).chr(0xba), 365 # graves 366 '`a' => chr(0xc3).chr(0xa0), 367 '`A' => chr(0xc3).chr(0x80), 368 '`e' => chr(0xc3).chr(0xa8), 369 '`E' => chr(0xc3).chr(0x88), 370 '`i' => chr(0xc3).chr(0xac), 371 '`I' => chr(0xc3).chr(0x8c), 372 '`o' => chr(0xc3).chr(0xb2), 373 '`O' => chr(0xc3).chr(0x92), 374 '`u' => chr(0xc3).chr(0xb9), 375 '`U' => chr(0xc3).chr(0x99), 376 # circumflex 377 '^a' => chr(0xc3).chr(0xa2), 378 '^A' => chr(0xc3).chr(0x82), 379 '^c' => chr(0xc4).chr(0x89), 380 '^C' => chr(0xc4).chr(0x88), 381 '^e' => chr(0xc3).chr(0xaa), 382 '^E' => chr(0xc3).chr(0x8a), 383 '^g' => chr(0xc4).chr(0x9d), 384 '^G' => chr(0xc4).chr(0x9c), 385 '^h' => chr(0xc4).chr(0xa5), 386 '^H' => chr(0xc4).chr(0xa4), 387 '^i' => chr(0xc3).chr(0xae), 388 '^I' => chr(0xc3).chr(0x8e), 389 '^j' => chr(0xc4).chr(0xb5), 390 '^J' => chr(0xc4).chr(0xb4), 391 '^o' => chr(0xc3).chr(0xb4), 392 '^O' => chr(0xc3).chr(0x94), 393 '^s' => chr(0xc5).chr(0x9d), 394 '^S' => chr(0xc5).chr(0x9c), 395 '^u' => chr(0xc3).chr(0xa2), 396 '^U' => chr(0xc3).chr(0xbb), 397 '^w' => chr(0xc5).chr(0xb5), 398 '^W' => chr(0xc5).chr(0xb4), 399 '^y' => chr(0xc5).chr(0xb7), 400 '^Y' => chr(0xc5).chr(0xb6), 401 402 # diaeresis 403 '"a' => chr(0xc3).chr(0xa4), 404 '"A' => chr(0xc3).chr(0x84), 405 '"e' => chr(0xc3).chr(0xab), 406 '"E' => chr(0xc3).chr(0x8b), 407 '"\\\\i' => chr(0xc3).chr(0xaf), 408 '"\\\\I' => chr(0xc3).chr(0x8f), 409 '"o' => chr(0xc3).chr(0xb6), 410 '"O' => chr(0xc3).chr(0x96), 411 '"u' => chr(0xc3).chr(0xbc), 412 '"U' => chr(0xc3).chr(0x9c), 413 '"y' => chr(0xc3).chr(0xbf), 414 '"Y' => chr(0xc3).chr(0xb8), 415 # tilde 416 # caron - handled specially 417 # ',s' => chr(0xc5).chr(0xa1), 418 # ',S' => chr(0xc5).chr(0xa5), 419 # breve 420 # double acute 421 # ring 422 # dot 423 # macron 424 '=a' => chr(0xc4).chr(0x81), 425 '=A' => chr(0xc4).chr(0x80), 426 '=e' => chr(0xc4).chr(0x93), 427 '=E' => chr(0xc4).chr(0x92), 428 '=i' => chr(0xc4).chr(0xab), 429 '=I' => chr(0xc4).chr(0xaa), 430 '=o' => chr(0xc4).chr(0x8d), 431 '=O' => chr(0xc4).chr(0x8c), 432 '=u' => chr(0xc4).chr(0xab), 433 '=U' => chr(0xc4).chr(0xaa), 434 435 # stroke - handled specially - see below 436 437 # cedilla - handled specially 438 439 ); 440 441 # these are one letter latex commands - we make sure they're not a longer 442 # command name. eg {\d} is d+stroke, so careful of \d 443 my %special_utf8_chars = 444 ( 445 # caron 446 'v n' => chr(0xc5).chr(0x88), 447 'v N' => chr(0xc5).chr(0x87), 448 'v s' => chr(0xc5).chr(0xa1), 449 'v S' => chr(0xc5).chr(0xa5), 450 # cedilla 451 'c c' => chr(0xc3).chr(0xa7), 452 'c C' => chr(0xc3).chr(0x87), 453 'c g' => chr(0xc4).chr(0xa3), 454 'c G' => chr(0xc4).chr(0xa2), 455 'c k' => chr(0xc4).chr(0xb7), 456 'c K' => chr(0xc4).chr(0xb6), 457 'c l' => chr(0xc4).chr(0xbc), 458 'c L' => chr(0xc4).chr(0xbb), 459 'c n' => chr(0xc5).chr(0x86), 460 'c N' => chr(0xc5).chr(0x85), 461 'c r' => chr(0xc5).chr(0x97), 462 'c R' => chr(0xc5).chr(0x96), 463 'c s' => chr(0xc5).chr(0x9f), 464 'c S' => chr(0xc5).chr(0x9e), 465 'c t' => chr(0xc5).chr(0xa3), 466 'c T' => chr(0xc5).chr(0xa2), 467 # double acute / Hungarian accent 468 'H O' => chr(0xc5).chr(0x90), 469 'H o' => chr(0xc5).chr(0x91), 470 'H U' => chr(0xc5).chr(0xb0), 471 'H u' => chr(0xc5).chr(0xb1), 472 473 # stroke 474 'd' => chr(0xc4).chr(0x91), 475 'D' => chr(0xc4).chr(0x90), 476 'h' => chr(0xc4).chr(0xa7), 477 # 'H' => chr(0xc4).chr(0xa6), # !! this normally(!!?) means Hung. umlaut 478 'l' => chr(0xc5).chr(0x82), 479 'L' => chr(0xc5).chr(0x81), 480 'o' => chr(0xc3).chr(0xb8), 481 'O' => chr(0xc3).chr(0x98), 482 't' => chr(0xc5).chr(0xa7), 483 'T' => chr(0xc5).chr(0xa6), 484 # german ss/szlig/sharp s 485 'ss' => chr(0xc3).chr(0x9f), 486 ); 487 488 # convert latex-style accented characters. 489 # remove space (if any) between \ and letter to accent (eg {\' a}) 490 491 $text =~ s@(\\[`'="])\s(\w)@$1$2@g; 492 493 # remove {} around a single character (eg \'{e}) 494 $text =~ s@(\\[`'="]){(\w)}@$1$2@; 495 496 # remove {} around a single character for special 1 letter commands - 497 # need to insert a space. Eg \v{s} -> {\v s} 498 $text =~ s@(\\[vcH]){(\w)}@{$1 $2}@; 499 500 # this is slow (go through whole hash for each substitution!) so 501 # only do if the text contains a '\' character. 502 if ($text =~ m|\\|) { 503 for $latex_code (keys %utf8_chars) { 504 $text =~ s/\\$latex_code/$utf8_chars{$latex_code}/g; 505 } 506 507 # where the following letter matters (eg "sm\o rrebr\o d", \ss{}) 508 # only do the change if immediately followed by a space, }, {, or \ 509 for $latex_code (keys %special_utf8_chars) { 510 $text =~ s/\\${latex_code}([\\\s\{\}])/$special_utf8_chars{$latex_code}$1/g; 511 } 512 } 513 514 # remove latex groupings { } (but not \{ or \} ) 515 # note - need it like this for first char match - eg {xx}{yy} 516 while ($text =~ s@([^\\]){([^}]*?[^\\])}@$1$2@g) {} 517 518 # remove latex commands 519 $text =~ s@\\\w+{(.*)}@$1@g; 520 521 # maths mode $...$ - this is not interpreted in any way at the moment... 522 $text =~ s@\$(.*)\$@$1@g; 523 524 # quoted { } chars 525 $text =~ s@\\{@{@g; 526 $text =~ s@\\}@}@g; 527 528 return $text; 529 } 530 278 531 279 532 sub set_OID {
Note:
See TracChangeset
for help on using the changeset viewer.