Changeset 16557
- Timestamp:
- 2008-07-25T18:31:50+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/BasePlugin.pm
r16520 r16557 35 35 use encodings; 36 36 use unicode; 37 use textcat; 37 38 use doc; 38 39 eval "require diagnostics"; # some perl distros (eg mac) don't have this … … 53 54 { 'name' => "unicode", 54 55 'desc' => "{BasePlugin.encoding.unicode}" } ]; 56 55 57 56 58 my $e = $encodings::encodings; … … 66 68 our $encoding_plus_auto_list = 67 69 [ { 'name' => "auto", 68 'desc' => "{BasePlugin.filename_encoding.auto}" } ]; 70 'desc' => "{BasePlugin.filename_encoding.auto}" }, 71 { 'name' => "auto-language-analysis", 72 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat 73 { 'name' => "auto-filesystem-encoding", 74 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale 75 { 'name' => "auto-fl", 76 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat 77 { 'name' => "auto-lf", 78 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale 69 79 70 80 push(@{$encoding_plus_auto_list},@{$encoding_list}); … … 376 386 my $filemeta = $file; 377 387 378 my $filename_encoding = $self->{'filename_encoding'}; 379 if ($filename_encoding eq "auto") { 380 # we check the locale first 381 if (!defined $self->{'filesystem_encoding'}) { 382 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding(); 383 $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'}; 384 } 385 if ($self->{'filesystem_encoding'} ne "undefined") { 386 $filename_encoding = $self->{'filesystem_encoding'}; 387 } else { 388 # try the encoding of the document, if available 389 if (defined $file_encoding) { 390 $filename_encoding = $file_encoding; 391 } else { 392 # use utf8 393 $filename_encoding = "utf8"; 394 } 395 } 388 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting 389 390 ## print STDERR "**** User chose filename encoding setting: $filename_encoding\n"; 391 392 # Whenever filename-encoding is set to any of the auto settings, we 393 # check if the filename is already in UTF8. If it is, then we're done. 394 if($filename_encoding =~ m/auto/) { 395 if(&unicode::check_is_utf8($filemeta)) 396 { 397 ## print STDERR "**** It is already UTF8\n"; 398 $filename_encoding = "utf8"; 399 return $filemeta; 400 } 401 } 402 403 # Auto setting, but filename is not utf8 404 if ($filename_encoding eq "auto") 405 { 406 # try textcat 407 $filename_encoding = $self->textcat_encoding($filemeta); 396 408 397 } 398 399 if ($filename_encoding !~ /(?:ascii|utf8|unicode)/) { 400 $filemeta = unicode::unicode2utf8( 401 unicode::convert2unicode($filename_encoding, \$filemeta) 409 # check the locale next 410 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined"; 411 412 413 # now try the encoding of the document, if available 414 if ($filename_encoding eq "undefined" && defined $file_encoding) { 415 $filename_encoding = $file_encoding; 416 } 417 418 } 419 420 elsif ($filename_encoding eq "auto-language-analysis") 421 { 422 $filename_encoding = $self->textcat_encoding($filemeta); 423 424 # now try the encoding of the document, if available 425 if ($filename_encoding eq "undefined" && defined $file_encoding) { 426 $filename_encoding = $file_encoding; 427 } 428 } 429 430 elsif ($filename_encoding eq "auto-filesystem-encoding") 431 { 432 # try locale 433 $filename_encoding = $self->locale_encoding(); 434 } 435 436 elsif ($filename_encoding eq "auto-fl") 437 { 438 # filesystem-encoding (locale) then language-analysis (textcat) 439 $filename_encoding = $self->locale_encoding(); 440 441 # try textcat 442 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined"; 443 444 # else assume filename encoding is encoding of file content, if that's available 445 if ($filename_encoding eq "undefined" && defined $file_encoding) { 446 $filename_encoding = $file_encoding; 447 } 448 } 449 450 elsif ($filename_encoding eq "auto-lf") 451 { 452 # language-analysis (textcat) then filesystem-encoding (locale) 453 $filename_encoding = $self->textcat_encoding($filemeta); 454 455 # guess filename encoding from encoding of file content, if available 456 if ($filename_encoding eq "undefined" && defined $file_encoding) { 457 $filename_encoding = $file_encoding; 458 } 459 460 # try locale 461 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined"; 462 } 463 464 ## print STDERR "**** filename_encoding selected: $filename_encoding \n"; 465 466 # if still undefined, use utf8 as fallback 467 if ($filename_encoding eq "undefined") { 468 $filename_encoding = "utf8"; 469 } 470 471 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when 472 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to 473 # be always utf8 (in which case the filename's encoding is also set as utf8 even 474 # though the filename need not be if it originates from another system)--in such 475 # cases attempt to make the filename utf8 to match. 476 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) { 477 ## print STDERR "**** BEFORE utf8 conversion: $filemeta\n"; 478 &unicode::ensure_utf8(\$filemeta); 479 ## print STDERR "**** AFTER utf8 conversion: $filemeta\n"; 480 } 481 482 483 # convert non-unicode encodings to utf8 484 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) { 485 $filemeta = &unicode::unicode2utf8( 486 &unicode::convert2unicode($filename_encoding, \$filemeta) 402 487 ); 403 488 } 489 490 print "*** filename encoding found: $filename_encoding\n"; 491 print "*** utf8 encoded filename: $filemeta\n"; 404 492 405 493 return $filemeta; … … 424 512 } 425 513 426 514 sub locale_encoding { 515 my $self = shift(@_); 516 517 if (!defined $self->{'filesystem_encoding'}) { 518 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding(); 519 $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'}; 520 } 521 522 print "filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n"; 523 return $self->{'filesystem_encoding'}; # can be the string "undefined" 524 } 525 526 sub textcat_encoding { 527 my $self = shift(@_); 528 my ($filemeta) = @_; 529 530 # analyse filenames without extensions and digits (and trimmed of surrounding 531 # whitespace), so that irrelevant chars don't confuse textcat 532 my $strictfilemeta = $filemeta; 533 $strictfilemeta =~ s/\.[^\.]+$//g; 534 $strictfilemeta =~ s/\d//g; 535 $strictfilemeta =~ s/^\s*//g; 536 $strictfilemeta =~ s/\s*$//g; 537 538 ## print STDERR "**** strict filename is |$strictfilemeta|\n"; 539 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta); 540 if(!defined $filename_encoding) { 541 $filename_encoding = "undefined"; 542 } 543 544 ## print STDERR "**** textcat found filename encoding: " . $file_textcat_encoding_map{$strictfilemeta} . "\n"; 545 return $filename_encoding; # can be the string "undefined" 546 } 547 548 # performs textcat 549 sub encoding_from_language_analysis { 550 my $self = shift(@_); 551 my ($text) = @_; 552 553 my $outhandle = $self->{'outhandle'}; 554 my $best_encoding = undef; 555 556 # get the language/encoding of the file using textcat 557 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'}); 558 #my $results = $self->{'textcat'}->classify(\$text); 559 my $results = $self->{'textcat'}->classify_cached(\$text); 560 561 562 if (scalar @$results < 0) { 563 print STDERR "**** Textcat returned 0 results\n"; 564 return undef; 565 } 566 567 print STDERR "**** TEXTCAT RESULTS for $text: "; 568 print STDERR join(",", @$results); 569 print STDERR "\n"; 570 571 # We have some results, we choose the first 572 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/; 573 574 $best_encoding = $encoding; 575 if (!defined $best_encoding) { 576 ## print STDERR "**** Textcat cannot determine encoding of filename: it's undefined.\n"; 577 return undef; 578 } 579 580 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) { 581 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities) 582 ## print STDERR "*** Filename turns out to be UTF8\n"; 583 $best_encoding = 'utf8'; 584 } 585 586 587 # check for equivalents where textcat doesn't have some encodings... 588 # eg MS versions of standard encodings 589 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) { 590 ## print STDERR "**** best_encoding is ISO_8859: $best_encoding\n"; 591 592 my $iso = $1; # which variant of the iso standard? 593 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do 594 if ($text =~ /[\x80-\x9f]/) { 595 ## print STDERR "**** best_encoding is some windows value: $best_encoding\n"; 596 # Western Europe 597 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' } 598 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe 599 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic 600 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic 601 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek 602 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew 603 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish 604 ## print STDERR "**** best_encoding windows value: $best_encoding\n"; 605 } 606 } 607 608 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ && 609 !defined $encodings::encodings->{$best_encoding}) 610 { 611 if ($self->{'verbosity'}) { 612 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef"); 613 } 614 ## print STDERR "***** unsupported encoding: $best_encoding. Setting it to undefined.\n"; 615 $best_encoding = undef; 616 } 617 ## print STDERR "**** language: $language\n" if defined $language; 618 ## print STDERR "**** encoding: $best_encoding\n" if defined $encoding; 619 620 return $best_encoding; 621 } 622 623 # uses locale 427 624 sub get_filesystem_encoding { 428 625
Note:
See TracChangeset
for help on using the changeset viewer.