Changeset 16767
- Timestamp:
- 2008-08-13T16:25:49+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/BasePlugin.pm
r16698 r16767 411 411 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting 412 412 413 ## print STDERR "**** User chose filename encoding setting: $filename_encoding\n";414 415 413 # Whenever filename-encoding is set to any of the auto settings, we 416 414 # check if the filename is already in UTF8. If it is, then we're done. … … 418 416 if(&unicode::check_is_utf8($filemeta)) 419 417 { 420 ## print STDERR "**** It is already UTF8\n";421 418 $filename_encoding = "utf8"; 422 419 return $filemeta; … … 484 481 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined"; 485 482 } 486 487 ## print STDERR "**** filename_encoding selected: $filename_encoding \n";488 483 489 484 # if still undefined, use utf8 as fallback … … 492 487 } 493 488 489 print STDERR "**** UTF8 encoding the filename $filemeta "; 490 494 491 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when 495 492 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to … … 498 495 # cases attempt to make the filename utf8 to match. 499 496 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) { 500 ## print STDERR "**** BEFORE utf8 conversion: $filemeta\n";501 497 &unicode::ensure_utf8(\$filemeta); 502 ## print STDERR "**** AFTER utf8 conversion: $filemeta\n"; 503 } 504 498 } 505 499 506 500 # convert non-unicode encodings to utf8 … … 511 505 } 512 506 513 print "*** filename encoding found: $filename_encoding\n"; 514 print "*** utf8 encoded filename: $filemeta\n"; 515 507 print STDERR " from encoding $filename_encoding -> $filemeta\n"; 516 508 return $filemeta; 517 509 } … … 528 520 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end) 529 521 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding); 530 522 531 523 my $dmsafe_filemeta = &ghtml::dmsafe($filemeta); 532 524 … … 543 535 } 544 536 545 print "filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";537 print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n"; 546 538 return $self->{'filesystem_encoding'}; # can be the string "undefined" 547 539 } … … 559 551 $strictfilemeta =~ s/\s*$//g; 560 552 561 ## print STDERR "**** strict filename is |$strictfilemeta|\n";562 553 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta); 563 554 if(!defined $filename_encoding) { … … 565 556 } 566 557 567 ## print STDERR "**** textcat found filename encoding: " . $file_textcat_encoding_map{$strictfilemeta} . "\n";568 558 return $filename_encoding; # can be the string "undefined" 569 559 } … … 580 570 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'}); 581 571 #my $results = $self->{'textcat'}->classify(\$text); 582 my $results = $self->{'textcat'}->classify_cached (\$text);572 my $results = $self->{'textcat'}->classify_cached_filename(\$text); 583 573 584 574 585 575 if (scalar @$results < 0) { 586 print STDERR "**** Textcat returned 0 results\n";587 576 return undef; 588 577 } 589 578 590 print STDERR "**** TEXTCAT RESULTS for $text: ";591 print STDERR join(",", @$results);592 print STDERR "\n";579 ## print STDERR "**** TEXTCAT RESULTS for $text: "; 580 ## print STDERR join(",", @$results); 581 ## print STDERR "\n"; 593 582 594 583 # We have some results, we choose the first … … 597 586 $best_encoding = $encoding; 598 587 if (!defined $best_encoding) { 599 ## print STDERR "**** Textcat cannot determine encoding of filename: it's undefined.\n";600 588 return undef; 601 589 } … … 603 591 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) { 604 592 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities) 605 ## print STDERR "*** Filename turns out to be UTF8\n";606 593 $best_encoding = 'utf8'; 607 594 } … … 611 598 # eg MS versions of standard encodings 612 599 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) { 613 ## print STDERR "**** best_encoding is ISO_8859: $best_encoding\n";614 615 600 my $iso = $1; # which variant of the iso standard? 616 601 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do 617 602 if ($text =~ /[\x80-\x9f]/) { 618 ## print STDERR "**** best_encoding is some windows value: $best_encoding\n";619 603 # Western Europe 620 604 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' } … … 625 609 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew 626 610 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish 627 ## print STDERR "**** best_encoding windows value: $best_encoding\n";628 611 } 629 612 } … … 635 618 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef"); 636 619 } 637 ## print STDERR "***** unsupported encoding: $best_encoding. Setting it to undefined.\n";638 620 $best_encoding = undef; 639 621 } 640 ## print STDERR "**** language: $language\n" if defined $language;641 ## print STDERR "**** encoding: $best_encoding\n" if defined $encoding;642 622 643 623 return $best_encoding; … … 706 686 my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding); 707 687 $doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta); 708 709 688 } 710 689 … … 724 703 my $top_section = $doc_obj->get_top_section(); 725 704 my $oid = $doc_obj->get_metadata_element($top_section,$metadata_doc_id); 726 ## print STDERR "**** oid = $oid\n";727 705 $doc_obj->set_OID($oid); 728 706 }
Note:
See TracChangeset
for help on using the changeset viewer.