| 373 | | # get the language/encoding |
|---|
| 374 | | $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'})); |
|---|
| 375 | | # my $results = $self->{'textcat'}->classify(\$text); |
|---|
| 376 | | my $results = $self->{'textcat'}->classify_cached_filecontents(\$text, $filename); |
|---|
| 377 | | |
|---|
| 378 | | # if textcat returns 3 or less possibilities we'll use the |
|---|
| 379 | | # first one in the list - otherwise use the defaults |
|---|
| 380 | | if (scalar @$results > 3) { |
|---|
| 381 | | if ($unicode_format) { # in case the first had a BOM |
|---|
| 382 | | $best_encoding=$unicode_format; |
|---|
| 383 | | } else { |
|---|
| 384 | | my %guessed_encodings = (); |
|---|
| 385 | | foreach my $result (@$results) { |
|---|
| 386 | | $result =~ /([^\-]+)$/; |
|---|
| 387 | | my $enc=$1; |
|---|
| 388 | | if (!defined($guessed_encodings{$enc})) { |
|---|
| 389 | | $guessed_encodings{$enc}=0; |
|---|
| | 374 | # don't need to do textcat if we know the encoding now AND don't need to extract language |
|---|
| | 375 | if($found_html_encoding && !$self->{'extract_language'}) { # encoding specified in html file |
|---|
| | 376 | $best_language = $self->{'default_language'}; |
|---|
| | 377 | } |
|---|
| | 378 | |
|---|
| | 379 | else { # need to use textcat to get either the language, or get both language and encoding |
|---|
| | 380 | $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'})); |
|---|
| | 381 | |
|---|
| | 382 | if($found_html_encoding) { # know encoding, find language by limiting search to known encoding |
|---|
| | 383 | my $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding); |
|---|
| | 384 | |
|---|
| | 385 | my $language; |
|---|
| | 386 | ($language) = $results->[0] =~ m/^([^-]*)(?:-(?:.*))?$/ if (scalar @$results > 0); |
|---|
| | 387 | |
|---|
| | 388 | if (!defined $language || scalar @$results > 3) { |
|---|
| | 389 | # if there were too many results even when restricting results by encoding, |
|---|
| | 390 | # or if there were no results, use default language with the known encoding |
|---|
| | 391 | $best_language = $self->use_default_language($filename); |
|---|
| | 392 | } |
|---|
| | 393 | else { # fewer than 3 results means textcat is more certain, use the first result |
|---|
| | 394 | $best_language = $language; |
|---|
| | 395 | } |
|---|
| | 396 | } |
|---|
| | 397 | else { # don't know encoding or language yet, therefore we use textcat |
|---|
| | 398 | my $results = $self->{'textcat'}->classify_contents(\$text, $filename); |
|---|
| | 399 | |
|---|
| | 400 | # if textcat returns 3 or less possibilities we'll use the first one in the list |
|---|
| | 401 | if (scalar @$results <= 3) { # results will be > 0 when we don't constrain textcat by an encoding |
|---|
| | 402 | my ($language, $encoding) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/; |
|---|
| | 403 | |
|---|
| | 404 | $language = $self->use_default_language($filename) unless defined $language; |
|---|
| | 405 | $encoding = $self->use_default_encoding($filename) unless defined $encoding; |
|---|
| | 406 | |
|---|
| | 407 | $best_language = $language; |
|---|
| | 408 | $best_encoding = $encoding; |
|---|
| | 409 | } |
|---|
| | 410 | else { # if (scalar @$results > 3) { |
|---|
| | 411 | if ($unicode_format) { # in case the first had a BOM |
|---|
| | 412 | $best_encoding=$unicode_format; |
|---|
| 391 | | $guessed_encodings{$enc}++; |
|---|
| 392 | | } |
|---|
| 393 | | |
|---|
| 394 | | $guessed_encodings{""}=-1; # for default best_encoding of "" |
|---|
| 395 | | foreach my $enc (keys %guessed_encodings) { |
|---|
| 396 | | if ($guessed_encodings{$enc} > |
|---|
| 397 | | $guessed_encodings{$best_encoding}){ |
|---|
| 398 | | $best_encoding=$enc; |
|---|
| | 414 | else { |
|---|
| | 415 | # Find the most frequent encoding in the textcat results returned |
|---|
| | 416 | # Returns "" if there's no encoding more frequent than another |
|---|
| | 417 | $best_encoding = $self->{'textcat'}->most_frequent_encoding($results); |
|---|
| | 418 | } |
|---|
| | 419 | |
|---|
| | 420 | if ($best_encoding eq "") { # encoding still not set, use defaults |
|---|
| | 421 | $best_language = $self->use_default_language($filename); |
|---|
| | 422 | $best_encoding = $self->use_default_encoding($filename); |
|---|
| | 423 | } |
|---|
| | 424 | elsif (!$self->{'extract_language'}) { # know encoding but don't need to discover language |
|---|
| | 425 | $best_language = $self->use_default_language($filename); |
|---|
| | 426 | } |
|---|
| | 427 | else { # textcat again using the most frequent encoding or the $unicode_format set above |
|---|
| | 428 | $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding); |
|---|
| | 429 | my $language; |
|---|
| | 430 | ($language) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/ if (scalar @$results > 0); |
|---|
| | 431 | if (!defined $language || scalar @$results > 3) { |
|---|
| | 432 | # if no result or too many results, use default language for the encoding previously found |
|---|
| | 433 | $best_language = $self->use_default_language($filename); |
|---|
| | 434 | } |
|---|
| | 435 | else { # fewer than 3 results, use the language of the first result |
|---|
| | 436 | $best_language = $language; |
|---|
| | 437 | } |
|---|
| 402 | | |
|---|
| 403 | | if ($self->{'input_encoding'} ne 'auto') { |
|---|
| 404 | | if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) { |
|---|
| 405 | | gsprintf($outhandle, |
|---|
| 406 | | "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", |
|---|
| 407 | | $filename, $self->{'default_language'}); |
|---|
| 408 | | } |
|---|
| 409 | | $best_language = $self->{'default_language'}; |
|---|
| 410 | | if (!$found_html_encoding) { |
|---|
| 411 | | $best_encoding = $self->{'input_encoding'}; |
|---|
| 412 | | } |
|---|
| 413 | | |
|---|
| 414 | | } else { |
|---|
| 415 | | if ($self->{'verbosity'}>2) { |
|---|
| 416 | | gsprintf($outhandle, |
|---|
| 417 | | "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", |
|---|
| 418 | | $filename, $self->{'default_language'}); |
|---|
| 419 | | } |
|---|
| 420 | | $best_language = $self->{'default_language'}; |
|---|
| 421 | | } |
|---|
| 422 | | } else { # <= 3 suggestions |
|---|
| 423 | | my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/; |
|---|
| 424 | | if (!defined $language) { |
|---|
| 425 | | if ($self->{'verbosity'}>2) { |
|---|
| 426 | | gsprintf($outhandle, |
|---|
| 427 | | "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", |
|---|
| 428 | | $filename, $self->{'default_language'}); |
|---|
| 429 | | } |
|---|
| 430 | | $language = $self->{'default_language'}; |
|---|
| 431 | | } |
|---|
| 432 | | if (!defined $encoding) { |
|---|
| 433 | | if ($self->{'verbosity'}>2) { |
|---|
| 434 | | gsprintf($outhandle, |
|---|
| 435 | | "ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n", |
|---|
| 436 | | $filename, $self->{'default_encoding'}); |
|---|
| 437 | | } |
|---|
| 438 | | $encoding = $self->{'default_encoding'}; |
|---|
| 439 | | } |
|---|
| 440 | | $best_language = $language; |
|---|
| 441 | | if (! $best_encoding ) { # may already be set... eg from html meta tag |
|---|
| 442 | | $best_encoding = $encoding; |
|---|
| 443 | | } |
|---|
| 444 | | } |
|---|
| | 441 | } |
|---|
| | 442 | |
|---|
| | 443 | if($best_encoding eq "" || $best_language eq "") { |
|---|
| | 444 | print STDERR "****Shouldn't happen: encoding and/or language still not set. Using defaults.\n"; |
|---|
| | 445 | $best_encoding = $self->use_default_encoding($filename) if $best_encoding eq ""; |
|---|
| | 446 | $best_language = $self->use_default_language($filename) if $best_language eq ""; |
|---|
| | 447 | } |
|---|
| | 448 | # print STDERR "****Content language: $best_language; Encoding: $best_encoding.\n"; |
|---|
| | 449 | |
|---|
| | 487 | sub use_default_language { |
|---|
| | 488 | my $self = shift (@_); |
|---|
| | 489 | my ($filename) = @_; |
|---|
| | 490 | |
|---|
| | 491 | if ($self->{'verbosity'}>2) { |
|---|
| | 492 | gsprintf($self->{'outhandle'}, |
|---|
| | 493 | "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", |
|---|
| | 494 | $filename, $self->{'default_language'}); |
|---|
| | 495 | } |
|---|
| | 496 | return $self->{'default_language'}; |
|---|
| | 497 | } |
|---|
| | 498 | |
|---|
| | 499 | sub use_default_encoding { |
|---|
| | 500 | my $self = shift (@_); |
|---|
| | 501 | my ($filename) = @_; |
|---|
| | 502 | |
|---|
| | 503 | if ($self->{'verbosity'}>2) { |
|---|
| | 504 | gsprintf($self->{'outhandle'}, |
|---|
| | 505 | "ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n", |
|---|
| | 506 | $filename, $self->{'default_encoding'}); |
|---|
| | 507 | } |
|---|
| | 508 | return $self->{'default_encoding'}; |
|---|
| | 509 | } |
|---|