Changeset 17213 for gsdl/trunk/perllib/plugins/ReadTextFile.pm
- Timestamp:
- 2008-09-08T13:41:13+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/ReadTextFile.pm
r17099 r17213 296 296 } 297 297 298 298 299 # Uses textcat to work out the encoding and language of the text in 299 300 # $filename. All html tags are removed before processing. … … 307 308 my $best_encoding = ""; 308 309 310 309 311 # read in file 310 312 if (!open (FILE, $filename)) { … … 356 358 elsif ($head =~ m/<meta http-equiv.*content-type.*charset=(.+?)\"/si) { 357 359 $best_encoding = $1; 358 # print STDERR "**** meta tag found, encoding is: $best_encoding\n";359 360 } 360 361 if ($best_encoding) { # we extracted an encoding … … 371 372 } 372 373 373 # get the language/encoding 374 $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'})); 375 # my $results = $self->{'textcat'}->classify(\$text); 376 my $results = $self->{'textcat'}->classify_cached_filecontents(\$text, $filename); 377 378 # if textcat returns 3 or less possibilities we'll use the 379 # first one in the list - otherwise use the defaults 380 if (scalar @$results > 3) { 381 if ($unicode_format) { # in case the first had a BOM 382 $best_encoding=$unicode_format; 383 } else { 384 my %guessed_encodings = (); 385 foreach my $result (@$results) { 386 $result =~ /([^\-]+)$/; 387 my $enc=$1; 388 if (!defined($guessed_encodings{$enc})) { 389 $guessed_encodings{$enc}=0; 374 # don't need to do textcat if we know the encoding now AND don't need to extract language 375 if($found_html_encoding && !$self->{'extract_language'}) { # encoding specified in html file 376 $best_language = $self->{'default_language'}; 377 } 378 379 else { # need to use textcat to get either the language, or get both language and encoding 380 $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'})); 381 382 if($found_html_encoding) { # know encoding, find language by limiting search to known encoding 383 my $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding); 384 385 my $language; 386 ($language) = $results->[0] =~ m/^([^-]*)(?:-(?:.*))?$/ if (scalar @$results > 0); 387 388 if (!defined $language || scalar @$results > 3) { 389 # if there were too many results even when restricting results by encoding, 390 # or if there were no results, use default language with the known encoding 391 $best_language = $self->use_default_language($filename); 392 } 393 else { # fewer than 3 results means textcat is more certain, use the first result 394 $best_language = $language; 395 } 396 } 397 else { # don't know encoding or language yet, therefore we use textcat 398 my $results = $self->{'textcat'}->classify_contents(\$text, $filename); 399 400 # if textcat returns 3 or less possibilities we'll use the first one in the list 401 if (scalar @$results <= 3) { # results will be > 0 when we don't constrain textcat by an encoding 402 my ($language, $encoding) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/; 403 404 $language = $self->use_default_language($filename) unless defined $language; 405 $encoding = $self->use_default_encoding($filename) unless defined $encoding; 406 407 $best_language = $language; 408 $best_encoding = $encoding; 409 } 410 else { # if (scalar @$results > 3) { 411 if ($unicode_format) { # in case the first had a BOM 412 $best_encoding=$unicode_format; 390 413 } 391 $guessed_encodings{$enc}++; 392 } 393 394 $guessed_encodings{""}=-1; # for default best_encoding of "" 395 foreach my $enc (keys %guessed_encodings) { 396 if ($guessed_encodings{$enc} > 397 $guessed_encodings{$best_encoding}){ 398 $best_encoding=$enc; 414 else { 415 # Find the most frequent encoding in the textcat results returned 416 # Returns "" if there's no encoding more frequent than another 417 $best_encoding = $self->{'textcat'}->most_frequent_encoding($results); 418 } 419 420 if ($best_encoding eq "") { # encoding still not set, use defaults 421 $best_language = $self->use_default_language($filename); 422 $best_encoding = $self->use_default_encoding($filename); 423 } 424 elsif (!$self->{'extract_language'}) { # know encoding but don't need to discover language 425 $best_language = $self->use_default_language($filename); 426 } 427 else { # textcat again using the most frequent encoding or the $unicode_format set above 428 $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding); 429 my $language; 430 ($language) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/ if (scalar @$results > 0); 431 if (!defined $language || scalar @$results > 3) { 432 # if no result or too many results, use default language for the encoding previously found 433 $best_language = $self->use_default_language($filename); 434 } 435 else { # fewer than 3 results, use the language of the first result 436 $best_language = $language; 437 } 399 438 } 400 439 } 401 440 } 402 403 if ($self->{'input_encoding'} ne 'auto') { 404 if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) { 405 gsprintf($outhandle, 406 "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", 407 $filename, $self->{'default_language'}); 408 } 409 $best_language = $self->{'default_language'}; 410 if (!$found_html_encoding) { 411 $best_encoding = $self->{'input_encoding'}; 412 } 413 414 } else { 415 if ($self->{'verbosity'}>2) { 416 gsprintf($outhandle, 417 "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", 418 $filename, $self->{'default_language'}); 419 } 420 $best_language = $self->{'default_language'}; 421 } 422 } else { # <= 3 suggestions 423 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/; 424 if (!defined $language) { 425 if ($self->{'verbosity'}>2) { 426 gsprintf($outhandle, 427 "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", 428 $filename, $self->{'default_language'}); 429 } 430 $language = $self->{'default_language'}; 431 } 432 if (!defined $encoding) { 433 if ($self->{'verbosity'}>2) { 434 gsprintf($outhandle, 435 "ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n", 436 $filename, $self->{'default_encoding'}); 437 } 438 $encoding = $self->{'default_encoding'}; 439 } 440 $best_language = $language; 441 if (! $best_encoding ) { # may already be set... eg from html meta tag 442 $best_encoding = $encoding; 443 } 444 } 441 } 442 443 if($best_encoding eq "" || $best_language eq "") { 444 print STDERR "****Shouldn't happen: encoding and/or language still not set. Using defaults.\n"; 445 $best_encoding = $self->use_default_encoding($filename) if $best_encoding eq ""; 446 $best_language = $self->use_default_language($filename) if $best_language eq ""; 447 } 448 # print STDERR "****Content language: $best_language; Encoding: $best_encoding.\n"; 449 445 450 446 451 if ($best_encoding =~ /^iso_8859/ && &unicode::check_is_utf8($text)) { … … 480 485 481 486 487 sub use_default_language { 488 my $self = shift (@_); 489 my ($filename) = @_; 490 491 if ($self->{'verbosity'}>2) { 492 gsprintf($self->{'outhandle'}, 493 "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", 494 $filename, $self->{'default_language'}); 495 } 496 return $self->{'default_language'}; 497 } 498 499 sub use_default_encoding { 500 my $self = shift (@_); 501 my ($filename) = @_; 502 503 if ($self->{'verbosity'}>2) { 504 gsprintf($self->{'outhandle'}, 505 "ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n", 506 $filename, $self->{'default_encoding'}); 507 } 508 return $self->{'default_encoding'}; 509 } 482 510 483 511 # Overridden by exploding plugins (eg. ISISPlug)
Note:
See TracChangeset
for help on using the changeset viewer.