Changeset 16724
- Timestamp:
- 2008-08-12T14:54:31+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/ReadTextFile.pm
r16699 r16724 139 139 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); 140 140 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}"); 141 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));141 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path)); 142 142 $self->set_Source_metadata($doc_obj, $filename_no_path, $encoding); 143 143 … … 225 225 226 226 227 sub read_file_no_decoding { 228 my $self = shift (@_); 229 my ($filename, $textref) = @_; 230 231 if (!-r $filename) 232 { 233 my $outhandle = $self->{'outhandle'}; 234 gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'}; 235 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'}; 236 return; 237 } 238 $$textref = ""; 239 if (!open (FILE, $filename)) { 240 gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename); 241 die "\n"; 242 } 243 244 my $reader = new multiread(); 245 $reader->set_handle ('ReadTextFile::FILE'); 246 $reader->read_file_no_decoding ($textref); 247 248 $self->{'reader'} = $reader; 249 250 close FILE; 251 } 252 253 254 sub decode_text { 255 my $self = shift (@_); 256 my ($raw_text, $encoding, $language, $textref) = @_; 257 258 my $reader = $self->{'reader'}; 259 if (!defined $reader) { 260 gsprintf(STDERR, "ReadTextFile::decode_text needs to call ReadTextFile::read_file_no_decoding first\n"); 261 } 262 else { 263 $reader->set_encoding($encoding); 264 $reader->decode_text($raw_text,$textref); 265 } 266 } 267 268 227 269 sub textcat_get_language_encoding { 228 270 my $self = shift (@_); 229 271 my ($filename) = @_; 230 272 231 232 273 my ($language, $encoding, $extracted_encoding); 233 274 if ($self->{'input_encoding'} eq "auto") { … … 235 276 ($language, $encoding) = $self->get_language_encoding ($filename); 236 277 } elsif ($self->{'extract_language'}) { 237 278 # use textcat to get language metadata 238 279 ($language, $extracted_encoding) = $self->get_language_encoding ($filename); 239 280 $encoding = $self->{'input_encoding'}; … … 241 282 # to english in iso-8859-1 (except for some punctuation). We don't have 242 283 # a language model for en_utf8, so textcat always says iso-8859-1! 243 if ($extracted_encoding ne $encoding && $language ne "en" 244 && $self->{'verbosity'}) { 284 if ($extracted_encoding ne $encoding && $language ne "en" && $self->{'verbosity'}) { 245 285 my $plugin_name = ref ($self); 246 286 my $outhandle = $self->{'outhandle'}; … … 249 289 } else { 250 290 $language = $self->{'default_language'}; 251 $encoding = $self->{'input_encoding'}; 252 } 291 $encoding = $self->{'input_encoding'}; 292 } 293 294 # print STDERR "**** language encoding of contents of file $filename:\n\t****$language $encoding\n"; 253 295 254 296 return ($language, $encoding); … … 305 347 if ($text =~ /^<\?xml.*encoding="(.+?)"/) { 306 348 $best_encoding = $1; 307 } elsif ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)"/i) {#" 349 } 350 # check the meta http-equiv charset tag unless it is commented out 351 elsif (($text !~ /<!--[^<>]?<meta http-equiv/i) && ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)\"/i)) { 308 352 $best_encoding = $1; 353 # print STDERR "**** meta tag found, encoding is: $best_encoding\n"; 309 354 } 310 355 if ($best_encoding) { # we extracted an encoding … … 323 368 # get the language/encoding 324 369 $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'})); 325 my $results = $self->{'textcat'}->classify(\$text); 370 # my $results = $self->{'textcat'}->classify(\$text); 371 my $results = $self->{'textcat'}->classify_cached_filecontents(\$text, $filename); 326 372 327 373 # if textcat returns 3 or less possibilities we'll use the
Note:
See TracChangeset
for help on using the changeset viewer.