Changeset 2811
- Timestamp:
- 2001-10-31T19:41:49+13:00 (22 years ago)
- Location:
- trunk/gsdl
- Files:
-
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/mkcol.pl
r2760 r2811 164 164 # load default plugins if none were on command line 165 165 if (!scalar(@plugin)) { 166 @plugin = (ZIPPlug,G MLPlug,TEXTPlug,HTMLPlug,EMAILPlug,166 @plugin = (ZIPPlug,GAPlug,TEXTPlug,HTMLPlug,EMAILPlug, 167 167 PDFPlug,RTFPlug,WordPlug,PSPlug,ArcPlug,RecPlug); 168 168 } -
trunk/gsdl/perllib/plugins/BasPlug.pm
r2796 r2811 272 272 return undef; 273 273 } 274 my $plugin_name = ref ($self);275 274 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 276 275 277 my ($language, $encoding); 278 if ($self->{'input_encoding'} eq "auto") { 279 # use textcat to automatically work out the input encoding and language 280 ($language, $encoding) = $self->get_language_encoding ($filename); 281 282 } elsif ($self->{'extract_language'}) { 283 # use textcat to get language metadata 284 ($language, $extracted_encoding) = $self->get_language_encoding ($filename); 285 $encoding = $self->{'input_encoding'}; 286 287 if ($extracted_encoding ne $encoding && $self->{'verbosity'}) { 288 print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but "; 289 print $outhandle "appears to be encoded as $extracted_encoding.\n"; 290 } 291 292 } else { 293 $language = $self->{'default_language'}; 294 $encoding = $self->{'input_encoding'}; 295 } 276 # Do encoding stuff 277 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 296 278 297 279 # create a new document … … 308 290 309 291 if (!length ($text)) { 292 my $plugin_name = ref ($self); 310 293 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'}; 311 294 … … 384 367 385 368 close FILE; 369 } 370 371 sub textcat_get_language_encoding { 372 my $self = shift (@_); 373 my ($filename) = @_; 374 375 my ($language, $encoding, $extracted_encoding); 376 if ($self->{'input_encoding'} eq "auto") { 377 # use textcat to automatically work out the input encoding and language 378 ($language, $encoding) = $self->get_language_encoding ($filename); 379 } elsif ($self->{'extract_language'}) { 380 # use textcat to get language metadata 381 ($language, $extracted_encoding) = $self->get_language_encoding ($filename); 382 $encoding = $self->{'input_encoding'}; 383 if ($extracted_encoding ne $encoding && $self->{'verbosity'}) { 384 my $plugin_name = ref ($self); 385 my $outhandle = $self->{'outhandle'}; 386 print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but "; 387 print $outhandle "appears to be encoded as $extracted_encoding.\n"; 388 } 389 } else { 390 $language = $self->{'default_language'}; 391 $encoding = $self->{'input_encoding'}; 392 } 393 return ($language, $encoding); 386 394 } 387 395 -
trunk/gsdl/perllib/plugins/ConvertToPlug.pm
r2799 r2811 243 243 return undef; 244 244 } 245 my $plugin_name = ref ($self);246 245 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 247 246 … … 257 256 258 257 # Do encoding stuff 259 my ($language, $encoding); 260 261 # WordPlug's wvWare will always produce html files encoded as utf-8 262 if ($plugin_name eq "WordPlug") { 263 $self->{'input_encoding'} = "utf8"; 264 $self->{'extract_language'} = 1; 265 } 266 267 if ($self->{'input_encoding'} eq "auto") { 268 # use textcat to automatically work out the input encoding and language 269 ($language, $encoding) = $self->get_language_encoding ($conv_filename); 270 } elsif ($self->{'extract_language'}) { 271 # use textcat to get language metadata 272 273 my ($extracted_encoding); 274 ($language, $extracted_encoding) = $self->get_language_encoding ($conv_filename); 275 $encoding = $self->{'input_encoding'}; 276 if ($extracted_encoding ne $encoding && $self->{'verbosity'}) { 277 print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but "; 278 print $outhandle "appears to be encoded as $extracted_encoding.\n"; 279 } 280 } else { 281 $language = $self->{'default_language'}; 282 $encoding = $self->{'input_encoding'}; 283 } 258 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename); 284 259 285 260 &BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text); 286 261 if (!length ($text)) { 262 my $plugin_name = ref ($self); 287 263 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'}; 288 264 return 0; -
trunk/gsdl/perllib/plugins/WordPlug.pm
r2785 r2811 32 32 } 33 33 34 sub new { 35 my $class = shift (@_); 36 37 my $self = new ConvertToPlug ($class, @_); 38 39 # wvWare will always produce html files encoded as utf-8 40 if ($self->{'input_encoding'} eq "auto") { 41 $self->{'input_encoding'} = "utf8"; 42 $self->{'extract_language'} = 1; 43 } 44 45 return bless $self, $class; 46 } 47 34 48 sub get_default_process_exp { 35 49 my $self = shift (@_); … … 37 51 return q^(?i)\.doc$^; 38 52 } 39 40 41 53 42 54 # do plugin specific processing of doc_obj for HTML type
Note:
See TracChangeset
for help on using the changeset viewer.