Changeset 2811


Ignore:
Timestamp:
2001-10-31T19:41:49+13:00 (22 years ago)
Author:
sjboddie
Message:

* empty log message *

Location:
trunk/gsdl
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/mkcol.pl

    r2760 r2811  
    164164    # load default plugins if none were on command line   
    165165    if (!scalar(@plugin)) {
    166     @plugin = (ZIPPlug,GMLPlug,TEXTPlug,HTMLPlug,EMAILPlug,
     166    @plugin = (ZIPPlug,GAPlug,TEXTPlug,HTMLPlug,EMAILPlug,
    167167           PDFPlug,RTFPlug,WordPlug,PSPlug,ArcPlug,RecPlug);
    168168    }
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r2796 r2811  
    272272    return undef;
    273273    }
    274     my $plugin_name = ref ($self);
    275274    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
    276275
    277     my ($language, $encoding);
    278     if ($self->{'input_encoding'} eq "auto") {
    279     # use textcat to automatically work out the input encoding and language
    280     ($language, $encoding) = $self->get_language_encoding ($filename);
    281 
    282     } elsif ($self->{'extract_language'}) {
    283     # use textcat to get language metadata
    284     ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
    285     $encoding = $self->{'input_encoding'};
    286 
    287     if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
    288         print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
    289         print $outhandle "appears to be encoded as $extracted_encoding.\n";
    290     }
    291 
    292     } else {
    293     $language = $self->{'default_language'};
    294     $encoding = $self->{'input_encoding'};
    295     }
     276    # Do encoding stuff
     277    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
    296278
    297279    # create a new document
     
    308290
    309291    if (!length ($text)) {
     292    my $plugin_name = ref ($self);
    310293    print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
    311294
     
    384367
    385368    close FILE;
     369}
     370
     371sub textcat_get_language_encoding {
     372    my $self = shift (@_);
     373    my ($filename) = @_;
     374
     375    my ($language, $encoding, $extracted_encoding);
     376    if ($self->{'input_encoding'} eq "auto") {
     377        # use textcat to automatically work out the input encoding and language
     378        ($language, $encoding) = $self->get_language_encoding ($filename);
     379    } elsif ($self->{'extract_language'}) {
     380        # use textcat to get language metadata
     381        ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
     382        $encoding = $self->{'input_encoding'};
     383        if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
     384        my $plugin_name = ref ($self);
     385        my $outhandle = $self->{'outhandle'};
     386            print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
     387            print $outhandle "appears to be encoded as $extracted_encoding.\n";
     388        }
     389    } else {
     390        $language = $self->{'default_language'};
     391        $encoding = $self->{'input_encoding'};
     392    }
     393    return ($language, $encoding);
    386394}
    387395
  • trunk/gsdl/perllib/plugins/ConvertToPlug.pm

    r2799 r2811  
    243243        return undef;
    244244    }
    245     my $plugin_name = ref ($self);
    246245    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
    247246
     
    257256
    258257    # Do encoding stuff
    259     my ($language, $encoding);
    260 
    261     # WordPlug's wvWare will always produce html files encoded as utf-8
    262     if ($plugin_name eq "WordPlug") {
    263     $self->{'input_encoding'} = "utf8";
    264     $self->{'extract_language'} = 1;
    265     }
    266 
    267     if ($self->{'input_encoding'} eq "auto") {
    268         # use textcat to automatically work out the input encoding and language
    269         ($language, $encoding) = $self->get_language_encoding ($conv_filename);
    270     } elsif ($self->{'extract_language'}) {
    271         # use textcat to get language metadata
    272 
    273     my ($extracted_encoding);
    274         ($language, $extracted_encoding) = $self->get_language_encoding ($conv_filename);
    275         $encoding = $self->{'input_encoding'};
    276         if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
    277             print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
    278             print $outhandle "appears to be encoded as $extracted_encoding.\n";
    279         }
    280     } else {
    281         $language = $self->{'default_language'};
    282         $encoding = $self->{'input_encoding'};
    283     }
     258    my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
    284259
    285260    &BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text);
    286261    if (!length ($text)) {
     262    my $plugin_name = ref ($self);
    287263        print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
    288264        return 0;
  • trunk/gsdl/perllib/plugins/WordPlug.pm

    r2785 r2811  
    3232}
    3333
     34sub new {
     35    my $class = shift (@_);
     36
     37    my $self = new ConvertToPlug ($class, @_);
     38
     39    # wvWare will always produce html files encoded as utf-8
     40    if ($self->{'input_encoding'} eq "auto") {
     41    $self->{'input_encoding'} = "utf8";
     42    $self->{'extract_language'} = 1;
     43    }
     44
     45    return bless $self, $class;
     46}
     47
    3448sub get_default_process_exp {
    3549    my $self = shift (@_);
     
    3751    return q^(?i)\.doc$^;
    3852}
    39    
    40 
    4153
    4254# do plugin specific processing of doc_obj for HTML type
Note: See TracChangeset for help on using the changeset viewer.