Changeset 2027


Ignore:
Timestamp:
2001-02-20T15:57:37+13:00 (23 years ago)
Author:
jrm21
Message:

read() is now completely independent of BasPlug::read(), as the latter
does textcat over the raw file to determine lang/enc. Our read does this
after conversion by gsConvert.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/ConvertToPlug.pm

    r1974 r2027  
    215215
    216216# Override BasPlug read
    217 
     217# We don't want to get language encoding stuff until after we've converted
     218# our file to either TEXT or HTML.
    218219sub read {
    219220    my $self = shift (@_);
    220    
    221     my $ret_val = BasPlug::read($self,@_);
    222  
     221    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
     222#    if ($self->is_recursive()) {
     223#        die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
     224#    }
     225
     226    my $outhandle = $self->{'outhandle'};
     227
     228    my $filename = &util::filename_cat($base_dir, $file);
     229    return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
     230    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
     231        return undef;
     232    }
     233    my $plugin_name = ref ($self);
     234    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
     235
     236
     237    # read in file ($text will be in utf8)
     238    my $text = "";
     239#    $self->read_file ($filename, $encoding, \$text);
     240### was read_file
     241#    my $self = shift (@_);
     242#    my ($src_filename, $encoding, $textref) = @_;
     243
     244    my $output_ext = $self->{'convert_to_ext'};
     245    my $conv_filename = $self->tmp_area_convert_file($output_ext,$filename);
     246# change following to return undef?
     247    if ("$conv_filename" eq "") {return "";} # allows continue on errors
     248    $self->{'conv_filename'} = $conv_filename;
     249
     250### was read_file
     251
     252
     253
     254
     255# Do encoding stuff
     256    my ($language, $encoding);
     257    if ($self->{'input_encoding'} eq "auto") {
     258        # use textcat to automatically work out the input encoding and language
     259        ($language, $encoding) = $self->get_language_encoding ($conv_filename);
     260    } elsif ($self->{'extract_language'}) {
     261        # use textcat to get language metadata
     262
     263        my ($language, $extracted_encoding) = $self->get_language_encoding ($conv_filename);
     264        $encoding = $self->{'input_encoding'};
     265        if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
     266            print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
     267            print $outhandle "appears to be encoded as $extracted_encoding.\n";
     268        }
     269    } else {
     270        $language = $self->{'default_language'};
     271        $encoding = $self->{'input_encoding'};
     272    }
     273
     274    BasPlug::read_file($self,$conv_filename, $encoding, \$text);
     275    if (!length ($text)) {
     276        print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
     277        return 0;
     278    }
     279
     280    # create a new document
     281    my $doc_obj = new doc ($conv_filename, "indexed_doc");
     282
     283    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language",
     284                $language);
     285    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding",
     286                $encoding);
     287
     288
     289    # include any metadata passed in from previous plugins
     290    # note that this metadata is associated with the top level section
     291    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
     292    # do plugin specific processing of doc_obj
     293    return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
     294    # do any automatic metadata extraction
     295    $self->auto_extract_metadata ($doc_obj);
     296    # add an OID
     297    $doc_obj->set_OID();
     298    # process the document
     299    $processor->process($doc_obj);
    223300    $self->cleanup_tmp_area();
    224    
    225     return $ret_val;
     301
     302
     303    return 1;
    226304}
    227305
Note: See TracChangeset for help on using the changeset viewer.