greenstone.org greenstone wiki greenstone trac planet greenstone

Changeset 15963

Show
Ignore:
Timestamp:
2008-06-12 12:16:20 (5 months ago)
Author:
kjdon
Message:

commented out the textcat stuff in post_process - need to think about whether we need it or not - does the conversion always produce utf8?

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • gsdl/trunk/perllib/plugins/PDFPlugin.pm

    r15904 r15963  
    212212    my $outhandle=$self->{'outhandle'}; 
    213213 
    214     my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename); 
     214    #$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?) 
     215    #my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename); 
    215216 
    216217    # read in file ($text will be in utf8) 
    217218    my $text = ""; 
    218     $self->read_file ($conv_filename, $encoding, $language, \$text); 
     219    # encoding will be utf8 for html files - what about other types? will we do this step for them anyway? 
     220    $self->read_file ($conv_filename, "utf8", "", \$text); 
    219221 
    220222    # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc 
     
    302304    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; 
    303305 
    304     my $result = $self->process_type("pdf",$base_dir,$file,$doc_obj); 
     306    my $result = $self->process_type($base_dir,$file,$doc_obj); 
    305307 
    306308    # fix up the extracted date metadata to be in Greenstone date format,