Changeset 12834


Ignore:
Timestamp:
2006-09-22T16:03:36+12:00 (18 years ago)
Author:
kjdon
Message:

these convertto plugins were all setting extract_language=1 to their secondary plugins. we don't want this - only pass to secondary plugin if user has asked for it. textcat can be very slow, so don't want to run it unless we have to

Location:
trunk/gsdl/perllib/plugins
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/ExcelPlug.pm

    r12169 r12834  
    7474    #$self->{'extract_language'} = 1;
    7575    push(@$html_options, "-input_encoding", "utf8");
    76     push(@$html_options,"-extract_language");
     76    push(@$html_options,"-extract_language") if $self->{'extract_language'};
    7777    $self = bless $self, $class;
    7878   
  • trunk/gsdl/perllib/plugins/PDFPlug.pm

    r12169 r12834  
    149149   
    150150    if ($self->{'input_encoding'} eq "auto") {
    151     # pdftohtml will always produce html files encoded as utf-8
    152     # => restrict primary PDFPlug and secondary HTML plugin to use
    153     # utf8 and extract language.
    154151    $self->{'input_encoding'} = "utf8";
    155     $self->{'extract_language'} = 1;
    156 
    157     push(@$html_options,"-extract_language");
    158     }
     152    }
     153
    159154    # if pdftohtml is always producing utf8, then htmlplug always needs this option
    160155    push(@$html_options,"-input_encoding", "utf8");
    161 
     156    push(@$html_options,"-extract_language") if $self->{'extract_language'};
    162157    # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
    163158    # to extract these metadata fields from the HEAD META fields
  • trunk/gsdl/perllib/plugins/PPTPlug.pm

    r12169 r12834  
    112112    if ($self->{'input_encoding'} eq "auto") {
    113113    $self->{'input_encoding'} = "utf8";
    114     $self->{'extract_language'} = 1;
    115114    if (defined $secondary_plugin_options->{'HTMLPlug'}){
    116115        push(@$html_options,"-input_encoding", "utf8");
    117         push(@$html_options,"-extract_language");
     116        push(@$html_options,"-extract_language") if $self->{'extract_language'};
    118117
    119118        # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
     
    123122    if (defined $secondary_plugin_options->{'PagedImgPlug'}){
    124123        push(@$pageimg_options,"-input_encoding", "utf8");
    125         push(@$pageimg_options,"-extract_language");
     124        push(@$pageimg_options,"-extract_language") if $self->{'extract_language'};
    126125    }
    127126    }
  • trunk/gsdl/perllib/plugins/PSPlug.pm

    r12169 r12834  
    124124    #$self->{'extract_language'} = 1;
    125125    push(@$text_options, "-input_encoding", "utf8");
    126     push(@$text_options,"-extract_language");
     126    push(@$text_options,"-extract_language") if $self->{'extract_language'};
    127127    push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    128128
  • trunk/gsdl/perllib/plugins/RTFPlug.pm

    r12169 r12834  
    8282    #$self->{'extract_language'} = 1;
    8383    push(@$text_options, "-input_encoding", "utf8");
    84     push(@$text_options,"-extract_language");
    85     if ($self->{'description_tags'} == 1) {
    86     push(@$html_options, "-description_tags");
    87     }
    88    
     84    push(@$text_options,"-extract_language") if $self->{'extract_language'};
     85    push(@$html_options, "-description_tags") if $self->{'description_tags'};
     86    push(@$html_options,"-extract_language") if $self->{'extract_language'};
    8987
    9088    $self = bless $self, $class;
  • trunk/gsdl/perllib/plugins/WordPlug.pm

    r12169 r12834  
    115115    $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
    116116
     117    # we always save as utf-8
     118    if ($self->{'input_encoding'} eq "auto") {
     119    $self->{'input_encoding'} = "utf8";
     120    }
     121
    117122    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
    118123    if (defined $self->{'windows_scripting'}) {
     
    120125        $secondary_plugin_options->{'StructuredHTMLPlug'} = [];
    121126        my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlug'};
    122         if ($self->{'input_encoding'} eq "auto") {
    123         $self->{'input_encoding'} = "utf8";
    124         $self->{'extract_language'} = 1;
    125         #push(@$structhtml_options,"-input_encoding", "utf8");
    126         push(@$structhtml_options,"-extract_language");
    127         }
    128                
     127       
    129128        # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
    130129        # to extract these metadata fields from the HEAD META fields
     
    132131        push (@$structhtml_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    133132        push (@$structhtml_options, "-description_tags") if $self->{'windows_scripting'};
     133        push(@$structhtml_options,"-extract_language") if $self->{'extract_language'};
    134134        push (@$structhtml_options, "-delete_toc") if $self->{'delete_toc'};
    135135        push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
     
    153153    # wvWare will always produce html files encoded as utf-8, so make sure the secondary HTMLPlug knows this
    154154    push(@$html_options,"-input_encoding", "utf8");
    155 
    156     if ($self->{'input_encoding'} eq "auto") {
    157     $self->{'input_encoding'} = "utf8";
    158     $self->{'extract_language'} = 1;
    159     push(@$html_options,"-extract_language");
    160     }
    161     if ($self->{'description_tags'} == 1) {
    162     push(@$html_options, "-description_tags");
    163     }
     155    push(@$html_options,"-extract_language") if $self->{'extract_language'};
     156    push(@$html_options, "-description_tags") if $self->{'description_tags'};
     157
    164158    # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
    165159    # to extract these metadata fields from the HEAD META fields
Note: See TracChangeset for help on using the changeset viewer.