Ignore:
Timestamp:
2010-08-10T14:31:53+12:00 (14 years ago)
Author:
kjdon
Message:

code tidy up. rearranged how convertbinaryfile plugins set up their secondary plugins - now only set up the options for the one they are using. all subclass specific code moved out of convertbinaryfile.new into the appropriate plugin file.

Location:
main/trunk/greenstone2/perllib/plugins
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/CONTENTdmPlugin.pm

    r20790 r22597  
    4444
    4545my $convert_to_list =
    46     [ { 'name' => "auto",
    47     'desc' => "{ConvertBinaryFile.convert_to.auto}" },
    48       { 'name' => "html",
    49     'desc' => "{ConvertBinaryFile.convert_to.html}" },
    50       { 'name' => "text",
    51     'desc' => "{ConvertBinaryFile.convert_to.text}" },
     46    [
     47#      {    'name' => "auto",
     48#   'desc' => "{ConvertBinaryFile.convert_to.auto}" },
     49#      {    'name' => "html",
     50#   'desc' => "{ConvertBinaryFile.convert_to.html}" },
     51#      {    'name' => "text",
     52#   'desc' => "{ConvertBinaryFile.convert_to.text}" },
    5253      { 'name' => "pagedimg",
    5354    'desc' => "{ConvertBinaryFile.convert_to.pagedimg}"},
     
    124125    $self->{'metadata_value'} = undef;
    125126
    126     $self->{'convert_to'} = "PagedImage";
     127    # do we only allow one option??
     128    $self->{'convert_to'} = "pagedimg";
     129    $self->{'convert_to_plugin'} = "PagedImagePlugin";
     130    $self->{'convert_to_ext'} = "jpg";
     131   
     132    my $secondary_plugin_name = $self->{'convert_to_plugin'};
    127133    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
    128134
    129     if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
    130     $secondary_plugin_options->{'PagedImagePlugin'} = [];
    131     }
    132     my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
    133     push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    134     push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true");
    135     push(@$pagedimg_options, "-file_rename_method", "none");
    136     push(@$pagedimg_options, "-processing_tmp_files");
     135    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
     136    $secondary_plugin_options->{$secondary_plugin_name} = [];
     137    }
     138    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
     139
     140    push(@$specific_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
     141    push(@$specific_options, "-create_thumbnail", "true", "-create_screenview", "true");
     142    push(@$specific_options, "-file_rename_method", "none");
     143    push(@$specific_options, "-processing_tmp_files");
     144
     145#    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
     146
     147#    if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
     148#   $secondary_plugin_options->{'PagedImagePlugin'} = [];
     149#    }
     150#    my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
     151#    push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
     152#    push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true");
     153#    push(@$pagedimg_options, "-file_rename_method", "none");
     154#    push(@$pagedimg_options, "-processing_tmp_files");
    137155    $self = bless $self, $class;
    138 
    139 # ***** no longer needed!
    140 #    # This needs to be done after blss, to $self passed to XML::Parser
    141 #    # can correctly resolve the right call-back methods during XML parsing
    142 
    143 
    144156    $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
    145157    return $self;
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r22504 r22597  
    9595    my ($class,$input_args,$hashArgOptLists) = @_;
    9696
    97     my @convert_to_list = split(",",$self->{'convert_to'});
     97    my @convert_to_list = split(",",$self->{'convert_to_plugin'});
    9898    my $secondary_plugins = {};
    9999    # find the plugin
     
    101101    foreach my $convert_to (@convert_to_list) {
    102102    # load in "convert_to" plugin package
    103     my $plugin_class = $convert_to."Plugin";
     103    my $plugin_class = $convert_to;
    104104    my $plugin_package = $plugin_class.".pm";
    105105
     
    143143
    144144    my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
    145    
    146     if ($self->{'info_only'}) {
    147     # don't worry about any options etc
    148     return bless $self, $class;
    149     }
    150 
    151     my $convert_to_type = $self->{'convert_to'};
    152     if (!defined $convert_to_type || $convert_to_type eq "") {
    153     $convert_to_type = "auto";
    154     }
    155     my $windows_scripting = $self->{'windows_scripting'};
    156     $windows_scripting = 0 unless defined $windows_scripting;
    157     if ($classPluginName eq "PDFPlugin") {
    158     if ($convert_to_type eq "text" &&
    159         $ENV{'GSDLOS'} =~ /^windows$/i) {
    160         print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
    161         $convert_to_type = "html";
    162     }
    163     } elsif ($classPluginName eq "WordPlugin") {
    164     if (($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type =~ /^(html|auto)$/) || defined $self->{'openoffice_scripting'}) {
    165         # we use structured HTML, not normal html
    166         $convert_to_type = "structuredhtml";
    167     }
    168     } elsif ($classPluginName eq "PowerPointPlugin") {
    169     if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type eq "auto") {
    170         # we use paged img
    171         $convert_to_type = "pagedimg_jpg";
    172     }
    173     } elsif ($classPluginName eq "PostScriptPlugin") {
    174     if ($convert_to_type eq "auto") {
    175         # we use text
    176         $convert_to_type = "text";
    177     }
    178     }
    179    
    180     if ($convert_to_type eq "auto") {
    181     # choose html for now - should choose a format based on doc type
    182     $convert_to_type = "html";
    183     }
    184    
    185     if ($convert_to_type eq "html") {
    186     $self->{'convert_to'} = "HTML";
     145   
     146    return bless $self, $class;
     147}
     148
     149# should be called by subclasses after checking and setting
     150# $self->{'convert_to'}
     151sub set_standard_convert_settings {
     152    my $self =shift (@_);
     153   
     154    my $convert_to = $self->{'convert_to'};
     155    if ($convert_to eq "auto") {
     156    $convert_to = "html";
     157    $self->{'convert_to'} = "html";
     158    }
     159
     160    if ($convert_to eq "html") {
     161    $self->{'convert_to_plugin'} = "HTMLPlugin";
    187162    $self->{'convert_to_ext'} = "html";
    188     } elsif ($convert_to_type eq "text") {
    189     $self->{'convert_to'} = "Text";
     163    } elsif ($convert_to eq "text") {
     164    $self->{'convert_to_plugin'} = "TextPlugin";
    190165    $self->{'convert_to_ext'} = "txt";
    191     } elsif ($convert_to_type eq "structuredhtml") {
    192     $self->{'convert_to'} = "StructuredHTML";
     166    } elsif ($convert_to eq "structuredhtml") {
     167    $self->{'convert_to_plugin'} = "StructuredHTMLPlugin";
    193168    $self->{'convert_to_ext'} = "html";
    194     } elsif ($convert_to_type =~ /^pagedimg/) {
    195     $self->{'convert_to'} = "PagedImage";
    196     my ($convert_to_ext) = $convert_to_type =~ /pagedimg\_(jpg|gif|png)/i;
     169    } elsif ($convert_to =~ /^pagedimg/) {
     170    $self->{'convert_to_plugin'} = "PagedImagePlugin";
     171    my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg|gif|png)/i;
    197172    $convert_to_ext = 'jpg' unless defined $convert_to_ext;
    198173    $self->{'convert_to_ext'} = $convert_to_ext;
    199174    }
    200    
    201     return bless $self, $class;
    202 }
    203 
    204 
     175
     176}
    205177sub init {
    206178    my $self = shift (@_);
     
    316288    # Execute the conversion command and get the type of the result,
    317289    # making sure the converter gives us the appropriate output type
    318     my $output_type="";
    319     if ($convert_to =~ m/PagedImage/i) {
    320     $output_type = lc($convert_to)."_".lc($convert_to_ext);
    321     } else {
    322     $output_type = lc($convert_to);
    323     }
     290    my $output_type=$self->{'convert_to'};
     291#    if ($convert_to =~ m/PagedImage/i) {
     292#   $output_type = lc($convert_to)."_".lc($convert_to_ext);
     293#    } else {
     294#   $output_type = lc($convert_to);
     295#    }
    324296
    325297    my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
     
    331303    }
    332304    $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
    333 
     305    print STDERR "calling cmd $cmd\n";
    334306    $output_type = `$cmd`;
    335307
  • main/trunk/greenstone2/perllib/plugins/ExcelPlugin.pm

    r22515 r22597  
    3333no strict 'subs';
    3434use gsprintf 'gsprintf';
    35 
    36 #sub BEGIN {
    37 #    @ExcelPlugin::ISA = ('ConvertBinaryFile');
    38 #}
    3935
    4036# @ISA dynamically configured to be either OpenOfficeConverter or ConvertBinaryFile
     
    9490    push(@$pluginlist, $class);
    9591   
    96     #my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
    9792    if ($openoffice_ext_installed) {
    9893    print STDERR "ExcelPlugin: OpenOffice Extension to Greenstone detected\n";
     
    126121    }
    127122
    128     my $outhandle = $self->{'outhandle'};
    129 
    130123    $self->{'filename_extension'} = "xls";
    131124    $self->{'file_type'} = "Excel";
    132125
     126    my $outhandle = $self->{'outhandle'};
     127
     128    # check convert_to
     129    if ($self->{'convert_to'} eq "auto") {
     130    $self->{'convert_to'} = "html";
     131    }
     132
    133133    $self->{'convert_options'} = "-openoffice_scripting" if $self->{'openoffice_scripting'};
    134134
    135     # other options for HTML if using open office???
     135    # set convert_to_plugin and convert_to_ext
     136    $self->ConvertBinaryFile::set_standard_convert_settings();
     137    print STDERR "final convert-to $self->{'convert_to'}\n";
     138
     139    my $secondary_plugin_name = $self->{'convert_to_plugin'};
    136140    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
    137     if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
    138     $secondary_plugin_options->{'HTMLPlugin'} = [];
     141
     142    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
     143    $secondary_plugin_options->{$secondary_plugin_name} = [];
    139144    }
    140     if (!defined $secondary_plugin_options->{'TextPlugin'}) {
    141     $secondary_plugin_options->{'TextPlugin'} = [];
     145    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
     146
     147    push(@$specific_options,"-extract_language") if $self->{'extract_language'};
     148    push(@$specific_options, "-file_rename_method", "none");
     149
     150    if ($secondary_plugin_name eq "HTMLPlugin") {
     151    push(@$specific_options, "-processing_tmp_files");
    142152    }
    143     my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
    144     my $text_options = $secondary_plugin_options->{'TextPlugin'};
    145 
    146     # xslhtml doesn't output utf8, let Greenstone work out the encoding
    147     #push(@$html_options, "-input_encoding", "utf8");
    148     push(@$html_options,"-extract_language") if $self->{'extract_language'};
    149     push(@$html_options, "-file_rename_method", "none");
    150     push(@$html_options, "-processing_tmp_files");
    151 
    152     #push(@$text_options, "-input_encoding", "utf8");
    153     push(@$text_options,"-extract_language") if $self->{'extract_language'};
    154     push(@$text_options, "-file_rename_method", "none");
    155153
    156154    $self = bless $self, $class;
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r21800 r22597  
    119119    push(@{$hashArgOptLists->{"OptList"}},$options);
    120120
    121     my @arg_array = @$inputargs;
    122121    my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
    123122   
     
    138137    $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
    139138
     139    # check convert_to
     140    if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
     141    print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
     142    $self->{'convert_to'} = "html";
     143    }
     144    elsif ($self->{'convert_to'} eq "auto") {
     145    # choose html ?? is this the best option
     146    $self->{'convert_to'} = "html";
     147    }
     148    # set convert_to_plugin and convert_to_ext
     149    $self->ConvertBinaryFile::set_standard_convert_settings();
     150
     151    my $secondary_plugin_name = $self->{'convert_to_plugin'};
    140152    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
    141153
    142     if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
    143     $secondary_plugin_options->{'HTMLPlugin'} = [];
    144     }
    145     if (!defined $secondary_plugin_options->{'TextPlugin'}) {
    146     $secondary_plugin_options->{'TextPlugin'} = [];
    147     }
    148     if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i) {
    149     if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
    150         $secondary_plugin_options->{'PagedImagePlugin'} = [];
    151         my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
    152         push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    153         push(@$pagedimg_options, "-screenviewsize", "1000");
    154         push(@$pagedimg_options, "-enable_cache");
    155     }
    156     }
    157     my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
    158     my $text_options = $secondary_plugin_options->{'TextPlugin'};
    159     my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
    160 
    161 #    if ($self->{'input_encoding'} eq "auto") {
    162 #   $self->{'input_encoding'} = "utf8";
    163 #    }
    164 
    165     # if pdftohtml is always producing utf8, then htmlplug always needs this option
    166     push(@$html_options,"-input_encoding", "utf8");
    167     push(@$html_options,"-extract_language") if $self->{'extract_language'};
    168 
    169     push(@$html_options, "-processing_tmp_files");
    170     push(@$pagedimg_options, "-processing_tmp_files");
    171 
    172     # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
    173     # to extract these metadata fields from the HEAD META fields
    174     my $required_metadata;
    175     if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
    176     push(@$html_options,"-metadata_fields",$self->{'metadata_fields'});
    177     } else {
    178     push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
    179     }
    180     if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
    181     push(@$html_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
    182     }
    183        
    184     if ($self->{'use_sections'} || $self->{'description_tags'}) {
    185     $self->{'description_tags'} = 1;
    186     push(@$html_options,"-description_tags");
    187     }
     154    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
     155    $secondary_plugin_options->{$secondary_plugin_name} = [];
     156    }
     157    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
    188158
    189159    # following title_sub removes "Page 1" added by pdftohtml, and a leading
    190160    # "1", which is often the page number at the top of the page. Bad Luck
    191161    # if your document title actually starts with "1 " - is there a better way?
    192     push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    193     push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    194 
     162    push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    195163    my $associate_tail_re = $self->{'associate_tail_re'};
    196164    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
    197     push(@$html_options, "-associate_tail_re", $associate_tail_re);
    198     push(@$text_options, "-associate_tail_re", $associate_tail_re);
    199     push(@$pagedimg_options, "-associate_tail_re", $associate_tail_re) if defined $pagedimg_options;
    200     }
    201 
    202     push(@$html_options, "-file_rename_method", "none");
    203     push(@$text_options, "-file_rename_method", "none");
    204     push(@$pagedimg_options, "-file_rename_method", "none") if defined $pagedimg_options;
     165    push(@$specific_options, "-associate_tail_re", $associate_tail_re);
     166    }
     167    push(@$specific_options, "-file_rename_method", "none");
     168   
     169    if ($secondary_plugin_name eq "HTMLPlugin") {
     170    # pdftohtml always produces utf8
     171    push(@$specific_options, "-input_encoding", "utf8");
     172    push(@$specific_options, "-extract_language") if $self->{'extract_language'};
     173    push(@$specific_options, "-processing_tmp_files");
     174    # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
     175    # to extract these metadata fields from the HEAD META fields
     176    if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
     177        push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
     178    } else {
     179        push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
     180    }
     181    if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
     182        push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
     183    }
     184    if ($self->{'use_sections'} || $self->{'description_tags'}) {
     185        $self->{'description_tags'} = 1;
     186        push(@$specific_options, "-description_tags");
     187    }
     188    }
     189    elsif ($secondary_plugin_name eq "PagedImagePlugin") {
     190    push(@$specific_options, "-screenviewsize", "1000");
     191    push(@$specific_options, "-enable_cache");
     192    push(@$specific_options, "-processing_tmp_files");
     193    }
    205194
    206195    $self = bless $self, $class;
  • main/trunk/greenstone2/perllib/plugins/PostScriptPlugin.pm

    r20790 r22597  
    8888    push(@$pluginlist, $class);
    8989
    90     #push(@$inputargs,"-convert_to");
    91     #push(@$inputargs,"text");
    9290    push(@$inputargs,"-title_sub");
    9391    push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
     
    106104    $self->{'file_type'} = "PS";
    107105
     106    if ($self->{'convert_to'} eq "auto") {
     107    $self->{'convert_to'} = "text";
     108    }
     109
     110    # set convert_to_plugin and convert_to_ext
     111    $self->ConvertBinaryFile::set_standard_convert_settings();
     112    my $secondary_plugin_name = $self->{'convert_to_plugin'};
    108113    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
    109114
    110     if (!defined $secondary_plugin_options->{'TextPlugin'}) {
    111     $secondary_plugin_options->{'TextPlugin'} = [];
    112     }
    113 
    114     if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
    115     $secondary_plugin_options->{'HTMLPlugin'} = [];
    116     }
    117 
    118     my $text_options = $secondary_plugin_options->{'TextPlugin'};
    119     my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
    120 
    121     if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i) {
    122     if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
    123         $secondary_plugin_options->{'PagedImagePlugin'} = [];
    124         my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
    125         push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    126         push(@$pagedimg_options, "-file_rename_method", "none");
    127         push(@$pagedimg_options, "-processing_tmp_files");
    128     }
    129     }
     115    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
     116    $secondary_plugin_options->{$secondary_plugin_name} = [];
     117    }
     118    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
     119
    130120    # following title_sub removes "Page 1" added by ps2ascii, and a leading
    131121    # "1", which is often the page number at the top of the page. Bad Luck
    132122    # if your document title actually starts with "1 " - is there a better way?
    133     #$self->{'input_encoding'} = "utf8";
    134     #$self->{'extract_language'} = 1;
    135     push(@$text_options, "-input_encoding", "utf8");
    136     push(@$text_options,"-extract_language") if $self->{'extract_language'};
    137     push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    138 
    139     push(@$text_options, "-file_rename_method", "none");
    140     push(@$html_options, "-file_rename_method", "none");
    141    
    142     # tell the secondary plugins that they are processing tmp files
    143     push(@$html_options, "-processing_tmp_files");
     123    push(@$specific_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
     124    push(@$specific_options, "-file_rename_method", "none");
     125   
     126    if ($secondary_plugin_name eq "TextPlugin") {
     127    push(@$specific_options, "-input_encoding", "utf8");
     128    push(@$specific_options,"-extract_language") if $self->{'extract_language'};
     129    } elsif ($secondary_plugin_name eq "PagedImagePlugin") {
     130    push(@$specific_options, "-processing_tmp_files");
     131    }
    144132
    145133    $self = bless $self, $class;
  • main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm

    r22515 r22597  
    3434no strict 'subs';
    3535use gsprintf 'gsprintf';
    36 
    37 #sub BEGIN {
    38 #    @PowerPointPlugin::ISA = ('ConvertBinaryFile');
    39 #}
    4036
    4137# @ISA dynamically configured to be either OpenOfficeConverter or ConvertBinaryFile
     
    157153    }
    158154
    159     my $outhandle = $self->{'outhandle'};
    160 
    161155    $self->{'filename_extension'} = "ppt";
    162156    $self->{'file_type'} = "PPT";
     157
     158    if ($self->{'convert_to'} eq "auto") {
     159    if ($self->{'windows_scripting'}) {
     160        $self->{'convert_to'} = "pagedimg_jpg";
     161    }
     162    else {
     163        $self->{'convert_to'} = "html";
     164    }
     165    }
     166
     167   my $outhandle = $self->{'outhandle'};
    163168
    164169    # can't have windows_scripting and openoffice_scripting at the same time
     
    172177    $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'};
    173178    $self->{'convert_options'} = "-openoffice_scripting" if $self->{'openoffice_scripting'};
     179
     180    # set convert_to_plugin and convert_to_ext
     181    $self->ConvertBinaryFile::set_standard_convert_settings();
     182
     183    my $secondary_plugin_name = $self->{'convert_to_plugin'};
    174184    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
    175185
    176     if ($self->{'windows_scripting'} && ($self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i)) {
    177     $secondary_plugin_options->{'PagedImagePlugin'} = [];
    178     } else {
    179     $secondary_plugin_options->{'HTMLPlugin'} = [];
    180     $secondary_plugin_options->{'TextPlugin'} = [];
    181     }
    182     my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
    183     my $text_options = $secondary_plugin_options->{'TextPlugin'};
    184     my $pageimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
    185  
    186     if (defined $html_options){
    187     # ppthtml doesn't output utf-8 necessarily - let Greenstone determine the encoding
    188     #push(@$html_options,"-input_encoding", "utf8");
    189     push(@$html_options,"-extract_language") if $self->{'extract_language'};
    190     push(@$html_options,"-file_rename_method", "none");
    191    
    192     push(@$html_options, "-processing_tmp_files");
    193    
    194     # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
    195     # to extract these metadata fields from the HEAD META fields
    196     push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
    197     }
    198     if (defined $text_options){
    199     #push(@$text_options,"-input_encoding", "utf8");
    200     push(@$text_options,"-extract_language") if $self->{'extract_language'};
    201     push(@$text_options,"-file_rename_method", "none");
    202     }
    203     if (defined $pageimg_options){
     186    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
     187    $secondary_plugin_options->{$secondary_plugin_name} = [];
     188    }
     189    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
     190
     191    push(@$specific_options, "-file_rename_method", "none");
     192    push(@$specific_options, "-extract_language") if $self->{'extract_language'};
     193
     194    if ($secondary_plugin_name eq "HTMLPlugin") {
     195    push(@$specific_options, "-processing_tmp_files");
     196    push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
     197    }
     198    elsif ($secondary_plugin_name eq "PagedImagePlugin") {
     199    push(@$specific_options, "-processing_tmp_files");
    204200    #is this true??
    205     push(@$pageimg_options,"-input_encoding", "utf8");
    206     push(@$pageimg_options,"-extract_language") if $self->{'extract_language'};
    207     push(@$pageimg_options,"-file_rename_method", "none");
    208     push(@$pageimg_options, "-processing_tmp_files");
     201    push(@$specific_options,"-input_encoding", "utf8");
    209202    }
    210203
  • main/trunk/greenstone2/perllib/plugins/RTFPlugin.pm

    r20790 r22597  
    3535}
    3636
     37# currently only converts to HTML
     38my $convert_to_list =
     39    [ { 'name' => "html",
     40    'desc' => "{ConvertBinaryFile.convert_to.html}" } ];
     41
    3742my $arguments =
    38     [ { 'name' => "process_exp",
     43    [ { 'name' => "convert_to",
     44    'desc' => "{ConvertBinaryFile.convert_to}",
     45    'type' => "enum",
     46    'reqd' => "yes",
     47    'list' => $convert_to_list,
     48    'deft' => "html" },
     49      { 'name' => "process_exp",
    3950    'desc' => "{BasePlugin.process_exp}",
    4051    'type' => "regexp",
     
    7182    $self->{'file_type'} = "RTF";
    7283
     84    # set convert_to_plugin and convert_to_ext
     85    $self->ConvertBinaryFile::set_standard_convert_settings();
     86    my $secondary_plugin_name = $self->{'convert_to_plugin'};
    7387    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
    74     if (!defined $secondary_plugin_options->{'TextPlugin'}) {
    75     $secondary_plugin_options->{'TextPlugin'} = [];
     88
     89    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
     90    $secondary_plugin_options->{$secondary_plugin_name} = [];
    7691    }
    77     if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
    78     $secondary_plugin_options->{'HTMLPlugin'} = [];
     92    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
     93   
     94    push(@$specific_options, "-file_rename_method", "none");
     95    push(@$specific_options, "-extract_language") if $self->{'extract_language'};
     96    if ($secondary_plugin_name eq "TextPlugin") {
     97    push(@$specific_options, "-input_encoding", "utf8");
    7998    }
    80     my $text_options = $secondary_plugin_options->{'TextPlugin'};
    81     my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
    82    
    83     #$self->{'input_encoding'} = "utf8";
    84     #$self->{'extract_language'} = 1;
    85     push(@$text_options, "-input_encoding", "utf8");
    86     push(@$text_options,"-extract_language") if $self->{'extract_language'};
    87     push(@$html_options, "-description_tags") if $self->{'description_tags'};
    88     push(@$html_options,"-extract_language") if $self->{'extract_language'};
    89 
    90     push(@$html_options, "-file_rename_method", "none");
    91     push(@$text_options, "-file_rename_method", "none");
    92 
    93     # tell the secondary plugins that they are processing tmp files
    94     push(@$html_options, "-processing_tmp_files");
     99    elsif ($secondary_plugin_name eq "HTMLPlugin") {
     100    push(@$specific_options, "-description_tags") if $self->{'description_tags'};
     101    push(@$specific_options, "-processing_tmp_files");
     102    }
    95103
    96104    $self = bless $self, $class;
  • main/trunk/greenstone2/perllib/plugins/WordPlugin.pm

    r22514 r22597  
    4141    eval("require OpenOfficeConverter");
    4242    if ($@) {
    43     # Useful debugging statement if there is a syntax error in OpenOfficeConverter 
     43    # Useful debugging statement if there is a syntax error in OpenOfficeConverter:
    4444    #print STDERR "$@\n";
    4545    @WordPlugin::ISA = ('ConvertBinaryFile');
     
    174174    }
    175175
    176     my $outhandle = $self->{'outhandle'};
    177176    $self->{'filename_extension'} = "doc";
    178177    $self->{'file_type'} = "Word";
     178
     179    my $outhandle = $self->{'outhandle'};
    179180
    180181    if ($self->{'windows_scripting'}) {
     
    193194    }
    194195
    195     # we always save as utf-8
    196 #    if ($self->{'input_encoding'} eq "auto") {
    197 #   $self->{'input_encoding'} = "utf8";
    198 #    }
    199 
     196    # check convert_to
     197    if ($self->{'convert_to'} eq "auto") {
     198    $self->{'convert_to'} = "html";
     199    }
     200    # windows or open office scripting, outputs structuredHTML
     201    if (defined $self->{'office_scripting'}) {
     202    $self->{'convert_to'} = "structuredhtml";
     203    }
     204
     205    # set convert_to_plugin and convert_to_ext
     206    $self->ConvertBinaryFile::set_standard_convert_settings();
     207 
     208    my $secondary_plugin_name = $self->{'convert_to_plugin'};
    200209    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
    201     if (defined $self->{'office_scripting'}) {
    202     if (!defined $secondary_plugin_options->{'StructuredHTMLPlugin'}){
    203         $secondary_plugin_options->{'StructuredHTMLPlugin'} = [];
    204         my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};
    205        
    206         # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
    207         # to extract these metadata fields from the HEAD META fields
    208         push (@$structhtml_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
    209         push (@$structhtml_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    210         push (@$structhtml_options, "-description_tags") if $self->{'office_scripting'};
    211         push (@$structhtml_options, "-extract_language") if $self->{'extract_language'};
    212         push (@$structhtml_options, "-delete_toc") if $self->{'delete_toc'};
    213         push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
    214         push (@$structhtml_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
    215         push (@$structhtml_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
    216         push (@$structhtml_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
    217         push (@$structhtml_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
    218         push (@$structhtml_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
    219         push (@$structhtml_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
    220     }
    221     }
    222     if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
    223     $secondary_plugin_options->{'HTMLPlugin'} = [];
    224     }
    225     if (!defined $secondary_plugin_options->{'TextPlugin'}) {
    226     $secondary_plugin_options->{'TextPlugin'} = [];
    227     }
    228 
    229     my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
    230     my $text_options = $secondary_plugin_options->{'TextPlugin'};
    231     my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};   
    232     # tell the secondary plugins that they are processing tmp files
    233     push(@$html_options, "-processing_tmp_files");
    234     push(@$structhtml_options, "-processing_tmp_files");
    235    
    236     # wvWare will always produce html files encoded as utf-8, so make sure the secondary HTMLPlugin knows this
    237     push(@$html_options,"-input_encoding", "utf8");
    238     push(@$html_options,"-extract_language") if $self->{'extract_language'};
    239     push(@$html_options, "-description_tags") if $self->{'description_tags'};
    240 
    241     # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
    242     # to extract these metadata fields from the HEAD META fields
    243     push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
    244     push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
     210
     211    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
     212    $secondary_plugin_options->{$secondary_plugin_name} = [];
     213    }
     214    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
     215
     216    # following title_sub removes "Page 1" and a leading
     217    # "1", which is often the page number at the top of the page. Bad Luck
     218    # if your document title actually starts with "1 " - is there a better way?
     219    push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    245220
    246221    my $associate_tail_re = $self->{'associate_tail_re'};
    247222    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
    248     push(@$html_options, "-associate_tail_re", $associate_tail_re);
    249     push(@$text_options, "-associate_tail_re", $associate_tail_re);
    250     push(@$structhtml_options, "-associate_tail_re", $associate_tail_re) if defined $structhtml_options;
    251     }
    252 
    253     push(@$html_options, "-file_rename_method", "none");
    254     push(@$text_options, "-file_rename_method", "none");
    255     push(@$structhtml_options, "-file_rename_method", "none") if defined $structhtml_options;
     223    push(@$specific_options, "-associate_tail_re", $associate_tail_re);
     224    }
     225    push(@$specific_options, "-file_rename_method", "none");
     226
     227    if ($secondary_plugin_name eq "StructuredHTMLPlugin") {
     228    # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
     229    # to extract these metadata fields from the HEAD META fields
     230    push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>");
     231    push (@$specific_options, "-description_tags") if $self->{'office_scripting'};
     232    push (@$specific_options, "-extract_language") if $self->{'extract_language'};
     233    push (@$specific_options, "-delete_toc") if $self->{'delete_toc'};
     234    push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'};
     235    push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'};
     236    push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'};
     237    push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'};
     238    push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'};
     239    push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'};
     240    push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'};
     241    push(@$specific_options, "-processing_tmp_files");
     242   
     243    }
     244   
     245    elsif ($secondary_plugin_name eq "HTMLPlugin") {
     246    push(@$specific_options, "-processing_tmp_files");
     247    push(@$specific_options,"-input_encoding", "utf8");
     248    push(@$specific_options,"-extract_language") if $self->{'extract_language'};
     249    push(@$specific_options, "-description_tags") if $self->{'description_tags'};
     250    # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)
     251    # to extract these metadata fields from the HEAD META fields
     252    push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
     253    }
    256254
    257255    $self = bless $self, $class;
Note: See TracChangeset for help on using the changeset viewer.