Changeset 22597

Show
Ignore:
Timestamp:
10.08.2010 14:31:53 (9 years ago)
Author:
kjdon
Message:

code tidy up. rearranged how convertbinaryfile plugins set up their secondary plugins - now only set up the options for the one they are using. all subclass specific code moved out of convertbinaryfile.new into the appropriate plugin file.

Location:
main/trunk/greenstone2/perllib/plugins
Files:
8 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/CONTENTdmPlugin.pm

    r20790 r22597  
    4444 
    4545my $convert_to_list = 
    46     [ { 'name' => "auto", 
    47     'desc' => "{ConvertBinaryFile.convert_to.auto}" }, 
    48       { 'name' => "html", 
    49     'desc' => "{ConvertBinaryFile.convert_to.html}" }, 
    50       { 'name' => "text", 
    51     'desc' => "{ConvertBinaryFile.convert_to.text}" }, 
     46    [  
     47#      {    'name' => "auto", 
     48#   'desc' => "{ConvertBinaryFile.convert_to.auto}" }, 
     49#      {    'name' => "html", 
     50#   'desc' => "{ConvertBinaryFile.convert_to.html}" }, 
     51#      {    'name' => "text", 
     52#   'desc' => "{ConvertBinaryFile.convert_to.text}" }, 
    5253      { 'name' => "pagedimg", 
    5354    'desc' => "{ConvertBinaryFile.convert_to.pagedimg}"}, 
     
    124125    $self->{'metadata_value'} = undef; 
    125126 
    126     $self->{'convert_to'} = "PagedImage"; 
     127    # do we only allow one option?? 
     128    $self->{'convert_to'} = "pagedimg"; 
     129    $self->{'convert_to_plugin'} = "PagedImagePlugin"; 
     130    $self->{'convert_to_ext'} = "jpg"; 
     131     
     132    my $secondary_plugin_name = $self->{'convert_to_plugin'}; 
    127133    my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 
    128134 
    129     if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){ 
    130     $secondary_plugin_options->{'PagedImagePlugin'} = []; 
    131     } 
    132     my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};  
    133     push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
    134     push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true"); 
    135     push(@$pagedimg_options, "-file_rename_method", "none"); 
    136     push(@$pagedimg_options, "-processing_tmp_files"); 
     135    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 
     136    $secondary_plugin_options->{$secondary_plugin_name} = []; 
     137    } 
     138    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 
     139 
     140    push(@$specific_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
     141    push(@$specific_options, "-create_thumbnail", "true", "-create_screenview", "true"); 
     142    push(@$specific_options, "-file_rename_method", "none"); 
     143    push(@$specific_options, "-processing_tmp_files"); 
     144 
     145#    my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 
     146 
     147#    if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){ 
     148#   $secondary_plugin_options->{'PagedImagePlugin'} = []; 
     149#    } 
     150#    my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};  
     151#    push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
     152#    push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true"); 
     153#    push(@$pagedimg_options, "-file_rename_method", "none"); 
     154#    push(@$pagedimg_options, "-processing_tmp_files"); 
    137155    $self = bless $self, $class; 
    138  
    139 # ***** no longer needed! 
    140 #    # This needs to be done after blss, to $self passed to XML::Parser 
    141 #    # can correctly resolve the right call-back methods during XML parsing 
    142  
    143  
    144156    $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists); 
    145157    return $self; 
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r22504 r22597  
    9595    my ($class,$input_args,$hashArgOptLists) = @_; 
    9696 
    97     my @convert_to_list = split(",",$self->{'convert_to'}); 
     97    my @convert_to_list = split(",",$self->{'convert_to_plugin'}); 
    9898    my $secondary_plugins = {}; 
    9999    # find the plugin 
     
    101101    foreach my $convert_to (@convert_to_list) { 
    102102    # load in "convert_to" plugin package 
    103     my $plugin_class = $convert_to."Plugin"; 
     103    my $plugin_class = $convert_to; 
    104104    my $plugin_package = $plugin_class.".pm"; 
    105105 
     
    143143 
    144144    my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists); 
    145      
    146     if ($self->{'info_only'}) { 
    147     # don't worry about any options etc 
    148     return bless $self, $class; 
    149     } 
    150  
    151     my $convert_to_type = $self->{'convert_to'}; 
    152     if (!defined $convert_to_type || $convert_to_type eq "") { 
    153     $convert_to_type = "auto"; 
    154     } 
    155     my $windows_scripting = $self->{'windows_scripting'}; 
    156     $windows_scripting = 0 unless defined $windows_scripting; 
    157     if ($classPluginName eq "PDFPlugin") { 
    158     if ($convert_to_type eq "text" &&  
    159         $ENV{'GSDLOS'} =~ /^windows$/i) { 
    160         print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; 
    161         $convert_to_type = "html"; 
    162     } 
    163     } elsif ($classPluginName eq "WordPlugin") { 
    164     if (($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type =~ /^(html|auto)$/) || defined $self->{'openoffice_scripting'}) { 
    165         # we use structured HTML, not normal html 
    166         $convert_to_type = "structuredhtml"; 
    167     }  
    168     } elsif ($classPluginName eq "PowerPointPlugin") { 
    169     if ($windows_scripting && $ENV{'GSDLOS'} =~ /^windows$/i && $convert_to_type eq "auto") { 
    170         # we use paged img 
    171         $convert_to_type = "pagedimg_jpg"; 
    172     }  
    173     } elsif ($classPluginName eq "PostScriptPlugin") { 
    174     if ($convert_to_type eq "auto") { 
    175         # we use text 
    176         $convert_to_type = "text"; 
    177     } 
    178     } 
    179      
    180     if ($convert_to_type eq "auto") { 
    181     # choose html for now - should choose a format based on doc type 
    182     $convert_to_type = "html"; 
    183     } 
    184      
    185     if ($convert_to_type eq "html") { 
    186     $self->{'convert_to'} = "HTML"; 
     145    
     146    return bless $self, $class; 
     147} 
     148 
     149# should be called by subclasses after checking and setting  
     150# $self->{'convert_to'} 
     151sub set_standard_convert_settings { 
     152    my $self =shift (@_); 
     153     
     154    my $convert_to = $self->{'convert_to'}; 
     155    if ($convert_to eq "auto") { 
     156    $convert_to = "html"; 
     157    $self->{'convert_to'} = "html"; 
     158    } 
     159 
     160    if ($convert_to eq "html") { 
     161    $self->{'convert_to_plugin'} = "HTMLPlugin"; 
    187162    $self->{'convert_to_ext'} = "html"; 
    188     } elsif ($convert_to_type eq "text") { 
    189     $self->{'convert_to'} = "Text"; 
     163    } elsif ($convert_to eq "text") { 
     164    $self->{'convert_to_plugin'} = "TextPlugin"; 
    190165    $self->{'convert_to_ext'} = "txt"; 
    191     } elsif ($convert_to_type eq "structuredhtml") { 
    192     $self->{'convert_to'} = "StructuredHTML"; 
     166    } elsif ($convert_to eq "structuredhtml") { 
     167    $self->{'convert_to_plugin'} = "StructuredHTMLPlugin"; 
    193168    $self->{'convert_to_ext'} = "html"; 
    194     } elsif ($convert_to_type =~ /^pagedimg/) { 
    195     $self->{'convert_to'} = "PagedImage"; 
    196     my ($convert_to_ext) = $convert_to_type =~ /pagedimg\_(jpg|gif|png)/i; 
     169    } elsif ($convert_to =~ /^pagedimg/) { 
     170    $self->{'convert_to_plugin'} = "PagedImagePlugin"; 
     171    my ($convert_to_ext) = $convert_to =~ /pagedimg\_(jpg|gif|png)/i; 
    197172    $convert_to_ext = 'jpg' unless defined $convert_to_ext; 
    198173    $self->{'convert_to_ext'} = $convert_to_ext; 
    199174    } 
    200      
    201     return bless $self, $class; 
    202 } 
    203  
    204  
     175 
     176} 
    205177sub init { 
    206178    my $self = shift (@_); 
     
    316288    # Execute the conversion command and get the type of the result, 
    317289    # making sure the converter gives us the appropriate output type 
    318     my $output_type=""; 
    319     if ($convert_to =~ m/PagedImage/i) { 
    320     $output_type = lc($convert_to)."_".lc($convert_to_ext); 
    321     } else { 
    322     $output_type = lc($convert_to); 
    323     } 
     290    my $output_type=$self->{'convert_to'}; 
     291#    if ($convert_to =~ m/PagedImage/i) { 
     292#   $output_type = lc($convert_to)."_".lc($convert_to_ext); 
     293#    } else { 
     294#   $output_type = lc($convert_to); 
     295#    } 
    324296 
    325297    my $cmd = "perl -S gsConvert.pl -verbose $verbosity "; 
     
    331303    } 
    332304    $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\""; 
    333  
     305    print STDERR "calling cmd $cmd\n"; 
    334306    $output_type = `$cmd`; 
    335307 
  • main/trunk/greenstone2/perllib/plugins/ExcelPlugin.pm

    r22515 r22597  
    3333no strict 'subs'; 
    3434use gsprintf 'gsprintf'; 
    35  
    36 #sub BEGIN { 
    37 #    @ExcelPlugin::ISA = ('ConvertBinaryFile'); 
    38 #} 
    3935 
    4036# @ISA dynamically configured to be either OpenOfficeConverter or ConvertBinaryFile 
     
    9490    push(@$pluginlist, $class); 
    9591     
    96     #my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists); 
    9792    if ($openoffice_ext_installed) { 
    9893    print STDERR "ExcelPlugin: OpenOffice Extension to Greenstone detected\n"; 
     
    126121    } 
    127122 
    128     my $outhandle = $self->{'outhandle'}; 
    129  
    130123    $self->{'filename_extension'} = "xls"; 
    131124    $self->{'file_type'} = "Excel"; 
    132125 
     126    my $outhandle = $self->{'outhandle'}; 
     127 
     128    # check convert_to 
     129    if ($self->{'convert_to'} eq "auto") { 
     130    $self->{'convert_to'} = "html"; 
     131    } 
     132 
    133133    $self->{'convert_options'} = "-openoffice_scripting" if $self->{'openoffice_scripting'}; 
    134134 
    135     # other options for HTML if using open office??? 
     135    # set convert_to_plugin and convert_to_ext 
     136    $self->ConvertBinaryFile::set_standard_convert_settings(); 
     137    print STDERR "final convert-to $self->{'convert_to'}\n"; 
     138 
     139    my $secondary_plugin_name = $self->{'convert_to_plugin'}; 
    136140    my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 
    137     if (!defined $secondary_plugin_options->{'HTMLPlugin'}) { 
    138     $secondary_plugin_options->{'HTMLPlugin'} = []; 
     141 
     142    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 
     143    $secondary_plugin_options->{$secondary_plugin_name} = []; 
    139144    } 
    140     if (!defined $secondary_plugin_options->{'TextPlugin'}) { 
    141     $secondary_plugin_options->{'TextPlugin'} = []; 
     145    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 
     146 
     147    push(@$specific_options,"-extract_language") if $self->{'extract_language'}; 
     148    push(@$specific_options, "-file_rename_method", "none"); 
     149 
     150    if ($secondary_plugin_name eq "HTMLPlugin") { 
     151    push(@$specific_options, "-processing_tmp_files"); 
    142152    } 
    143     my $html_options = $secondary_plugin_options->{'HTMLPlugin'}; 
    144     my $text_options = $secondary_plugin_options->{'TextPlugin'}; 
    145  
    146     # xslhtml doesn't output utf8, let Greenstone work out the encoding 
    147     #push(@$html_options, "-input_encoding", "utf8"); 
    148     push(@$html_options,"-extract_language") if $self->{'extract_language'}; 
    149     push(@$html_options, "-file_rename_method", "none"); 
    150     push(@$html_options, "-processing_tmp_files"); 
    151  
    152     #push(@$text_options, "-input_encoding", "utf8"); 
    153     push(@$text_options,"-extract_language") if $self->{'extract_language'}; 
    154     push(@$text_options, "-file_rename_method", "none"); 
    155153 
    156154    $self = bless $self, $class; 
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r21800 r22597  
    119119    push(@{$hashArgOptLists->{"OptList"}},$options); 
    120120 
    121     my @arg_array = @$inputargs; 
    122121    my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists); 
    123122     
     
    138137    $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"}; 
    139138 
     139    # check convert_to 
     140    if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 
     141    print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; 
     142    $self->{'convert_to'} = "html"; 
     143    } 
     144    elsif ($self->{'convert_to'} eq "auto") { 
     145    # choose html ?? is this the best option 
     146    $self->{'convert_to'} = "html"; 
     147    } 
     148    # set convert_to_plugin and convert_to_ext 
     149    $self->ConvertBinaryFile::set_standard_convert_settings(); 
     150 
     151    my $secondary_plugin_name = $self->{'convert_to_plugin'}; 
    140152    my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 
    141153 
    142     if (!defined $secondary_plugin_options->{'HTMLPlugin'}) { 
    143     $secondary_plugin_options->{'HTMLPlugin'} = []; 
    144     } 
    145     if (!defined $secondary_plugin_options->{'TextPlugin'}) { 
    146     $secondary_plugin_options->{'TextPlugin'} = []; 
    147     } 
    148     if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i) { 
    149     if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){ 
    150         $secondary_plugin_options->{'PagedImagePlugin'} = []; 
    151         my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};  
    152         push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
    153         push(@$pagedimg_options, "-screenviewsize", "1000"); 
    154         push(@$pagedimg_options, "-enable_cache"); 
    155     } 
    156     } 
    157     my $html_options = $secondary_plugin_options->{'HTMLPlugin'}; 
    158     my $text_options = $secondary_plugin_options->{'TextPlugin'}; 
    159     my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'}; 
    160  
    161 #    if ($self->{'input_encoding'} eq "auto") { 
    162 #   $self->{'input_encoding'} = "utf8"; 
    163 #    } 
    164  
    165     # if pdftohtml is always producing utf8, then htmlplug always needs this option 
    166     push(@$html_options,"-input_encoding", "utf8"); 
    167     push(@$html_options,"-extract_language") if $self->{'extract_language'}; 
    168  
    169     push(@$html_options, "-processing_tmp_files"); 
    170     push(@$pagedimg_options, "-processing_tmp_files"); 
    171  
    172     # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)  
    173     # to extract these metadata fields from the HEAD META fields 
    174     my $required_metadata; 
    175     if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) { 
    176     push(@$html_options,"-metadata_fields",$self->{'metadata_fields'}); 
    177     } else { 
    178     push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 
    179     } 
    180     if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) { 
    181     push(@$html_options,"-metadata_field_separator",$self->{'metadata_field_separator'}); 
    182     } 
    183          
    184     if ($self->{'use_sections'} || $self->{'description_tags'}) { 
    185     $self->{'description_tags'} = 1; 
    186     push(@$html_options,"-description_tags"); 
    187     } 
     154    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 
     155    $secondary_plugin_options->{$secondary_plugin_name} = []; 
     156    } 
     157    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 
    188158 
    189159    # following title_sub removes "Page 1" added by pdftohtml, and a leading 
    190160    # "1", which is often the page number at the top of the page. Bad Luck 
    191161    # if your document title actually starts with "1 " - is there a better way? 
    192     push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
    193     push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
    194  
     162    push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
    195163    my $associate_tail_re = $self->{'associate_tail_re'}; 
    196164    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) { 
    197     push(@$html_options, "-associate_tail_re", $associate_tail_re); 
    198     push(@$text_options, "-associate_tail_re", $associate_tail_re); 
    199     push(@$pagedimg_options, "-associate_tail_re", $associate_tail_re) if defined $pagedimg_options; 
    200     } 
    201  
    202     push(@$html_options, "-file_rename_method", "none"); 
    203     push(@$text_options, "-file_rename_method", "none"); 
    204     push(@$pagedimg_options, "-file_rename_method", "none") if defined $pagedimg_options; 
     165    push(@$specific_options, "-associate_tail_re", $associate_tail_re); 
     166    } 
     167    push(@$specific_options, "-file_rename_method", "none"); 
     168     
     169    if ($secondary_plugin_name eq "HTMLPlugin") { 
     170    # pdftohtml always produces utf8 
     171    push(@$specific_options, "-input_encoding", "utf8"); 
     172    push(@$specific_options, "-extract_language") if $self->{'extract_language'}; 
     173    push(@$specific_options, "-processing_tmp_files"); 
     174    # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)  
     175    # to extract these metadata fields from the HEAD META fields 
     176    if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) { 
     177        push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'}); 
     178    } else { 
     179        push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 
     180    } 
     181    if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) { 
     182        push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'}); 
     183    } 
     184    if ($self->{'use_sections'} || $self->{'description_tags'}) { 
     185        $self->{'description_tags'} = 1; 
     186        push(@$specific_options, "-description_tags"); 
     187    } 
     188    } 
     189    elsif ($secondary_plugin_name eq "PagedImagePlugin") { 
     190    push(@$specific_options, "-screenviewsize", "1000"); 
     191    push(@$specific_options, "-enable_cache"); 
     192    push(@$specific_options, "-processing_tmp_files"); 
     193    } 
    205194 
    206195    $self = bless $self, $class; 
  • main/trunk/greenstone2/perllib/plugins/PostScriptPlugin.pm

    r20790 r22597  
    8888    push(@$pluginlist, $class); 
    8989 
    90     #push(@$inputargs,"-convert_to"); 
    91     #push(@$inputargs,"text"); 
    9290    push(@$inputargs,"-title_sub"); 
    9391    push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?'); 
     
    106104    $self->{'file_type'} = "PS"; 
    107105 
     106    if ($self->{'convert_to'} eq "auto") { 
     107    $self->{'convert_to'} = "text"; 
     108    } 
     109 
     110    # set convert_to_plugin and convert_to_ext 
     111    $self->ConvertBinaryFile::set_standard_convert_settings(); 
     112    my $secondary_plugin_name = $self->{'convert_to_plugin'}; 
    108113    my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 
    109114 
    110     if (!defined $secondary_plugin_options->{'TextPlugin'}) { 
    111     $secondary_plugin_options->{'TextPlugin'} = []; 
    112     } 
    113  
    114     if (!defined $secondary_plugin_options->{'HTMLPlugin'}) { 
    115     $secondary_plugin_options->{'HTMLPlugin'} = []; 
    116     } 
    117  
    118     my $text_options = $secondary_plugin_options->{'TextPlugin'}; 
    119     my $html_options = $secondary_plugin_options->{'HTMLPlugin'}; 
    120  
    121     if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i) { 
    122     if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){ 
    123         $secondary_plugin_options->{'PagedImagePlugin'} = []; 
    124         my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};  
    125         push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
    126         push(@$pagedimg_options, "-file_rename_method", "none"); 
    127         push(@$pagedimg_options, "-processing_tmp_files"); 
    128     } 
    129     } 
     115    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 
     116    $secondary_plugin_options->{$secondary_plugin_name} = []; 
     117    } 
     118    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 
     119 
    130120    # following title_sub removes "Page 1" added by ps2ascii, and a leading 
    131121    # "1", which is often the page number at the top of the page. Bad Luck 
    132122    # if your document title actually starts with "1 " - is there a better way? 
    133     #$self->{'input_encoding'} = "utf8"; 
    134     #$self->{'extract_language'} = 1; 
    135     push(@$text_options, "-input_encoding", "utf8"); 
    136     push(@$text_options,"-extract_language") if $self->{'extract_language'}; 
    137     push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
    138  
    139     push(@$text_options, "-file_rename_method", "none"); 
    140     push(@$html_options, "-file_rename_method", "none"); 
    141      
    142     # tell the secondary plugins that they are processing tmp files 
    143     push(@$html_options, "-processing_tmp_files"); 
     123    push(@$specific_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
     124    push(@$specific_options, "-file_rename_method", "none"); 
     125     
     126    if ($secondary_plugin_name eq "TextPlugin") { 
     127    push(@$specific_options, "-input_encoding", "utf8"); 
     128    push(@$specific_options,"-extract_language") if $self->{'extract_language'}; 
     129    } elsif ($secondary_plugin_name eq "PagedImagePlugin") { 
     130    push(@$specific_options, "-processing_tmp_files"); 
     131    } 
    144132 
    145133    $self = bless $self, $class; 
  • main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm

    r22515 r22597  
    3434no strict 'subs'; 
    3535use gsprintf 'gsprintf'; 
    36  
    37 #sub BEGIN { 
    38 #    @PowerPointPlugin::ISA = ('ConvertBinaryFile'); 
    39 #} 
    4036 
    4137# @ISA dynamically configured to be either OpenOfficeConverter or ConvertBinaryFile 
     
    157153    } 
    158154 
    159     my $outhandle = $self->{'outhandle'}; 
    160  
    161155    $self->{'filename_extension'} = "ppt"; 
    162156    $self->{'file_type'} = "PPT"; 
     157 
     158    if ($self->{'convert_to'} eq "auto") { 
     159    if ($self->{'windows_scripting'}) { 
     160        $self->{'convert_to'} = "pagedimg_jpg"; 
     161    } 
     162    else { 
     163        $self->{'convert_to'} = "html"; 
     164    } 
     165    } 
     166 
     167   my $outhandle = $self->{'outhandle'}; 
    163168 
    164169    # can't have windows_scripting and openoffice_scripting at the same time 
     
    172177    $self->{'convert_options'} = "-windows_scripting" if $self->{'windows_scripting'}; 
    173178    $self->{'convert_options'} = "-openoffice_scripting" if $self->{'openoffice_scripting'}; 
     179 
     180    # set convert_to_plugin and convert_to_ext 
     181    $self->ConvertBinaryFile::set_standard_convert_settings(); 
     182 
     183    my $secondary_plugin_name = $self->{'convert_to_plugin'}; 
    174184    my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 
    175185 
    176     if ($self->{'windows_scripting'} && ($self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i)) { 
    177     $secondary_plugin_options->{'PagedImagePlugin'} = []; 
    178     } else { 
    179     $secondary_plugin_options->{'HTMLPlugin'} = []; 
    180     $secondary_plugin_options->{'TextPlugin'} = []; 
    181     } 
    182     my $html_options = $secondary_plugin_options->{'HTMLPlugin'}; 
    183     my $text_options = $secondary_plugin_options->{'TextPlugin'}; 
    184     my $pageimg_options = $secondary_plugin_options->{'PagedImagePlugin'}; 
    185   
    186     if (defined $html_options){ 
    187     # ppthtml doesn't output utf-8 necessarily - let Greenstone determine the encoding 
    188     #push(@$html_options,"-input_encoding", "utf8"); 
    189     push(@$html_options,"-extract_language") if $self->{'extract_language'}; 
    190     push(@$html_options,"-file_rename_method", "none"); 
    191      
    192     push(@$html_options, "-processing_tmp_files"); 
    193      
    194     # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj)  
    195     # to extract these metadata fields from the HEAD META fields 
    196     push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 
    197     }  
    198     if (defined $text_options){ 
    199     #push(@$text_options,"-input_encoding", "utf8"); 
    200     push(@$text_options,"-extract_language") if $self->{'extract_language'}; 
    201     push(@$text_options,"-file_rename_method", "none"); 
    202     } 
    203     if (defined $pageimg_options){ 
     186    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 
     187    $secondary_plugin_options->{$secondary_plugin_name} = []; 
     188    } 
     189    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 
     190 
     191    push(@$specific_options, "-file_rename_method", "none"); 
     192    push(@$specific_options, "-extract_language") if $self->{'extract_language'}; 
     193 
     194    if ($secondary_plugin_name eq "HTMLPlugin") { 
     195    push(@$specific_options, "-processing_tmp_files"); 
     196    push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 
     197    } 
     198    elsif ($secondary_plugin_name eq "PagedImagePlugin") { 
     199    push(@$specific_options, "-processing_tmp_files"); 
    204200    #is this true?? 
    205     push(@$pageimg_options,"-input_encoding", "utf8"); 
    206     push(@$pageimg_options,"-extract_language") if $self->{'extract_language'};  
    207     push(@$pageimg_options,"-file_rename_method", "none"); 
    208     push(@$pageimg_options, "-processing_tmp_files"); 
     201    push(@$specific_options,"-input_encoding", "utf8"); 
    209202    } 
    210203 
  • main/trunk/greenstone2/perllib/plugins/RTFPlugin.pm

    r20790 r22597  
    3535} 
    3636 
     37# currently only converts to HTML 
     38my $convert_to_list = 
     39    [ { 'name' => "html", 
     40    'desc' => "{ConvertBinaryFile.convert_to.html}" } ]; 
     41 
    3742my $arguments = 
    38     [ { 'name' => "process_exp", 
     43    [ { 'name' => "convert_to", 
     44    'desc' => "{ConvertBinaryFile.convert_to}", 
     45    'type' => "enum", 
     46    'reqd' => "yes", 
     47    'list' => $convert_to_list,  
     48    'deft' => "html" }, 
     49      { 'name' => "process_exp", 
    3950    'desc' => "{BasePlugin.process_exp}", 
    4051    'type' => "regexp", 
     
    7182    $self->{'file_type'} = "RTF"; 
    7283 
     84    # set convert_to_plugin and convert_to_ext 
     85    $self->ConvertBinaryFile::set_standard_convert_settings(); 
     86    my $secondary_plugin_name = $self->{'convert_to_plugin'}; 
    7387    my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 
    74     if (!defined $secondary_plugin_options->{'TextPlugin'}) { 
    75     $secondary_plugin_options->{'TextPlugin'} = []; 
     88 
     89    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 
     90    $secondary_plugin_options->{$secondary_plugin_name} = []; 
    7691    } 
    77     if (!defined $secondary_plugin_options->{'HTMLPlugin'}) { 
    78     $secondary_plugin_options->{'HTMLPlugin'} = []; 
     92    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 
     93     
     94    push(@$specific_options, "-file_rename_method", "none"); 
     95    push(@$specific_options, "-extract_language") if $self->{'extract_language'}; 
     96    if ($secondary_plugin_name eq "TextPlugin") { 
     97    push(@$specific_options, "-input_encoding", "utf8"); 
    7998    } 
    80     my $text_options = $secondary_plugin_options->{'TextPlugin'}; 
    81     my $html_options = $secondary_plugin_options->{'HTMLPlugin'}; 
    82      
    83     #$self->{'input_encoding'} = "utf8"; 
    84     #$self->{'extract_language'} = 1; 
    85     push(@$text_options, "-input_encoding", "utf8"); 
    86     push(@$text_options,"-extract_language") if $self->{'extract_language'}; 
    87     push(@$html_options, "-description_tags") if $self->{'description_tags'}; 
    88     push(@$html_options,"-extract_language") if $self->{'extract_language'}; 
    89  
    90     push(@$html_options, "-file_rename_method", "none"); 
    91     push(@$text_options, "-file_rename_method", "none"); 
    92  
    93     # tell the secondary plugins that they are processing tmp files 
    94     push(@$html_options, "-processing_tmp_files"); 
     99    elsif ($secondary_plugin_name eq "HTMLPlugin") { 
     100    push(@$specific_options, "-description_tags") if $self->{'description_tags'}; 
     101    push(@$specific_options, "-processing_tmp_files"); 
     102    } 
    95103 
    96104    $self = bless $self, $class; 
  • main/trunk/greenstone2/perllib/plugins/WordPlugin.pm

    r22514 r22597  
    4141    eval("require OpenOfficeConverter"); 
    4242    if ($@) { 
    43     # Useful debugging statement if there is a syntax error in OpenOfficeConverter   
     43    # Useful debugging statement if there is a syntax error in OpenOfficeConverter: 
    4444    #print STDERR "$@\n"; 
    4545    @WordPlugin::ISA = ('ConvertBinaryFile'); 
     
    174174    } 
    175175 
    176     my $outhandle = $self->{'outhandle'}; 
    177176    $self->{'filename_extension'} = "doc"; 
    178177    $self->{'file_type'} = "Word"; 
     178 
     179    my $outhandle = $self->{'outhandle'}; 
    179180 
    180181    if ($self->{'windows_scripting'}) { 
     
    193194    } 
    194195 
    195     # we always save as utf-8 
    196 #    if ($self->{'input_encoding'} eq "auto") { 
    197 #   $self->{'input_encoding'} = "utf8"; 
    198 #    } 
    199  
     196    # check convert_to 
     197    if ($self->{'convert_to'} eq "auto") { 
     198    $self->{'convert_to'} = "html"; 
     199    } 
     200    # windows or open office scripting, outputs structuredHTML 
     201    if (defined $self->{'office_scripting'}) { 
     202    $self->{'convert_to'} = "structuredhtml"; 
     203    }  
     204 
     205    # set convert_to_plugin and convert_to_ext 
     206    $self->ConvertBinaryFile::set_standard_convert_settings(); 
     207  
     208    my $secondary_plugin_name = $self->{'convert_to_plugin'}; 
    200209    my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 
    201     if (defined $self->{'office_scripting'}) {  
    202     if (!defined $secondary_plugin_options->{'StructuredHTMLPlugin'}){ 
    203         $secondary_plugin_options->{'StructuredHTMLPlugin'} = []; 
    204         my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'}; 
    205          
    206         # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj) 
    207         # to extract these metadata fields from the HEAD META fields 
    208         push (@$structhtml_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>"); 
    209         push (@$structhtml_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
    210         push (@$structhtml_options, "-description_tags") if $self->{'office_scripting'};  
    211         push (@$structhtml_options, "-extract_language") if $self->{'extract_language'}; 
    212         push (@$structhtml_options, "-delete_toc") if $self->{'delete_toc'}; 
    213         push (@$structhtml_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'}; 
    214         push (@$structhtml_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'}; 
    215         push (@$structhtml_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'}; 
    216         push (@$structhtml_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'}; 
    217         push (@$structhtml_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'}; 
    218         push (@$structhtml_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'}; 
    219         push (@$structhtml_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'}; 
    220     } 
    221     } 
    222     if (!defined $secondary_plugin_options->{'HTMLPlugin'}) { 
    223     $secondary_plugin_options->{'HTMLPlugin'} = []; 
    224     } 
    225     if (!defined $secondary_plugin_options->{'TextPlugin'}) { 
    226     $secondary_plugin_options->{'TextPlugin'} = []; 
    227     } 
    228  
    229     my $html_options = $secondary_plugin_options->{'HTMLPlugin'}; 
    230     my $text_options = $secondary_plugin_options->{'TextPlugin'}; 
    231     my $structhtml_options = $secondary_plugin_options->{'StructuredHTMLPlugin'};     
    232     # tell the secondary plugins that they are processing tmp files 
    233     push(@$html_options, "-processing_tmp_files"); 
    234     push(@$structhtml_options, "-processing_tmp_files"); 
    235      
    236     # wvWare will always produce html files encoded as utf-8, so make sure the secondary HTMLPlugin knows this 
    237     push(@$html_options,"-input_encoding", "utf8"); 
    238     push(@$html_options,"-extract_language") if $self->{'extract_language'}; 
    239     push(@$html_options, "-description_tags") if $self->{'description_tags'}; 
    240  
    241     # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj) 
    242     # to extract these metadata fields from the HEAD META fields 
    243     push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 
    244     push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
     210 
     211    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 
     212    $secondary_plugin_options->{$secondary_plugin_name} = []; 
     213    } 
     214    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 
     215 
     216    # following title_sub removes "Page 1" and a leading 
     217    # "1", which is often the page number at the top of the page. Bad Luck 
     218    # if your document title actually starts with "1 " - is there a better way? 
     219    push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
    245220 
    246221    my $associate_tail_re = $self->{'associate_tail_re'}; 
    247222    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) { 
    248     push(@$html_options, "-associate_tail_re", $associate_tail_re); 
    249     push(@$text_options, "-associate_tail_re", $associate_tail_re); 
    250     push(@$structhtml_options, "-associate_tail_re", $associate_tail_re) if defined $structhtml_options; 
    251     } 
    252  
    253     push(@$html_options, "-file_rename_method", "none"); 
    254     push(@$text_options, "-file_rename_method", "none"); 
    255     push(@$structhtml_options, "-file_rename_method", "none") if defined $structhtml_options; 
     223    push(@$specific_options, "-associate_tail_re", $associate_tail_re); 
     224    } 
     225    push(@$specific_options, "-file_rename_method", "none"); 
     226 
     227    if ($secondary_plugin_name eq "StructuredHTMLPlugin") { 
     228    # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj) 
     229    # to extract these metadata fields from the HEAD META fields 
     230    push (@$specific_options, "-metadata_fields","Title,GENERATOR,date,author<Creator>"); 
     231    push (@$specific_options, "-description_tags") if $self->{'office_scripting'};  
     232    push (@$specific_options, "-extract_language") if $self->{'extract_language'}; 
     233    push (@$specific_options, "-delete_toc") if $self->{'delete_toc'}; 
     234    push (@$specific_options, "-toc_header", $self->{'toc_header'}) if $self->{'toc_header'}; 
     235    push (@$specific_options, "-title_header", $self->{'title_header'}) if $self->{'title_header'}; 
     236    push (@$specific_options, "-level1_header", $self->{'level1_header'}) if $self->{'level1_header'}; 
     237    push (@$specific_options, "-level2_header", $self->{'level2_header'})if $self->{'level2_header'}; 
     238    push (@$specific_options, "-level3_header", $self->{'level3_header'}) if $self->{'level3_header'}; 
     239    push (@$specific_options, "-metadata_fields", $self->{'metadata_fields'}) if $self->{'metadata_fields'}; 
     240    push (@$specific_options, "-metadata_field_separator", $self->{'metadata_field_separator'}) if $self->{'metadata_field_separator'}; 
     241    push(@$specific_options, "-processing_tmp_files"); 
     242     
     243    } 
     244     
     245    elsif ($secondary_plugin_name eq "HTMLPlugin") { 
     246    push(@$specific_options, "-processing_tmp_files"); 
     247    push(@$specific_options,"-input_encoding", "utf8"); 
     248    push(@$specific_options,"-extract_language") if $self->{'extract_language'}; 
     249    push(@$specific_options, "-description_tags") if $self->{'description_tags'}; 
     250    # Instruct HTMLPlugin (when eventually accessed through read_into_doc_obj) 
     251    # to extract these metadata fields from the HEAD META fields 
     252    push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 
     253    } 
    256254 
    257255    $self = bless $self, $class;