Ignore:
Timestamp:
2010-08-10T14:31:53+12:00 (14 years ago)
Author:
kjdon
Message:

code tidy up. rearranged how convertbinaryfile plugins set up their secondary plugins - now only set up the options for the one they are using. all subclass specific code moved out of convertbinaryfile.new into the appropriate plugin file.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r21800 r22597  
    119119    push(@{$hashArgOptLists->{"OptList"}},$options);
    120120
    121     my @arg_array = @$inputargs;
    122121    my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
    123122   
     
    138137    $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
    139138
     139    # check convert_to
     140    if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
     141    print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
     142    $self->{'convert_to'} = "html";
     143    }
     144    elsif ($self->{'convert_to'} eq "auto") {
     145    # choose html ?? is this the best option
     146    $self->{'convert_to'} = "html";
     147    }
     148    # set convert_to_plugin and convert_to_ext
     149    $self->ConvertBinaryFile::set_standard_convert_settings();
     150
     151    my $secondary_plugin_name = $self->{'convert_to_plugin'};
    140152    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
    141153
    142     if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
    143     $secondary_plugin_options->{'HTMLPlugin'} = [];
    144     }
    145     if (!defined $secondary_plugin_options->{'TextPlugin'}) {
    146     $secondary_plugin_options->{'TextPlugin'} = [];
    147     }
    148     if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i) {
    149     if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
    150         $secondary_plugin_options->{'PagedImagePlugin'} = [];
    151         my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
    152         push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    153         push(@$pagedimg_options, "-screenviewsize", "1000");
    154         push(@$pagedimg_options, "-enable_cache");
    155     }
    156     }
    157     my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
    158     my $text_options = $secondary_plugin_options->{'TextPlugin'};
    159     my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
    160 
    161 #    if ($self->{'input_encoding'} eq "auto") {
    162 #   $self->{'input_encoding'} = "utf8";
    163 #    }
    164 
    165     # if pdftohtml is always producing utf8, then htmlplug always needs this option
    166     push(@$html_options,"-input_encoding", "utf8");
    167     push(@$html_options,"-extract_language") if $self->{'extract_language'};
    168 
    169     push(@$html_options, "-processing_tmp_files");
    170     push(@$pagedimg_options, "-processing_tmp_files");
    171 
    172     # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
    173     # to extract these metadata fields from the HEAD META fields
    174     my $required_metadata;
    175     if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
    176     push(@$html_options,"-metadata_fields",$self->{'metadata_fields'});
    177     } else {
    178     push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
    179     }
    180     if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
    181     push(@$html_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
    182     }
    183        
    184     if ($self->{'use_sections'} || $self->{'description_tags'}) {
    185     $self->{'description_tags'} = 1;
    186     push(@$html_options,"-description_tags");
    187     }
     154    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
     155    $secondary_plugin_options->{$secondary_plugin_name} = [];
     156    }
     157    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
    188158
    189159    # following title_sub removes "Page 1" added by pdftohtml, and a leading
    190160    # "1", which is often the page number at the top of the page. Bad Luck
    191161    # if your document title actually starts with "1 " - is there a better way?
    192     push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    193     push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    194 
     162    push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    195163    my $associate_tail_re = $self->{'associate_tail_re'};
    196164    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
    197     push(@$html_options, "-associate_tail_re", $associate_tail_re);
    198     push(@$text_options, "-associate_tail_re", $associate_tail_re);
    199     push(@$pagedimg_options, "-associate_tail_re", $associate_tail_re) if defined $pagedimg_options;
    200     }
    201 
    202     push(@$html_options, "-file_rename_method", "none");
    203     push(@$text_options, "-file_rename_method", "none");
    204     push(@$pagedimg_options, "-file_rename_method", "none") if defined $pagedimg_options;
     165    push(@$specific_options, "-associate_tail_re", $associate_tail_re);
     166    }
     167    push(@$specific_options, "-file_rename_method", "none");
     168   
     169    if ($secondary_plugin_name eq "HTMLPlugin") {
     170    # pdftohtml always produces utf8
     171    push(@$specific_options, "-input_encoding", "utf8");
     172    push(@$specific_options, "-extract_language") if $self->{'extract_language'};
     173    push(@$specific_options, "-processing_tmp_files");
     174    # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
     175    # to extract these metadata fields from the HEAD META fields
     176    if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
     177        push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
     178    } else {
     179        push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
     180    }
     181    if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
     182        push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
     183    }
     184    if ($self->{'use_sections'} || $self->{'description_tags'}) {
     185        $self->{'description_tags'} = 1;
     186        push(@$specific_options, "-description_tags");
     187    }
     188    }
     189    elsif ($secondary_plugin_name eq "PagedImagePlugin") {
     190    push(@$specific_options, "-screenviewsize", "1000");
     191    push(@$specific_options, "-enable_cache");
     192    push(@$specific_options, "-processing_tmp_files");
     193    }
    205194
    206195    $self = bless $self, $class;
Note: See TracChangeset for help on using the changeset viewer.