Changeset 22597 for main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
- Timestamp:
- 2010-08-10T14:31:53+12:00 (14 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
r21800 r22597 119 119 push(@{$hashArgOptLists->{"OptList"}},$options); 120 120 121 my @arg_array = @$inputargs;122 121 my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists); 123 122 … … 138 137 $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"}; 139 138 139 # check convert_to 140 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 141 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; 142 $self->{'convert_to'} = "html"; 143 } 144 elsif ($self->{'convert_to'} eq "auto") { 145 # choose html ?? is this the best option 146 $self->{'convert_to'} = "html"; 147 } 148 # set convert_to_plugin and convert_to_ext 149 $self->ConvertBinaryFile::set_standard_convert_settings(); 150 151 my $secondary_plugin_name = $self->{'convert_to_plugin'}; 140 152 my $secondary_plugin_options = $self->{'secondary_plugin_options'}; 141 153 142 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) { 143 $secondary_plugin_options->{'HTMLPlugin'} = []; 144 } 145 if (!defined $secondary_plugin_options->{'TextPlugin'}) { 146 $secondary_plugin_options->{'TextPlugin'} = []; 147 } 148 if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i) { 149 if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){ 150 $secondary_plugin_options->{'PagedImagePlugin'} = []; 151 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'}; 152 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 153 push(@$pagedimg_options, "-screenviewsize", "1000"); 154 push(@$pagedimg_options, "-enable_cache"); 155 } 156 } 157 my $html_options = $secondary_plugin_options->{'HTMLPlugin'}; 158 my $text_options = $secondary_plugin_options->{'TextPlugin'}; 159 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'}; 160 161 # if ($self->{'input_encoding'} eq "auto") { 162 # $self->{'input_encoding'} = "utf8"; 163 # } 164 165 # if pdftohtml is always producing utf8, then htmlplug always needs this option 166 push(@$html_options,"-input_encoding", "utf8"); 167 push(@$html_options,"-extract_language") if $self->{'extract_language'}; 168 169 push(@$html_options, "-processing_tmp_files"); 170 push(@$pagedimg_options, "-processing_tmp_files"); 171 172 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj) 173 # to extract these metadata fields from the HEAD META fields 174 my $required_metadata; 175 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) { 176 push(@$html_options,"-metadata_fields",$self->{'metadata_fields'}); 177 } else { 178 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 179 } 180 if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) { 181 push(@$html_options,"-metadata_field_separator",$self->{'metadata_field_separator'}); 182 } 183 184 if ($self->{'use_sections'} || $self->{'description_tags'}) { 185 $self->{'description_tags'} = 1; 186 push(@$html_options,"-description_tags"); 187 } 154 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { 155 $secondary_plugin_options->{$secondary_plugin_name} = []; 156 } 157 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 188 158 189 159 # following title_sub removes "Page 1" added by pdftohtml, and a leading 190 160 # "1", which is often the page number at the top of the page. Bad Luck 191 161 # if your document title actually starts with "1 " - is there a better way? 192 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 193 push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 194 162 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 195 163 my $associate_tail_re = $self->{'associate_tail_re'}; 196 164 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) { 197 push(@$html_options, "-associate_tail_re", $associate_tail_re); 198 push(@$text_options, "-associate_tail_re", $associate_tail_re); 199 push(@$pagedimg_options, "-associate_tail_re", $associate_tail_re) if defined $pagedimg_options; 200 } 201 202 push(@$html_options, "-file_rename_method", "none"); 203 push(@$text_options, "-file_rename_method", "none"); 204 push(@$pagedimg_options, "-file_rename_method", "none") if defined $pagedimg_options; 165 push(@$specific_options, "-associate_tail_re", $associate_tail_re); 166 } 167 push(@$specific_options, "-file_rename_method", "none"); 168 169 if ($secondary_plugin_name eq "HTMLPlugin") { 170 # pdftohtml always produces utf8 171 push(@$specific_options, "-input_encoding", "utf8"); 172 push(@$specific_options, "-extract_language") if $self->{'extract_language'}; 173 push(@$specific_options, "-processing_tmp_files"); 174 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj) 175 # to extract these metadata fields from the HEAD META fields 176 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) { 177 push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'}); 178 } else { 179 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>"); 180 } 181 if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) { 182 push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'}); 183 } 184 if ($self->{'use_sections'} || $self->{'description_tags'}) { 185 $self->{'description_tags'} = 1; 186 push(@$specific_options, "-description_tags"); 187 } 188 } 189 elsif ($secondary_plugin_name eq "PagedImagePlugin") { 190 push(@$specific_options, "-screenviewsize", "1000"); 191 push(@$specific_options, "-enable_cache"); 192 push(@$specific_options, "-processing_tmp_files"); 193 } 205 194 206 195 $self = bless $self, $class;
Note:
See TracChangeset
for help on using the changeset viewer.