Changeset 32287
- Timestamp:
- 2018-07-18T20:30:14+12:00 (6 years ago)
- Location:
- main/trunk/greenstone2
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/gsConvert.pl
r32284 r32287 322 322 $output_type =~ s/.*\-(.*)/$1/i; 323 323 324 print STDERR "@@@@@@@@ Using $pdf_tool for the conversion\n";324 #print STDERR "@@@@@@@@ Using $pdf_tool for the conversion\n"; 325 325 326 326 # First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools -
main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm
r32286 r32287 35 35 use Mojo::DOM; # for HTML parsing 36 36 37 #use AutoLoadConverters;38 37 use PDFBoxConverter; 39 38 use ConvertBinaryFile; 40 39 41 #@PDFv2Plugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');42 40 @PDFv2Plugin::ISA = ('ConvertBinaryFile', 'PDFBoxConverter', 'ReadTextFile'); 43 41 … … 123 121 push(@$pluginlist, $class); 124 122 125 # push(@$inputargs,"-title_sub");126 # push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');127 128 123 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 129 124 push(@{$hashArgOptLists->{"OptList"}},$options); 130 125 131 # my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);132 126 my $pdfbox_converter_self = new PDFBoxConverter($pluginlist, $inputargs, $hashArgOptLists); 133 127 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists); 134 # my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);135 128 my $self = BaseImporter::merge_inheritance($pdfbox_converter_self, $cbf_self); 136 129 … … 167 160 } 168 161 } 169 170 # if pdfbox_conversion is not on, check convert_to to make sure that xpdftools can171 # support the selected output format, or fallback on a sensible default172 # Not all available conversion output options are possible with xpdftools, as some are173 # only handled by pdfbox. If a format is unavailable with xpdftools, default to pretty_html174 # if (!$self->{"pdfbox_conversion"}) {175 # my $convert_to = $self->{'convert_to'};176 # my $fallback_convert_to = $convert_to;177 # if($convert_to =~ /^html$/) {178 # $fallback_convert_to = "pretty_html";179 # }180 # elsif ($self->{'convert_to'} =~ /^pagedimg/) {181 # $fallback_convert_to = "paged_pretty_html";182 # }183 # elsif ($self->{'convert_to'} =~ /^paged_text$/) {184 # # print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n";185 # $fallback_convert_to = "text";186 # }187 188 # if($convert_to =~ /^(html|pagedimg|paged_text)/) {189 # &gsprintf::gsprintf(STDERR, "{PDFv2Plugin.conversion_needs_pdfbox}\n", ($self->{'convert_to'}, $fallback_convert_to));190 # $self->{'convert_to'} = $fallback_convert_to;191 # }192 # }193 162 194 163 # set convert_to_plugin and convert_to_ext … … 203 172 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; 204 173 205 # following title_sub removes "Page 1" added by pdftohtml, and a leading206 # "1", which is often the page number at the top of the page. Bad Luck207 # if your document title actually starts with "1 " - is there a better way?208 # push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');209 174 my $associate_tail_re = $self->{'associate_tail_re'}; 210 175 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) { … … 267 232 # ConvertBinaryFile init 268 233 $self->SUPER::init(@_); 269 # $self->AutoLoadConverters::init(@_);270 234 $self->PDFBoxConverter::init(@_); 271 235 … … 275 239 my $self = shift (@_); 276 240 277 # $self->AutoLoadConverters::begin(@_);278 241 $self->PDFBoxConverter::begin(@_); 279 242 $self->SUPER::begin(@_); … … 285 248 286 249 $self->PDFBoxConverter::deinit(@_); 287 # $self->AutoLoadConverters::deinit(@_);288 250 $self->SUPER::deinit(@_); 289 251 … … 318 280 } 319 281 320 # for all other output formats, use pdfbox: 321 #return $self->AutoLoadConverters::tmp_area_convert_file(@_); 322 # Here, we now do what AutoLoadConverters::tmp_area_convert_file(@_) does: 282 # for all other output formats, use pdfbox: 283 284 # Here, we now do directly what AutoLoadConverters::tmp_area_convert_file(@_) 285 # does with PDFBoxConverter: 323 286 my ($result, $result_str, $new_filename) = $self->PDFBoxConverter::convert($input_filename, $output_ext); 324 287 if (defined $result && $result != 0) { … … 382 345 $self->xpdftohtml_convert_post_process($conv_filename); 383 346 } 384 else { # use PDFPlugin's usual post processing347 else { # use original PDFPlugin's usual post processing 385 348 $self->default_convert_post_process($conv_filename); 386 349 } … … 689 652 690 653 my $title = $sections[0]; 691 $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...654 $title =~ s/^\"?\w+\"?>//; # specific for old pdftohtml... 692 655 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space 693 656 $title =~ s/<[^>]*>/ /g; … … 696 659 $title =~ s/\s+$//; 697 660 $title =~ s/\s+/ /gs; 698 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'}); 699 $title =~ s/^\s+//s; # in case title_sub introduced any... 661 $title =~ s/^\s+//s; # in case title_sub (of old PDFPlugin's old pdftohtml) introduced any... Generally still useful to remove spaces at the start? 700 662 $title = substr ($title, 0, 100); 701 663 $title =~ s/\s\S*$/.../; -
main/trunk/greenstone2/perllib/strings.properties
r32283 r32287 1195 1195 PDFv1Plugin.zoom:The factor by which to zoom the PDF for output. Only useful if -complex is set. 1196 1196 1197 PDFv2Plugin.zoom:The factor by which to zoom the PDF for (paged_)pretty_html output. Can be fractional. 1198 1199 PDFv2Plugin.win_pdftotext_info:PDFv2Plugin uses Xpdf Tools to support pdf to text conversion, including on Windows. 1200 1201 PDFv2Plugin.conversion_needs_pdfbox:*** Conversion to %s not supported with Xpdf Tools, defaulting to %s.\nTurn on pdfbox_conversion if you wish to enable output to the selected format. 1197 PDFv2Plugin.dpi:The resolution in DPI of background images generated for pagedimg(txt) and (paged_)pretty_html output settings. 1202 1198 1203 1199 PostScriptPlugin.desc:This is a \"poor man's\" ps to text converter. If you are serious, consider using the PRESCRIPT package, which is available for download at http://www.nzdl.org/html/software.html
Note:
See TracChangeset
for help on using the changeset viewer.