Changeset 32286 for main/trunk/greenstone2
- Timestamp:
- 2018-07-18T20:15:24+12:00 (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm
r32285 r32286 1 1 ########################################################################### 2 2 # 3 # PDFv2Plugin.pm -- pdf plugin that uses xpdftools or, if switched on, 4 # pdfbox, to process PDFs. 3 # PDFv2Plugin.pm -- pdf plugin that uses xpdftools and pdfbox to process PDFs. 4 # It only works out of the box for GS3 since it assumes the pdfbox extension 5 # is installed. 5 6 # A component of the Greenstone digital library software 6 7 # from the New Zealand Digital Library Project at the … … 34 35 use Mojo::DOM; # for HTML parsing 35 36 36 use AutoLoadConverters; 37 #use AutoLoadConverters; 38 use PDFBoxConverter; 37 39 use ConvertBinaryFile; 38 40 39 @PDFv2Plugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile'); 41 #@PDFv2Plugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile'); 42 @PDFv2Plugin::ISA = ('ConvertBinaryFile', 'PDFBoxConverter', 'ReadTextFile'); 40 43 41 44 42 45 my $convert_to_list = 43 [ { 'name' => "auto", 46 [ { 'name' => "auto", # pretty_html using xpdftools' pdftohtml 44 47 'desc' => "{ConvertBinaryFile.convert_to.auto}" }, 45 { 'name' => "text", # xpdftools 48 { 'name' => "text", # xpdftools' pdftotext 46 49 'desc' => "{ConvertBinaryFile.convert_to.text}" }, 47 { 'name' => "paged_text", # xpdftools50 { 'name' => "paged_text", # pdfbox 48 51 'desc' => "{ConvertBinaryFile.convert_to.paged_text}" }, 49 52 50 { 'name' => "html", # pdfbox ## TODO: rename this to html_without_imgs 53 { 'name' => "html", # pdfbox ## TODO: rename this to html_without_imgs? 51 54 'desc' => "{PDFPlugin.convert_to.html}" }, 52 55 { 'name' => "pretty_html", # xpdftools … … 55 58 'desc' => "{PDFPlugin.convert_to.paged_pretty_html}"}, 56 59 57 # pdfbox60 # pdfbox for all pagedimg(txt) output formats: 58 61 { 'name' => "pagedimg_jpg", 59 62 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"}, … … 120 123 push(@$pluginlist, $class); 121 124 122 push(@$inputargs,"-title_sub");123 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');125 # push(@$inputargs,"-title_sub"); 126 # push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?'); 124 127 125 128 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 126 129 push(@{$hashArgOptLists->{"OptList"}},$options); 127 130 128 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1); 131 # my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1); 132 my $pdfbox_converter_self = new PDFBoxConverter($pluginlist, $inputargs, $hashArgOptLists); 129 133 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists); 130 my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self); 134 # my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self); 135 my $self = BaseImporter::merge_inheritance($pdfbox_converter_self, $cbf_self); 131 136 132 137 if ($self->{'info_only'}) { … … 149 154 $self->{'convert_options'} .= " -pdf_dpi $dpi"; 150 155 151 # PDFv2Plugin now supports PDF to txt conversion on Windows too: 152 # using XPDF Tools (incl pdftotext) on Windows/Linux/Mac 153 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 154 &gsprintf::gsprintf(STDERR, "{PDFv2Plugin.win_pdftotext_info}\n"); 155 } 156 elsif ($self->{'convert_to'} eq "auto") { 156 # The old pdftohtml tool used by PDFPlugin didn't do PDF to txt conversion on Windows 157 # But PDFv2Plugin now supports PDF to txt conversion on Windows too using XPDFTools' pdftotext 158 159 if ($self->{'convert_to'} eq "auto") { 157 160 # choose pretty_html is the best default option when using xpdftools 158 161 $self->{'convert_to'} = "pretty_html"; … … 169 172 # Not all available conversion output options are possible with xpdftools, as some are 170 173 # only handled by pdfbox. If a format is unavailable with xpdftools, default to pretty_html 171 if (!$self->{"pdfbox_conversion"}) {172 my $convert_to = $self->{'convert_to'};173 my $fallback_convert_to = $convert_to;174 if($convert_to =~ /^html$/) {175 $fallback_convert_to = "pretty_html";176 }177 elsif ($self->{'convert_to'} =~ /^pagedimg/) {178 $fallback_convert_to = "paged_pretty_html";179 }180 elsif ($self->{'convert_to'} =~ /^paged_text$/) {181 # print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n";182 $fallback_convert_to = "text";183 }184 185 if($convert_to =~ /^(html|pagedimg|paged_text)/) {186 &gsprintf::gsprintf(STDERR, "{PDFv2Plugin.conversion_needs_pdfbox}\n", ($self->{'convert_to'}, $fallback_convert_to));187 $self->{'convert_to'} = $fallback_convert_to;188 }189 }174 # if (!$self->{"pdfbox_conversion"}) { 175 # my $convert_to = $self->{'convert_to'}; 176 # my $fallback_convert_to = $convert_to; 177 # if($convert_to =~ /^html$/) { 178 # $fallback_convert_to = "pretty_html"; 179 # } 180 # elsif ($self->{'convert_to'} =~ /^pagedimg/) { 181 # $fallback_convert_to = "paged_pretty_html"; 182 # } 183 # elsif ($self->{'convert_to'} =~ /^paged_text$/) { 184 # # print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n"; 185 # $fallback_convert_to = "text"; 186 # } 187 188 # if($convert_to =~ /^(html|pagedimg|paged_text)/) { 189 # &gsprintf::gsprintf(STDERR, "{PDFv2Plugin.conversion_needs_pdfbox}\n", ($self->{'convert_to'}, $fallback_convert_to)); 190 # $self->{'convert_to'} = $fallback_convert_to; 191 # } 192 # } 190 193 191 194 # set convert_to_plugin and convert_to_ext … … 203 206 # "1", which is often the page number at the top of the page. Bad Luck 204 207 # if your document title actually starts with "1 " - is there a better way? 205 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');208 # push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 206 209 my $associate_tail_re = $self->{'associate_tail_re'}; 207 210 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) { … … 264 267 # ConvertBinaryFile init 265 268 $self->SUPER::init(@_); 266 $self->AutoLoadConverters::init(@_); 269 # $self->AutoLoadConverters::init(@_); 270 $self->PDFBoxConverter::init(@_); 267 271 268 272 } … … 271 275 my $self = shift (@_); 272 276 273 $self->AutoLoadConverters::begin(@_); 277 # $self->AutoLoadConverters::begin(@_); 278 $self->PDFBoxConverter::begin(@_); 274 279 $self->SUPER::begin(@_); 275 280 … … 278 283 sub deinit { 279 284 my $self = shift (@_); 280 281 $self->AutoLoadConverters::deinit(@_); 285 286 $self->PDFBoxConverter::deinit(@_); 287 # $self->AutoLoadConverters::deinit(@_); 282 288 $self->SUPER::deinit(@_); 283 289 … … 302 308 } 303 309 304 310 305 311 sub tmp_area_convert_file { 306 312 307 313 my $self = shift (@_); 314 my ($output_ext, $input_filename, $textref) = @_; 308 315 309 if($self->{'convert_to'} =~ m/pretty_html$/) { # if outputting paged_pretty_html or pretty_html: 310 # only xpdftools can output pretty_html regardless of whether pdfbox_conversion is switched on 311 print STDERR "@@@@ PDFBox_conversion is switched on, but pretty_html variants are generated by xpdftools.\n"; 316 if($self->{'convert_to'} eq "text" || $self->{'convert_to'} =~ m/pretty_html$/) { # use xpdftools 312 317 return $self->ConvertBinaryFile::tmp_area_convert_file(@_); 313 318 } 314 # else, output format uses pdfbox: 315 return $self->AutoLoadConverters::tmp_area_convert_file(@_); 316 319 320 # for all other output formats, use pdfbox: 321 #return $self->AutoLoadConverters::tmp_area_convert_file(@_); 322 # Here, we now do what AutoLoadConverters::tmp_area_convert_file(@_) does: 323 my ($result, $result_str, $new_filename) = $self->PDFBoxConverter::convert($input_filename, $output_ext); 324 if (defined $result && $result != 0) { 325 return $new_filename; 326 } 327 my $outhandle=$self->{'outhandle'}; 328 print $outhandle "PDFBoxConverter had a conversion error\n"; 329 print $outhandle "$@\n"; 330 if (defined $result_str) { 331 print $outhandle "$result_str\n"; 332 } 333 return ""; 317 334 } 318 335
Note:
See TracChangeset
for help on using the changeset viewer.