Changeset 32286


Ignore:
Timestamp:
2018-07-18T20:15:24+12:00 (3 years ago)
Author:
ak19
Message:

PDFv2Plugin will only work out of the box for GS3 now: PDFBoxConverter is no longer loaded up via AutoLoadConverters, but directly instantiated. Not yet tidied up

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm

    r32285 r32286  
    11###########################################################################
    22#
    3 # PDFv2Plugin.pm -- pdf plugin that uses xpdftools or, if switched on,
    4 # pdfbox, to process PDFs.
     3# PDFv2Plugin.pm -- pdf plugin that uses xpdftools and pdfbox to process PDFs.
     4# It only works out of the box for GS3 since it assumes the pdfbox extension
     5# is installed.
    56# A component of the Greenstone digital library software
    67# from the New Zealand Digital Library Project at the
     
    3435use Mojo::DOM; # for HTML parsing
    3536
    36 use AutoLoadConverters;
     37#use AutoLoadConverters;
     38use PDFBoxConverter;
    3739use ConvertBinaryFile;
    3840
    39 @PDFv2Plugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
     41#@PDFv2Plugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
     42@PDFv2Plugin::ISA = ('ConvertBinaryFile', 'PDFBoxConverter', 'ReadTextFile');
    4043
    4144
    4245my $convert_to_list =
    43     [ { 'name' => "auto",
     46    [ { 'name' => "auto", # pretty_html using xpdftools' pdftohtml
    4447    'desc' => "{ConvertBinaryFile.convert_to.auto}" },
    45       { 'name' => "text", # xpdftools
     48      { 'name' => "text", # xpdftools' pdftotext
    4649    'desc' => "{ConvertBinaryFile.convert_to.text}" },
    47       { 'name' => "paged_text", # xpdftools
     50      { 'name' => "paged_text", # pdfbox
    4851    'desc' => "{ConvertBinaryFile.convert_to.paged_text}" },
    4952     
    50       { 'name' => "html", # pdfbox ## TODO: rename this to html_without_imgs
     53      { 'name' => "html", # pdfbox ## TODO: rename this to html_without_imgs?
    5154    'desc' => "{PDFPlugin.convert_to.html}" },
    5255      { 'name' => "pretty_html", # xpdftools
     
    5558    'desc' => "{PDFPlugin.convert_to.paged_pretty_html}"},
    5659
    57       #pdfbox
     60      # pdfbox for all pagedimg(txt) output formats:
    5861      { 'name' => "pagedimg_jpg",
    5962    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
     
    120123    push(@$pluginlist, $class);
    121124
    122     push(@$inputargs,"-title_sub");
    123     push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
     125#    push(@$inputargs,"-title_sub");
     126#    push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
    124127
    125128    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
    126129    push(@{$hashArgOptLists->{"OptList"}},$options);
    127130
    128     my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
     131    #    my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
     132    my $pdfbox_converter_self = new PDFBoxConverter($pluginlist, $inputargs, $hashArgOptLists);
    129133    my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
    130     my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
     134    #    my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
     135    my $self = BaseImporter::merge_inheritance($pdfbox_converter_self, $cbf_self);
    131136   
    132137    if ($self->{'info_only'}) {
     
    149154    $self->{'convert_options'} .= " -pdf_dpi $dpi";
    150155
    151     # PDFv2Plugin now supports PDF to txt conversion on Windows too:
    152     # using XPDF Tools (incl pdftotext) on Windows/Linux/Mac
    153     if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
    154     &gsprintf::gsprintf(STDERR, "{PDFv2Plugin.win_pdftotext_info}\n");
    155     }
    156     elsif ($self->{'convert_to'} eq "auto") {
     156    # The old pdftohtml tool used by PDFPlugin didn't do PDF to txt conversion on Windows
     157    # But PDFv2Plugin now supports PDF to txt conversion on Windows too using XPDFTools' pdftotext
     158
     159    if ($self->{'convert_to'} eq "auto") {
    157160    # choose pretty_html is the best default option when using xpdftools
    158161    $self->{'convert_to'} = "pretty_html";
     
    169172    # Not all available conversion output options are possible with xpdftools, as some are
    170173    # only handled by pdfbox. If a format is unavailable with xpdftools, default to pretty_html
    171     if (!$self->{"pdfbox_conversion"}) {
    172     my $convert_to = $self->{'convert_to'};
    173     my $fallback_convert_to = $convert_to;
    174     if($convert_to =~ /^html$/) {
    175         $fallback_convert_to = "pretty_html";
    176     }
    177     elsif ($self->{'convert_to'} =~ /^pagedimg/) {
    178         $fallback_convert_to = "paged_pretty_html";
    179     }
    180     elsif ($self->{'convert_to'} =~ /^paged_text$/) {
    181         #   print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n";
    182         $fallback_convert_to = "text";
    183     }
    184 
    185     if($convert_to =~ /^(html|pagedimg|paged_text)/) {
    186         &gsprintf::gsprintf(STDERR, "{PDFv2Plugin.conversion_needs_pdfbox}\n", ($self->{'convert_to'}, $fallback_convert_to));
    187         $self->{'convert_to'} = $fallback_convert_to;
    188     }
    189     }
     174    # if (!$self->{"pdfbox_conversion"}) {
     175    #   my $convert_to = $self->{'convert_to'};
     176    #   my $fallback_convert_to = $convert_to;
     177    #   if($convert_to =~ /^html$/) {
     178    #       $fallback_convert_to = "pretty_html";
     179    #   }
     180    #   elsif ($self->{'convert_to'} =~ /^pagedimg/) {
     181    #       $fallback_convert_to = "paged_pretty_html";
     182    #   }
     183    #   elsif ($self->{'convert_to'} =~ /^paged_text$/) {
     184    #       #   print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n";
     185    #       $fallback_convert_to = "text";
     186    #   }
     187
     188    #   if($convert_to =~ /^(html|pagedimg|paged_text)/) {
     189    #       &gsprintf::gsprintf(STDERR, "{PDFv2Plugin.conversion_needs_pdfbox}\n", ($self->{'convert_to'}, $fallback_convert_to));
     190    #       $self->{'convert_to'} = $fallback_convert_to;
     191    #   }
     192    # }
    190193   
    191194    # set convert_to_plugin and convert_to_ext
     
    203206    # "1", which is often the page number at the top of the page. Bad Luck
    204207    # if your document title actually starts with "1 " - is there a better way?
    205     push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
     208#    push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
    206209    my $associate_tail_re = $self->{'associate_tail_re'};
    207210    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
     
    264267    # ConvertBinaryFile init
    265268    $self->SUPER::init(@_);
    266     $self->AutoLoadConverters::init(@_);
     269#   $self->AutoLoadConverters::init(@_);
     270    $self->PDFBoxConverter::init(@_);
    267271
    268272}
     
    271275    my $self = shift (@_);
    272276
    273     $self->AutoLoadConverters::begin(@_);
     277#   $self->AutoLoadConverters::begin(@_);
     278    $self->PDFBoxConverter::begin(@_);
    274279    $self->SUPER::begin(@_);
    275280
     
    278283sub deinit {
    279284    my $self = shift (@_);
    280    
    281     $self->AutoLoadConverters::deinit(@_);
     285
     286    $self->PDFBoxConverter::deinit(@_);
     287#   $self->AutoLoadConverters::deinit(@_);
    282288    $self->SUPER::deinit(@_);
    283289
     
    302308}
    303309 
    304  
     310
    305311sub tmp_area_convert_file {
    306312
    307313    my $self = shift (@_);
     314    my ($output_ext, $input_filename, $textref) = @_;
    308315   
    309     if($self->{'convert_to'} =~ m/pretty_html$/) { # if outputting paged_pretty_html or pretty_html:
    310     # only xpdftools can output pretty_html regardless of whether pdfbox_conversion is switched on
    311     print STDERR "@@@@ PDFBox_conversion is switched on, but pretty_html variants are generated by xpdftools.\n";
     316    if($self->{'convert_to'} eq "text" || $self->{'convert_to'} =~ m/pretty_html$/) { # use xpdftools
    312317    return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
    313318    }
    314     # else, output format uses pdfbox:
    315     return $self->AutoLoadConverters::tmp_area_convert_file(@_);
    316 
     319   
     320    # for all other output formats, use pdfbox:   
     321    #return $self->AutoLoadConverters::tmp_area_convert_file(@_);
     322    # Here, we now do what AutoLoadConverters::tmp_area_convert_file(@_) does:
     323    my ($result, $result_str, $new_filename) = $self->PDFBoxConverter::convert($input_filename, $output_ext);
     324    if (defined $result && $result != 0) {
     325    return $new_filename;
     326    }
     327    my $outhandle=$self->{'outhandle'};
     328    print $outhandle "PDFBoxConverter had a conversion error\n";
     329    print $outhandle "$@\n";
     330    if (defined $result_str) {
     331    print $outhandle "$result_str\n";
     332    }
     333    return "";
    317334}
    318335
Note: See TracChangeset for help on using the changeset viewer.