Show
Ignore:
Timestamp:
18.07.2018 20:15:24 (15 months ago)
Author:
ak19
Message:

PDFv2Plugin will only work out of the box for GS3 now: PDFBoxConverter is no longer loaded up via AutoLoadConverters?, but directly instantiated. Not yet tidied up

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm

    r32285 r32286  
    11########################################################################### 
    22# 
    3 # PDFv2Plugin.pm -- pdf plugin that uses xpdftools or, if switched on, 
    4 # pdfbox, to process PDFs. 
     3# PDFv2Plugin.pm -- pdf plugin that uses xpdftools and pdfbox to process PDFs. 
     4# It only works out of the box for GS3 since it assumes the pdfbox extension 
     5# is installed. 
    56# A component of the Greenstone digital library software 
    67# from the New Zealand Digital Library Project at the  
     
    3435use Mojo::DOM; # for HTML parsing 
    3536 
    36 use AutoLoadConverters; 
     37#use AutoLoadConverters; 
     38use PDFBoxConverter; 
    3739use ConvertBinaryFile; 
    3840 
    39 @PDFv2Plugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile'); 
     41#@PDFv2Plugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile'); 
     42@PDFv2Plugin::ISA = ('ConvertBinaryFile', 'PDFBoxConverter', 'ReadTextFile'); 
    4043 
    4144 
    4245my $convert_to_list = 
    43     [ { 'name' => "auto", 
     46    [ { 'name' => "auto", # pretty_html using xpdftools' pdftohtml 
    4447    'desc' => "{ConvertBinaryFile.convert_to.auto}" }, 
    45       { 'name' => "text", # xpdftools 
     48      { 'name' => "text", # xpdftools' pdftotext 
    4649    'desc' => "{ConvertBinaryFile.convert_to.text}" }, 
    47       { 'name' => "paged_text", # xpdftools 
     50      { 'name' => "paged_text", # pdfbox 
    4851    'desc' => "{ConvertBinaryFile.convert_to.paged_text}" }, 
    4952       
    50       { 'name' => "html", # pdfbox ## TODO: rename this to html_without_imgs 
     53      { 'name' => "html", # pdfbox ## TODO: rename this to html_without_imgs? 
    5154    'desc' => "{PDFPlugin.convert_to.html}" }, 
    5255      { 'name' => "pretty_html", # xpdftools 
     
    5558    'desc' => "{PDFPlugin.convert_to.paged_pretty_html}"}, 
    5659 
    57       #pdfbox 
     60      # pdfbox for all pagedimg(txt) output formats: 
    5861      { 'name' => "pagedimg_jpg", 
    5962    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"}, 
     
    120123    push(@$pluginlist, $class); 
    121124 
    122     push(@$inputargs,"-title_sub"); 
    123     push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?'); 
     125#    push(@$inputargs,"-title_sub"); 
     126#    push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?'); 
    124127 
    125128    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 
    126129    push(@{$hashArgOptLists->{"OptList"}},$options); 
    127130 
    128     my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1); 
     131    #    my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1); 
     132    my $pdfbox_converter_self = new PDFBoxConverter($pluginlist, $inputargs, $hashArgOptLists); 
    129133    my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists); 
    130     my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self); 
     134    #    my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self); 
     135    my $self = BaseImporter::merge_inheritance($pdfbox_converter_self, $cbf_self); 
    131136     
    132137    if ($self->{'info_only'}) { 
     
    149154    $self->{'convert_options'} .= " -pdf_dpi $dpi"; 
    150155 
    151     # PDFv2Plugin now supports PDF to txt conversion on Windows too: 
    152     # using XPDF Tools (incl pdftotext) on Windows/Linux/Mac 
    153     if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 
    154     &gsprintf::gsprintf(STDERR, "{PDFv2Plugin.win_pdftotext_info}\n"); 
    155     } 
    156     elsif ($self->{'convert_to'} eq "auto") { 
     156    # The old pdftohtml tool used by PDFPlugin didn't do PDF to txt conversion on Windows 
     157    # But PDFv2Plugin now supports PDF to txt conversion on Windows too using XPDFTools' pdftotext 
     158 
     159    if ($self->{'convert_to'} eq "auto") { 
    157160    # choose pretty_html is the best default option when using xpdftools 
    158161    $self->{'convert_to'} = "pretty_html"; 
     
    169172    # Not all available conversion output options are possible with xpdftools, as some are 
    170173    # only handled by pdfbox. If a format is unavailable with xpdftools, default to pretty_html 
    171     if (!$self->{"pdfbox_conversion"}) { 
    172     my $convert_to = $self->{'convert_to'}; 
    173     my $fallback_convert_to = $convert_to; 
    174     if($convert_to =~ /^html$/) { 
    175         $fallback_convert_to = "pretty_html"; 
    176     } 
    177     elsif ($self->{'convert_to'} =~ /^pagedimg/) { 
    178         $fallback_convert_to = "paged_pretty_html"; 
    179     } 
    180     elsif ($self->{'convert_to'} =~ /^paged_text$/) { 
    181         #   print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n"; 
    182         $fallback_convert_to = "text"; 
    183     } 
    184  
    185     if($convert_to =~ /^(html|pagedimg|paged_text)/) { 
    186         &gsprintf::gsprintf(STDERR, "{PDFv2Plugin.conversion_needs_pdfbox}\n", ($self->{'convert_to'}, $fallback_convert_to)); 
    187         $self->{'convert_to'} = $fallback_convert_to; 
    188     } 
    189     } 
     174    # if (!$self->{"pdfbox_conversion"}) { 
     175    #   my $convert_to = $self->{'convert_to'}; 
     176    #   my $fallback_convert_to = $convert_to; 
     177    #   if($convert_to =~ /^html$/) { 
     178    #       $fallback_convert_to = "pretty_html"; 
     179    #   } 
     180    #   elsif ($self->{'convert_to'} =~ /^pagedimg/) { 
     181    #       $fallback_convert_to = "paged_pretty_html"; 
     182    #   } 
     183    #   elsif ($self->{'convert_to'} =~ /^paged_text$/) { 
     184    #       #   print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n"; 
     185    #       $fallback_convert_to = "text"; 
     186    #   } 
     187 
     188    #   if($convert_to =~ /^(html|pagedimg|paged_text)/) { 
     189    #       &gsprintf::gsprintf(STDERR, "{PDFv2Plugin.conversion_needs_pdfbox}\n", ($self->{'convert_to'}, $fallback_convert_to)); 
     190    #       $self->{'convert_to'} = $fallback_convert_to; 
     191    #   } 
     192    # } 
    190193     
    191194    # set convert_to_plugin and convert_to_ext 
     
    203206    # "1", which is often the page number at the top of the page. Bad Luck 
    204207    # if your document title actually starts with "1 " - is there a better way? 
    205     push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
     208#    push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 
    206209    my $associate_tail_re = $self->{'associate_tail_re'}; 
    207210    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) { 
     
    264267    # ConvertBinaryFile init 
    265268    $self->SUPER::init(@_); 
    266     $self->AutoLoadConverters::init(@_); 
     269#   $self->AutoLoadConverters::init(@_); 
     270    $self->PDFBoxConverter::init(@_); 
    267271 
    268272} 
     
    271275    my $self = shift (@_); 
    272276 
    273     $self->AutoLoadConverters::begin(@_); 
     277#   $self->AutoLoadConverters::begin(@_); 
     278    $self->PDFBoxConverter::begin(@_); 
    274279    $self->SUPER::begin(@_); 
    275280 
     
    278283sub deinit { 
    279284    my $self = shift (@_); 
    280      
    281     $self->AutoLoadConverters::deinit(@_); 
     285 
     286    $self->PDFBoxConverter::deinit(@_); 
     287#   $self->AutoLoadConverters::deinit(@_); 
    282288    $self->SUPER::deinit(@_); 
    283289 
     
    302308} 
    303309   
    304    
     310 
    305311sub tmp_area_convert_file { 
    306312 
    307313    my $self = shift (@_); 
     314    my ($output_ext, $input_filename, $textref) = @_; 
    308315     
    309     if($self->{'convert_to'} =~ m/pretty_html$/) { # if outputting paged_pretty_html or pretty_html: 
    310     # only xpdftools can output pretty_html regardless of whether pdfbox_conversion is switched on 
    311     print STDERR "@@@@ PDFBox_conversion is switched on, but pretty_html variants are generated by xpdftools.\n"; 
     316    if($self->{'convert_to'} eq "text" || $self->{'convert_to'} =~ m/pretty_html$/) { # use xpdftools 
    312317    return $self->ConvertBinaryFile::tmp_area_convert_file(@_); 
    313318    } 
    314     # else, output format uses pdfbox: 
    315     return $self->AutoLoadConverters::tmp_area_convert_file(@_); 
    316  
     319     
     320    # for all other output formats, use pdfbox:     
     321    #return $self->AutoLoadConverters::tmp_area_convert_file(@_); 
     322    # Here, we now do what AutoLoadConverters::tmp_area_convert_file(@_) does: 
     323    my ($result, $result_str, $new_filename) = $self->PDFBoxConverter::convert($input_filename, $output_ext); 
     324    if (defined $result && $result != 0) { 
     325    return $new_filename; 
     326    } 
     327    my $outhandle=$self->{'outhandle'}; 
     328    print $outhandle "PDFBoxConverter had a conversion error\n"; 
     329    print $outhandle "$@\n"; 
     330    if (defined $result_str) { 
     331    print $outhandle "$result_str\n"; 
     332    } 
     333    return ""; 
    317334} 
    318335