Ignore:
Timestamp:
2018-06-15T18:10:25+12:00 (6 years ago)
Author:
ak19
Message:

Fixing an issue with PDFBox to txt conversion, whereby PDFBox to txt conversion would produce text that's actually HTML in pre tags. Not sure if this issue was introduced during the recent commit upgrading the pdfbox version from 1.8.2 to 2.0.9, or whether it already existed. But fixing it now so that text conversion with PDFBox actually produces txt, while html conversion still produces the old cheap html without preserving any images in the src pdf.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm

    r32197 r32198  
    127127    my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
    128128    my $java = &util::get_java_command();
    129     my $launch_cmd = "$java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.tools.ExtractText";
    130    
    131     $self->{'pdfbox_launch_cmd'} = $launch_cmd;
     129    $self->{'pdfbox_txt_launch_cmd'} = "$java -cp \"$pbajar\" org.apache.pdfbox.tools.ExtractText";
     130    $self->{'pdfbox_html_launch_cmd'} = "$java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.tools.ExtractText";
    132131    #$self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (gif, jpg, png)
    133132    # Now: use this cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png)
     
    244243   
    245244    } else { # html or text
    246     $convert_cmd = $self->{'pdfbox_launch_cmd'};
    247     $convert_cmd .= " -html" if ($target_file_type eq "html");
     245   
     246    if ($target_file_type eq "html") {
     247        $convert_cmd = $self->{'pdfbox_html_launch_cmd'};
     248        $convert_cmd .= " -html" if ($target_file_type eq "html");
     249    } else {
     250        $convert_cmd = $self->{'pdfbox_txt_launch_cmd'};
     251    }
    248252    $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
    249253    }
Note: See TracChangeset for help on using the changeset viewer.