Ignore:
Timestamp:
2018-07-17T22:13:17+12:00 (6 years ago)
Author:
ak19
Message:

This was meant to be oart of commit 32278, where I forgot to commit the updated PDFBoxConvert.pm. The commit message for 32278 was: Our custom pdf-box class PDFToImagesAndText.java now takes two additional flags, textOnly and imagesOnly, which can be used to support paged_text and the original pagedimg_ output formats, besides pagedimgtxt_

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm

    r32273 r32282  
    129129    $self->{'pdfbox_txt_launch_cmd'} = "$java -cp \"$pbajar\" org.apache.pdfbox.tools.ExtractText";
    130130    $self->{'pdfbox_html_launch_cmd'} = "$java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.tools.ExtractText";
    131     #$self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (gif, jpg, png)
    132     # Now: use this cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png)
    133     # AND its extracted text. An item file is still generated, but this time referring to txtfiles too, not just the images. Result: searchable paged output.
     131#   $self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (gif, jpg, png)
     132
     133    # We use this next cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png)
     134    # AND its extracted text. Or just each page's extracted text. An item file is still generated,
     135    # but this time referring to txtfiles too, not just the images. Result: searchable paged output.
    134136    # Our new custom class PDFBoxToImagesAndText.java lives in the new build folder, so add that to the classpath for the launch cmd
    135137    my $pdfbox_build = &FileUtils::filenameConcatenate($gextpb_home,"build");
    136138    my $classpath = &util::pathname_cat($pbajar,$pdfbox_build);
    137     $self->{'pdfbox_img_launch_cmd'} = "java -cp \"$classpath\" org.greenstone.pdfbox.PDFBoxToImagesAndText";
     139    $self->{'pdfbox_imgtxt_launch_cmd'} = "java -cp \"$classpath\" org.greenstone.pdfbox.PDFBoxToImagesAndText";
    138140    }
    139141    else {       
     
    179181   
    180182    my $img_output_mode = 0;
    181 
     183   
     184    my $convert_to = $self->{'convert_to'};
     185    my $paged_txt_output_mode = ($convert_to =~ /(pagedimgtxt|paged_text)/) ? 1 : 0;
     186   
    182187    # the following line is necessary to avoid 'uninitialised variable' error
    183188    # messages concerning the converted_to member variable when PDFPlugin's
     
    187192    if ($target_file_type eq "html") {
    188193    $self->{'converted_to'} = "HTML";
    189     } elsif ($target_file_type eq "jpg" || $target_file_type eq "gif" || $target_file_type eq "png") {
     194    } elsif ($target_file_type eq "jpg" || $target_file_type eq "gif" || $target_file_type eq "png") {
     195    # GIF not supported by PDFBox at present, see https://pdfbox.apache.org/1.8/commandline.html#pdftoimage
    190196    $self->{'converted_to'} = $target_file_type;   
    191197    $img_output_mode = 1;
     
    208214    # append the output filetype suffix only for non-image output formats, since for
    209215    # images we can be outputting multiple image files per single PDF input file
    210     my $target_file = $img_output_mode ? "$file_root" : "$file_root.$target_file_type";
     216    my $target_file = ($img_output_mode || $paged_txt_output_mode) ? "$file_root" : "$file_root.$target_file_type";
    211217
    212218    $target_file_path = &FileUtils::filenameConcatenate($cache_dir,$target_file);
     
    218224    # for image files, remove the suffix, since we can have many output image files
    219225    # per input PDF (one img for each page of the PDF, for example)
    220     if($img_output_mode) {
     226    if($img_output_mode || $paged_txt_output_mode) {
    221227        $target_file_path =~ s/\.[^.]*$//g;
    222228        if(!&FileUtils::directoryExists($target_file_path)) {       
     
    229235        # item file generated in it can be deleted in one go on clean_up
    230236    }
    231 
     237   
    232238    push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
    233239    }
     
    240246    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$");
    241247
    242     if($img_output_mode) { # converting to images
     248    if($img_output_mode || $paged_txt_output_mode) { # converting each page to image and/or text
    243249    my $output_prefix = &FileUtils::filenameConcatenate($target_file_path, $tailname);
    244250   
    245     $convert_cmd = $self->{'pdfbox_img_launch_cmd'};
    246     $convert_cmd .= " -imageType $target_file_type";
     251    #$convert_cmd = $paged_txt_output_mode ? $self->{'pdfbox_imgtxt_launch_cmd'} : $self->{'pdfbox_img_launch_cmd'};
     252    $convert_cmd = $self->{'pdfbox_imgtxt_launch_cmd'};
     253    $convert_cmd .= " -textOnly" unless($img_output_mode); # if paged txt only and no images
     254    $convert_cmd .= " -imagesOnly" unless($paged_txt_output_mode); # set to images only unless there's text too
     255    $convert_cmd .= " -imageType $target_file_type" if($img_output_mode);
    247256    $convert_cmd .= " -outputPrefix \"$output_prefix\"";
    248257    $convert_cmd .= " \"$source_file_full_path\"";
    249258   
    250     } else { # html or text
     259    } else { # single stream of text or html
    251260   
    252261    if ($target_file_type eq "html") {
     
    270279    = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
    271280
    272     if($img_output_mode) {
     281    if($img_output_mode || $paged_txt_output_mode) {
    273282    # now the images have been generated, generate the "$target_file_path/tailname.item"
    274283    # item file for them, which is also the target_file_path that needs to be returned
Note: See TracChangeset for help on using the changeset viewer.