Show
Ignore:
Timestamp:
17.07.2018 22:13:17 (13 months ago)
Author:
ak19
Message:

This was meant to be oart of commit 32278, where I forgot to commit the updated PDFBoxConvert.pm. The commit message for 32278 was: Our custom pdf-box class PDFToImagesAndText.java now takes two additional flags, textOnly and imagesOnly, which can be used to support paged_text and the original pagedimg_ output formats, besides pagedimgtxt_

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm

    r32273 r32282  
    129129    $self->{'pdfbox_txt_launch_cmd'} = "$java -cp \"$pbajar\" org.apache.pdfbox.tools.ExtractText"; 
    130130    $self->{'pdfbox_html_launch_cmd'} = "$java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.tools.ExtractText"; 
    131     #$self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (gif, jpg, png) 
    132     # Now: use this cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png) 
    133     # AND its extracted text. An item file is still generated, but this time referring to txtfiles too, not just the images. Result: searchable paged output. 
     131#   $self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (gif, jpg, png) 
     132 
     133    # We use this next cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png) 
     134    # AND its extracted text. Or just each page's extracted text. An item file is still generated, 
     135    # but this time referring to txtfiles too, not just the images. Result: searchable paged output. 
    134136    # Our new custom class PDFBoxToImagesAndText.java lives in the new build folder, so add that to the classpath for the launch cmd 
    135137    my $pdfbox_build = &FileUtils::filenameConcatenate($gextpb_home,"build"); 
    136138    my $classpath = &util::pathname_cat($pbajar,$pdfbox_build);  
    137     $self->{'pdfbox_img_launch_cmd'} = "java -cp \"$classpath\" org.greenstone.pdfbox.PDFBoxToImagesAndText"; 
     139    $self->{'pdfbox_imgtxt_launch_cmd'} = "java -cp \"$classpath\" org.greenstone.pdfbox.PDFBoxToImagesAndText"; 
    138140    } 
    139141    else {        
     
    179181     
    180182    my $img_output_mode = 0; 
    181  
     183     
     184    my $convert_to = $self->{'convert_to'}; 
     185    my $paged_txt_output_mode = ($convert_to =~ /(pagedimgtxt|paged_text)/) ? 1 : 0; 
     186     
    182187    # the following line is necessary to avoid 'uninitialised variable' error 
    183188    # messages concerning the converted_to member variable when PDFPlugin's  
     
    187192    if ($target_file_type eq "html") { 
    188193    $self->{'converted_to'} = "HTML"; 
    189     } elsif ($target_file_type eq "jpg" || $target_file_type eq "gif" || $target_file_type eq "png") { 
     194    } elsif ($target_file_type eq "jpg" || $target_file_type eq "gif" || $target_file_type eq "png") {  
     195    # GIF not supported by PDFBox at present, see https://pdfbox.apache.org/1.8/commandline.html#pdftoimage 
    190196    $self->{'converted_to'} = $target_file_type;     
    191197    $img_output_mode = 1; 
     
    208214    # append the output filetype suffix only for non-image output formats, since for 
    209215    # images we can be outputting multiple image files per single PDF input file 
    210     my $target_file = $img_output_mode ? "$file_root" : "$file_root.$target_file_type"; 
     216    my $target_file = ($img_output_mode || $paged_txt_output_mode) ? "$file_root" : "$file_root.$target_file_type"; 
    211217 
    212218    $target_file_path = &FileUtils::filenameConcatenate($cache_dir,$target_file); 
     
    218224    # for image files, remove the suffix, since we can have many output image files 
    219225    # per input PDF (one img for each page of the PDF, for example) 
    220     if($img_output_mode) { 
     226    if($img_output_mode || $paged_txt_output_mode) { 
    221227        $target_file_path =~ s/\.[^.]*$//g; 
    222228        if(!&FileUtils::directoryExists($target_file_path)) {        
     
    229235        # item file generated in it can be deleted in one go on clean_up 
    230236    } 
    231  
     237     
    232238    push(@{$self->{'pbtmp_file_paths'}}, $target_file_path); 
    233239    } 
     
    240246    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$"); 
    241247 
    242     if($img_output_mode) { # converting to images 
     248    if($img_output_mode || $paged_txt_output_mode) { # converting each page to image and/or text 
    243249    my $output_prefix = &FileUtils::filenameConcatenate($target_file_path, $tailname); 
    244250     
    245     $convert_cmd = $self->{'pdfbox_img_launch_cmd'}; 
    246     $convert_cmd .= " -imageType $target_file_type"; 
     251    #$convert_cmd = $paged_txt_output_mode ? $self->{'pdfbox_imgtxt_launch_cmd'} : $self->{'pdfbox_img_launch_cmd'}; 
     252    $convert_cmd = $self->{'pdfbox_imgtxt_launch_cmd'}; 
     253    $convert_cmd .= " -textOnly" unless($img_output_mode); # if paged txt only and no images 
     254    $convert_cmd .= " -imagesOnly" unless($paged_txt_output_mode); # set to images only unless there's text too 
     255    $convert_cmd .= " -imageType $target_file_type" if($img_output_mode); 
    247256    $convert_cmd .= " -outputPrefix \"$output_prefix\""; 
    248257    $convert_cmd .= " \"$source_file_full_path\""; 
    249258     
    250     } else { # html or text 
     259    } else { # single stream of text or html 
    251260     
    252261    if ($target_file_type eq "html") { 
     
    270279    = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info); 
    271280 
    272     if($img_output_mode) { 
     281    if($img_output_mode || $paged_txt_output_mode) { 
    273282    # now the images have been generated, generate the "$target_file_path/tailname.item"  
    274283    # item file for them, which is also the target_file_path that needs to be returned