Ignore:
Timestamp:
2012-07-19T18:51:55+12:00 (12 years ago)
Author:
ak19
Message:

First set of commits that will allow PDFBoxConverter to convert the PDF pages to images when the -pagedimg_IMGTYPE flag is set. There is still something that prevents the build from completing, but as the original ExtractText functionality of the PDFBoxConverter is perfectly intact, these changes for the image conversion can be committed.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm

    r25513 r25995  
    127127   
    128128    $self->{'pdfbox_launch_cmd'} = $launch_cmd;
     129    $self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.PDFToImage"; # cmd for converting pages to images (gif, jpg, png)
    129130    }
    130131    else {       
     
    162163    # check the filename
    163164    return 0 if ( !-f $source_file_full_path);
     165
     166    my $img_output_mode = 0;
    164167
    165168    # the following line is necessary to avoid 'uninitialised variable' error
     
    170173    if ($target_file_type eq "html") {
    171174    $self->{'converted_to'} = "HTML";
     175    } elsif ($target_file_type eq "jpg" || $target_file_type eq "gif" || $target_file_type eq "png") {
     176    $self->{'converted_to'} = $target_file_type;   
     177    $img_output_mode = 1;
    172178    } else {
    173179    $self->{'converted_to'} = "text";
     
    185191    my $file_root = $self->{'cached_file_root'};
    186192    #$file_root .= "_$convert_id" if ($convert_id ne "");
    187     my $target_file = "$file_root.$target_file_type";
     193
     194    # append the output filetype suffix only for non-image output formats, since for
     195    # images we can be outputting multiple image files per single PDF input file
     196    my $target_file = $img_output_mode ? "$file_root" : "$file_root.$target_file_type";
     197
    188198    $target_file_path = &util::filename_cat($cache_dir,$target_file);
    189199    }
     
    191201    # this is in gsdl/tmp. get a tmp filename in collection instead???
    192202    $target_file_path = &util::get_tmp_filename($target_file_type);
     203
     204    # for image files, remove the suffix, since we can have many output image files
     205    # per input PDF (one img for each page of the PDF, for example)
     206    if($img_output_mode) {
     207        $target_file_path =~ s/\.[^.]*$//g;
     208        if(!&util::dir_exists($target_file_path)) {     
     209        mkdir($target_file_path);
     210        }
     211       
     212        # once the item file for the imgs has been created, need to adjust target_file_path
     213
     214        # below, we'll store the dir just created to pbtmp_file_paths, so all imgs and the
     215        # item file generated in it can be deleted in one go on clean_up
     216    }
     217
    193218    push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
    194219    }
    195220
    196221    # Generate and run the convert command
    197     my $convert_cmd = $self->{'pdfbox_launch_cmd'};
    198     $convert_cmd .= " -html" if ($target_file_type eq "html");
    199     $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
     222    my $convert_cmd = "";
     223
     224    # want the filename without extension, because any images
     225    # are to be generated with the same filename as the PDF
     226    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$");
     227
     228    if($img_output_mode) { # converting to images
     229    my $output_prefix = &util::filename_cat($target_file_path, $tailname);
     230   
     231    $convert_cmd = $self->{'pdfbox_img_launch_cmd'};
     232    $convert_cmd .= " -imageType $target_file_type";
     233    $convert_cmd .= " -outputPrefix $output_prefix";
     234    $convert_cmd .= " \"$source_file_full_path\"";
     235   
     236    } else { # html or text
     237    $convert_cmd = $self->{'pdfbox_launch_cmd'};
     238    $convert_cmd .= " -html" if ($target_file_type eq "html");
     239    $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
     240    }
    200241
    201242    if ($verbosity>2) {
     
    209250    my ($regenerated,$result,$had_error)
    210251    = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
     252
     253    if($img_output_mode) {
     254    # now the images have been generated, generate the "$target_file_path/tailname.item"
     255    # item file for them, which is also the target_file_path that needs to be returned
     256    $target_file_path = &util::create_itemfile($target_file_path, $tailname, $target_file_type);
     257    #print STDERR "**** item file: $target_file_path\n";
     258    }
     259   
    211260    if ($had_error) {
    212261    return (0, $result,$target_file_path);
     
    231280
    232281    foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
    233     if (-e $pbtmp_file_path) {
     282    if (-d $pbtmp_file_path) {
     283        #print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n";
     284        &util::rm_r($pbtmp_file_path);
     285    }
     286    elsif (-e $pbtmp_file_path) {
    234287        &util::rm($pbtmp_file_path);
    235288    }
Note: See TracChangeset for help on using the changeset viewer.