Context Navigation

← Previous Change
Next Change →

Changeset 25995 for gs2-extensions/pdf-box

Timestamp:

2012-07-19T18:51:55+12:00 (12 years ago)

Author:

ak19

Message:

First set of commits that will allow PDFBoxConverter to convert the PDF pages to images when the -pagedimg_IMGTYPE flag is set. There is still something that prevents the build from completing, but as the original ExtractText functionality of the PDFBoxConverter is perfectly intact, these changes for the image conversion can be committed.

File:

: 1 edited

gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm

-              r25513
+              r25995
     $self->{'pdfbox_launch_cmd'} = $launch_cmd;
+    $self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.PDFToImage"; # cmd for converting pages to images (gif, jpg, png)
+    }
     else {
 …
     # check the filename
     return 0 if ( !-f $source_file_full_path);
+    my $img_output_mode = 0;
     # the following line is necessary to avoid 'uninitialised variable' error
 …
     if ($target_file_type eq "html") {
     $self->{'converted_to'} = "HTML";
+    } elsif ($target_file_type eq "jpg" || $target_file_type eq "gif" || $target_file_type eq "png") {
+    $self->{'converted_to'} = $target_file_type;
+    $img_output_mode = 1;
     } else {
     $self->{'converted_to'} = "text";
 …
     my $file_root = $self->{'cached_file_root'};
     #$file_root .= "_$convert_id" if ($convert_id ne "");
+    my $target_file = "$file_root.$target_file_type";
+    # append the output filetype suffix only for non-image output formats, since for
+    # images we can be outputting multiple image files per single PDF input file
+    my $target_file = $img_output_mode ? "$file_root" : "$file_root.$target_file_type";
     $target_file_path = &util::filename_cat($cache_dir,$target_file);
+    }
 …
     # this is in gsdl/tmp. get a tmp filename in collection instead???
     $target_file_path = &util::get_tmp_filename($target_file_type);
+    # for image files, remove the suffix, since we can have many output image files
+    # per input PDF (one img for each page of the PDF, for example)
+    if($img_output_mode) {
+        $target_file_path =~ s/\.[^.]*$//g;
+        if(!&util::dir_exists($target_file_path)) {
+        mkdir($target_file_path);
+        }
+        # once the item file for the imgs has been created, need to adjust target_file_path
+        # below, we'll store the dir just created to pbtmp_file_paths, so all imgs and the
+        # item file generated in it can be deleted in one go on clean_up
+    }
     push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
+    }
     # Generate and run the convert command
+    my $convert_cmd = $self->{'pdfbox_launch_cmd'};
+    $convert_cmd .= " -html" if ($target_file_type eq "html");
+    $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
+    my $convert_cmd = "";
+    # want the filename without extension, because any images
+    # are to be generated with the same filename as the PDF
+    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$");
+    if($img_output_mode) { # converting to images
+    my $output_prefix = &util::filename_cat($target_file_path, $tailname);
+    $convert_cmd = $self->{'pdfbox_img_launch_cmd'};
+    $convert_cmd .= " -imageType $target_file_type";
+    $convert_cmd .= " -outputPrefix $output_prefix";
+    $convert_cmd .= " \"$source_file_full_path\"";
+    } else { # html or text
+    $convert_cmd = $self->{'pdfbox_launch_cmd'};
+    $convert_cmd .= " -html" if ($target_file_type eq "html");
+    $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
+    }
     if ($verbosity>2) {
 …
     my ($regenerated,$result,$had_error)
     = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
+    if($img_output_mode) {
+    # now the images have been generated, generate the "$target_file_path/tailname.item"
+    # item file for them, which is also the target_file_path that needs to be returned
+    $target_file_path = &util::create_itemfile($target_file_path, $tailname, $target_file_type);
+    #print STDERR "**** item file: $target_file_path\n";
+    }
     if ($had_error) {
     return (0, $result,$target_file_path);
 …
     foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
+    if (-e $pbtmp_file_path) {
+    if (-d $pbtmp_file_path) {
+        #print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n";
+        &util::rm_r($pbtmp_file_path);
+    }
+    elsif (-e $pbtmp_file_path) {
         &util::rm($pbtmp_file_path);
+    }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 25995 for gs2-extensions/pdf-box

Legend:

gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm

Download in other formats: