- Timestamp:
- 2018-07-17T22:13:17+12:00 (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm
r32273 r32282 129 129 $self->{'pdfbox_txt_launch_cmd'} = "$java -cp \"$pbajar\" org.apache.pdfbox.tools.ExtractText"; 130 130 $self->{'pdfbox_html_launch_cmd'} = "$java -cp \"$pbajar\" -Dline.separator=\"<br />\" org.apache.pdfbox.tools.ExtractText"; 131 #$self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (gif, jpg, png) 132 # Now: use this cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png) 133 # AND its extracted text. An item file is still generated, but this time referring to txtfiles too, not just the images. Result: searchable paged output. 131 # $self->{'pdfbox_img_launch_cmd'} = "java -cp \"$pbajar\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (gif, jpg, png) 132 133 # We use this next cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png) 134 # AND its extracted text. Or just each page's extracted text. An item file is still generated, 135 # but this time referring to txtfiles too, not just the images. Result: searchable paged output. 134 136 # Our new custom class PDFBoxToImagesAndText.java lives in the new build folder, so add that to the classpath for the launch cmd 135 137 my $pdfbox_build = &FileUtils::filenameConcatenate($gextpb_home,"build"); 136 138 my $classpath = &util::pathname_cat($pbajar,$pdfbox_build); 137 $self->{'pdfbox_img _launch_cmd'} = "java -cp \"$classpath\" org.greenstone.pdfbox.PDFBoxToImagesAndText";139 $self->{'pdfbox_imgtxt_launch_cmd'} = "java -cp \"$classpath\" org.greenstone.pdfbox.PDFBoxToImagesAndText"; 138 140 } 139 141 else { … … 179 181 180 182 my $img_output_mode = 0; 181 183 184 my $convert_to = $self->{'convert_to'}; 185 my $paged_txt_output_mode = ($convert_to =~ /(pagedimgtxt|paged_text)/) ? 1 : 0; 186 182 187 # the following line is necessary to avoid 'uninitialised variable' error 183 188 # messages concerning the converted_to member variable when PDFPlugin's … … 187 192 if ($target_file_type eq "html") { 188 193 $self->{'converted_to'} = "HTML"; 189 } elsif ($target_file_type eq "jpg" || $target_file_type eq "gif" || $target_file_type eq "png") { 194 } elsif ($target_file_type eq "jpg" || $target_file_type eq "gif" || $target_file_type eq "png") { 195 # GIF not supported by PDFBox at present, see https://pdfbox.apache.org/1.8/commandline.html#pdftoimage 190 196 $self->{'converted_to'} = $target_file_type; 191 197 $img_output_mode = 1; … … 208 214 # append the output filetype suffix only for non-image output formats, since for 209 215 # images we can be outputting multiple image files per single PDF input file 210 my $target_file = $img_output_mode? "$file_root" : "$file_root.$target_file_type";216 my $target_file = ($img_output_mode || $paged_txt_output_mode) ? "$file_root" : "$file_root.$target_file_type"; 211 217 212 218 $target_file_path = &FileUtils::filenameConcatenate($cache_dir,$target_file); … … 218 224 # for image files, remove the suffix, since we can have many output image files 219 225 # per input PDF (one img for each page of the PDF, for example) 220 if($img_output_mode ) {226 if($img_output_mode || $paged_txt_output_mode) { 221 227 $target_file_path =~ s/\.[^.]*$//g; 222 228 if(!&FileUtils::directoryExists($target_file_path)) { … … 229 235 # item file generated in it can be deleted in one go on clean_up 230 236 } 231 237 232 238 push(@{$self->{'pbtmp_file_paths'}}, $target_file_path); 233 239 } … … 240 246 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$"); 241 247 242 if($img_output_mode ) { # converting to images248 if($img_output_mode || $paged_txt_output_mode) { # converting each page to image and/or text 243 249 my $output_prefix = &FileUtils::filenameConcatenate($target_file_path, $tailname); 244 250 245 $convert_cmd = $self->{'pdfbox_img_launch_cmd'}; 246 $convert_cmd .= " -imageType $target_file_type"; 251 #$convert_cmd = $paged_txt_output_mode ? $self->{'pdfbox_imgtxt_launch_cmd'} : $self->{'pdfbox_img_launch_cmd'}; 252 $convert_cmd = $self->{'pdfbox_imgtxt_launch_cmd'}; 253 $convert_cmd .= " -textOnly" unless($img_output_mode); # if paged txt only and no images 254 $convert_cmd .= " -imagesOnly" unless($paged_txt_output_mode); # set to images only unless there's text too 255 $convert_cmd .= " -imageType $target_file_type" if($img_output_mode); 247 256 $convert_cmd .= " -outputPrefix \"$output_prefix\""; 248 257 $convert_cmd .= " \"$source_file_full_path\""; 249 258 250 } else { # html or text259 } else { # single stream of text or html 251 260 252 261 if ($target_file_type eq "html") { … … 270 279 = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info); 271 280 272 if($img_output_mode ) {281 if($img_output_mode || $paged_txt_output_mode) { 273 282 # now the images have been generated, generate the "$target_file_path/tailname.item" 274 283 # item file for them, which is also the target_file_path that needs to be returned
Note:
See TracChangeset
for help on using the changeset viewer.