package OCRImageConverter; use BaseMediaConverter; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa no strict 'subs'; # allow barewords (eg STDERR) as function arguments use gsprintf 'gsprintf'; # these two variables mustn't be initialised here or they will get stuck # at those values. our $ocrimage_conversion_available; our $no_ocrimage_conversion_reason; BEGIN { @OCRImageConverter::ISA = ('BaseMediaConverter'); # Check that PDFBox is installed and available on the path $ocrimage_conversion_available = 1; if (!defined $ENV{'GEXTOCR'}) { $ocrimage_conversion_available = 0; $no_ocrimage_conversion_reason = "gextocrnotinstalled"; } else { my $gextocr_home = $ENV{'GEXTOCR'}; #my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar"); my $tesseract = &FileUtils::filenameConcatenate($gextocr_home, "installed", "cmdline", "bin", "tesseract"); if (!-e $tesseract) { &gsprintf(STDERR, "**** Failed to find $tesseract\n"); $ocrimage_conversion_available = 0; $no_ocrimage_conversion_reason = "gexttesseractnotinstalled"; } else { my $cmd = "\"$tesseract\" -v "; if ($ENV{'GSDLOS'} =~ /^windows/i) { $cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0) } else { # On Ubuntu, java >/dev/null 2>&1 works, # but java 2>&1 >/dev/null doesn't work: output goes to screen anyway $cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it) } my $status = system($cmd); if ($status != 0) { my $error_message = "**** Testing for Tesseract\n"; $error_message .= "Failed to run: $cmd\n"; $error_message .= "Error variable: |$!| and status: $status\n"; &gsprintf(STDERR, "OCRImageConverter: $error_message"); $ocrimage_conversion_available = 0; $no_ocrimage_conversion_reason = "couldnotruntesseract"; } } } } my $arguments = [ ]; my $options = { 'name' => "OCRImageConverter", 'desc' => "{OCRImageConverter.desc}", 'abstract' => "yes", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist, $inputargs, $hashArgOptLists, $auxilary) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments}); push(@{$hashArgOptLists->{"OptList"}}, $options); my $self = new BaseMediaConverter($pluginlist, $inputargs, $hashArgOptLists, $auxilary); if ($self->{'info_only'}) { # don't worry about any options etc return bless $self, $class; } if ($ocrimage_conversion_available) { my $gextocr_home = $ENV{'GEXTOCR'}; my $tesseract = &FileUtils::filenameConcatenate($gextocr_home, "installed", "cmdline", "bin", "tesseract"); my $lang = "eng"; # TODO my $launch_cmd = "\"$tesseract\" -l $lang "; $self->{'ocrimage_launch_cmd'} = $launch_cmd; } else { $self->{'no_ocrimage_conversion_reason'} = $no_ocrimage_conversion_reason; my $outhandle = $self->{'outhandle'}; &gsprintf($outhandle, "OCRImageConverter: {OCRImageConverter.noconversionavailable} ({OCRImageConverter.$no_ocrimage_conversion_reason})\n"); } $self->{'ocrimage_conversion_available'} = $ocrimage_conversion_available; return bless $self, $class; } sub init { my $self = shift(@_); my ($verbosity, $outhandle, $failhandle) = @_; $self->{'ocrtmp_file_paths'} = (); } sub deinit { my $self = shift(@_); $self->clean_up_temporary_files(); } sub convert { my $self = shift(@_); my ($source_file_full_path) = @_; print STDERR "***** WE ARE WORKING *****\n"; # TODO cache if (lc(substr($source_file_full_path, length($source_file_full_path) - 4)) eq ".gif") { # need to convert to another format first #my $target_source = substr($source_file_full_path, length($source_file_full_path) - 4) . ".tif"; my $converted_file = &util::get_tmp_filename("tif"); push(@{$self->{'ocrtmp_file_paths'}}, $converted_file); my $imagick_cmd = "\"" . &util::get_perl_exec() . "\" -S gs-magick.pl convert \"$source_file_full_path\" \"$converted_file\""; system($imagick_cmd); print STDERR "*** Magick command: $imagick_cmd\n"; $source_file_full_path = $converted_file; print STDERR "*** Converted file: $converted_file\n"; } my $source_file_no_path = &File::Basename::basename($source_file_full_path); return 0 unless $ocrimage_conversion_available; return 0 if (!-f $source_file_full_path); my $outhandle = $self->{'outhandle'}; my $verbosity = $self->{'verbosity'}; my $target_file_path; if ($self->{'enable_cache'}) { $self->init_cache_for_file($source_file_full_path); my $cache_dir = $self->{'cached_dir'}; my $file_root = $self->{'cached_file_root'}; $target_file_path = &FileUtils::filenameConcatenate($cache_dir, $file_root); } else { $target_file_path = &util::get_tmp_filename("txt"); push(@{$self->{'ocrtmp_file_paths'}}, $target_file_path); } my $convert_cmd = ""; my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$"); $convert_cmd = $self->{'ocrimage_launch_cmd'}; $convert_cmd .= " \"$source_file_full_path\" \"" . substr($target_file_path, 0, length($target_file_path) - 4) . "\""; #&gsprintf(STDERR, "OCRImageConverter convert command: $convert_cmd\n"); print STDERR "OCRImageConverter convert command: $convert_cmd\n"; my $print_info = { 'message_prefix' => "OCR Conversion", 'message' => "Converting $source_file_no_path." }; my ($regenerated, $result, $had_error) = $self->autorun_general_cmd($convert_cmd, $source_file_full_path, $target_file_path, $print_info); if ($had_error) { return (0, $result, $target_file_path); } else { return (1, $result, $target_file_path); } } sub convert_without_result { my $self = shift(@_); my $source_file_path = shift(@_); my $target_file_type = shift(@_); my $convert_options = shift(@_) || ""; my $convert_id = shift(@_) || ""; return $self->convert($source_file_path,$target_file_type, $convert_options,$convert_id,"without_result"); } sub clean_up_temporary_files { my $self = shift(@_); foreach my $ocrtmp_file_path (@{$self->{'ocrtmp_file_paths'}}) { if (-d $ocrtmp_file_path) { #print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n"; &FileUtils::removeFilesRecursive($ocrtmp_file_path); } elsif (-e $ocrtmp_file_path) { &FileUtils::removeFiles($ocrtmp_file_path); } } $self->{'ocrtmp_file_paths'} = (); } 1;