package OCRImageConverter; use BaseMediaConverter; use strict; use warnings; no strict 'refs'; # allow filehandles to be variables and viceversa no strict 'subs'; # allow barewords (eg STDERR) as function arguments use gsprintf 'gsprintf'; # these two variables mustn't be initialised here or they will get stuck # at those values. our $ocrimage_conversion_available; our $no_ocrimage_conversion_reason; BEGIN { @OCRImageConverter::ISA = ('BaseMediaConverter'); # Check that Tesseract is installed and available on the path $ocrimage_conversion_available = 1; if (!defined $ENV{'GEXTOCR'}) { $ocrimage_conversion_available = 0; $no_ocrimage_conversion_reason = "gextocrnotinstalled"; } else { my $gextocr_home = $ENV{'GEXTOCR'}; #my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar"); my $tesseract = &FileUtils::filenameConcatenate($gextocr_home, "installed", "cmdline", "bin", "tesseract"); if (!-e $tesseract) { &gsprintf(STDERR, "**** Failed to find $tesseract\n"); $ocrimage_conversion_available = 0; $no_ocrimage_conversion_reason = "gexttesseractnotinstalled"; } else { my $cmd = "\"$tesseract\" -v "; if ($ENV{'GSDLOS'} =~ /^windows/i) { $cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0) } else { # On Ubuntu, java >/dev/null 2>&1 works, # but java 2>&1 >/dev/null doesn't work: output goes to screen anyway $cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it) } my $status = system($cmd); if ($status != 0) { my $error_message = "**** Testing for Tesseract\n"; $error_message .= "Failed to run: $cmd\n"; $error_message .= "Error variable: |$!| and status: $status\n"; &gsprintf(STDERR, "OCRImageConverter: $error_message"); $ocrimage_conversion_available = 0; $no_ocrimage_conversion_reason = "couldnotruntesseract"; } } my $cpan = &FileUtils::filenameConcatenate($gextocr_home, "installed", "cmdline", "cpan", "lib64", "perl5"); &gsprintf(STDERR, "OCRImageConverter: CPAN directory $cpan\n"); if (! -d $cpan) { $ocrimage_conversion_available = 0; $no_ocrimage_conversion_reason = "gexthtmltokenotinstalled"; } else { push(@INC, $cpan); } } } use HTML::TokeParser; my $layout_list = [ { 'name' => "none", 'desc' => "{OCRImageConverter.unpaper_layout.none}" }, { 'name' => "single", 'desc' => "{OCRImageConverter.unpaper_layout.single}" }, { 'name' => "double", 'desc' => "{OCRImageConverter.unpaper_layout.double}" } ]; my $arguments = [ { 'name' => 'use_unpaper', 'desc' => "{OCRImageConverter.use_unpaper}", 'type' => "flag", 'deft' => 0, 'reqd' => "no" }, { 'name' => 'unpaper_layout', 'desc' => "{OCRImageConverter.unpaper_layout}", 'type' => "enum", 'reqd' => "yes", 'list' => $layout_list, 'deft' => "none" } ]; my $opt_diva_args = [ { 'name' => 'diva_support', 'desc' => "{OCRImageConverter.diva_support}", 'type' => "flag", 'deft' => "yes", 'reqd' => "no" } ]; my $options = { 'name' => "OCRImageConverter", 'desc' => "{OCRImageConverter.desc}", 'abstract' => "yes", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist, $inputargs, $hashArgOptLists, $auxilary) = @_; push(@$pluginlist, $class); if (defined $ENV{'GEXTDIVA'}) { push(@$arguments, @$opt_diva_args); } push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments}); push(@{$hashArgOptLists->{"OptList"}}, $options); my $self = new BaseMediaConverter($pluginlist, $inputargs, $hashArgOptLists, $auxilary); if ($self->{'info_only'}) { # don't worry about any options etc return bless $self, $class; } if ($ocrimage_conversion_available) { my $gextocr_home = $ENV{'GEXTOCR'}; my $tesseract = &FileUtils::filenameConcatenate($gextocr_home, "installed", "cmdline", "bin", "tesseract"); my $lang = "eng"; # TODO my $launch_cmd = "\"$tesseract\" -l $lang "; $self->{'ocrimage_launch_cmd'} = $launch_cmd; } else { $self->{'no_ocrimage_conversion_reason'} = $no_ocrimage_conversion_reason; my $outhandle = $self->{'outhandle'}; &gsprintf($outhandle, "OCRImageConverter: {OCRImageConverter.noconversionavailable} ({OCRImageConverter.$no_ocrimage_conversion_reason})\n"); } $self->{'ocrimage_conversion_available'} = $ocrimage_conversion_available; return bless $self, $class; } sub init { my $self = shift(@_); my ($verbosity, $outhandle, $failhandle) = @_; $self->{'ocrtmp_file_paths'} = (); } sub deinit { my $self = shift(@_); $self->clean_up_temporary_files(); } sub convert { my $self = shift(@_); my ($source_file_full_path) = @_; # TODO cache if (lc(substr($source_file_full_path, length($source_file_full_path) - 4)) eq ".gif") { my $desired_extension = "tif"; if ($self->{'use_unpaper'}) { $desired_extension = "pnm"; } # need to convert to another format first #my $target_source = substr($source_file_full_path, length($source_file_full_path) - 4) . ".tif"; my $converted_file = &util::get_tmp_filename($desired_extension); push(@{$self->{'ocrtmp_file_paths'}}, $converted_file); my $imagick_cmd = "\"" . &util::get_perl_exec() . "\" -S gs-magick.pl convert \"$source_file_full_path\" \"$converted_file\""; system($imagick_cmd); print STDERR "*** Magick command: $imagick_cmd\n"; if ($self->{'use_unpaper'}) { my $unpaper_cmd = "unpaper \"$converted_file\" \"$converted_file\" --overwrite --layout " . $self->{'unpaper_layout'}; system($unpaper_cmd); print STDERR "*** unpaper command: $unpaper_cmd\n"; } $source_file_full_path = $converted_file; print STDERR "*** Converted file: $converted_file\n"; } my $source_file_no_path = &File::Basename::basename($source_file_full_path); return 0 unless $ocrimage_conversion_available; return 0 if (!-f $source_file_full_path); my $outhandle = $self->{'outhandle'}; my $verbosity = $self->{'verbosity'}; my $target_file_path; if ($self->{'enable_cache'}) { $self->init_cache_for_file($source_file_full_path); my $cache_dir = $self->{'cached_dir'}; my $file_root = $self->{'cached_file_root'}; $target_file_path = &FileUtils::filenameConcatenate($cache_dir, $file_root); } else { $target_file_path = &util::get_tmp_filename("txt"); push(@{$self->{'ocrtmp_file_paths'}}, $target_file_path); } my $convert_cmd = ""; my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$"); $convert_cmd = $self->{'ocrimage_launch_cmd'}; $convert_cmd .= " \"$source_file_full_path\" \"" . substr($target_file_path, 0, length($target_file_path) - 4) . "\""; #&gsprintf(STDERR, "OCRImageConverter convert command: $convert_cmd\n"); print STDERR "OCRImageConverter convert command: $convert_cmd\n"; my $print_info = { 'message_prefix' => "OCR Conversion", 'message' => "Converting $source_file_no_path." }; my ($regenerated, $result, $had_error) = $self->autorun_general_cmd($convert_cmd, $source_file_full_path, $target_file_path, $print_info); $self->autorun_general_cmd($convert_cmd . " hocr", $source_file_full_path, $target_file_path, $print_info); # HOCR my $hocr = substr($target_file_path, 0, length($target_file_path) - 4) . ".hocr"; my $json = "["; open(my $fh, $hocr) || die "Can't open HOCR file $hocr!"; #open(my $fh, $hocr); my $parser = HTML::TokeParser->new($fh); my $state = "idle"; my $bbox = ""; my $text = ""; while (my $token = $parser->get_token()) { if ($state eq "idle" && $token->[0] eq "S" && $token->[1] eq "span") { $state = "inSpan"; $bbox = $token->[2]{'title'}; } if ($state eq "inSpan" && $token->[0] eq "T") { # $state = "gotText"; $text = $token->[1]; $state = "gotText"; chomp $text; } if ($state eq "gotText" && $token->[0] eq "E") { $state = "idle"; #next unless $text =~ /\w/; my @numbers = $bbox =~ /(\d+)/g; $json .= "[\"$text\",[${numbers[0]},${numbers[1]},${numbers[2]},${numbers[3]}]],"; } } close($fh); $json = substr($json, 0, length($json) - 1); # remove trailing comma $json .= "]"; open(my $oh, '>', $hocr . '.json'); print $oh "$json"; close($oh); # Diva.js #if (defined $EXT{'GEXTDIVA'}) { # if ($self->{'diva_support'}) { # my $webroot = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, '..', 'packages', 'tomcat', 'webapps', 'iipsrv', 'collect', ''); # my $processpy = &FileUtils::filenameConcatenate($ENV{'GEXTDIVA_INSTALLED'}, 'cmdline', 'python'); # my $DS = &util::get_dirsep(); # my $convert = `which convert`; # # $print_info = { # 'message_prefix' => "Diva.js processing", # 'message' => "Now processing images for use with Diva.js..." # }; # # my $process_cmd = "python \"$processpy\" \"\" \"$webroot${DS}img\" \"$webroot${DS}data\" -t tiff -i \"$convert\""; # $self->autorun_general_cmd($process_cmd, "", $print_info); # } #} if ($had_error) { return (0, $result, $target_file_path); } else { return (1, $result, $target_file_path); } } sub test { my $self = shift(@_); print STDERR "**** test working\n"; } sub convert_without_result { my $self = shift(@_); my $source_file_path = shift(@_); my $target_file_type = shift(@_); my $convert_options = shift(@_) || ""; my $convert_id = shift(@_) || ""; return $self->convert($source_file_path,$target_file_type, $convert_options,$convert_id,"without_result"); } sub clean_up_temporary_files { my $self = shift(@_); foreach my $ocrtmp_file_path (@{$self->{'ocrtmp_file_paths'}}) { if (-d $ocrtmp_file_path) { #print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n"; &FileUtils::removeFilesRecursive($ocrtmp_file_path); } elsif (-e $ocrtmp_file_path) { &FileUtils::removeFiles($ocrtmp_file_path); } } $self->{'ocrtmp_file_paths'} = (); } 1;