Changeset 30201
- Timestamp:
- 2015-09-04T17:29:04+12:00 (8 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/ocr/trunk/perllib/plugins/OCRImageConverter.pm
r30110 r30201 4 4 5 5 use strict; 6 use warnings; 6 7 no strict 'refs'; # allow filehandles to be variables and viceversa 7 8 no strict 'subs'; # allow barewords (eg STDERR) as function arguments … … 17 18 @OCRImageConverter::ISA = ('BaseMediaConverter'); 18 19 19 # Check that PDFBoxis installed and available on the path20 # Check that Tesseract is installed and available on the path 20 21 $ocrimage_conversion_available = 1; 21 22 … … 55 56 } 56 57 } 57 } 58 59 } 60 61 my $arguments = [ ]; 58 59 my $cpan = &FileUtils::filenameConcatenate($gextocr_home, "installed", "cmdline", "cpan", "lib64", "perl5"); 60 &gsprintf(STDERR, "OCRImageConverter: CPAN directory $cpan\n"); 61 if (! -d $cpan) { 62 $ocrimage_conversion_available = 0; 63 $no_ocrimage_conversion_reason = "gexthtmltokenotinstalled"; 64 } else { 65 push(@INC, $cpan); 66 } 67 } 68 } 69 70 use HTML::TokeParser; 71 72 my $layout_list = [ 73 { 74 'name' => "none", 75 'desc' => "{OCRImageConverter.unpaper_layout.none}" 76 }, 77 { 78 'name' => "single", 79 'desc' => "{OCRImageConverter.unpaper_layout.single}" 80 }, 81 { 82 'name' => "double", 83 'desc' => "{OCRImageConverter.unpaper_layout.double}" 84 } 85 ]; 86 87 my $arguments = [ 88 { 89 'name' => 'use_unpaper', 90 'desc' => "{OCRImageConverter.use_unpaper}", 91 'type' => "flag", 92 'deft' => 0, 93 'reqd' => "no" 94 }, 95 { 96 'name' => 'unpaper_layout', 97 'desc' => "{OCRImageConverter.unpaper_layout}", 98 'type' => "enum", 99 'reqd' => "yes", 100 'list' => $layout_list, 101 'deft' => "none" 102 } 103 ]; 104 105 my $opt_diva_args = [ 106 { 107 'name' => 'diva_support', 108 'desc' => "{OCRImageConverter.diva_support}", 109 'type' => "flag", 110 'deft' => "yes", 111 'reqd' => "no" 112 } 113 ]; 62 114 63 115 my $options = { … … 74 126 push(@$pluginlist, $class); 75 127 128 if (defined $ENV{'GEXTDIVA'}) { 129 push(@$arguments, @$opt_diva_args); 130 } 131 76 132 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments}); 77 133 push(@{$hashArgOptLists->{"OptList"}}, $options); 78 79 134 80 135 my $self = new BaseMediaConverter($pluginlist, $inputargs, $hashArgOptLists, $auxilary); … … 121 176 my ($source_file_full_path) = @_; 122 177 123 print STDERR "***** WE ARE WORKING *****\n";124 125 178 # TODO cache 126 179 if (lc(substr($source_file_full_path, length($source_file_full_path) - 4)) eq ".gif") { 180 my $desired_extension = "tif"; 181 if ($self->{'use_unpaper'}) { 182 $desired_extension = "pnm"; 183 } 184 127 185 # need to convert to another format first 128 186 #my $target_source = substr($source_file_full_path, length($source_file_full_path) - 4) . ".tif"; 129 my $converted_file = &util::get_tmp_filename( "tif");187 my $converted_file = &util::get_tmp_filename($desired_extension); 130 188 push(@{$self->{'ocrtmp_file_paths'}}, $converted_file); 131 189 … … 134 192 135 193 print STDERR "*** Magick command: $imagick_cmd\n"; 194 195 if ($self->{'use_unpaper'}) { 196 my $unpaper_cmd = "unpaper \"$converted_file\" \"$converted_file\" --overwrite --layout " . $self->{'unpaper_layout'}; 197 system($unpaper_cmd); 198 print STDERR "*** unpaper command: $unpaper_cmd\n"; 199 } 136 200 137 201 $source_file_full_path = $converted_file; … … 175 239 176 240 my ($regenerated, $result, $had_error) = $self->autorun_general_cmd($convert_cmd, $source_file_full_path, $target_file_path, $print_info); 241 $self->autorun_general_cmd($convert_cmd . " hocr", $source_file_full_path, $target_file_path, $print_info); 242 243 # HOCR 244 my $hocr = substr($target_file_path, 0, length($target_file_path) - 4) . ".hocr"; 245 246 my $json = "["; 247 248 open(my $fh, $hocr) || die "Can't open HOCR file $hocr!"; 249 #open(my $fh, $hocr); 250 my $parser = HTML::TokeParser->new($fh); 251 252 my $state = "idle"; 253 my $bbox = ""; 254 my $text = ""; 255 256 while (my $token = $parser->get_token()) { 257 if ($state eq "idle" && $token->[0] eq "S" && $token->[1] eq "span") { 258 $state = "inSpan"; 259 $bbox = $token->[2]{'title'}; 260 } 261 262 if ($state eq "inSpan" && $token->[0] eq "T") { 263 # $state = "gotText"; 264 $text = $token->[1]; 265 $state = "gotText"; 266 chomp $text; 267 } 268 269 if ($state eq "gotText" && $token->[0] eq "E") { 270 $state = "idle"; 271 #next unless $text =~ /\w/; 272 273 my @numbers = $bbox =~ /(\d+)/g; 274 $json .= "[\"$text\",[${numbers[0]},${numbers[1]},${numbers[2]},${numbers[3]}]],"; 275 } 276 } 277 278 close($fh); 279 280 $json = substr($json, 0, length($json) - 1); # remove trailing comma 281 $json .= "]"; 282 283 open(my $oh, '>', $hocr . '.json'); 284 print $oh "$json"; 285 close($oh); 286 287 # Diva.js 288 #if (defined $EXT{'GEXTDIVA'}) { 289 # if ($self->{'diva_support'}) { 290 # my $webroot = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, '..', 'packages', 'tomcat', 'webapps', 'iipsrv', 'collect', '<HASH>'); 291 # my $processpy = &FileUtils::filenameConcatenate($ENV{'GEXTDIVA_INSTALLED'}, 'cmdline', 'python'); 292 # my $DS = &util::get_dirsep(); 293 # my $convert = `which convert`; 294 # 295 # $print_info = { 296 # 'message_prefix' => "Diva.js processing", 297 # 'message' => "Now processing images for use with Diva.js..." 298 # }; 299 # 300 # my $process_cmd = "python \"$processpy\" \"<IN>\" \"$webroot${DS}img\" \"$webroot${DS}data\" -t tiff -i \"$convert\""; 301 # $self->autorun_general_cmd($process_cmd, "<IN>", $print_info); 302 # } 303 #} 177 304 178 305 if ($had_error) { … … 181 308 return (1, $result, $target_file_path); 182 309 } 310 } 311 312 sub test { 313 my $self = shift(@_); 314 print STDERR "**** test working\n"; 183 315 } 184 316 … … 210 342 } 211 343 212 1; 344 1;
Note:
See TracChangeset
for help on using the changeset viewer.