source: gs2-extensions/ocr/trunk/perllib/plugins/OCRImageConverter.pm@ 30201

Last change on this file since 30201 was 30201, checked in by davidb, 9 years ago

Zach's hard work improving OCR in Greenstone

File size: 10.9 KB
Line 
1package OCRImageConverter;
2
3use BaseMediaConverter;
4
5use strict;
6use warnings;
7no strict 'refs'; # allow filehandles to be variables and viceversa
8no strict 'subs'; # allow barewords (eg STDERR) as function arguments
9
10use gsprintf 'gsprintf';
11
12# these two variables mustn't be initialised here or they will get stuck
13# at those values.
14our $ocrimage_conversion_available;
15our $no_ocrimage_conversion_reason;
16
17BEGIN {
18 @OCRImageConverter::ISA = ('BaseMediaConverter');
19
20 # Check that Tesseract is installed and available on the path
21 $ocrimage_conversion_available = 1;
22
23 if (!defined $ENV{'GEXTOCR'}) {
24 $ocrimage_conversion_available = 0;
25 $no_ocrimage_conversion_reason = "gextocrnotinstalled";
26 } else {
27 my $gextocr_home = $ENV{'GEXTOCR'};
28 #my $pbajar = &FileUtils::filenameConcatenate($gextpb_home,"lib","java","pdfbox-app.jar");
29 my $tesseract = &FileUtils::filenameConcatenate($gextocr_home, "installed", "cmdline", "bin", "tesseract");
30
31 if (!-e $tesseract) {
32 &gsprintf(STDERR, "**** Failed to find $tesseract\n");
33 $ocrimage_conversion_available = 0;
34 $no_ocrimage_conversion_reason = "gexttesseractnotinstalled";
35 } else {
36 my $cmd = "\"$tesseract\" -v ";
37 if ($ENV{'GSDLOS'} =~ /^windows/i) {
38 $cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
39 } else {
40 # On Ubuntu, java >/dev/null 2>&1 works,
41 # but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
42 $cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
43 }
44
45 my $status = system($cmd);
46
47 if ($status != 0) {
48 my $error_message = "**** Testing for Tesseract\n";
49 $error_message .= "Failed to run: $cmd\n";
50 $error_message .= "Error variable: |$!| and status: $status\n";
51
52 &gsprintf(STDERR, "OCRImageConverter: $error_message");
53
54 $ocrimage_conversion_available = 0;
55 $no_ocrimage_conversion_reason = "couldnotruntesseract";
56 }
57 }
58
59 my $cpan = &FileUtils::filenameConcatenate($gextocr_home, "installed", "cmdline", "cpan", "lib64", "perl5");
60 &gsprintf(STDERR, "OCRImageConverter: CPAN directory $cpan\n");
61 if (! -d $cpan) {
62 $ocrimage_conversion_available = 0;
63 $no_ocrimage_conversion_reason = "gexthtmltokenotinstalled";
64 } else {
65 push(@INC, $cpan);
66 }
67 }
68}
69
70use HTML::TokeParser;
71
72my $layout_list = [
73 {
74 'name' => "none",
75 'desc' => "{OCRImageConverter.unpaper_layout.none}"
76 },
77 {
78 'name' => "single",
79 'desc' => "{OCRImageConverter.unpaper_layout.single}"
80 },
81 {
82 'name' => "double",
83 'desc' => "{OCRImageConverter.unpaper_layout.double}"
84 }
85];
86
87my $arguments = [
88 {
89 'name' => 'use_unpaper',
90 'desc' => "{OCRImageConverter.use_unpaper}",
91 'type' => "flag",
92 'deft' => 0,
93 'reqd' => "no"
94 },
95 {
96 'name' => 'unpaper_layout',
97 'desc' => "{OCRImageConverter.unpaper_layout}",
98 'type' => "enum",
99 'reqd' => "yes",
100 'list' => $layout_list,
101 'deft' => "none"
102 }
103];
104
105my $opt_diva_args = [
106 {
107 'name' => 'diva_support',
108 'desc' => "{OCRImageConverter.diva_support}",
109 'type' => "flag",
110 'deft' => "yes",
111 'reqd' => "no"
112 }
113];
114
115my $options = {
116 'name' => "OCRImageConverter",
117 'desc' => "{OCRImageConverter.desc}",
118 'abstract' => "yes",
119 'inherits' => "yes",
120 'args' => $arguments
121};
122
123sub new {
124 my ($class) = shift (@_);
125 my ($pluginlist, $inputargs, $hashArgOptLists, $auxilary) = @_;
126 push(@$pluginlist, $class);
127
128 if (defined $ENV{'GEXTDIVA'}) {
129 push(@$arguments, @$opt_diva_args);
130 }
131
132 push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
133 push(@{$hashArgOptLists->{"OptList"}}, $options);
134
135 my $self = new BaseMediaConverter($pluginlist, $inputargs, $hashArgOptLists, $auxilary);
136
137 if ($self->{'info_only'}) {
138 # don't worry about any options etc
139 return bless $self, $class;
140 }
141
142 if ($ocrimage_conversion_available) {
143 my $gextocr_home = $ENV{'GEXTOCR'};
144 my $tesseract = &FileUtils::filenameConcatenate($gextocr_home, "installed", "cmdline", "bin", "tesseract");
145 my $lang = "eng"; # TODO
146 my $launch_cmd = "\"$tesseract\" -l $lang ";
147
148 $self->{'ocrimage_launch_cmd'} = $launch_cmd;
149 } else {
150 $self->{'no_ocrimage_conversion_reason'} = $no_ocrimage_conversion_reason;
151
152 my $outhandle = $self->{'outhandle'};
153 &gsprintf($outhandle, "OCRImageConverter: {OCRImageConverter.noconversionavailable} ({OCRImageConverter.$no_ocrimage_conversion_reason})\n");
154 }
155
156 $self->{'ocrimage_conversion_available'} = $ocrimage_conversion_available;
157
158 return bless $self, $class;
159}
160
161sub init {
162 my $self = shift(@_);
163 my ($verbosity, $outhandle, $failhandle) = @_;
164
165 $self->{'ocrtmp_file_paths'} = ();
166}
167
168sub deinit {
169 my $self = shift(@_);
170
171 $self->clean_up_temporary_files();
172}
173
174sub convert {
175 my $self = shift(@_);
176 my ($source_file_full_path) = @_;
177
178 # TODO cache
179 if (lc(substr($source_file_full_path, length($source_file_full_path) - 4)) eq ".gif") {
180 my $desired_extension = "tif";
181 if ($self->{'use_unpaper'}) {
182 $desired_extension = "pnm";
183 }
184
185 # need to convert to another format first
186 #my $target_source = substr($source_file_full_path, length($source_file_full_path) - 4) . ".tif";
187 my $converted_file = &util::get_tmp_filename($desired_extension);
188 push(@{$self->{'ocrtmp_file_paths'}}, $converted_file);
189
190 my $imagick_cmd = "\"" . &util::get_perl_exec() . "\" -S gs-magick.pl convert \"$source_file_full_path\" \"$converted_file\"";
191 system($imagick_cmd);
192
193 print STDERR "*** Magick command: $imagick_cmd\n";
194
195 if ($self->{'use_unpaper'}) {
196 my $unpaper_cmd = "unpaper \"$converted_file\" \"$converted_file\" --overwrite --layout " . $self->{'unpaper_layout'};
197 system($unpaper_cmd);
198 print STDERR "*** unpaper command: $unpaper_cmd\n";
199 }
200
201 $source_file_full_path = $converted_file;
202 print STDERR "*** Converted file: $converted_file\n";
203 }
204
205 my $source_file_no_path = &File::Basename::basename($source_file_full_path);
206
207 return 0 unless $ocrimage_conversion_available;
208 return 0 if (!-f $source_file_full_path);
209
210 my $outhandle = $self->{'outhandle'};
211 my $verbosity = $self->{'verbosity'};
212
213 my $target_file_path;
214
215 if ($self->{'enable_cache'}) {
216 $self->init_cache_for_file($source_file_full_path);
217 my $cache_dir = $self->{'cached_dir'};
218 my $file_root = $self->{'cached_file_root'};
219
220 $target_file_path = &FileUtils::filenameConcatenate($cache_dir, $file_root);
221 } else {
222 $target_file_path = &util::get_tmp_filename("txt");
223 push(@{$self->{'ocrtmp_file_paths'}}, $target_file_path);
224 }
225
226 my $convert_cmd = "";
227 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$");
228
229 $convert_cmd = $self->{'ocrimage_launch_cmd'};
230 $convert_cmd .= " \"$source_file_full_path\" \"" . substr($target_file_path, 0, length($target_file_path) - 4) . "\"";
231
232 #&gsprintf(STDERR, "OCRImageConverter convert command: $convert_cmd\n");
233 print STDERR "OCRImageConverter convert command: $convert_cmd\n";
234
235 my $print_info = {
236 'message_prefix' => "OCR Conversion",
237 'message' => "Converting $source_file_no_path."
238 };
239
240 my ($regenerated, $result, $had_error) = $self->autorun_general_cmd($convert_cmd, $source_file_full_path, $target_file_path, $print_info);
241 $self->autorun_general_cmd($convert_cmd . " hocr", $source_file_full_path, $target_file_path, $print_info);
242
243 # HOCR
244 my $hocr = substr($target_file_path, 0, length($target_file_path) - 4) . ".hocr";
245
246 my $json = "[";
247
248 open(my $fh, $hocr) || die "Can't open HOCR file $hocr!";
249 #open(my $fh, $hocr);
250 my $parser = HTML::TokeParser->new($fh);
251
252 my $state = "idle";
253 my $bbox = "";
254 my $text = "";
255
256 while (my $token = $parser->get_token()) {
257 if ($state eq "idle" && $token->[0] eq "S" && $token->[1] eq "span") {
258 $state = "inSpan";
259 $bbox = $token->[2]{'title'};
260 }
261
262 if ($state eq "inSpan" && $token->[0] eq "T") {
263# $state = "gotText";
264 $text = $token->[1];
265 $state = "gotText";
266 chomp $text;
267 }
268
269 if ($state eq "gotText" && $token->[0] eq "E") {
270 $state = "idle";
271 #next unless $text =~ /\w/;
272
273 my @numbers = $bbox =~ /(\d+)/g;
274 $json .= "[\"$text\",[${numbers[0]},${numbers[1]},${numbers[2]},${numbers[3]}]],";
275 }
276 }
277
278 close($fh);
279
280 $json = substr($json, 0, length($json) - 1); # remove trailing comma
281 $json .= "]";
282
283 open(my $oh, '>', $hocr . '.json');
284 print $oh "$json";
285 close($oh);
286
287 # Diva.js
288 #if (defined $EXT{'GEXTDIVA'}) {
289 # if ($self->{'diva_support'}) {
290 # my $webroot = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, '..', 'packages', 'tomcat', 'webapps', 'iipsrv', 'collect', '<HASH>');
291 # my $processpy = &FileUtils::filenameConcatenate($ENV{'GEXTDIVA_INSTALLED'}, 'cmdline', 'python');
292 # my $DS = &util::get_dirsep();
293 # my $convert = `which convert`;
294 #
295 # $print_info = {
296 # 'message_prefix' => "Diva.js processing",
297 # 'message' => "Now processing images for use with Diva.js..."
298 # };
299 #
300 # my $process_cmd = "python \"$processpy\" \"<IN>\" \"$webroot${DS}img\" \"$webroot${DS}data\" -t tiff -i \"$convert\"";
301 # $self->autorun_general_cmd($process_cmd, "<IN>", $print_info);
302 # }
303 #}
304
305 if ($had_error) {
306 return (0, $result, $target_file_path);
307 } else {
308 return (1, $result, $target_file_path);
309 }
310}
311
312sub test {
313 my $self = shift(@_);
314 print STDERR "**** test working\n";
315}
316
317sub convert_without_result {
318 my $self = shift(@_);
319
320 my $source_file_path = shift(@_);
321 my $target_file_type = shift(@_);
322 my $convert_options = shift(@_) || "";
323 my $convert_id = shift(@_) || "";
324
325 return $self->convert($source_file_path,$target_file_type,
326 $convert_options,$convert_id,"without_result");
327}
328
329sub clean_up_temporary_files {
330 my $self = shift(@_);
331
332 foreach my $ocrtmp_file_path (@{$self->{'ocrtmp_file_paths'}}) {
333 if (-d $ocrtmp_file_path) {
334 #print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n";
335 &FileUtils::removeFilesRecursive($ocrtmp_file_path);
336 } elsif (-e $ocrtmp_file_path) {
337 &FileUtils::removeFiles($ocrtmp_file_path);
338 }
339 }
340
341 $self->{'ocrtmp_file_paths'} = ();
342}
343
3441;
Note: See TracBrowser for help on using the repository browser.