source: gs3-extensions/structured-image/trunk/perllib/plugins/GoogleVisionAPIConverter.pm@ 37046

Last change on this file since 37046 was 37046, checked in by davidb, 16 months ago

Code extended to now generate Open Annotation (JSON format) of the OCR'd blocks of text; some refinement of the existing Google Vision perl code

File size: 20.2 KB
Line 
1######################################################################
2#
3# GoogleVisionAPIConverter.pm -- helper plugin that allows other plugins
4# (such as ImagePlugin and PagedImagePlugin) to extend their
5# processing capability through sub-classing inheritence (such as
6# GoogleVisionImagePlugin and GoogleVisionPagedImagePlugin) to
7# expand the image processing capabilities at ingest time to
8# include the Google Vision API allowing for: metadata labelling
9# of objects within a scene; and OCR text recognition.
10#
11# A component of the Greenstone digital library software
12# from the New Zealand Digital Library Project at the
13# University of Waikato, New Zealand.
14#
15# Copyright (C) 1999 New Zealand Digital Library Project
16#
17# This program is free software; you can redistribute it and/or modify
18# it under the terms of the GNU General Public License as published by
19# the Free Software Foundation; either version 2 of the License, or
20# (at your option) any later version.
21#
22# This program is distributed in the hope that it will be useful,
23# but WITHOUT ANY WARRANTY; without even the implied warranty of
24# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25# GNU General Public License for more details.
26#
27# You should have received a copy of the GNU General Public License
28# along with this program; if not, write to the Free Software
29# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30#
31###########################################################################
32
33package GoogleVisionAPIConverter;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37no strict 'subs';
38
39use gsprintf;
40use FileUtils;
41
42##use ImagePlugin;
43use BaseMediaConverter;
44
45use utf8;
46use JSON; # qw( from_json, encode_json );
47
48sub BEGIN {
49 @GoogleVisionAPIConverter::ISA = ('BaseMediaConverter');
50}
51
52my $arguments = [
53 { 'name' => "google_application_credentials",
54 'desc' => "{GoogleVisionAPIConverter.google_applicatio_credentials}",
55 'type' => "string",
56 'reqd' => "no",
57 'deft' => "google-sa-credentials-key.json"
58 },
59 { 'name' => "enable_image_labelling",
60 'desc' => "{GoogleVisionAPIConverter.enable_image_labelling}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "enable_image_ocr",
64 'desc' => "{GoogleVisionAPIConverter.enable_image_ocr}",
65 'type' => "flag",
66 'reqd' => "no" },
67 { 'name' => "enable_document_ocr",
68 'desc' => "{GoogleVisionAPIConverter.enable_document_ocr}",
69 'type' => "flag",
70 'reqd' => "no" }
71];
72
73my $options = { 'name' => "GoogleVisionAPIConverter",
74 'desc' => "{GoogleVisionAPIConverter.desc}",
75 'abstract' => "no",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79sub new {
80 my ($class) = shift (@_);
81 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
82 push(@$pluginlist, $class);
83
84 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
85 push(@{$hashArgOptLists->{"OptList"}},$options);
86
87 my $self = new BaseMediaConverter($pluginlist, $inputargs, $hashArgOptLists, 1);
88
89 return bless $self, $class;
90}
91
92sub begin {
93 my $self = shift (@_);
94 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
95
96 if ($self->{'enable_image_ocr'} && $self->{'enable_document_ocr'}) {
97 print STDERR "Please use the following command syntax for vision types: (--enable_image_ocr | --enable_document_ocr) [--enable_image_labelling]\n";
98 print STDERR "\t\t --enable_image_ocr : optical character recognition for text within images\n";
99 print STDERR "\t\t --enable_document_ocr : optical character recognition for text within documents\n";
100 print STDERR "\t\t --enable_image_labelling : annotation labeling for objects within images\n";
101 exit(2);
102 }
103
104 $self->SUPER::begin(@_);
105}
106
107sub vision_monitor_line {
108 my ($line) = @_;
109
110 my $had_error = 0;
111 my $generate_dot = 0;
112
113 if ($line =~ m/^.*$/)
114 {
115 $generate_dot = 1;
116 }
117
118 return ($had_error,$generate_dot);
119}
120
121sub run_gv_convert {
122 my $self = shift (@_);
123 my ($filename,$file,$doc_obj,$opt_section) = @_;
124
125 my $section = (defined $opt_section) ? $opt_section : $doc_obj->get_top_section();
126
127 my $verbosity = $self->{'verbosity'};
128 my $outhandle = $self->{'outhandle'};
129 print $outhandle "----- GoogleVisionAPIConveter run_gv_convert -----\n";
130
131 my @vision_types = (); # array containing target ocr / labelling type(s)
132
133 if ($self->{'enable_image_labelling'}) { push(@vision_types, "enable_image_labelling"); }
134 if ($self->{'enable_image_ocr'}) { push(@vision_types, "enable_image_ocr"); }
135 if ($self->{'enable_document_ocr'}) { push(@vision_types, "enable_document_ocr"); }
136
137 my $vision_types_length = scalar(@vision_types);
138
139 if ($vision_types_length != 0) {
140
141 $self->init_cache_for_file($filename);
142 my $cached_image_dir = $self->{'cached_dir'};
143 # my $cached_image_root = $self->{'cached_file_root'};
144
145 # my $filename_no_path = &File::Basename::basename($filename);
146
147 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
148 my $credentials_filename = &FileUtils::filenameConcatenate($collect_dir, "etc", $self->{'google_application_credentials'});
149
150 for my $vision_type (@vision_types) {
151
152 my $ofile = "${vision_type}-google-vision-output.json";
153 my $ofilename = &FileUtils::filenameConcatenate($cached_image_dir,$ofile);
154
155 my $vision_cmd = "vision.py --$vision_type --credentials \"$credentials_filename\" \"$filename\" \"$ofilename\"";
156
157 $self->run_vision($file, $filename, $ofile, $ofilename, $vision_cmd, $vision_type, $doc_obj,$section);
158
159 if ($vision_type eq "enable_document_ocr") {
160 my $gv_dococr_rec = { 'filename' => $ofilename, 'section' => $section };
161
162 push(@{$self->{'gv-dococr-json-filename-recs'}}, $gv_dococr_rec);
163 }
164 }
165 }
166
167 return "json";
168}
169
170sub gv_ocr_bounding_box_rect
171{
172 my $self = shift (@_);
173 my ($gv_block,) = @_;
174
175 my $bbox_rect = undef;
176
177 my $gv_boundingBox = $gv_block->{'boundingBox'};
178
179 my $gv_vertices = $gv_boundingBox->{'vertices'};
180 my $gv_num_vertices = scalar(@$gv_vertices);
181
182 if ($gv_num_vertices > 0) {
183 # print STDERR "**** gs_vertices[0] = ", JSON::encode_json($gv_vertices->[0]), "\n";
184
185 # Discovered that sometimes the 'x' value in the 'vertices' structure is not defined
186 # So can't rely on picking up $gv_vertices->[0 for 'x' and 'y'
187 # start off with 'undef' and test for !defined in for-loop
188
189 my $min_x = undef;
190 my $min_y = undef;
191 my $max_x = undef;
192 my $max_y = undef;
193
194
195 for (my $v=0; $v<$gv_num_vertices; $v++) {
196 my $x = $gv_vertices->[$v]->{'x'};
197 my $y = $gv_vertices->[$v]->{'y'};
198
199 if (defined $x) {
200 $min_x = $x if (!defined $min_x || ($x < $min_x));
201 $max_x = $x if (!defined $max_x || ($x > $max_x));
202 }
203
204 if (defined $y) {
205 $min_y = $y if (!defined $min_y || ($y < $min_y));
206 $max_y = $y if (!defined $max_y || ($y > $max_y));
207 }
208 }
209
210 my $x_org = $min_x;
211 my $y_org = $min_y;
212 my $x_dim = $max_x - $min_x +1;
213 my $y_dim = $max_y - $min_y +1;
214
215 $bbox_rect = { "x_org" => $x_org, "y_org" => $y_org, "x_dim" => $x_dim, "y_dim" => $y_dim};
216 }
217
218 return $bbox_rect;
219}
220
221sub run_vision
222{
223 my $self = shift (@_);
224 my ($file, $filename, $ofile, $ofilename, $vision_cmd, $vision_type, $doc_obj,$section) = @_;
225
226 my $vision_regenerated;
227 my $vision_result;
228 my $vision_error;
229
230 my $print_info = {
231 'message_prefix' => "GoogleVisionAPI",
232 'message' => "Sending $file to GoogleVisionAPI using vision.py"
233 };
234
235 ($vision_regenerated,$vision_result,$vision_error)
236 = $self->run_cached_general_cmd($vision_cmd,$filename,$ofilename,$print_info);
237
238 # Need to work a bit harder in setting up the associated JSON file
239 # => strip of 'enable_' in favour of 'gv_'
240 # => add in section number as part of the file name to avoid clashes
241
242 my $section_file_suffix = $section;
243 $section_file_suffix =~ s/\./_/g;
244
245 my $assoc_ofile = $ofile;
246 $assoc_ofile =~ s/^enable_/gv_/;
247 $assoc_ofile =~ s/-google-vision//;
248 $assoc_ofile =~ s/\.(.*?)$/$section_file_suffix.$1/;
249
250 $doc_obj->associate_file($ofilename,$assoc_ofile,"application/json",$section);
251
252 my $json_text = do { # read in json file
253 open(my $json_fh, "<:encoding(UTF-8)", $ofilename)
254 or die("Can't open \"$ofilename\": $!\n");
255 local $/;
256 <$json_fh>
257 };
258
259 my $decoded_json = JSON::from_json($json_text);
260
261 my $ocr_text;
262 if ($vision_type eq "enable_document_ocr" || $vision_type eq "enable_image_ocr") {
263 $ocr_text = $decoded_json->{'textAnnotations'}[0]{'description'}; # access full ocr content
264 $doc_obj->add_utf8_text($section, $ocr_text); # append text to section
265
266 my $blocks = $decoded_json->{'fullTextAnnotation'}{'pages'}[0]{'blocks'};
267 my %text_and_language;
268
269 foreach my $block (@{$blocks}) {
270 foreach my $paragraph (@{$block->{'paragraphs'}}) {
271 foreach my $word (@{$paragraph->{'words'}}) {
272 my $detected_language = $word->{'property'}{'detectedLanguages'}[0]{'languageCode'} || "no_lang";
273 my $word_text = "";
274 foreach my $letter (@{$word->{'symbols'}}) {
275 $word_text .= $letter->{'text'};
276 }
277 $text_and_language{$detected_language} .= $word_text . " ";
278 }
279 }
280 }
281
282 for (keys %text_and_language) {
283 $doc_obj->add_utf8_metadata($section, "z_" . $_, $text_and_language{$_});
284 }
285
286
287 my $assoc_json_metaname = "HasGoogleVision";
288
289 if ($vision_type eq "enable_document_ocr") {
290 $assoc_json_metaname .= "DocumentOCRJSON";
291
292 $doc_obj->add_utf8_metadata($section, "GVDocumentOCRJSON",$assoc_ofile);
293 }
294 else {
295 # $vision_type eq "enable_image_ocr"
296 $assoc_json_metaname .= "ImageOCRJSON";
297
298 $doc_obj->add_utf8_metadata($section, "GVImageOCRJSON",$assoc_ofile);
299 }
300
301 $doc_obj->add_utf8_metadata($section, $assoc_json_metaname, 1);
302 }
303 elsif ($vision_type eq "enable_image_labelling") {
304 $ocr_text = $decoded_json->{'labelAnnotations'};
305 foreach my $label (@{$ocr_text}) {
306 # write to metadata : 'description'='Book' 'score'='0.9' 'topicality'='0.9' 'mid'='/m/0123'
307 $doc_obj->add_utf8_metadata($section, "description", $label->{'description'});
308 $doc_obj->add_utf8_metadata($section, "score", $label->{'score'});
309 $doc_obj->add_utf8_metadata($section, "topicality", $label->{'topicality'});
310 $doc_obj->add_utf8_metadata($section, "mid", $label->{'mid'});
311
312 # write to metadata : 'descriptions'='Book' 'Book_score'='0.9' 'Book_topicality'='0.9' 'Book_mid'='/m/0123'
313 $doc_obj->add_utf8_metadata($section, "descriptions", $label->{'description'});
314 $doc_obj->add_utf8_metadata($section, $label->{'description'} . "_score", $label->{'score'});
315 $doc_obj->add_utf8_metadata($section, $label->{'description'} . "_topicality", $label->{'topicality'});
316 $doc_obj->add_utf8_metadata($section, $label->{'description'} . "_mid", $label->{'mid'});
317
318 }
319
320 $doc_obj->add_utf8_metadata($section, "HasGoogleVisionImageLabellingJSON", 1);
321 $doc_obj->add_utf8_metadata($section, "GVImageLabellingJSON",$assoc_ofile);
322
323 }
324}
325
326sub start_openannotation_list
327{
328 my $self = shift (@_);
329 my ($doc_obj) = @_;
330
331 my $OID = $doc_obj->get_OID();
332
333 my $openannotation_list = {
334 "\@context" => "http://www.shared-canvas.org/ns/context.json",
335 # "\@id" => "https://iiif.harvardartmuseums.org/manifests/object/299843/list/47174896",
336 "\@id" => "${OID}/openannotation-list.json", # #### **** make full URL to be unique? or greenstone3:site:collect:OID ??
337 "\@type" => "sc:AnnotationList",
338 "resources" => []
339 };
340
341 $self->{'openannotation-list'} = $openannotation_list;
342}
343
344
345
346sub convert_gvocr_to_openannotation_resource
347{
348 my $self = shift (@_);
349 my ($gv_blocks, $doc_obj, $section) = @_;
350
351 my $OID = $doc_obj->get_OID();
352 my $OID_with_section = ($section ne "") ? "${OID}_$section" : $OID;
353 $section = 1 if ($section eq ""); # occurs when the document is a single image
354
355
356 # Example Open Annotation resource (for single annotation):
357# {
358# "@context": "http://iiif.io/api/presentation/2/context.json",
359# "@id": "https://iiif.harvardartmuseums.org/annotations/9641482",
360# "@type": "oa:Annotation",
361# "motivation": [
362# "oa:commenting"
363# ],
364# "on": {
365# "@type": "oa:SpecificResource",
366# "full": "https://iiif.harvardartmuseums.org/manifests/object/299843/canvas/canvas-47174896",
367# "selector": {
368# "@type": "oa:FragmentSelector",
369# "value": "xywh=622,591,642,940"
370# },
371# "within": {
372# "@id": "https://iiif.harvardartmuseums.org/manifests/object/299843",
373# "@type": "sc:Manifest"
374# }
375# },
376# "resource": [
377# {
378# "@type": "dctypes:Text",
379# "chars": "<p>age: 35-52<br/>gender: Female(66.337677%)<br/>CALM: 55.438412%<br/>CONFUSED: 3.949288%<br/>SURPRISED: 2.33092%<br/>DISGUSTED:
380# 0.545727%<br/>HAPPY: 1.549943%<br/>ANGRY: 2.082294%<br/>SAD: 34.103416%<br/></p><p>Generated by AWS Rekognition</p>",
381# "format": "text/html"
382# }
383# ]
384# },
385
386 my $self_openannotation_resources = $self->{'openannotation-list'}->{'resources'};
387
388 my $block_i = 0;
389
390 foreach my $block (@{$gv_blocks}) {
391 $block_i++;
392
393 my $openannotation_resource = {
394 "\@context" => "http://iiif.io/api/presentation/2/context.json",
395 # "\@id" => "https://iiif.harvardartmuseums.org/annotations/9641482",
396 "\@id" => "${OID_with_section}/annotation/gv-block-$block_i", # #### **** make full URL to be unique? or greenstone3:site:collect:OID ??
397 "\@type" => "oa:Annotation",
398 "motivation" => [ "oa:commenting" ]
399 };
400
401
402 my $bbox_rect = $self->gv_ocr_bounding_box_rect($block);
403 my $bb_x_org = $bbox_rect->{'x_org'};
404 my $bb_y_org = $bbox_rect->{'y_org'};
405 my $bb_x_dim = $bbox_rect->{'x_dim'};
406 my $bb_y_dim = $bbox_rect->{'y_dim'};
407
408 my $openannotation_on = {
409 "\@type" => "oa:SpecificResource",
410 # "full" => "https://iiif.harvardartmuseums.org/manifests/object/299843/canvas/canvas-47174896",
411 "full" => "${OID}/canvas/$section", # doc id + /canvas + page-i/sect # #### **** make full URL to be unique? or greenstone3:site:collect:OID ??
412 "selector" => {
413 "\@type" => "oa:FragmentSelector",
414 "value" => "xywh=${bb_x_org},${bb_y_org},${bb_x_dim},${bb_y_dim}"
415 },
416 "within" => {
417 #"\@id" => "https://iiif.harvardartmuseums.org/manifests/object/299843",
418 "\@id" => "${OID_with_section}/manifest", # #### **** make full URL to be unique? or greenstone3:site:collect:OID... ??
419 "\@type" => "sc:Manifest"
420 }
421 };
422 $openannotation_resource->{'on'} = $openannotation_on;
423
424
425 my $block_text_html = "";
426
427 foreach my $paragraph (@{$block->{'paragraphs'}}) {
428 my $para_text = "";
429
430 foreach my $word (@{$paragraph->{'words'}}) {
431 my $word_text = "";
432
433 foreach my $letter (@{$word->{'symbols'}}) {
434 $word_text .= $letter->{'text'};
435 }
436
437 $para_text .= " " if $para_text ne "";
438 $para_text .= $word_text;
439 }
440
441 $block_text_html .= "<p>\n$para_text\n</p>\n\n";
442 }
443
444 my $openannotation_inner_resource = [{
445 "\@type" => "dctypes:Text",
446 "chars" => "$block_text_html",
447 "format" => "text/html"
448 }];
449
450 $openannotation_resource->{'resource'} = $openannotation_inner_resource;
451
452 push(@$self_openannotation_resources,$openannotation_resource);
453 }
454}
455
456
457sub convert_and_append_openannotation_resources
458{
459 my $self = shift (@_);
460 my ($gv_dococr_json_filename, $doc_obj, $section) = @_;
461
462
463 # Read in JSON file
464 my $json_text = do {
465 open(my $json_fh, "<:encoding(UTF-8)", $gv_dococr_json_filename)
466 or die("Can't open \"$gv_dococr_json_filename\": $!\n");
467 local $/;
468 <$json_fh>
469 };
470
471 my $decoded_json = JSON::from_json($json_text);
472
473 my $gv_blocks = $decoded_json->{'fullTextAnnotation'}->{'pages'}->[0]->{'blocks'};
474
475 $self->convert_gvocr_to_openannotation_resource($gv_blocks, $doc_obj, $section);
476}
477
478
479
480sub end_openannotation_list
481{
482 my $self = shift (@_);
483 my ($doc_obj,$json_ofilename) = @_;
484
485 my $ret_status = 1;
486
487 if (!open(JOUT, "> $json_ofilename")) {
488 print STDERR "Error: Failed save Open Annotation List JSON to \"$json_ofilename\":\n $!\n";
489 $ret_status = 0;
490 }
491 else {
492 binmode(JOUT, ":utf8");
493
494 my $openannotation_list = $self->{'openannotation-list'};
495 my $openannotation_list_json_text = JSON::encode_json($openannotation_list);
496
497 print JOUT $openannotation_list_json_text;
498 close JOUT;
499
500 }
501
502 $self->{'openannotation-list'} = undef;
503
504 return $ret_status;
505}
506
507sub openannotation_list_associate_json
508{
509 my $self = shift (@_);
510 my ($doc_obj, $gv_dococr_json_filename_recs) = @_;
511
512 my $outhandle = $self->{'outhandle'};
513
514 # Guaranteed to have at least one value in gv_dococr_json_filename_recs
515 #
516 # Legacy code used to have a '\d+' just before the '.json' reflecting page/section number
517 # Keep this in the regular expression, just in case,
518 #
519 my ($gv_dococr_filename_root) = ($gv_dococr_json_filename_recs->[0]->{'filename'} =~ m/^(.+)(?:\d+)?\.json$/);
520
521 # slight of hand so new directory spot in cache_dir picked out is where we want it!
522 $gv_dococr_filename_root .= "/";
523
524 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
525 my $toplevel_cached_dir = &FileUtils::filenameConcatenate($collect_dir,"cached");
526
527 $self->init_cache_for_file($gv_dococr_filename_root);
528 my $cached_dir = $self->{'cached_dir'};
529
530 my $assoc_openannotation_json_ofile = "openannotation-list.json";
531 my $cached_openannotation_json_ofilename = &FileUtils::filenameConcatenate($cached_dir,$assoc_openannotation_json_ofile);
532
533 my $needs_json_regen = 0;
534
535 if (!-f $cached_openannotation_json_ofilename) {
536 $needs_json_regen = 1;
537 }
538 else {
539 for my $gv_json_filename_rec (@$gv_dococr_json_filename_recs) {
540 my $gv_json_filename = $gv_json_filename_rec->{'filename'};
541 if (-M $gv_json_filename > -M $cached_openannotation_json_ofilename) {
542 $needs_json_regen = 1;
543 last;
544 }
545 }
546 }
547
548 my $saved_ok = 1;
549
550 if ($needs_json_regen) {
551
552 print $outhandle " OpenAnnotation-List: Generating $cached_openannotation_json_ofilename\n";
553
554 $self->start_openannotation_list($doc_obj);
555
556 for my $gv_json_filename_rec (@$gv_dococr_json_filename_recs) {
557 my $gv_json_filename = $gv_json_filename_rec->{'filename'};
558 my $section = $gv_json_filename_rec->{'section'};
559 $self->convert_and_append_openannotation_resources($gv_json_filename, $doc_obj,$section);
560 }
561
562 $saved_ok = $self->end_openannotation_list($doc_obj,$cached_openannotation_json_ofilename);
563 }
564
565 if ($saved_ok) {
566 print $outhandle " OpenAnnotation-List: Cached file $cached_openannotation_json_ofilename already exists\n";
567
568 my $top_section = $doc_obj->get_top_section();
569 $doc_obj->associate_file($cached_openannotation_json_ofilename,$assoc_openannotation_json_ofile,"application/json",$top_section);
570 }
571
572 return $saved_ok;
573}
574
575sub opt_run_gen_openannotation
576{
577 my $self = shift (@_);
578 my ($doc_obj) = @_;
579
580 my $gv_dococr_json_filename_recs = $self->{'gv-dococr-json-filename-recs'};
581 my $num_gv_dococr_json_filename_recs = scalar(@$gv_dococr_json_filename_recs);
582
583 my $ret_val_ok = 1;
584
585 if ($num_gv_dococr_json_filename_recs > 0) {
586 $ret_val_ok = $self->openannotation_list_associate_json($doc_obj,$gv_dococr_json_filename_recs);
587 }
588
589 return $ret_val_ok;
590}
591
5921;
593
Note: See TracBrowser for help on using the repository browser.