source: gs3-extensions/structured-image/trunk/perllib/plugins/GoogleVisionAPIConverter.pm@ 37046

Last change on this file since 37046 was 37046, checked in by davidb, 9 months ago

Code extended to now generate Open Annotation (JSON format) of the OCR'd blocks of text; some refinement of the existing Google Vision perl code

File size: 20.2 KB
Line 
1######################################################################
2#
3# GoogleVisionAPIConverter.pm -- helper plugin that allows other plugins
4# (such as ImagePlugin and PagedImagePlugin) to extend their
5# processing capability through sub-classing inheritence (such as
6# GoogleVisionImagePlugin and GoogleVisionPagedImagePlugin) to
7# expand the image processing capabilities at ingest time to
8# include the Google Vision API allowing for: metadata labelling
9# of objects within a scene; and OCR text recognition.
10#
11# A component of the Greenstone digital library software
12# from the New Zealand Digital Library Project at the
13# University of Waikato, New Zealand.
14#
15# Copyright (C) 1999 New Zealand Digital Library Project
16#
17# This program is free software; you can redistribute it and/or modify
18# it under the terms of the GNU General Public License as published by
19# the Free Software Foundation; either version 2 of the License, or
20# (at your option) any later version.
21#
22# This program is distributed in the hope that it will be useful,
23# but WITHOUT ANY WARRANTY; without even the implied warranty of
24# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25# GNU General Public License for more details.
26#
27# You should have received a copy of the GNU General Public License
28# along with this program; if not, write to the Free Software
29# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30#
31###########################################################################
32
33package GoogleVisionAPIConverter;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37no strict 'subs';
38
39use gsprintf;
40use FileUtils;
41
42##use ImagePlugin;
43use BaseMediaConverter;
44
45use utf8;
46use JSON; # qw( from_json, encode_json );
47
48sub BEGIN {
49 @GoogleVisionAPIConverter::ISA = ('BaseMediaConverter');
50}
51
52my $arguments = [
53 { 'name' => "google_application_credentials",
54 'desc' => "{GoogleVisionAPIConverter.google_applicatio_credentials}",
55 'type' => "string",
56 'reqd' => "no",
57 'deft' => "google-sa-credentials-key.json"
58 },
59 { 'name' => "enable_image_labelling",
60 'desc' => "{GoogleVisionAPIConverter.enable_image_labelling}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "enable_image_ocr",
64 'desc' => "{GoogleVisionAPIConverter.enable_image_ocr}",
65 'type' => "flag",
66 'reqd' => "no" },
67 { 'name' => "enable_document_ocr",
68 'desc' => "{GoogleVisionAPIConverter.enable_document_ocr}",
69 'type' => "flag",
70 'reqd' => "no" }
71];
72
73my $options = { 'name' => "GoogleVisionAPIConverter",
74 'desc' => "{GoogleVisionAPIConverter.desc}",
75 'abstract' => "no",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79sub new {
80 my ($class) = shift (@_);
81 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
82 push(@$pluginlist, $class);
83
84 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
85 push(@{$hashArgOptLists->{"OptList"}},$options);
86
87 my $self = new BaseMediaConverter($pluginlist, $inputargs, $hashArgOptLists, 1);
88
89 return bless $self, $class;
90}
91
92sub begin {
93 my $self = shift (@_);
94 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
95
96 if ($self->{'enable_image_ocr'} && $self->{'enable_document_ocr'}) {
97 print STDERR "Please use the following command syntax for vision types: (--enable_image_ocr | --enable_document_ocr) [--enable_image_labelling]\n";
98 print STDERR "\t\t --enable_image_ocr : optical character recognition for text within images\n";
99 print STDERR "\t\t --enable_document_ocr : optical character recognition for text within documents\n";
100 print STDERR "\t\t --enable_image_labelling : annotation labeling for objects within images\n";
101 exit(2);
102 }
103
104 $self->SUPER::begin(@_);
105}
106
107sub vision_monitor_line {
108 my ($line) = @_;
109
110 my $had_error = 0;
111 my $generate_dot = 0;
112
113 if ($line =~ m/^.*$/)
114 {
115 $generate_dot = 1;
116 }
117
118 return ($had_error,$generate_dot);
119}
120
121sub run_gv_convert {
122 my $self = shift (@_);
123 my ($filename,$file,$doc_obj,$opt_section) = @_;
124
125 my $section = (defined $opt_section) ? $opt_section : $doc_obj->get_top_section();
126
127 my $verbosity = $self->{'verbosity'};
128 my $outhandle = $self->{'outhandle'};
129 print $outhandle "----- GoogleVisionAPIConveter run_gv_convert -----\n";
130
131 my @vision_types = (); # array containing target ocr / labelling type(s)
132
133 if ($self->{'enable_image_labelling'}) { push(@vision_types, "enable_image_labelling"); }
134 if ($self->{'enable_image_ocr'}) { push(@vision_types, "enable_image_ocr"); }
135 if ($self->{'enable_document_ocr'}) { push(@vision_types, "enable_document_ocr"); }
136
137 my $vision_types_length = scalar(@vision_types);
138
139 if ($vision_types_length != 0) {
140
141 $self->init_cache_for_file($filename);
142 my $cached_image_dir = $self->{'cached_dir'};
143 # my $cached_image_root = $self->{'cached_file_root'};
144
145 # my $filename_no_path = &File::Basename::basename($filename);
146
147 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
148 my $credentials_filename = &FileUtils::filenameConcatenate($collect_dir, "etc", $self->{'google_application_credentials'});
149
150 for my $vision_type (@vision_types) {
151
152 my $ofile = "${vision_type}-google-vision-output.json";
153 my $ofilename = &FileUtils::filenameConcatenate($cached_image_dir,$ofile);
154
155 my $vision_cmd = "vision.py --$vision_type --credentials \"$credentials_filename\" \"$filename\" \"$ofilename\"";
156
157 $self->run_vision($file, $filename, $ofile, $ofilename, $vision_cmd, $vision_type, $doc_obj,$section);
158
159 if ($vision_type eq "enable_document_ocr") {
160 my $gv_dococr_rec = { 'filename' => $ofilename, 'section' => $section };
161
162 push(@{$self->{'gv-dococr-json-filename-recs'}}, $gv_dococr_rec);
163 }
164 }
165 }
166
167 return "json";
168}
169
170sub gv_ocr_bounding_box_rect
171{
172 my $self = shift (@_);
173 my ($gv_block,) = @_;
174
175 my $bbox_rect = undef;
176
177 my $gv_boundingBox = $gv_block->{'boundingBox'};
178
179 my $gv_vertices = $gv_boundingBox->{'vertices'};
180 my $gv_num_vertices = scalar(@$gv_vertices);
181
182 if ($gv_num_vertices > 0) {
183 # print STDERR "**** gs_vertices[0] = ", JSON::encode_json($gv_vertices->[0]), "\n";
184
185 # Discovered that sometimes the 'x' value in the 'vertices' structure is not defined
186 # So can't rely on picking up $gv_vertices->[0 for 'x' and 'y'
187 # start off with 'undef' and test for !defined in for-loop
188
189 my $min_x = undef;
190 my $min_y = undef;
191 my $max_x = undef;
192 my $max_y = undef;
193
194
195 for (my $v=0; $v<$gv_num_vertices; $v++) {
196 my $x = $gv_vertices->[$v]->{'x'};
197 my $y = $gv_vertices->[$v]->{'y'};
198
199 if (defined $x) {
200 $min_x = $x if (!defined $min_x || ($x < $min_x));
201 $max_x = $x if (!defined $max_x || ($x > $max_x));
202 }
203
204 if (defined $y) {
205 $min_y = $y if (!defined $min_y || ($y < $min_y));
206 $max_y = $y if (!defined $max_y || ($y > $max_y));
207 }
208 }
209
210 my $x_org = $min_x;
211 my $y_org = $min_y;
212 my $x_dim = $max_x - $min_x +1;
213 my $y_dim = $max_y - $min_y +1;
214
215 $bbox_rect = { "x_org" => $x_org, "y_org" => $y_org, "x_dim" => $x_dim, "y_dim" => $y_dim};
216 }
217
218 return $bbox_rect;
219}
220
221sub run_vision
222{
223 my $self = shift (@_);
224 my ($file, $filename, $ofile, $ofilename, $vision_cmd, $vision_type, $doc_obj,$section) = @_;
225
226 my $vision_regenerated;
227 my $vision_result;
228 my $vision_error;
229
230 my $print_info = {
231 'message_prefix' => "GoogleVisionAPI",
232 'message' => "Sending $file to GoogleVisionAPI using vision.py"
233 };
234
235 ($vision_regenerated,$vision_result,$vision_error)
236 = $self->run_cached_general_cmd($vision_cmd,$filename,$ofilename,$print_info);
237
238 # Need to work a bit harder in setting up the associated JSON file
239 # => strip of 'enable_' in favour of 'gv_'
240 # => add in section number as part of the file name to avoid clashes
241
242 my $section_file_suffix = $section;
243 $section_file_suffix =~ s/\./_/g;
244
245 my $assoc_ofile = $ofile;
246 $assoc_ofile =~ s/^enable_/gv_/;
247 $assoc_ofile =~ s/-google-vision//;
248 $assoc_ofile =~ s/\.(.*?)$/$section_file_suffix.$1/;
249
250 $doc_obj->associate_file($ofilename,$assoc_ofile,"application/json",$section);
251
252 my $json_text = do { # read in json file
253 open(my $json_fh, "<:encoding(UTF-8)", $ofilename)
254 or die("Can't open \"$ofilename\": $!\n");
255 local $/;
256 <$json_fh>
257 };
258
259 my $decoded_json = JSON::from_json($json_text);
260
261 my $ocr_text;
262 if ($vision_type eq "enable_document_ocr" || $vision_type eq "enable_image_ocr") {
263 $ocr_text = $decoded_json->{'textAnnotations'}[0]{'description'}; # access full ocr content
264 $doc_obj->add_utf8_text($section, $ocr_text); # append text to section
265
266 my $blocks = $decoded_json->{'fullTextAnnotation'}{'pages'}[0]{'blocks'};
267 my %text_and_language;
268
269 foreach my $block (@{$blocks}) {
270 foreach my $paragraph (@{$block->{'paragraphs'}}) {
271 foreach my $word (@{$paragraph->{'words'}}) {
272 my $detected_language = $word->{'property'}{'detectedLanguages'}[0]{'languageCode'} || "no_lang";
273 my $word_text = "";
274 foreach my $letter (@{$word->{'symbols'}}) {
275 $word_text .= $letter->{'text'};
276 }
277 $text_and_language{$detected_language} .= $word_text . " ";
278 }
279 }
280 }
281
282 for (keys %text_and_language) {
283 $doc_obj->add_utf8_metadata($section, "z_" . $_, $text_and_language{$_});
284 }
285
286
287 my $assoc_json_metaname = "HasGoogleVision";
288
289 if ($vision_type eq "enable_document_ocr") {
290 $assoc_json_metaname .= "DocumentOCRJSON";
291
292 $doc_obj->add_utf8_metadata($section, "GVDocumentOCRJSON",$assoc_ofile);
293 }
294 else {
295 # $vision_type eq "enable_image_ocr"
296 $assoc_json_metaname .= "ImageOCRJSON";
297
298 $doc_obj->add_utf8_metadata($section, "GVImageOCRJSON",$assoc_ofile);
299 }
300
301 $doc_obj->add_utf8_metadata($section, $assoc_json_metaname, 1);
302 }
303 elsif ($vision_type eq "enable_image_labelling") {
304 $ocr_text = $decoded_json->{'labelAnnotations'};
305 foreach my $label (@{$ocr_text}) {
306 # write to metadata : 'description'='Book' 'score'='0.9' 'topicality'='0.9' 'mid'='/m/0123'
307 $doc_obj->add_utf8_metadata($section, "description", $label->{'description'});
308 $doc_obj->add_utf8_metadata($section, "score", $label->{'score'});
309 $doc_obj->add_utf8_metadata($section, "topicality", $label->{'topicality'});
310 $doc_obj->add_utf8_metadata($section, "mid", $label->{'mid'});
311
312 # write to metadata : 'descriptions'='Book' 'Book_score'='0.9' 'Book_topicality'='0.9' 'Book_mid'='/m/0123'
313 $doc_obj->add_utf8_metadata($section, "descriptions", $label->{'description'});
314 $doc_obj->add_utf8_metadata($section, $label->{'description'} . "_score", $label->{'score'});
315 $doc_obj->add_utf8_metadata($section, $label->{'description'} . "_topicality", $label->{'topicality'});
316 $doc_obj->add_utf8_metadata($section, $label->{'description'} . "_mid", $label->{'mid'});
317
318 }
319
320 $doc_obj->add_utf8_metadata($section, "HasGoogleVisionImageLabellingJSON", 1);
321 $doc_obj->add_utf8_metadata($section, "GVImageLabellingJSON",$assoc_ofile);
322
323 }
324}
325
326sub start_openannotation_list
327{
328 my $self = shift (@_);
329 my ($doc_obj) = @_;
330
331 my $OID = $doc_obj->get_OID();
332
333 my $openannotation_list = {
334 "\@context" => "http://www.shared-canvas.org/ns/context.json",
335 # "\@id" => "https://iiif.harvardartmuseums.org/manifests/object/299843/list/47174896",
336 "\@id" => "${OID}/openannotation-list.json", # #### **** make full URL to be unique? or greenstone3:site:collect:OID ??
337 "\@type" => "sc:AnnotationList",
338 "resources" => []
339 };
340
341 $self->{'openannotation-list'} = $openannotation_list;
342}
343
344
345
346sub convert_gvocr_to_openannotation_resource
347{
348 my $self = shift (@_);
349 my ($gv_blocks, $doc_obj, $section) = @_;
350
351 my $OID = $doc_obj->get_OID();
352 my $OID_with_section = ($section ne "") ? "${OID}_$section" : $OID;
353 $section = 1 if ($section eq ""); # occurs when the document is a single image
354
355
356 # Example Open Annotation resource (for single annotation):
357# {
358# "@context": "http://iiif.io/api/presentation/2/context.json",
359# "@id": "https://iiif.harvardartmuseums.org/annotations/9641482",
360# "@type": "oa:Annotation",
361# "motivation": [
362# "oa:commenting"
363# ],
364# "on": {
365# "@type": "oa:SpecificResource",
366# "full": "https://iiif.harvardartmuseums.org/manifests/object/299843/canvas/canvas-47174896",
367# "selector": {
368# "@type": "oa:FragmentSelector",
369# "value": "xywh=622,591,642,940"
370# },
371# "within": {
372# "@id": "https://iiif.harvardartmuseums.org/manifests/object/299843",
373# "@type": "sc:Manifest"
374# }
375# },
376# "resource": [
377# {
378# "@type": "dctypes:Text",
379# "chars": "<p>age: 35-52<br/>gender: Female(66.337677%)<br/>CALM: 55.438412%<br/>CONFUSED: 3.949288%<br/>SURPRISED: 2.33092%<br/>DISGUSTED:
380# 0.545727%<br/>HAPPY: 1.549943%<br/>ANGRY: 2.082294%<br/>SAD: 34.103416%<br/></p><p>Generated by AWS Rekognition</p>",
381# "format": "text/html"
382# }
383# ]
384# },
385
386 my $self_openannotation_resources = $self->{'openannotation-list'}->{'resources'};
387
388 my $block_i = 0;
389
390 foreach my $block (@{$gv_blocks}) {
391 $block_i++;
392
393 my $openannotation_resource = {
394 "\@context" => "http://iiif.io/api/presentation/2/context.json",
395 # "\@id" => "https://iiif.harvardartmuseums.org/annotations/9641482",
396 "\@id" => "${OID_with_section}/annotation/gv-block-$block_i", # #### **** make full URL to be unique? or greenstone3:site:collect:OID ??
397 "\@type" => "oa:Annotation",
398 "motivation" => [ "oa:commenting" ]
399 };
400
401
402 my $bbox_rect = $self->gv_ocr_bounding_box_rect($block);
403 my $bb_x_org = $bbox_rect->{'x_org'};
404 my $bb_y_org = $bbox_rect->{'y_org'};
405 my $bb_x_dim = $bbox_rect->{'x_dim'};
406 my $bb_y_dim = $bbox_rect->{'y_dim'};
407
408 my $openannotation_on = {
409 "\@type" => "oa:SpecificResource",
410 # "full" => "https://iiif.harvardartmuseums.org/manifests/object/299843/canvas/canvas-47174896",
411 "full" => "${OID}/canvas/$section", # doc id + /canvas + page-i/sect # #### **** make full URL to be unique? or greenstone3:site:collect:OID ??
412 "selector" => {
413 "\@type" => "oa:FragmentSelector",
414 "value" => "xywh=${bb_x_org},${bb_y_org},${bb_x_dim},${bb_y_dim}"
415 },
416 "within" => {
417 #"\@id" => "https://iiif.harvardartmuseums.org/manifests/object/299843",
418 "\@id" => "${OID_with_section}/manifest", # #### **** make full URL to be unique? or greenstone3:site:collect:OID... ??
419 "\@type" => "sc:Manifest"
420 }
421 };
422 $openannotation_resource->{'on'} = $openannotation_on;
423
424
425 my $block_text_html = "";
426
427 foreach my $paragraph (@{$block->{'paragraphs'}}) {
428 my $para_text = "";
429
430 foreach my $word (@{$paragraph->{'words'}}) {
431 my $word_text = "";
432
433 foreach my $letter (@{$word->{'symbols'}}) {
434 $word_text .= $letter->{'text'};
435 }
436
437 $para_text .= " " if $para_text ne "";
438 $para_text .= $word_text;
439 }
440
441 $block_text_html .= "<p>\n$para_text\n</p>\n\n";
442 }
443
444 my $openannotation_inner_resource = [{
445 "\@type" => "dctypes:Text",
446 "chars" => "$block_text_html",
447 "format" => "text/html"
448 }];
449
450 $openannotation_resource->{'resource'} = $openannotation_inner_resource;
451
452 push(@$self_openannotation_resources,$openannotation_resource);
453 }
454}
455
456
457sub convert_and_append_openannotation_resources
458{
459 my $self = shift (@_);
460 my ($gv_dococr_json_filename, $doc_obj, $section) = @_;
461
462
463 # Read in JSON file
464 my $json_text = do {
465 open(my $json_fh, "<:encoding(UTF-8)", $gv_dococr_json_filename)
466 or die("Can't open \"$gv_dococr_json_filename\": $!\n");
467 local $/;
468 <$json_fh>
469 };
470
471 my $decoded_json = JSON::from_json($json_text);
472
473 my $gv_blocks = $decoded_json->{'fullTextAnnotation'}->{'pages'}->[0]->{'blocks'};
474
475 $self->convert_gvocr_to_openannotation_resource($gv_blocks, $doc_obj, $section);
476}
477
478
479
480sub end_openannotation_list
481{
482 my $self = shift (@_);
483 my ($doc_obj,$json_ofilename) = @_;
484
485 my $ret_status = 1;
486
487 if (!open(JOUT, "> $json_ofilename")) {
488 print STDERR "Error: Failed save Open Annotation List JSON to \"$json_ofilename\":\n $!\n";
489 $ret_status = 0;
490 }
491 else {
492 binmode(JOUT, ":utf8");
493
494 my $openannotation_list = $self->{'openannotation-list'};
495 my $openannotation_list_json_text = JSON::encode_json($openannotation_list);
496
497 print JOUT $openannotation_list_json_text;
498 close JOUT;
499
500 }
501
502 $self->{'openannotation-list'} = undef;
503
504 return $ret_status;
505}
506
507sub openannotation_list_associate_json
508{
509 my $self = shift (@_);
510 my ($doc_obj, $gv_dococr_json_filename_recs) = @_;
511
512 my $outhandle = $self->{'outhandle'};
513
514 # Guaranteed to have at least one value in gv_dococr_json_filename_recs
515 #
516 # Legacy code used to have a '\d+' just before the '.json' reflecting page/section number
517 # Keep this in the regular expression, just in case,
518 #
519 my ($gv_dococr_filename_root) = ($gv_dococr_json_filename_recs->[0]->{'filename'} =~ m/^(.+)(?:\d+)?\.json$/);
520
521 # slight of hand so new directory spot in cache_dir picked out is where we want it!
522 $gv_dococr_filename_root .= "/";
523
524 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
525 my $toplevel_cached_dir = &FileUtils::filenameConcatenate($collect_dir,"cached");
526
527 $self->init_cache_for_file($gv_dococr_filename_root);
528 my $cached_dir = $self->{'cached_dir'};
529
530 my $assoc_openannotation_json_ofile = "openannotation-list.json";
531 my $cached_openannotation_json_ofilename = &FileUtils::filenameConcatenate($cached_dir,$assoc_openannotation_json_ofile);
532
533 my $needs_json_regen = 0;
534
535 if (!-f $cached_openannotation_json_ofilename) {
536 $needs_json_regen = 1;
537 }
538 else {
539 for my $gv_json_filename_rec (@$gv_dococr_json_filename_recs) {
540 my $gv_json_filename = $gv_json_filename_rec->{'filename'};
541 if (-M $gv_json_filename > -M $cached_openannotation_json_ofilename) {
542 $needs_json_regen = 1;
543 last;
544 }
545 }
546 }
547
548 my $saved_ok = 1;
549
550 if ($needs_json_regen) {
551
552 print $outhandle " OpenAnnotation-List: Generating $cached_openannotation_json_ofilename\n";
553
554 $self->start_openannotation_list($doc_obj);
555
556 for my $gv_json_filename_rec (@$gv_dococr_json_filename_recs) {
557 my $gv_json_filename = $gv_json_filename_rec->{'filename'};
558 my $section = $gv_json_filename_rec->{'section'};
559 $self->convert_and_append_openannotation_resources($gv_json_filename, $doc_obj,$section);
560 }
561
562 $saved_ok = $self->end_openannotation_list($doc_obj,$cached_openannotation_json_ofilename);
563 }
564
565 if ($saved_ok) {
566 print $outhandle " OpenAnnotation-List: Cached file $cached_openannotation_json_ofilename already exists\n";
567
568 my $top_section = $doc_obj->get_top_section();
569 $doc_obj->associate_file($cached_openannotation_json_ofilename,$assoc_openannotation_json_ofile,"application/json",$top_section);
570 }
571
572 return $saved_ok;
573}
574
575sub opt_run_gen_openannotation
576{
577 my $self = shift (@_);
578 my ($doc_obj) = @_;
579
580 my $gv_dococr_json_filename_recs = $self->{'gv-dococr-json-filename-recs'};
581 my $num_gv_dococr_json_filename_recs = scalar(@$gv_dococr_json_filename_recs);
582
583 my $ret_val_ok = 1;
584
585 if ($num_gv_dococr_json_filename_recs > 0) {
586 $ret_val_ok = $self->openannotation_list_associate_json($doc_obj,$gv_dococr_json_filename_recs);
587 }
588
589 return $ret_val_ok;
590}
591
5921;
593
Note: See TracBrowser for help on using the repository browser.