source: gs3-extensions/structured-image/trunk/perllib/plugins/GoogleVisionAPIConverter.pm@ 37065

Last change on this file since 37065 was 37065, checked in by davidb, 16 months ago

An array of changes resulting from testing with the Mirador3 viewer in combination with the SimpleAnnotationSever: most of the changes relate to the fact that SAS works with IIIF v2, and MIrador3 works with v3

File size: 30.2 KB
Line 
1######################################################################
2#
3# GoogleVisionAPIConverter.pm -- helper plugin that allows other plugins
4# (such as ImagePlugin and PagedImagePlugin) to extend their
5# processing capability through sub-classing inheritence (such as
6# GoogleVisionImagePlugin and GoogleVisionPagedImagePlugin) to
7# expand the image processing capabilities at ingest time to
8# include the Google Vision API allowing for: metadata labelling
9# of objects within a scene; and OCR text recognition.
10#
11# A component of the Greenstone digital library software
12# from the New Zealand Digital Library Project at the
13# University of Waikato, New Zealand.
14#
15# Copyright (C) 1999 New Zealand Digital Library Project
16#
17# This program is free software; you can redistribute it and/or modify
18# it under the terms of the GNU General Public License as published by
19# the Free Software Foundation; either version 2 of the License, or
20# (at your option) any later version.
21#
22# This program is distributed in the hope that it will be useful,
23# but WITHOUT ANY WARRANTY; without even the implied warranty of
24# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25# GNU General Public License for more details.
26#
27# You should have received a copy of the GNU General Public License
28# along with this program; if not, write to the Free Software
29# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30#
31###########################################################################
32
33package GoogleVisionAPIConverter;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37no strict 'subs';
38
39use gsprintf;
40use FileUtils;
41
42##use ImagePlugin;
43use BaseMediaConverter;
44
45use utf8;
46use JSON; # qw( from_json, encode_json );
47
48sub BEGIN {
49 @GoogleVisionAPIConverter::ISA = ('BaseMediaConverter');
50}
51
52my $arguments = [
53 { 'name' => "google_application_credentials",
54 'desc' => "{GoogleVisionAPIConverter.google_applicatio_credentials}",
55 'type' => "string",
56 'reqd' => "no",
57 'deft' => "google-sa-credentials-key.json"
58 },
59 { 'name' => "enable_image_labelling",
60 'desc' => "{GoogleVisionAPIConverter.enable_image_labelling}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "enable_image_ocr",
64 'desc' => "{GoogleVisionAPIConverter.enable_image_ocr}",
65 'type' => "flag",
66 'reqd' => "no" },
67 { 'name' => "enable_document_ocr",
68 'desc' => "{GoogleVisionAPIConverter.enable_document_ocr}",
69 'type' => "flag",
70 'reqd' => "no" }
71];
72
73my $options = { 'name' => "GoogleVisionAPIConverter",
74 'desc' => "{GoogleVisionAPIConverter.desc}",
75 'abstract' => "no",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79sub new {
80 my ($class) = shift (@_);
81 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
82 push(@$pluginlist, $class);
83
84 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
85 push(@{$hashArgOptLists->{"OptList"}},$options);
86
87 my $self = new BaseMediaConverter($pluginlist, $inputargs, $hashArgOptLists, 1);
88
89 return bless $self, $class;
90}
91
92sub begin {
93 my $self = shift (@_);
94 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
95
96 if ($self->{'enable_image_ocr'} && $self->{'enable_document_ocr'}) {
97 print STDERR "Please use the following command syntax for vision types: (--enable_image_ocr | --enable_document_ocr) [--enable_image_labelling]\n";
98 print STDERR "\t\t --enable_image_ocr : optical character recognition for text within images\n";
99 print STDERR "\t\t --enable_document_ocr : optical character recognition for text within documents\n";
100 print STDERR "\t\t --enable_image_labelling : annotation labeling for objects within images\n";
101 exit(2);
102 }
103
104 $self->SUPER::begin(@_);
105}
106
107sub vision_monitor_line {
108 my ($line) = @_;
109
110 my $had_error = 0;
111 my $generate_dot = 0;
112
113 if ($line =~ m/^.*$/)
114 {
115 $generate_dot = 1;
116 }
117
118 return ($had_error,$generate_dot);
119}
120
121sub run_gv_convert {
122 my $self = shift (@_);
123 my ($filename,$file,$doc_obj,$opt_section) = @_;
124
125 my $section = (defined $opt_section) ? $opt_section : $doc_obj->get_top_section();
126
127 my $verbosity = $self->{'verbosity'};
128 my $outhandle = $self->{'outhandle'};
129 print $outhandle "----- GoogleVisionAPIConveter run_gv_convert -----\n";
130
131 my @vision_types = (); # array containing target ocr / labelling type(s)
132
133 if ($self->{'enable_image_labelling'}) { push(@vision_types, "enable_image_labelling"); }
134 if ($self->{'enable_image_ocr'}) { push(@vision_types, "enable_image_ocr"); }
135 if ($self->{'enable_document_ocr'}) { push(@vision_types, "enable_document_ocr"); }
136
137 my $vision_types_length = scalar(@vision_types);
138
139 if ($vision_types_length != 0) {
140
141 $self->init_cache_for_file($filename);
142 my $cached_image_dir = $self->{'cached_dir'};
143 # my $cached_image_root = $self->{'cached_file_root'};
144
145 # my $filename_no_path = &File::Basename::basename($filename);
146
147 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
148 my $credentials_filename = &FileUtils::filenameConcatenate($collect_dir, "etc", $self->{'google_application_credentials'});
149
150 for my $vision_type (@vision_types) {
151
152 my $ofile = "${vision_type}-google-vision-output.json";
153 my $ofilename = &FileUtils::filenameConcatenate($cached_image_dir,$ofile);
154
155 my $vision_cmd = "vision.py --$vision_type --credentials \"$credentials_filename\" \"$filename\" \"$ofilename\"";
156
157 $self->run_vision($file, $filename, $ofile, $ofilename, $vision_cmd, $vision_type, $doc_obj,$section);
158
159 if ($vision_type eq "enable_document_ocr") {
160 my $gv_dococr_rec = { 'filename' => $ofilename, 'section' => $section };
161
162 push(@{$self->{'gv-dococr-json-filename-recs'}}, $gv_dococr_rec);
163 }
164 }
165 }
166
167 return "json";
168}
169
170sub gv_ocr_bounding_box_rect
171{
172 my $self = shift (@_);
173 my ($gv_block,) = @_;
174
175 my $bbox_rect = undef;
176
177 my $gv_boundingBox = $gv_block->{'boundingBox'};
178
179 my $gv_vertices = $gv_boundingBox->{'vertices'};
180 my $gv_num_vertices = scalar(@$gv_vertices);
181
182 if ($gv_num_vertices > 0) {
183 # print STDERR "**** gs_vertices[0] = ", JSON::encode_json($gv_vertices->[0]), "\n";
184
185 # Discovered that sometimes the 'x' value in the 'vertices' structure is not defined
186 # So can't rely on picking up $gv_vertices->[0 for 'x' and 'y'
187 # start off with 'undef' and test for !defined in for-loop
188
189 my $min_x = undef;
190 my $min_y = undef;
191 my $max_x = undef;
192 my $max_y = undef;
193
194
195 for (my $v=0; $v<$gv_num_vertices; $v++) {
196 my $x = $gv_vertices->[$v]->{'x'};
197 my $y = $gv_vertices->[$v]->{'y'};
198
199 if (defined $x) {
200 $min_x = $x if (!defined $min_x || ($x < $min_x));
201 $max_x = $x if (!defined $max_x || ($x > $max_x));
202 }
203
204 if (defined $y) {
205 $min_y = $y if (!defined $min_y || ($y < $min_y));
206 $max_y = $y if (!defined $max_y || ($y > $max_y));
207 }
208 }
209
210 my $x_org = $min_x;
211 my $y_org = $min_y;
212 my $x_dim = $max_x - $min_x +1;
213 my $y_dim = $max_y - $min_y +1;
214
215 $bbox_rect = { "x_org" => $x_org, "y_org" => $y_org, "x_dim" => $x_dim, "y_dim" => $y_dim};
216 }
217
218 return $bbox_rect;
219}
220
221sub run_vision
222{
223 my $self = shift (@_);
224 my ($file, $filename, $ofile, $ofilename, $vision_cmd, $vision_type, $doc_obj,$section) = @_;
225
226 my $vision_regenerated;
227 my $vision_result;
228 my $vision_error;
229
230 my $print_info = {
231 'message_prefix' => "GoogleVisionAPI",
232 'message' => "Sending $file to GoogleVisionAPI using vision.py"
233 };
234
235 ($vision_regenerated,$vision_result,$vision_error)
236 = $self->run_cached_general_cmd($vision_cmd,$filename,$ofilename,$print_info);
237
238 # Need to work a bit harder in setting up the associated JSON file
239 # => strip of 'enable_' in favour of 'gv_'
240 # => add in section number as part of the file name to avoid clashes
241
242 my $section_file_suffix = $section;
243 $section_file_suffix =~ s/\./_/g;
244
245 my $assoc_ofile = $ofile;
246 $assoc_ofile =~ s/^enable_/gv_/;
247 $assoc_ofile =~ s/-google-vision//;
248 $assoc_ofile =~ s/\.(.*?)$/$section_file_suffix.$1/;
249
250 $doc_obj->associate_file($ofilename,$assoc_ofile,"application/json",$section);
251
252 my $json_text = do { # read in json file
253 open(my $json_fh, "<:encoding(UTF-8)", $ofilename)
254 or die("Can't open \"$ofilename\": $!\n");
255 local $/;
256 <$json_fh>
257 };
258
259 my $decoded_json = JSON::from_json($json_text);
260
261 my $ocr_text;
262 if ($vision_type eq "enable_document_ocr" || $vision_type eq "enable_image_ocr") {
263 $ocr_text = $decoded_json->{'textAnnotations'}[0]{'description'}; # access full ocr content
264 $doc_obj->add_utf8_text($section, $ocr_text); # append text to section
265
266 my $blocks = $decoded_json->{'fullTextAnnotation'}{'pages'}[0]{'blocks'};
267 my %text_and_language;
268
269 foreach my $block (@{$blocks}) {
270 foreach my $paragraph (@{$block->{'paragraphs'}}) {
271 foreach my $word (@{$paragraph->{'words'}}) {
272 my $detected_language = $word->{'property'}{'detectedLanguages'}[0]{'languageCode'} || "no_lang";
273 my $word_text = "";
274 foreach my $letter (@{$word->{'symbols'}}) {
275 $word_text .= $letter->{'text'};
276 }
277 $text_and_language{$detected_language} .= $word_text . " ";
278 }
279 }
280 }
281
282 for (keys %text_and_language) {
283 $doc_obj->add_utf8_metadata($section, "z_" . $_, $text_and_language{$_});
284 }
285
286
287 my $assoc_json_metaname = "HasGoogleVision";
288
289 if ($vision_type eq "enable_document_ocr") {
290 $assoc_json_metaname .= "DocumentOCRJSON";
291
292 $doc_obj->add_utf8_metadata($section, "GVDocumentOCRJSON",$assoc_ofile);
293 }
294 else {
295 # $vision_type eq "enable_image_ocr"
296 $assoc_json_metaname .= "ImageOCRJSON";
297
298 $doc_obj->add_utf8_metadata($section, "GVImageOCRJSON",$assoc_ofile);
299 }
300
301 $doc_obj->add_utf8_metadata($section, $assoc_json_metaname, 1);
302 }
303 elsif ($vision_type eq "enable_image_labelling") {
304 $ocr_text = $decoded_json->{'labelAnnotations'};
305 foreach my $label (@{$ocr_text}) {
306 # write to metadata : 'description'='Book' 'score'='0.9' 'topicality'='0.9' 'mid'='/m/0123'
307 $doc_obj->add_utf8_metadata($section, "description", $label->{'description'});
308 $doc_obj->add_utf8_metadata($section, "score", $label->{'score'});
309 $doc_obj->add_utf8_metadata($section, "topicality", $label->{'topicality'});
310 $doc_obj->add_utf8_metadata($section, "mid", $label->{'mid'});
311
312 # write to metadata : 'descriptions'='Book' 'Book_score'='0.9' 'Book_topicality'='0.9' 'Book_mid'='/m/0123'
313 $doc_obj->add_utf8_metadata($section, "descriptions", $label->{'description'});
314 $doc_obj->add_utf8_metadata($section, $label->{'description'} . "_score", $label->{'score'});
315 $doc_obj->add_utf8_metadata($section, $label->{'description'} . "_topicality", $label->{'topicality'});
316 $doc_obj->add_utf8_metadata($section, $label->{'description'} . "_mid", $label->{'mid'});
317
318 }
319
320 $doc_obj->add_utf8_metadata($section, "HasGoogleVisionImageLabellingJSON", 1);
321 $doc_obj->add_utf8_metadata($section, "GVImageLabellingJSON",$assoc_ofile);
322
323 }
324}
325
326sub start_openannotation_list
327{
328 my $self = shift (@_);
329 my ($doc_obj,$section) = @_;
330
331 my $OID = $doc_obj->get_OID();
332
333 my $site = $self->{'site'};
334 my $collect = my $collect_dir = $ENV{'GSDLCOLLECTION'};
335
336 # Implication of the following is that the generated openannotation-list JSON content
337 # is bound to the site/collection where it has been imported.
338 # => if renaming a collection at the file system level, then
339 # (i) The versios of openannotation-list*.json in the collections 'cache' dir
340 # need to be removed
341 # (ii) And collection rebuilt
342
343 my $uri_prefix = "http-greenstone://";
344 $uri_prefix .= "${site}/" if (defined $site) && $site ne ""; # GS3 specific
345 $uri_prefix .= "${collect}/";
346
347 my $id_uri = "${uri_prefix}${OID}/openannotation-list.json";
348
349 my $openannotation_list = {
350 "\@context" => "http://www.shared-canvas.org/ns/context.json",
351 # "\@id" => "https://iiif.harvardartmuseums.org/manifests/object/299843/list/47174896",
352 "\@id" => $id_uri,
353 "\@type" => "sc:AnnotationList",
354 "resources" => []
355 };
356
357 $self->{'openannotation-list'} = $openannotation_list;
358 $self->{'openannotation-uri-prefix'} = $uri_prefix;
359}
360
361
362sub convert_gvocr_to_openannotation_resource
363{
364 my $self = shift (@_);
365 my ($gv_blocks, $doc_obj, $section) = @_;
366
367 my $OID = $doc_obj->get_OID();
368 my $OID_with_section = ($section ne "") ? "${OID}_$section" : $OID;
369 $section = 1 if ($section eq ""); # occurs when the document is a single image
370
371
372 # Example Open Annotation resource (for single annotation):
373# {
374# "@context": "http://iiif.io/api/presentation/2/context.json",
375# "@id": "https://iiif.harvardartmuseums.org/annotations/9641482",
376# "@type": "oa:Annotation",
377# "motivation": [
378# "oa:commenting"
379# ],
380# "on": {
381# "@type": "oa:SpecificResource",
382# "full": "https://iiif.harvardartmuseums.org/manifests/object/299843/canvas/canvas-47174896",
383# "selector": {
384# "@type": "oa:FragmentSelector",
385# "value": "xywh=622,591,642,940"
386# },
387# "within": {
388# "@id": "https://iiif.harvardartmuseums.org/manifests/object/299843",
389# "@type": "sc:Manifest"
390# }
391# },
392# "resource": [
393# {
394# "@type": "dctypes:Text",
395# "chars": "<p>age: 35-52<br/>gender: Female(66.337677%)<br/>CALM: 55.438412%<br/>CONFUSED: 3.949288%<br/>SURPRISED: 2.33092%<br/>DISGUSTED:
396# 0.545727%<br/>HAPPY: 1.549943%<br/>ANGRY: 2.082294%<br/>SAD: 34.103416%<br/></p><p>Generated by AWS Rekognition</p>",
397# "format": "text/html"
398# }
399# ]
400# },
401
402 my $self_openannotation_resources = $self->{'openannotation-list'}->{'resources'};
403
404 my $block_i = 0;
405
406 my $uri_prefix = $self->{'openannotation-uri-prefix'};
407
408 foreach my $block (@{$gv_blocks}) {
409 $block_i++;
410
411 my $annotation_id_uri = "${uri_prefix}${OID_with_section}/annotation/gv-block-$block_i";
412
413 my $openannotation_resource = {
414 "\@context" => "http://iiif.io/api/presentation/2/context.json",
415 # "\@id" => "https://iiif.harvardartmuseums.org/annotations/9641482",
416 "\@id" => $annotation_id_uri,
417 "\@type" => "oa:Annotation",
418 "motivation" => [ "oa:commenting" ]
419 };
420
421
422 my $bbox_rect = $self->gv_ocr_bounding_box_rect($block);
423 my $bb_x_org = $bbox_rect->{'x_org'};
424 my $bb_y_org = $bbox_rect->{'y_org'};
425 my $bb_x_dim = $bbox_rect->{'x_dim'};
426 my $bb_y_dim = $bbox_rect->{'y_dim'};
427
428 my $canvas_full_uri = "${uri_prefix}${OID}/canvas/$section";
429 my $manifest_id_uri = "${uri_prefix}${OID_with_section}/manifest";
430
431 # {
432 # "type": "FragmentSelector",
433 # "value": "xywh=1265,1217,166,205"
434 # },
435 # {
436 # "type": "SvgSelector",
437 # "value": "<svg xmlns='http://www.w3.org/2000/svg'><path xmlns=\"http://www.w3.org/2000/svg\" d=\"M1265,1422.08859v-205h166v205z\" data-paper-data=\"{&quot;state&quot;:null}\" fill=\"none\" fill-rule=\"nonzero\" stroke=\"#00bfff\" stroke-width=\"1\" stroke-linecap=\"butt\" stroke-linejoin=\"miter\" stroke-miterlimit=\"10\" stroke-dasharray=\"\" stroke-dashoffset=\"0\" font-family=\"none\" font-weight=\"none\" font-size=\"none\" text-anchor=\"none\" style=\"mix-blend-mode: normal\"/></svg>"
438 # }
439
440 my $bb_y_org_plus_y_dim = $bb_y_org + $bb_y_dim;
441 my $openannotation_on = [ {
442 "\@type" => "oa:SpecificResource",
443 # "full" => "https://iiif.harvardartmuseums.org/manifests/object/299843/canvas/canvas-47174896",
444 "full" => $canvas_full_uri,
445 "selector" => {
446 "\@type" => "oa:Choice",
447 "default" => {
448 "\@type" => "oa:FragmentSelector",
449 "value" => "xywh=${bb_x_org},${bb_y_org},${bb_x_dim},${bb_y_dim}"
450 },
451 "item" => {
452 "\@type" => "oa:SvgSelector",
453 "value" => "<svg xmlns='http://www.w3.org/2000/svg'><path xmlns='http://www.w3.org/2000/svg' d='M${bb_x_org},${bb_y_org_plus_y_dim}v-${bb_y_dim}h${bb_x_dim}v${bb_y_dim}z' data-paper-data='{&quot;state&quot;:null}' fill='none' fill-rule='nonzero' stroke='#008000' stroke-width='1' stroke-linecap='butt' stroke-linejoin='miter' stroke-miterlimit='10' stroke-dasharray='' stroke-dashoffset='0' font-family='none' font-weight='none' font-size='none' text-anchor='none' style='mix-blend-mode: normal'/></svg>"
454 }
455 }
456 #"within" => {
457 # #"\@id" => "https://iiif.harvardartmuseums.org/manifests/object/299843",
458 # "\@id" => $manifest_id_uri,
459 # "\@type" => "sc:Manifest"
460 #}
461 } ];
462
463 # # "on": "http://localhost:8887/coin/canvas#xywh=3706,208,522,522"
464 # my $openannotation_on = "${canvas_full_uri}#xywh=${bb_x_org},${bb_y_org},${bb_x_dim},${bb_y_dim}";
465
466 $openannotation_resource->{'on'} = $openannotation_on;
467
468
469 my $block_text_html = "";
470
471 foreach my $paragraph (@{$block->{'paragraphs'}}) {
472 my $para_text = "";
473
474 foreach my $word (@{$paragraph->{'words'}}) {
475 my $word_text = "";
476
477 foreach my $letter (@{$word->{'symbols'}}) {
478 $word_text .= $letter->{'text'};
479 }
480
481 $para_text .= " " if $para_text ne "";
482 $para_text .= $word_text;
483 }
484
485 $block_text_html .= "<p>\n$para_text\n</p>\n\n";
486 # $block_text_html .= "\n$para_text\n\n";
487 }
488
489 my $openannotation_inner_resource = [{
490 "\@type" => "dctypes:Text",
491 "chars" => "$block_text_html",
492 "format" => "text/html"
493 }];
494
495
496 #"resource": {
497 # "@type": "cnt:ContentAsText",
498 # "format": "text/plain",
499 # "chars": "Zeus seated on stool-throne"
500 #},
501
502 #my $openannotation_inner_resource = [{
503 # "\@type" => "cnt:ContentAsText",
504 # "format" => "text/plain",
505 # "chars" => "$block_text_html"
506 #}];
507
508 $openannotation_resource->{'resource'} = $openannotation_inner_resource;
509
510 push(@$self_openannotation_resources,$openannotation_resource);
511 }
512}
513
514
515sub convert_and_append_openannotation_resources
516{
517 my $self = shift (@_);
518 my ($gv_dococr_json_filename, $doc_obj, $section) = @_;
519
520
521 # Read in JSON file
522 my $json_text = do {
523 open(my $json_fh, "<:encoding(UTF-8)", $gv_dococr_json_filename)
524 or die("Can't open \"$gv_dococr_json_filename\": $!\n");
525 local $/;
526 <$json_fh>
527 };
528
529 my $decoded_json = JSON::from_json($json_text);
530
531 my $gv_blocks = $decoded_json->{'fullTextAnnotation'}->{'pages'}->[0]->{'blocks'};
532
533 $self->convert_gvocr_to_openannotation_resource($gv_blocks, $doc_obj, $section);
534}
535
536
537
538sub end_openannotation_list
539{
540 my $self = shift (@_);
541 my ($doc_obj,$json_ofilename) = @_;
542
543 my $ret_status = 1;
544
545 if (!open(JOUT, "> $json_ofilename")) {
546 print STDERR "Error: Failed save Open Annotation List JSON to \"$json_ofilename\":\n $!\n";
547 $ret_status = 0;
548 }
549 else {
550 binmode(JOUT, ":utf8");
551
552 my $openannotation_list = $self->{'openannotation-list'};
553 my $openannotation_list_json_text = JSON::encode_json($openannotation_list);
554
555 print JOUT $openannotation_list_json_text;
556 close JOUT;
557
558 }
559
560 $self->{'openannotation-list'} = undef;
561 $self->{'openannotation-uri-prefix'} = undef;
562
563 return $ret_status;
564}
565
566
567sub openannotation_list_associate_json
568{
569 my $self = shift (@_);
570 my ($doc_obj, $gv_dococr_json_filename_recs) = @_;
571
572 my $outhandle = $self->{'outhandle'};
573
574 my $all_saved_ok = 1;
575
576 for my $gv_json_filename_rec (@$gv_dococr_json_filename_recs) {
577 my $gv_json_filename = $gv_json_filename_rec->{'filename'};
578 my $section = $gv_json_filename_rec->{'section'};
579
580
581 my ($gv_dococr_filename_root) = ($gv_dococr_json_filename_recs->[0]->{'filename'} =~ m/^(.+)\.json$/);
582
583 # slight of hand so new directory spot in cache_dir picked out is where we want it!
584 $gv_dococr_filename_root .= "/";
585
586 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
587 my $toplevel_cached_dir = &FileUtils::filenameConcatenate($collect_dir,"cached");
588
589 $self->init_cache_for_file($gv_dococr_filename_root);
590 my $cached_dir = $self->{'cached_dir'};
591
592 my $assoc_openannotation_json_ofile = "openannotation-list${section}.json";
593 my $cached_openannotation_json_ofilename = &FileUtils::filenameConcatenate($cached_dir,$assoc_openannotation_json_ofile);
594
595 my $needs_json_regen = 0;
596
597 if (!-f $cached_openannotation_json_ofilename) {
598 $needs_json_regen = 1;
599 }
600 else {
601 if (-M $gv_json_filename > -M $cached_openannotation_json_ofilename) {
602 $needs_json_regen = 1;
603 }
604 }
605
606 my $saved_ok = 1;
607
608 if ($needs_json_regen) {
609
610 print $outhandle " OpenAnnotation-List: Generating $cached_openannotation_json_ofilename\n";
611
612 $self->start_openannotation_list($doc_obj);
613 $self->convert_and_append_openannotation_resources($gv_json_filename, $doc_obj,$section);
614
615 $saved_ok = $self->end_openannotation_list($doc_obj,$cached_openannotation_json_ofilename);
616 }
617 else {
618 print $outhandle " OpenAnnotation-List: Cached file $cached_openannotation_json_ofilename already exists\n";
619 }
620
621 if ($saved_ok) {
622 my $top_section = $doc_obj->get_top_section();
623 $doc_obj->associate_file($cached_openannotation_json_ofilename,$assoc_openannotation_json_ofile,"application/json",$top_section);
624 }
625 else {
626 $all_saved_ok = 0;
627 }
628 }
629
630 return $all_saved_ok;
631}
632
633
634sub opt_run_gen_openannotation
635{
636 my $self = shift (@_);
637 my ($doc_obj) = @_;
638
639 my $gv_dococr_json_filename_recs = $self->{'gv-dococr-json-filename-recs'};
640 my $num_gv_dococr_json_filename_recs = scalar(@$gv_dococr_json_filename_recs);
641
642 my $ret_val_ok = 1;
643
644 if ($num_gv_dococr_json_filename_recs > 0) {
645 $ret_val_ok = $self->openannotation_list_associate_json($doc_obj,$gv_dococr_json_filename_recs);
646 }
647
648 return $ret_val_ok;
649}
650
651
652
653sub start_webannotation_list_INPROGRESS
654{
655 my $self = shift (@_);
656 my ($doc_obj,$section) = @_;
657
658 my $OID = $doc_obj->get_OID();
659
660 my $site = $self->{'site'};
661 my $collect = my $collect_dir = $ENV{'GSDLCOLLECTION'};
662
663 # Implication of the following is that the generated webannotation-list JSON content
664 # is bound to the site/collection where it has been imported.
665 # => if renaming a collection at the file system level, then
666 # (i) The versios of webannotation-list*.json in the collections 'cache' dir
667 # need to be removed
668 # (ii) And collection rebuilt
669
670 my $uri_prefix = "http-greenstone://";
671 $uri_prefix .= "${site}/" if (defined $site) && $site ne ""; # GS3 specific
672 $uri_prefix .= "${collect}/";
673
674 my $id_uri = "${uri_prefix}${OID}/webannotation-list.json";
675
676 my $webannotation_list = {
677 "\@context" => "http://www.shared-canvas.org/ns/context.json",
678 "\@id" => $id_uri,
679 "\@type" => "sc:AnnotationList",
680 "resources" => []
681 };
682
683 $self->{'webannotation-list'} = $webannotation_list;
684 $self->{'webannotation-uri-prefix'} = $uri_prefix;
685}
686
687
688
689
690sub convert_gvocr_to_webannotation_resource_INPROGRESS
691{
692 my $self = shift (@_);
693 my ($gv_blocks, $doc_obj, $section) = @_;
694
695 my $OID = $doc_obj->get_OID();
696 my $OID_with_section = ($section ne "") ? "${OID}_$section" : $OID;
697 $section = 1 if ($section eq ""); # occurs when the document is a single image
698
699 # Details on difference between OpenAnnotation and WebAnnotation covered at
700 # https://www.google.com/search?q=iiif+simpleannotationserver&sxsrf=ALiCzsbIpm1YO0SYE9sCXBQ231_oyEmopw:1672137985013&source=lnms&tbm=vid&sa=X&ved=2ahUKEwizu_K0z5n8AhXF1DgGHQ7FCb4Q_AUoA3oECAEQBQ&biw=1536&bih=742&dpr=1.25#fpstate=ive&vld=cid:07a4e9d9,vid:gFNWWIe5QpM
701
702
703 my $self_webannotation_resources = $self->{'webannotation-list'}->{'resources'};
704
705 my $block_i = 0;
706
707 my $uri_prefix = $self->{'webannotation-uri-prefix'};
708
709 foreach my $block (@{$gv_blocks}) {
710 $block_i++;
711
712 my $annotation_id_uri = "${uri_prefix}${OID_with_section}/annotation/gv-block-$block_i";
713
714 my $webannotation_resource = {
715 "\@context" => "http://iiif.io/api/presentation/2/context.json",
716 "id" => $annotation_id_uri,
717 "type" => "Annotation",
718 "motivation" => [ "commenting" ]
719 };
720
721
722 my $bbox_rect = $self->gv_ocr_bounding_box_rect($block);
723 my $bb_x_org = $bbox_rect->{'x_org'};
724 my $bb_y_org = $bbox_rect->{'y_org'};
725 my $bb_x_dim = $bbox_rect->{'x_dim'};
726 my $bb_y_dim = $bbox_rect->{'y_dim'};
727
728 my $canvas_full_uri = "${uri_prefix}${OID}/canvas/$section";
729 my $manifest_id_uri = "${uri_prefix}${OID_with_section}/manifest";
730
731 # Needs updating -- see openannotation_on above !!!!!!! *********
732 my $webannotation_target = [ {
733 "type" => "oa:SpecificResource",
734 "full" => $canvas_full_uri,
735 "selector" => {
736 "type" => "oa:FragmentSelector",
737 "value" => "xywh=${bb_x_org},${bb_y_org},${bb_x_dim},${bb_y_dim}"
738 },
739 "within" => {
740 "id" => $manifest_id_uri,
741 "type" => "sc:Manifest"
742 }
743 } ];
744 $webannotation_resource->{'target'} = $webannotation_target;
745
746
747 my $block_text_html = "";
748
749 foreach my $paragraph (@{$block->{'paragraphs'}}) {
750 my $para_text = "";
751
752 foreach my $word (@{$paragraph->{'words'}}) {
753 my $word_text = "";
754
755 foreach my $letter (@{$word->{'symbols'}}) {
756 $word_text .= $letter->{'text'};
757 }
758
759 $para_text .= " " if $para_text ne "";
760 $para_text .= $word_text;
761 }
762
763 $block_text_html .= "<p>\n$para_text\n</p>\n\n";
764 }
765
766 my $webannotation_body = [{
767 "type" => "TextualBody",
768 "chars" => "$block_text_html",
769 "format" => "text/html"
770 }];
771
772 $webannotation_resource->{'body'} = $webannotation_body;
773
774 push(@$self_webannotation_resources,$webannotation_resource);
775 }
776}
777
778
779sub convert_and_append_webannotation_resources_INPROGRESS
780{
781 my $self = shift (@_);
782 my ($gv_dococr_json_filename, $doc_obj, $section) = @_;
783
784
785 # Read in JSON file
786 my $json_text = do {
787 open(my $json_fh, "<:encoding(UTF-8)", $gv_dococr_json_filename)
788 or die("Can't open \"$gv_dococr_json_filename\": $!\n");
789 local $/;
790 <$json_fh>
791 };
792
793 my $decoded_json = JSON::from_json($json_text);
794
795 my $gv_blocks = $decoded_json->{'fullTextAnnotation'}->{'pages'}->[0]->{'blocks'};
796
797 $self->convert_gvocr_to_webannotation_resource_INPROGRESS($gv_blocks, $doc_obj, $section);
798}
799
800
801
802sub end_webannotation_list_INPROGRESS
803{
804 my $self = shift (@_);
805 my ($doc_obj,$json_ofilename) = @_;
806
807 my $ret_status = 1;
808
809 if (!open(JOUT, "> $json_ofilename")) {
810 print STDERR "Error: Failed save Open Annotation List JSON to \"$json_ofilename\":\n $!\n";
811 $ret_status = 0;
812 }
813 else {
814 binmode(JOUT, ":utf8");
815
816 my $webannotation_list = $self->{'webannotation-list'};
817 my $webannotation_list_json_text = JSON::encode_json($webannotation_list);
818
819 print JOUT $webannotation_list_json_text;
820 close JOUT;
821
822 }
823
824 $self->{'webannotation-list'} = undef;
825 $self->{'webannotation-uri-prefix'} = undef;
826
827 return $ret_status;
828}
829
830sub webannotation_list_associate_json_INPROGRESS
831{
832 my $self = shift (@_);
833 my ($doc_obj, $gv_dococr_json_filename_recs) = @_;
834
835 my $outhandle = $self->{'outhandle'};
836
837 my $all_saved_ok = 1;
838
839 for my $gv_json_filename_rec (@$gv_dococr_json_filename_recs) {
840 my $gv_json_filename = $gv_json_filename_rec->{'filename'};
841 my $section = $gv_json_filename_rec->{'section'};
842
843
844 my ($gv_dococr_filename_root) = ($gv_dococr_json_filename_recs->[0]->{'filename'} =~ m/^(.+)\.json$/);
845
846 # slight of hand so new directory spot in cache_dir picked out is where we want it!
847 $gv_dococr_filename_root .= "/";
848
849 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
850 my $toplevel_cached_dir = &FileUtils::filenameConcatenate($collect_dir,"cached");
851
852 $self->init_cache_for_file($gv_dococr_filename_root);
853 my $cached_dir = $self->{'cached_dir'};
854
855 my $assoc_webannotation_json_ofile = "webannotation-list${section}.json";
856 my $cached_webannotation_json_ofilename = &FileUtils::filenameConcatenate($cached_dir,$assoc_webannotation_json_ofile);
857
858 my $needs_json_regen = 0;
859
860 if (!-f $cached_webannotation_json_ofilename) {
861 $needs_json_regen = 1;
862 }
863 else {
864 if (-M $gv_json_filename > -M $cached_webannotation_json_ofilename) {
865 $needs_json_regen = 1;
866 }
867 }
868
869 my $saved_ok = 1;
870
871 if ($needs_json_regen) {
872
873 print $outhandle " WebAnnotation-List: Generating $cached_webannotation_json_ofilename\n";
874
875 $self->start_webannotation_list_INPROGRESS($doc_obj);
876 $self->convert_and_append_webannotation_resources_INPROGRESS($gv_json_filename, $doc_obj,$section);
877
878 $saved_ok = $self->end_webannotation_list_INPROGRESS($doc_obj,$cached_webannotation_json_ofilename);
879 }
880 else {
881 print $outhandle " WebAnnotation-List: Cached file $cached_webannotation_json_ofilename already exists\n";
882 }
883
884 if ($saved_ok) {
885 my $top_section = $doc_obj->get_top_section();
886 $doc_obj->associate_file($cached_webannotation_json_ofilename,$assoc_webannotation_json_ofile,"application/json",$top_section);
887 }
888 else {
889 $all_saved_ok = 0;
890 }
891 }
892
893 return $all_saved_ok;
894}
895
896
897sub opt_run_gen_webannotation_INPROGRESS
898{
899 my $self = shift (@_);
900 my ($doc_obj) = @_;
901
902 my $gv_dococr_json_filename_recs = $self->{'gv-dococr-json-filename-recs'};
903 my $num_gv_dococr_json_filename_recs = scalar(@$gv_dococr_json_filename_recs);
904
905 my $ret_val_ok = 1;
906
907 if ($num_gv_dococr_json_filename_recs > 0) {
908 $ret_val_ok = $self->webannotation_list_associate_json($doc_obj,$gv_dococr_json_filename_recs);
909 }
910
911 return $ret_val_ok;
912}
913
914
9151;
916
Note: See TracBrowser for help on using the repository browser.