source: gs3-extensions/structured-image/trunk/perllib/plugins/GoogleVisionAPIConverter.pm@ 37117

Last change on this file since 37117 was 37117, checked in by davidb, 16 months ago

Allow for a page having now OCR'd text found

File size: 30.3 KB
Line 
1######################################################################
2#
3# GoogleVisionAPIConverter.pm -- helper plugin that allows other plugins
4# (such as ImagePlugin and PagedImagePlugin) to extend their
5# processing capability through sub-classing inheritence (such as
6# GoogleVisionImagePlugin and GoogleVisionPagedImagePlugin) to
7# expand the image processing capabilities at ingest time to
8# include the Google Vision API allowing for: metadata labelling
9# of objects within a scene; and OCR text recognition.
10#
11# A component of the Greenstone digital library software
12# from the New Zealand Digital Library Project at the
13# University of Waikato, New Zealand.
14#
15# Copyright (C) 1999 New Zealand Digital Library Project
16#
17# This program is free software; you can redistribute it and/or modify
18# it under the terms of the GNU General Public License as published by
19# the Free Software Foundation; either version 2 of the License, or
20# (at your option) any later version.
21#
22# This program is distributed in the hope that it will be useful,
23# but WITHOUT ANY WARRANTY; without even the implied warranty of
24# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25# GNU General Public License for more details.
26#
27# You should have received a copy of the GNU General Public License
28# along with this program; if not, write to the Free Software
29# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30#
31###########################################################################
32
33package GoogleVisionAPIConverter;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37no strict 'subs';
38
39use gsprintf;
40use FileUtils;
41
42##use ImagePlugin;
43use BaseMediaConverter;
44
45use utf8;
46use JSON; # qw( from_json, encode_json );
47
48sub BEGIN {
49 @GoogleVisionAPIConverter::ISA = ('BaseMediaConverter');
50}
51
52my $arguments = [
53 { 'name' => "google_application_credentials",
54 'desc' => "{GoogleVisionAPIConverter.google_applicatio_credentials}",
55 'type' => "string",
56 'reqd' => "no",
57 'deft' => "google-sa-credentials-key.json"
58 },
59 { 'name' => "enable_image_labelling",
60 'desc' => "{GoogleVisionAPIConverter.enable_image_labelling}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "enable_image_ocr",
64 'desc' => "{GoogleVisionAPIConverter.enable_image_ocr}",
65 'type' => "flag",
66 'reqd' => "no" },
67 { 'name' => "enable_document_ocr",
68 'desc' => "{GoogleVisionAPIConverter.enable_document_ocr}",
69 'type' => "flag",
70 'reqd' => "no" }
71];
72
73my $options = { 'name' => "GoogleVisionAPIConverter",
74 'desc' => "{GoogleVisionAPIConverter.desc}",
75 'abstract' => "no",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79sub new {
80 my ($class) = shift (@_);
81 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
82 push(@$pluginlist, $class);
83
84 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
85 push(@{$hashArgOptLists->{"OptList"}},$options);
86
87 my $self = new BaseMediaConverter($pluginlist, $inputargs, $hashArgOptLists, 1);
88
89 return bless $self, $class;
90}
91
92sub begin {
93 my $self = shift (@_);
94 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
95
96 if ($self->{'enable_image_ocr'} && $self->{'enable_document_ocr'}) {
97 print STDERR "Please use the following command syntax for vision types: (--enable_image_ocr | --enable_document_ocr) [--enable_image_labelling]\n";
98 print STDERR "\t\t --enable_image_ocr : optical character recognition for text within images\n";
99 print STDERR "\t\t --enable_document_ocr : optical character recognition for text within documents\n";
100 print STDERR "\t\t --enable_image_labelling : annotation labeling for objects within images\n";
101 exit(2);
102 }
103
104 $self->SUPER::begin(@_);
105}
106
107sub vision_monitor_line {
108 my ($line) = @_;
109
110 my $had_error = 0;
111 my $generate_dot = 0;
112
113 if ($line =~ m/^.*$/)
114 {
115 $generate_dot = 1;
116 }
117
118 return ($had_error,$generate_dot);
119}
120
121sub run_gv_convert {
122 my $self = shift (@_);
123 my ($filename,$file,$doc_obj,$opt_section) = @_;
124
125 my $section = (defined $opt_section) ? $opt_section : $doc_obj->get_top_section();
126
127 my $verbosity = $self->{'verbosity'};
128 my $outhandle = $self->{'outhandle'};
129 print $outhandle "----- GoogleVisionAPIConveter run_gv_convert -----\n";
130
131 my @vision_types = (); # array containing target ocr / labelling type(s)
132
133 if ($self->{'enable_image_labelling'}) { push(@vision_types, "enable_image_labelling"); }
134 if ($self->{'enable_image_ocr'}) { push(@vision_types, "enable_image_ocr"); }
135 if ($self->{'enable_document_ocr'}) { push(@vision_types, "enable_document_ocr"); }
136
137 my $vision_types_length = scalar(@vision_types);
138
139 if ($vision_types_length != 0) {
140
141 $self->init_cache_for_file($filename);
142 my $cached_image_dir = $self->{'cached_dir'};
143 # my $cached_image_root = $self->{'cached_file_root'};
144
145 # my $filename_no_path = &File::Basename::basename($filename);
146
147 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
148 my $credentials_filename = &FileUtils::filenameConcatenate($collect_dir, "etc", $self->{'google_application_credentials'});
149
150 for my $vision_type (@vision_types) {
151
152 my $ofile = "${vision_type}-google-vision-output.json";
153 my $ofilename = &FileUtils::filenameConcatenate($cached_image_dir,$ofile);
154
155 my $vision_cmd = "vision.py --$vision_type --credentials \"$credentials_filename\" \"$filename\" \"$ofilename\"";
156
157 $self->run_vision($file, $filename, $ofile, $ofilename, $vision_cmd, $vision_type, $doc_obj,$section);
158
159 if ($vision_type eq "enable_document_ocr") {
160 my $gv_dococr_rec = { 'filename' => $ofilename, 'section' => $section };
161
162 push(@{$self->{'gv-dococr-json-filename-recs'}}, $gv_dococr_rec);
163 }
164 }
165 }
166
167 return "json";
168}
169
170sub gv_ocr_bounding_box_rect
171{
172 my $self = shift (@_);
173 my ($gv_block,) = @_;
174
175 my $bbox_rect = undef;
176
177 my $gv_boundingBox = $gv_block->{'boundingBox'};
178
179 my $gv_vertices = $gv_boundingBox->{'vertices'};
180 my $gv_num_vertices = scalar(@$gv_vertices);
181
182 if ($gv_num_vertices > 0) {
183 # print STDERR "**** gs_vertices[0] = ", JSON::encode_json($gv_vertices->[0]), "\n";
184
185 # Discovered that sometimes the 'x' value in the 'vertices' structure is not defined
186 # So can't rely on picking up $gv_vertices->[0 for 'x' and 'y'
187 # start off with 'undef' and test for !defined in for-loop
188
189 my $min_x = undef;
190 my $min_y = undef;
191 my $max_x = undef;
192 my $max_y = undef;
193
194
195 for (my $v=0; $v<$gv_num_vertices; $v++) {
196 my $x = $gv_vertices->[$v]->{'x'};
197 my $y = $gv_vertices->[$v]->{'y'};
198
199 if (defined $x) {
200 $min_x = $x if (!defined $min_x || ($x < $min_x));
201 $max_x = $x if (!defined $max_x || ($x > $max_x));
202 }
203
204 if (defined $y) {
205 $min_y = $y if (!defined $min_y || ($y < $min_y));
206 $max_y = $y if (!defined $max_y || ($y > $max_y));
207 }
208 }
209
210 my $x_org = $min_x;
211 my $y_org = $min_y;
212 my $x_dim = $max_x - $min_x +1;
213 my $y_dim = $max_y - $min_y +1;
214
215 $bbox_rect = { "x_org" => $x_org, "y_org" => $y_org, "x_dim" => $x_dim, "y_dim" => $y_dim};
216 }
217
218 return $bbox_rect;
219}
220
221sub run_vision
222{
223 my $self = shift (@_);
224 my ($file, $filename, $ofile, $ofilename, $vision_cmd, $vision_type, $doc_obj,$section) = @_;
225
226 my $vision_regenerated;
227 my $vision_result;
228 my $vision_error;
229
230 my $print_info = {
231 'message_prefix' => "GoogleVisionAPI",
232 'message' => "Sending $file to GoogleVisionAPI using vision.py"
233 };
234
235 ($vision_regenerated,$vision_result,$vision_error)
236 = $self->run_cached_general_cmd($vision_cmd,$filename,$ofilename,$print_info);
237
238 # Need to work a bit harder in setting up the associated JSON file
239 # => strip of 'enable_' in favour of 'gv_'
240 # => add in section number as part of the file name to avoid clashes
241
242 my $section_file_suffix = $section;
243 $section_file_suffix =~ s/\./_/g;
244
245 my $assoc_ofile = $ofile;
246 $assoc_ofile =~ s/^enable_/gv_/;
247 $assoc_ofile =~ s/-google-vision//;
248 $assoc_ofile =~ s/\.(.*?)$/$section_file_suffix.$1/;
249
250 $doc_obj->associate_file($ofilename,$assoc_ofile,"application/json",$section);
251
252 my $json_text = do { # read in json file
253 open(my $json_fh, "<:encoding(UTF-8)", $ofilename)
254 or die("Can't open \"$ofilename\": $!\n");
255 local $/;
256 <$json_fh>
257 };
258
259 my $decoded_json = JSON::from_json($json_text);
260
261 my $ocr_text;
262 if ($vision_type eq "enable_document_ocr" || $vision_type eq "enable_image_ocr") {
263
264 if (defined $decoded_json->{'textAnnotations'}) {
265 $ocr_text = $decoded_json->{'textAnnotations'}->[0]->{'description'}; # access full ocr content
266 $doc_obj->add_utf8_text($section, $ocr_text); # append text to section
267
268 my $blocks = $decoded_json->{'fullTextAnnotation'}->{'pages'}->[0]->{'blocks'};
269 my %text_and_language;
270
271 foreach my $block (@{$blocks}) {
272 foreach my $paragraph (@{$block->{'paragraphs'}}) {
273 foreach my $word (@{$paragraph->{'words'}}) {
274 my $detected_language = $word->{'property'}->{'detectedLanguages'}->[0]->{'languageCode'} || "no_lang";
275 my $word_text = "";
276 foreach my $letter (@{$word->{'symbols'}}) {
277 $word_text .= $letter->{'text'};
278 }
279 $text_and_language{$detected_language} .= $word_text . " ";
280 }
281 }
282 }
283
284 for (keys %text_and_language) {
285 $doc_obj->add_utf8_metadata($section, "z_" . $_, $text_and_language{$_});
286 }
287
288 }
289
290 # Note: Even if there is no actual OCR'd text detected (if test above),
291 # stil set metadata that show that we applied the Google Vision API seeking text
292
293 my $assoc_json_metaname = "HasGoogleVision";
294
295 if ($vision_type eq "enable_document_ocr") {
296 $assoc_json_metaname .= "DocumentOCRJSON";
297
298 $doc_obj->add_utf8_metadata($section, "GVDocumentOCRJSON",$assoc_ofile);
299 }
300 else {
301 # $vision_type eq "enable_image_ocr"
302 $assoc_json_metaname .= "ImageOCRJSON";
303
304 $doc_obj->add_utf8_metadata($section, "GVImageOCRJSON",$assoc_ofile);
305 }
306
307 $doc_obj->add_utf8_metadata($section, $assoc_json_metaname, 1);
308 }
309 elsif ($vision_type eq "enable_image_labelling") {
310 $ocr_text = $decoded_json->{'labelAnnotations'};
311 foreach my $label (@{$ocr_text}) {
312 # write to metadata : 'description'='Book' 'score'='0.9' 'topicality'='0.9' 'mid'='/m/0123'
313 $doc_obj->add_utf8_metadata($section, "description", $label->{'description'});
314 $doc_obj->add_utf8_metadata($section, "score", $label->{'score'});
315 $doc_obj->add_utf8_metadata($section, "topicality", $label->{'topicality'});
316 $doc_obj->add_utf8_metadata($section, "mid", $label->{'mid'});
317
318 # write to metadata : 'descriptions'='Book' 'Book_score'='0.9' 'Book_topicality'='0.9' 'Book_mid'='/m/0123'
319 $doc_obj->add_utf8_metadata($section, "descriptions", $label->{'description'});
320 $doc_obj->add_utf8_metadata($section, $label->{'description'} . "_score", $label->{'score'});
321 $doc_obj->add_utf8_metadata($section, $label->{'description'} . "_topicality", $label->{'topicality'});
322 $doc_obj->add_utf8_metadata($section, $label->{'description'} . "_mid", $label->{'mid'});
323
324 }
325
326 $doc_obj->add_utf8_metadata($section, "HasGoogleVisionImageLabellingJSON", 1);
327 $doc_obj->add_utf8_metadata($section, "GVImageLabellingJSON",$assoc_ofile);
328
329 }
330}
331
332sub start_openannotation_list
333{
334 my $self = shift (@_);
335 my ($doc_obj,$section) = @_;
336
337 my $OID = $doc_obj->get_OID();
338
339 my $site = $self->{'site'};
340 my $collect = my $collect_dir = $ENV{'GSDLCOLLECTION'};
341
342 # Implication of the following is that the generated openannotation-list JSON content
343 # is bound to the site/collection where it has been imported.
344 # => if renaming a collection at the file system level, then
345 # (i) The versios of openannotation-list*.json in the collections 'cache' dir
346 # need to be removed
347 # (ii) And collection rebuilt
348
349 my $uri_prefix = "http-greenstone://";
350 $uri_prefix .= "${site}/" if (defined $site) && $site ne ""; # GS3 specific
351 $uri_prefix .= "${collect}/";
352
353 my $id_uri = "${uri_prefix}${OID}/openannotation-list.json";
354
355 my $openannotation_list = {
356 "\@context" => "http://www.shared-canvas.org/ns/context.json",
357 # "\@id" => "https://iiif.harvardartmuseums.org/manifests/object/299843/list/47174896",
358 "\@id" => $id_uri,
359 "\@type" => "sc:AnnotationList",
360 "resources" => []
361 };
362
363 $self->{'openannotation-list'} = $openannotation_list;
364 $self->{'openannotation-uri-prefix'} = $uri_prefix;
365}
366
367
368sub convert_gvocr_to_openannotation_resource
369{
370 my $self = shift (@_);
371 my ($gv_blocks, $doc_obj, $section) = @_;
372
373 my $OID = $doc_obj->get_OID();
374 my $OID_with_section = ($section ne "") ? "${OID}_$section" : $OID;
375 $section = 1 if ($section eq ""); # occurs when the document is a single image
376
377
378 # Example Open Annotation resource (for single annotation):
379# {
380# "@context": "http://iiif.io/api/presentation/2/context.json",
381# "@id": "https://iiif.harvardartmuseums.org/annotations/9641482",
382# "@type": "oa:Annotation",
383# "motivation": [
384# "oa:commenting"
385# ],
386# "on": {
387# "@type": "oa:SpecificResource",
388# "full": "https://iiif.harvardartmuseums.org/manifests/object/299843/canvas/canvas-47174896",
389# "selector": {
390# "@type": "oa:FragmentSelector",
391# "value": "xywh=622,591,642,940"
392# },
393# "within": {
394# "@id": "https://iiif.harvardartmuseums.org/manifests/object/299843",
395# "@type": "sc:Manifest"
396# }
397# },
398# "resource": [
399# {
400# "@type": "dctypes:Text",
401# "chars": "<p>age: 35-52<br/>gender: Female(66.337677%)<br/>CALM: 55.438412%<br/>CONFUSED: 3.949288%<br/>SURPRISED: 2.33092%<br/>DISGUSTED:
402# 0.545727%<br/>HAPPY: 1.549943%<br/>ANGRY: 2.082294%<br/>SAD: 34.103416%<br/></p><p>Generated by AWS Rekognition</p>",
403# "format": "text/html"
404# }
405# ]
406# },
407
408 my $self_openannotation_resources = $self->{'openannotation-list'}->{'resources'};
409
410 my $block_i = 0;
411
412 my $uri_prefix = $self->{'openannotation-uri-prefix'};
413
414 foreach my $block (@{$gv_blocks}) {
415 $block_i++;
416
417 my $annotation_id_uri = "${uri_prefix}${OID_with_section}/annotation/gv-block-$block_i";
418
419 my $openannotation_resource = {
420 "\@context" => "http://iiif.io/api/presentation/2/context.json",
421 # "\@id" => "https://iiif.harvardartmuseums.org/annotations/9641482",
422 "\@id" => $annotation_id_uri,
423 "\@type" => "oa:Annotation",
424 "motivation" => [ "oa:commenting" ]
425 };
426
427
428 my $bbox_rect = $self->gv_ocr_bounding_box_rect($block);
429 my $bb_x_org = $bbox_rect->{'x_org'};
430 my $bb_y_org = $bbox_rect->{'y_org'};
431 my $bb_x_dim = $bbox_rect->{'x_dim'};
432 my $bb_y_dim = $bbox_rect->{'y_dim'};
433
434 my $canvas_full_uri = "${uri_prefix}${OID}/canvas/$section";
435 my $manifest_id_uri = "${uri_prefix}${OID_with_section}/manifest";
436
437 # {
438 # "type": "FragmentSelector",
439 # "value": "xywh=1265,1217,166,205"
440 # },
441 # {
442 # "type": "SvgSelector",
443 # "value": "<svg xmlns='http://www.w3.org/2000/svg'><path xmlns=\"http://www.w3.org/2000/svg\" d=\"M1265,1422.08859v-205h166v205z\" data-paper-data=\"{&quot;state&quot;:null}\" fill=\"none\" fill-rule=\"nonzero\" stroke=\"#00bfff\" stroke-width=\"1\" stroke-linecap=\"butt\" stroke-linejoin=\"miter\" stroke-miterlimit=\"10\" stroke-dasharray=\"\" stroke-dashoffset=\"0\" font-family=\"none\" font-weight=\"none\" font-size=\"none\" text-anchor=\"none\" style=\"mix-blend-mode: normal\"/></svg>"
444 # }
445
446 my $bb_y_org_plus_y_dim = $bb_y_org + $bb_y_dim;
447 my $openannotation_on = [ {
448 "\@type" => "oa:SpecificResource",
449 # "full" => "https://iiif.harvardartmuseums.org/manifests/object/299843/canvas/canvas-47174896",
450 "full" => $canvas_full_uri,
451 "selector" => {
452 "\@type" => "oa:Choice",
453 "default" => {
454 "\@type" => "oa:FragmentSelector",
455 "value" => "xywh=${bb_x_org},${bb_y_org},${bb_x_dim},${bb_y_dim}"
456 },
457 "item" => {
458 "\@type" => "oa:SvgSelector",
459 "value" => "<svg xmlns='http://www.w3.org/2000/svg'><path xmlns='http://www.w3.org/2000/svg' d='M${bb_x_org},${bb_y_org_plus_y_dim}v-${bb_y_dim}h${bb_x_dim}v${bb_y_dim}z' data-paper-data='{&quot;state&quot;:null}' fill='none' fill-rule='nonzero' stroke='#008000' stroke-width='1' stroke-linecap='butt' stroke-linejoin='miter' stroke-miterlimit='10' stroke-dasharray='' stroke-dashoffset='0' font-family='none' font-weight='none' font-size='none' text-anchor='none' style='mix-blend-mode: normal'/></svg>"
460 }
461 }
462 #"within" => {
463 # #"\@id" => "https://iiif.harvardartmuseums.org/manifests/object/299843",
464 # "\@id" => $manifest_id_uri,
465 # "\@type" => "sc:Manifest"
466 #}
467 } ];
468
469 # # "on": "http://localhost:8887/coin/canvas#xywh=3706,208,522,522"
470 # my $openannotation_on = "${canvas_full_uri}#xywh=${bb_x_org},${bb_y_org},${bb_x_dim},${bb_y_dim}";
471
472 $openannotation_resource->{'on'} = $openannotation_on;
473
474
475 my $block_text_html = "";
476
477 foreach my $paragraph (@{$block->{'paragraphs'}}) {
478 my $para_text = "";
479
480 foreach my $word (@{$paragraph->{'words'}}) {
481 my $word_text = "";
482
483 foreach my $letter (@{$word->{'symbols'}}) {
484 $word_text .= $letter->{'text'};
485 }
486
487 $para_text .= " " if $para_text ne "";
488 $para_text .= $word_text;
489 }
490
491 $block_text_html .= "<p>\n$para_text\n</p>\n\n";
492 # $block_text_html .= "\n$para_text\n\n";
493 }
494
495 my $openannotation_inner_resource = [{
496 "\@type" => "dctypes:Text",
497 "chars" => "$block_text_html",
498 "format" => "text/html"
499 }];
500
501
502 #"resource": {
503 # "@type": "cnt:ContentAsText",
504 # "format": "text/plain",
505 # "chars": "Zeus seated on stool-throne"
506 #},
507
508 #my $openannotation_inner_resource = [{
509 # "\@type" => "cnt:ContentAsText",
510 # "format" => "text/plain",
511 # "chars" => "$block_text_html"
512 #}];
513
514 $openannotation_resource->{'resource'} = $openannotation_inner_resource;
515
516 push(@$self_openannotation_resources,$openannotation_resource);
517 }
518}
519
520
521sub convert_and_append_openannotation_resources
522{
523 my $self = shift (@_);
524 my ($gv_dococr_json_filename, $doc_obj, $section) = @_;
525
526
527 # Read in JSON file
528 my $json_text = do {
529 open(my $json_fh, "<:encoding(UTF-8)", $gv_dococr_json_filename)
530 or die("Can't open \"$gv_dococr_json_filename\": $!\n");
531 local $/;
532 <$json_fh>
533 };
534
535 my $decoded_json = JSON::from_json($json_text);
536
537 my $gv_blocks = $decoded_json->{'fullTextAnnotation'}->{'pages'}->[0]->{'blocks'};
538
539 $self->convert_gvocr_to_openannotation_resource($gv_blocks, $doc_obj, $section);
540}
541
542
543
544sub end_openannotation_list
545{
546 my $self = shift (@_);
547 my ($doc_obj,$json_ofilename) = @_;
548
549 my $ret_status = 1;
550
551 if (!open(JOUT, "> $json_ofilename")) {
552 print STDERR "Error: Failed save Open Annotation List JSON to \"$json_ofilename\":\n $!\n";
553 $ret_status = 0;
554 }
555 else {
556 binmode(JOUT, ":utf8");
557
558 my $openannotation_list = $self->{'openannotation-list'};
559 my $openannotation_list_json_text = JSON::encode_json($openannotation_list);
560
561 print JOUT $openannotation_list_json_text;
562 close JOUT;
563
564 }
565
566 $self->{'openannotation-list'} = undef;
567 $self->{'openannotation-uri-prefix'} = undef;
568
569 return $ret_status;
570}
571
572
573sub openannotation_list_associate_json
574{
575 my $self = shift (@_);
576 my ($doc_obj, $gv_dococr_json_filename_recs) = @_;
577
578 my $outhandle = $self->{'outhandle'};
579
580 my $all_saved_ok = 1;
581
582 for my $gv_json_filename_rec (@$gv_dococr_json_filename_recs) {
583 my $gv_json_filename = $gv_json_filename_rec->{'filename'};
584 my $section = $gv_json_filename_rec->{'section'};
585
586
587 my ($gv_dococr_filename_root) = ($gv_dococr_json_filename_recs->[0]->{'filename'} =~ m/^(.+)\.json$/);
588
589 # slight of hand so new directory spot in cache_dir picked out is where we want it!
590 $gv_dococr_filename_root .= "/";
591
592 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
593 my $toplevel_cached_dir = &FileUtils::filenameConcatenate($collect_dir,"cached");
594
595 $self->init_cache_for_file($gv_dococr_filename_root);
596 my $cached_dir = $self->{'cached_dir'};
597
598 my $assoc_openannotation_json_ofile = "openannotation-list${section}.json";
599 my $cached_openannotation_json_ofilename = &FileUtils::filenameConcatenate($cached_dir,$assoc_openannotation_json_ofile);
600
601 my $needs_json_regen = 0;
602
603 if (!-f $cached_openannotation_json_ofilename) {
604 $needs_json_regen = 1;
605 }
606 else {
607 if (-M $gv_json_filename > -M $cached_openannotation_json_ofilename) {
608 $needs_json_regen = 1;
609 }
610 }
611
612 my $saved_ok = 1;
613
614 if ($needs_json_regen) {
615
616 print $outhandle " OpenAnnotation-List: Generating $cached_openannotation_json_ofilename\n";
617
618 $self->start_openannotation_list($doc_obj);
619 $self->convert_and_append_openannotation_resources($gv_json_filename, $doc_obj,$section);
620
621 $saved_ok = $self->end_openannotation_list($doc_obj,$cached_openannotation_json_ofilename);
622 }
623 else {
624 print $outhandle " OpenAnnotation-List: Cached file $cached_openannotation_json_ofilename already exists\n";
625 }
626
627 if ($saved_ok) {
628 my $top_section = $doc_obj->get_top_section();
629 $doc_obj->associate_file($cached_openannotation_json_ofilename,$assoc_openannotation_json_ofile,"application/json",$top_section);
630 }
631 else {
632 $all_saved_ok = 0;
633 }
634 }
635
636 return $all_saved_ok;
637}
638
639
640sub opt_run_gen_openannotation
641{
642 my $self = shift (@_);
643 my ($doc_obj) = @_;
644
645 my $gv_dococr_json_filename_recs = $self->{'gv-dococr-json-filename-recs'};
646 my $num_gv_dococr_json_filename_recs = scalar(@$gv_dococr_json_filename_recs);
647
648 my $ret_val_ok = 1;
649
650 if ($num_gv_dococr_json_filename_recs > 0) {
651 $ret_val_ok = $self->openannotation_list_associate_json($doc_obj,$gv_dococr_json_filename_recs);
652 }
653
654 return $ret_val_ok;
655}
656
657
658
659sub start_webannotation_list_INPROGRESS
660{
661 my $self = shift (@_);
662 my ($doc_obj,$section) = @_;
663
664 my $OID = $doc_obj->get_OID();
665
666 my $site = $self->{'site'};
667 my $collect = my $collect_dir = $ENV{'GSDLCOLLECTION'};
668
669 # Implication of the following is that the generated webannotation-list JSON content
670 # is bound to the site/collection where it has been imported.
671 # => if renaming a collection at the file system level, then
672 # (i) The versios of webannotation-list*.json in the collections 'cache' dir
673 # need to be removed
674 # (ii) And collection rebuilt
675
676 my $uri_prefix = "http-greenstone://";
677 $uri_prefix .= "${site}/" if (defined $site) && $site ne ""; # GS3 specific
678 $uri_prefix .= "${collect}/";
679
680 my $id_uri = "${uri_prefix}${OID}/webannotation-list.json";
681
682 my $webannotation_list = {
683 "\@context" => "http://www.shared-canvas.org/ns/context.json",
684 "\@id" => $id_uri,
685 "\@type" => "sc:AnnotationList",
686 "resources" => []
687 };
688
689 $self->{'webannotation-list'} = $webannotation_list;
690 $self->{'webannotation-uri-prefix'} = $uri_prefix;
691}
692
693
694
695
696sub convert_gvocr_to_webannotation_resource_INPROGRESS
697{
698 my $self = shift (@_);
699 my ($gv_blocks, $doc_obj, $section) = @_;
700
701 my $OID = $doc_obj->get_OID();
702 my $OID_with_section = ($section ne "") ? "${OID}_$section" : $OID;
703 $section = 1 if ($section eq ""); # occurs when the document is a single image
704
705 # Details on difference between OpenAnnotation and WebAnnotation covered at
706 # https://www.google.com/search?q=iiif+simpleannotationserver&sxsrf=ALiCzsbIpm1YO0SYE9sCXBQ231_oyEmopw:1672137985013&source=lnms&tbm=vid&sa=X&ved=2ahUKEwizu_K0z5n8AhXF1DgGHQ7FCb4Q_AUoA3oECAEQBQ&biw=1536&bih=742&dpr=1.25#fpstate=ive&vld=cid:07a4e9d9,vid:gFNWWIe5QpM
707
708
709 my $self_webannotation_resources = $self->{'webannotation-list'}->{'resources'};
710
711 my $block_i = 0;
712
713 my $uri_prefix = $self->{'webannotation-uri-prefix'};
714
715 foreach my $block (@{$gv_blocks}) {
716 $block_i++;
717
718 my $annotation_id_uri = "${uri_prefix}${OID_with_section}/annotation/gv-block-$block_i";
719
720 my $webannotation_resource = {
721 "\@context" => "http://iiif.io/api/presentation/2/context.json",
722 "id" => $annotation_id_uri,
723 "type" => "Annotation",
724 "motivation" => [ "commenting" ]
725 };
726
727
728 my $bbox_rect = $self->gv_ocr_bounding_box_rect($block);
729 my $bb_x_org = $bbox_rect->{'x_org'};
730 my $bb_y_org = $bbox_rect->{'y_org'};
731 my $bb_x_dim = $bbox_rect->{'x_dim'};
732 my $bb_y_dim = $bbox_rect->{'y_dim'};
733
734 my $canvas_full_uri = "${uri_prefix}${OID}/canvas/$section";
735 my $manifest_id_uri = "${uri_prefix}${OID_with_section}/manifest";
736
737 # Needs updating -- see openannotation_on above !!!!!!! *********
738 my $webannotation_target = [ {
739 "type" => "oa:SpecificResource",
740 "full" => $canvas_full_uri,
741 "selector" => {
742 "type" => "oa:FragmentSelector",
743 "value" => "xywh=${bb_x_org},${bb_y_org},${bb_x_dim},${bb_y_dim}"
744 },
745 "within" => {
746 "id" => $manifest_id_uri,
747 "type" => "sc:Manifest"
748 }
749 } ];
750 $webannotation_resource->{'target'} = $webannotation_target;
751
752
753 my $block_text_html = "";
754
755 foreach my $paragraph (@{$block->{'paragraphs'}}) {
756 my $para_text = "";
757
758 foreach my $word (@{$paragraph->{'words'}}) {
759 my $word_text = "";
760
761 foreach my $letter (@{$word->{'symbols'}}) {
762 $word_text .= $letter->{'text'};
763 }
764
765 $para_text .= " " if $para_text ne "";
766 $para_text .= $word_text;
767 }
768
769 $block_text_html .= "<p>\n$para_text\n</p>\n\n";
770 }
771
772 my $webannotation_body = [{
773 "type" => "TextualBody",
774 "chars" => "$block_text_html",
775 "format" => "text/html"
776 }];
777
778 $webannotation_resource->{'body'} = $webannotation_body;
779
780 push(@$self_webannotation_resources,$webannotation_resource);
781 }
782}
783
784
785sub convert_and_append_webannotation_resources_INPROGRESS
786{
787 my $self = shift (@_);
788 my ($gv_dococr_json_filename, $doc_obj, $section) = @_;
789
790
791 # Read in JSON file
792 my $json_text = do {
793 open(my $json_fh, "<:encoding(UTF-8)", $gv_dococr_json_filename)
794 or die("Can't open \"$gv_dococr_json_filename\": $!\n");
795 local $/;
796 <$json_fh>
797 };
798
799 my $decoded_json = JSON::from_json($json_text);
800
801 my $gv_blocks = $decoded_json->{'fullTextAnnotation'}->{'pages'}->[0]->{'blocks'};
802
803 $self->convert_gvocr_to_webannotation_resource_INPROGRESS($gv_blocks, $doc_obj, $section);
804}
805
806
807
808sub end_webannotation_list_INPROGRESS
809{
810 my $self = shift (@_);
811 my ($doc_obj,$json_ofilename) = @_;
812
813 my $ret_status = 1;
814
815 if (!open(JOUT, "> $json_ofilename")) {
816 print STDERR "Error: Failed save Open Annotation List JSON to \"$json_ofilename\":\n $!\n";
817 $ret_status = 0;
818 }
819 else {
820 binmode(JOUT, ":utf8");
821
822 my $webannotation_list = $self->{'webannotation-list'};
823 my $webannotation_list_json_text = JSON::encode_json($webannotation_list);
824
825 print JOUT $webannotation_list_json_text;
826 close JOUT;
827
828 }
829
830 $self->{'webannotation-list'} = undef;
831 $self->{'webannotation-uri-prefix'} = undef;
832
833 return $ret_status;
834}
835
836sub webannotation_list_associate_json_INPROGRESS
837{
838 my $self = shift (@_);
839 my ($doc_obj, $gv_dococr_json_filename_recs) = @_;
840
841 my $outhandle = $self->{'outhandle'};
842
843 my $all_saved_ok = 1;
844
845 for my $gv_json_filename_rec (@$gv_dococr_json_filename_recs) {
846 my $gv_json_filename = $gv_json_filename_rec->{'filename'};
847 my $section = $gv_json_filename_rec->{'section'};
848
849
850 my ($gv_dococr_filename_root) = ($gv_dococr_json_filename_recs->[0]->{'filename'} =~ m/^(.+)\.json$/);
851
852 # slight of hand so new directory spot in cache_dir picked out is where we want it!
853 $gv_dococr_filename_root .= "/";
854
855 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
856 my $toplevel_cached_dir = &FileUtils::filenameConcatenate($collect_dir,"cached");
857
858 $self->init_cache_for_file($gv_dococr_filename_root);
859 my $cached_dir = $self->{'cached_dir'};
860
861 my $assoc_webannotation_json_ofile = "webannotation-list${section}.json";
862 my $cached_webannotation_json_ofilename = &FileUtils::filenameConcatenate($cached_dir,$assoc_webannotation_json_ofile);
863
864 my $needs_json_regen = 0;
865
866 if (!-f $cached_webannotation_json_ofilename) {
867 $needs_json_regen = 1;
868 }
869 else {
870 if (-M $gv_json_filename > -M $cached_webannotation_json_ofilename) {
871 $needs_json_regen = 1;
872 }
873 }
874
875 my $saved_ok = 1;
876
877 if ($needs_json_regen) {
878
879 print $outhandle " WebAnnotation-List: Generating $cached_webannotation_json_ofilename\n";
880
881 $self->start_webannotation_list_INPROGRESS($doc_obj);
882 $self->convert_and_append_webannotation_resources_INPROGRESS($gv_json_filename, $doc_obj,$section);
883
884 $saved_ok = $self->end_webannotation_list_INPROGRESS($doc_obj,$cached_webannotation_json_ofilename);
885 }
886 else {
887 print $outhandle " WebAnnotation-List: Cached file $cached_webannotation_json_ofilename already exists\n";
888 }
889
890 if ($saved_ok) {
891 my $top_section = $doc_obj->get_top_section();
892 $doc_obj->associate_file($cached_webannotation_json_ofilename,$assoc_webannotation_json_ofile,"application/json",$top_section);
893 }
894 else {
895 $all_saved_ok = 0;
896 }
897 }
898
899 return $all_saved_ok;
900}
901
902
903sub opt_run_gen_webannotation_INPROGRESS
904{
905 my $self = shift (@_);
906 my ($doc_obj) = @_;
907
908 my $gv_dococr_json_filename_recs = $self->{'gv-dococr-json-filename-recs'};
909 my $num_gv_dococr_json_filename_recs = scalar(@$gv_dococr_json_filename_recs);
910
911 my $ret_val_ok = 1;
912
913 if ($num_gv_dococr_json_filename_recs > 0) {
914 $ret_val_ok = $self->webannotation_list_associate_json($doc_obj,$gv_dococr_json_filename_recs);
915 }
916
917 return $ret_val_ok;
918}
919
920
9211;
922
Note: See TracBrowser for help on using the repository browser.