source: gs3-extensions/structured-image/trunk/perllib/plugins/GoogleVisionAPIConverter.pm@ 37139

Last change on this file since 37139 was 37139, checked in by davidb, 16 months ago

Need to replace spaces with underscore _

File size: 30.6 KB
Line 
1######################################################################
2#
3# GoogleVisionAPIConverter.pm -- helper plugin that allows other plugins
4# (such as ImagePlugin and PagedImagePlugin) to extend their
5# processing capability through sub-classing inheritence (such as
6# GoogleVisionImagePlugin and GoogleVisionPagedImagePlugin) to
7# expand the image processing capabilities at ingest time to
8# include the Google Vision API allowing for: metadata labelling
9# of objects within a scene; and OCR text recognition.
10#
11# A component of the Greenstone digital library software
12# from the New Zealand Digital Library Project at the
13# University of Waikato, New Zealand.
14#
15# Copyright (C) 1999 New Zealand Digital Library Project
16#
17# This program is free software; you can redistribute it and/or modify
18# it under the terms of the GNU General Public License as published by
19# the Free Software Foundation; either version 2 of the License, or
20# (at your option) any later version.
21#
22# This program is distributed in the hope that it will be useful,
23# but WITHOUT ANY WARRANTY; without even the implied warranty of
24# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25# GNU General Public License for more details.
26#
27# You should have received a copy of the GNU General Public License
28# along with this program; if not, write to the Free Software
29# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30#
31###########################################################################
32
33package GoogleVisionAPIConverter;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37no strict 'subs';
38
39use gsprintf;
40use FileUtils;
41
42##use ImagePlugin;
43use BaseMediaConverter;
44
45use utf8;
46use JSON; # qw( from_json, encode_json );
47
48sub BEGIN {
49 @GoogleVisionAPIConverter::ISA = ('BaseMediaConverter');
50}
51
52my $arguments = [
53 { 'name' => "google_application_credentials",
54 'desc' => "{GoogleVisionAPIConverter.google_applicatio_credentials}",
55 'type' => "string",
56 'reqd' => "no",
57 'deft' => "google-sa-credentials-key.json"
58 },
59 { 'name' => "enable_image_labelling",
60 'desc' => "{GoogleVisionAPIConverter.enable_image_labelling}",
61 'type' => "flag",
62 'reqd' => "no" },
63 { 'name' => "enable_image_ocr",
64 'desc' => "{GoogleVisionAPIConverter.enable_image_ocr}",
65 'type' => "flag",
66 'reqd' => "no" },
67 { 'name' => "enable_document_ocr",
68 'desc' => "{GoogleVisionAPIConverter.enable_document_ocr}",
69 'type' => "flag",
70 'reqd' => "no" }
71];
72
73my $options = { 'name' => "GoogleVisionAPIConverter",
74 'desc' => "{GoogleVisionAPIConverter.desc}",
75 'abstract' => "no",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79sub new {
80 my ($class) = shift (@_);
81 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
82 push(@$pluginlist, $class);
83
84 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
85 push(@{$hashArgOptLists->{"OptList"}},$options);
86
87 my $self = new BaseMediaConverter($pluginlist, $inputargs, $hashArgOptLists, 1);
88
89 return bless $self, $class;
90}
91
92sub begin {
93 my $self = shift (@_);
94 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
95
96 if ($self->{'enable_image_ocr'} && $self->{'enable_document_ocr'}) {
97 print STDERR "Please use the following command syntax for vision types: (--enable_image_ocr | --enable_document_ocr) [--enable_image_labelling]\n";
98 print STDERR "\t\t --enable_image_ocr : optical character recognition for text within images\n";
99 print STDERR "\t\t --enable_document_ocr : optical character recognition for text within documents\n";
100 print STDERR "\t\t --enable_image_labelling : annotation labeling for objects within images\n";
101 exit(2);
102 }
103
104 $self->SUPER::begin(@_);
105}
106
107sub vision_monitor_line {
108 my ($line) = @_;
109
110 my $had_error = 0;
111 my $generate_dot = 0;
112
113 if ($line =~ m/^.*$/)
114 {
115 $generate_dot = 1;
116 }
117
118 return ($had_error,$generate_dot);
119}
120
121sub run_gv_convert {
122 my $self = shift (@_);
123 my ($filename,$file,$doc_obj,$opt_section) = @_;
124
125 my $section = (defined $opt_section) ? $opt_section : $doc_obj->get_top_section();
126
127 my $verbosity = $self->{'verbosity'};
128 my $outhandle = $self->{'outhandle'};
129 print $outhandle "----- GoogleVisionAPIConveter run_gv_convert -----\n";
130
131 my @vision_types = (); # array containing target ocr / labelling type(s)
132
133 if ($self->{'enable_image_labelling'}) { push(@vision_types, "enable_image_labelling"); }
134 if ($self->{'enable_image_ocr'}) { push(@vision_types, "enable_image_ocr"); }
135 if ($self->{'enable_document_ocr'}) { push(@vision_types, "enable_document_ocr"); }
136
137 my $vision_types_length = scalar(@vision_types);
138
139 if ($vision_types_length != 0) {
140
141 $self->init_cache_for_file($filename);
142 my $cached_image_dir = $self->{'cached_dir'};
143 # my $cached_image_root = $self->{'cached_file_root'};
144
145 # my $filename_no_path = &File::Basename::basename($filename);
146
147 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
148 my $credentials_filename = &FileUtils::filenameConcatenate($collect_dir, "etc", $self->{'google_application_credentials'});
149
150 for my $vision_type (@vision_types) {
151
152 my $ofile = "${vision_type}-google-vision-output.json";
153 my $ofilename = &FileUtils::filenameConcatenate($cached_image_dir,$ofile);
154
155 my $vision_cmd = "vision.py --$vision_type --credentials \"$credentials_filename\" \"$filename\" \"$ofilename\"";
156
157 $self->run_vision($file, $filename, $ofile, $ofilename, $vision_cmd, $vision_type, $doc_obj,$section);
158
159 if ($vision_type eq "enable_document_ocr") {
160 my $gv_dococr_rec = { 'filename' => $ofilename, 'section' => $section };
161
162 push(@{$self->{'gv-dococr-json-filename-recs'}}, $gv_dococr_rec);
163 }
164 }
165 }
166
167 return "json";
168}
169
170sub gv_ocr_bounding_box_rect
171{
172 my $self = shift (@_);
173 my ($gv_block,) = @_;
174
175 my $bbox_rect = undef;
176
177 my $gv_boundingBox = $gv_block->{'boundingBox'};
178
179 my $gv_vertices = $gv_boundingBox->{'vertices'};
180 my $gv_num_vertices = scalar(@$gv_vertices);
181
182 if ($gv_num_vertices > 0) {
183 # print STDERR "**** gs_vertices[0] = ", JSON::encode_json($gv_vertices->[0]), "\n";
184
185 # Discovered that sometimes the 'x' value in the 'vertices' structure is not defined
186 # So can't rely on picking up $gv_vertices->[0 for 'x' and 'y'
187 # start off with 'undef' and test for !defined in for-loop
188
189 my $min_x = undef;
190 my $min_y = undef;
191 my $max_x = undef;
192 my $max_y = undef;
193
194
195 for (my $v=0; $v<$gv_num_vertices; $v++) {
196 my $x = $gv_vertices->[$v]->{'x'};
197 my $y = $gv_vertices->[$v]->{'y'};
198
199 if (defined $x) {
200 $min_x = $x if (!defined $min_x || ($x < $min_x));
201 $max_x = $x if (!defined $max_x || ($x > $max_x));
202 }
203
204 if (defined $y) {
205 $min_y = $y if (!defined $min_y || ($y < $min_y));
206 $max_y = $y if (!defined $max_y || ($y > $max_y));
207 }
208 }
209
210 my $x_org = $min_x;
211 my $y_org = $min_y;
212 my $x_dim = $max_x - $min_x +1;
213 my $y_dim = $max_y - $min_y +1;
214
215 $bbox_rect = { "x_org" => $x_org, "y_org" => $y_org, "x_dim" => $x_dim, "y_dim" => $y_dim};
216 }
217
218 return $bbox_rect;
219}
220
221sub run_vision
222{
223 my $self = shift (@_);
224 my ($file, $filename, $ofile, $ofilename, $vision_cmd, $vision_type, $doc_obj,$section) = @_;
225
226 my $vision_regenerated;
227 my $vision_result;
228 my $vision_error;
229
230 my $print_info = {
231 'message_prefix' => "GoogleVisionAPI",
232 'message' => "Sending $file to GoogleVisionAPI using vision.py"
233 };
234
235 ($vision_regenerated,$vision_result,$vision_error)
236 = $self->run_cached_general_cmd($vision_cmd,$filename,$ofilename,$print_info);
237
238 # Need to work a bit harder in setting up the associated JSON file
239 # => strip of 'enable_' in favour of 'gv_'
240 # => add in section number as part of the file name to avoid clashes
241
242 my $section_file_suffix = $section;
243 $section_file_suffix =~ s/\./_/g;
244
245 my $assoc_ofile = $ofile;
246 $assoc_ofile =~ s/^enable_/gv_/;
247 $assoc_ofile =~ s/-google-vision//;
248 $assoc_ofile =~ s/\.(.*?)$/$section_file_suffix.$1/;
249
250 $doc_obj->associate_file($ofilename,$assoc_ofile,"application/json",$section);
251
252 my $json_text = do { # read in json file
253 open(my $json_fh, "<:encoding(UTF-8)", $ofilename)
254 or die("Can't open \"$ofilename\": $!\n");
255 local $/;
256 <$json_fh>
257 };
258
259 my $decoded_json = JSON::from_json($json_text);
260
261 my $ocr_text;
262 if ($vision_type eq "enable_document_ocr" || $vision_type eq "enable_image_ocr") {
263
264 if (defined $decoded_json->{'textAnnotations'}) {
265 $ocr_text = $decoded_json->{'textAnnotations'}->[0]->{'description'}; # access full ocr content
266 $doc_obj->add_utf8_text($section, $ocr_text); # append text to section
267
268 my $blocks = $decoded_json->{'fullTextAnnotation'}->{'pages'}->[0]->{'blocks'};
269 my %text_and_language;
270
271 foreach my $block (@{$blocks}) {
272 foreach my $paragraph (@{$block->{'paragraphs'}}) {
273 foreach my $word (@{$paragraph->{'words'}}) {
274 my $detected_language = $word->{'property'}->{'detectedLanguages'}->[0]->{'languageCode'} || "no_lang";
275 my $word_text = "";
276 foreach my $letter (@{$word->{'symbols'}}) {
277 $word_text .= $letter->{'text'};
278 }
279 $text_and_language{$detected_language} .= $word_text . " ";
280 }
281 }
282 }
283
284 for (keys %text_and_language) {
285 $doc_obj->add_utf8_metadata($section, "z_" . $_, $text_and_language{$_});
286 }
287
288 }
289
290 # Note: Even if there is no actual OCR'd text detected (if test above),
291 # stil set metadata that show that we applied the Google Vision API seeking text
292
293 my $assoc_json_metaname = "HasGoogleVision";
294
295 if ($vision_type eq "enable_document_ocr") {
296 $assoc_json_metaname .= "DocumentOCRJSON";
297
298 $doc_obj->add_utf8_metadata($section, "GVDocumentOCRJSON",$assoc_ofile);
299 }
300 else {
301 # $vision_type eq "enable_image_ocr"
302 $assoc_json_metaname .= "ImageOCRJSON";
303
304 $doc_obj->add_utf8_metadata($section, "GVImageOCRJSON",$assoc_ofile);
305 }
306
307 $doc_obj->add_utf8_metadata($section, $assoc_json_metaname, 1);
308 }
309 elsif ($vision_type eq "enable_image_labelling") {
310 $ocr_text = $decoded_json->{'labelAnnotations'};
311 foreach my $label (@{$ocr_text}) {
312 # Write to metadata : 'description'='Book' 'score'='0.9' 'topicality'='0.9' 'mid'='/m/0123'
313 $doc_obj->add_utf8_metadata($section, "description", $label->{'description'});
314 $doc_obj->add_utf8_metadata($section, "score", $label->{'score'});
315 $doc_obj->add_utf8_metadata($section, "topicality", $label->{'topicality'});
316 $doc_obj->add_utf8_metadata($section, "mid", $label->{'mid'});
317
318 # Write to metadata, e.g.,: 'descriptions'='Book' 'Book_score'='0.9' 'Book_topicality'='0.9' 'Book_mid'='/m/0123'
319 #
320 # ... but first generate a 'safe' metadata name, derived from the metadata value for 'description'
321 my $description_mdvalue = $label->{'description'};
322 my $description_mdname = $description_mdvalue;
323 $description_mdname = s/\s/_/g; # replace spaces with underscores
324
325 $doc_obj->add_utf8_metadata($section, "descriptions", $label->{'description'});
326 $doc_obj->add_utf8_metadata($section, $description_mdname . "_score", $label->{'score'});
327 $doc_obj->add_utf8_metadata($section, $description_mdname . "_topicality", $label->{'topicality'});
328 $doc_obj->add_utf8_metadata($section, $description_mdname . "_mid", $label->{'mid'});
329
330 }
331
332 $doc_obj->add_utf8_metadata($section, "HasGoogleVisionImageLabellingJSON", 1);
333 $doc_obj->add_utf8_metadata($section, "GVImageLabellingJSON",$assoc_ofile);
334
335 }
336}
337
338sub start_openannotation_list
339{
340 my $self = shift (@_);
341 my ($doc_obj,$section) = @_;
342
343 my $OID = $doc_obj->get_OID();
344
345 my $site = $self->{'site'};
346 my $collect = my $collect_dir = $ENV{'GSDLCOLLECTION'};
347
348 # Implication of the following is that the generated openannotation-list JSON content
349 # is bound to the site/collection where it has been imported.
350 # => if renaming a collection at the file system level, then
351 # (i) The versios of openannotation-list*.json in the collections 'cache' dir
352 # need to be removed
353 # (ii) And collection rebuilt
354
355 my $uri_prefix = "http-greenstone://";
356 $uri_prefix .= "${site}/" if (defined $site) && $site ne ""; # GS3 specific
357 $uri_prefix .= "${collect}/";
358
359 my $id_uri = "${uri_prefix}${OID}/openannotation-list.json";
360
361 my $openannotation_list = {
362 "\@context" => "http://www.shared-canvas.org/ns/context.json",
363 # "\@id" => "https://iiif.harvardartmuseums.org/manifests/object/299843/list/47174896",
364 "\@id" => $id_uri,
365 "\@type" => "sc:AnnotationList",
366 "resources" => []
367 };
368
369 $self->{'openannotation-list'} = $openannotation_list;
370 $self->{'openannotation-uri-prefix'} = $uri_prefix;
371}
372
373
374sub convert_gvocr_to_openannotation_resource
375{
376 my $self = shift (@_);
377 my ($gv_blocks, $doc_obj, $section) = @_;
378
379 my $OID = $doc_obj->get_OID();
380 my $OID_with_section = ($section ne "") ? "${OID}_$section" : $OID;
381 $section = 1 if ($section eq ""); # occurs when the document is a single image
382
383
384 # Example Open Annotation resource (for single annotation):
385# {
386# "@context": "http://iiif.io/api/presentation/2/context.json",
387# "@id": "https://iiif.harvardartmuseums.org/annotations/9641482",
388# "@type": "oa:Annotation",
389# "motivation": [
390# "oa:commenting"
391# ],
392# "on": {
393# "@type": "oa:SpecificResource",
394# "full": "https://iiif.harvardartmuseums.org/manifests/object/299843/canvas/canvas-47174896",
395# "selector": {
396# "@type": "oa:FragmentSelector",
397# "value": "xywh=622,591,642,940"
398# },
399# "within": {
400# "@id": "https://iiif.harvardartmuseums.org/manifests/object/299843",
401# "@type": "sc:Manifest"
402# }
403# },
404# "resource": [
405# {
406# "@type": "dctypes:Text",
407# "chars": "<p>age: 35-52<br/>gender: Female(66.337677%)<br/>CALM: 55.438412%<br/>CONFUSED: 3.949288%<br/>SURPRISED: 2.33092%<br/>DISGUSTED:
408# 0.545727%<br/>HAPPY: 1.549943%<br/>ANGRY: 2.082294%<br/>SAD: 34.103416%<br/></p><p>Generated by AWS Rekognition</p>",
409# "format": "text/html"
410# }
411# ]
412# },
413
414 my $self_openannotation_resources = $self->{'openannotation-list'}->{'resources'};
415
416 my $block_i = 0;
417
418 my $uri_prefix = $self->{'openannotation-uri-prefix'};
419
420 foreach my $block (@{$gv_blocks}) {
421 $block_i++;
422
423 my $annotation_id_uri = "${uri_prefix}${OID_with_section}/annotation/gv-block-$block_i";
424
425 my $openannotation_resource = {
426 "\@context" => "http://iiif.io/api/presentation/2/context.json",
427 # "\@id" => "https://iiif.harvardartmuseums.org/annotations/9641482",
428 "\@id" => $annotation_id_uri,
429 "\@type" => "oa:Annotation",
430 "motivation" => [ "oa:commenting" ]
431 };
432
433
434 my $bbox_rect = $self->gv_ocr_bounding_box_rect($block);
435 my $bb_x_org = $bbox_rect->{'x_org'};
436 my $bb_y_org = $bbox_rect->{'y_org'};
437 my $bb_x_dim = $bbox_rect->{'x_dim'};
438 my $bb_y_dim = $bbox_rect->{'y_dim'};
439
440 my $canvas_full_uri = "${uri_prefix}${OID}/canvas/$section";
441 my $manifest_id_uri = "${uri_prefix}${OID_with_section}/manifest";
442
443 # {
444 # "type": "FragmentSelector",
445 # "value": "xywh=1265,1217,166,205"
446 # },
447 # {
448 # "type": "SvgSelector",
449 # "value": "<svg xmlns='http://www.w3.org/2000/svg'><path xmlns=\"http://www.w3.org/2000/svg\" d=\"M1265,1422.08859v-205h166v205z\" data-paper-data=\"{&quot;state&quot;:null}\" fill=\"none\" fill-rule=\"nonzero\" stroke=\"#00bfff\" stroke-width=\"1\" stroke-linecap=\"butt\" stroke-linejoin=\"miter\" stroke-miterlimit=\"10\" stroke-dasharray=\"\" stroke-dashoffset=\"0\" font-family=\"none\" font-weight=\"none\" font-size=\"none\" text-anchor=\"none\" style=\"mix-blend-mode: normal\"/></svg>"
450 # }
451
452 my $bb_y_org_plus_y_dim = $bb_y_org + $bb_y_dim;
453 my $openannotation_on = [ {
454 "\@type" => "oa:SpecificResource",
455 # "full" => "https://iiif.harvardartmuseums.org/manifests/object/299843/canvas/canvas-47174896",
456 "full" => $canvas_full_uri,
457 "selector" => {
458 "\@type" => "oa:Choice",
459 "default" => {
460 "\@type" => "oa:FragmentSelector",
461 "value" => "xywh=${bb_x_org},${bb_y_org},${bb_x_dim},${bb_y_dim}"
462 },
463 "item" => {
464 "\@type" => "oa:SvgSelector",
465 "value" => "<svg xmlns='http://www.w3.org/2000/svg'><path xmlns='http://www.w3.org/2000/svg' d='M${bb_x_org},${bb_y_org_plus_y_dim}v-${bb_y_dim}h${bb_x_dim}v${bb_y_dim}z' data-paper-data='{&quot;state&quot;:null}' fill='none' fill-rule='nonzero' stroke='#008000' stroke-width='1' stroke-linecap='butt' stroke-linejoin='miter' stroke-miterlimit='10' stroke-dasharray='' stroke-dashoffset='0' font-family='none' font-weight='none' font-size='none' text-anchor='none' style='mix-blend-mode: normal'/></svg>"
466 }
467 }
468 #"within" => {
469 # #"\@id" => "https://iiif.harvardartmuseums.org/manifests/object/299843",
470 # "\@id" => $manifest_id_uri,
471 # "\@type" => "sc:Manifest"
472 #}
473 } ];
474
475 # # "on": "http://localhost:8887/coin/canvas#xywh=3706,208,522,522"
476 # my $openannotation_on = "${canvas_full_uri}#xywh=${bb_x_org},${bb_y_org},${bb_x_dim},${bb_y_dim}";
477
478 $openannotation_resource->{'on'} = $openannotation_on;
479
480
481 my $block_text_html = "";
482
483 foreach my $paragraph (@{$block->{'paragraphs'}}) {
484 my $para_text = "";
485
486 foreach my $word (@{$paragraph->{'words'}}) {
487 my $word_text = "";
488
489 foreach my $letter (@{$word->{'symbols'}}) {
490 $word_text .= $letter->{'text'};
491 }
492
493 $para_text .= " " if $para_text ne "";
494 $para_text .= $word_text;
495 }
496
497 $block_text_html .= "<p>\n$para_text\n</p>\n\n";
498 # $block_text_html .= "\n$para_text\n\n";
499 }
500
501 my $openannotation_inner_resource = [{
502 "\@type" => "dctypes:Text",
503 "chars" => "$block_text_html",
504 "format" => "text/html"
505 }];
506
507
508 #"resource": {
509 # "@type": "cnt:ContentAsText",
510 # "format": "text/plain",
511 # "chars": "Zeus seated on stool-throne"
512 #},
513
514 #my $openannotation_inner_resource = [{
515 # "\@type" => "cnt:ContentAsText",
516 # "format" => "text/plain",
517 # "chars" => "$block_text_html"
518 #}];
519
520 $openannotation_resource->{'resource'} = $openannotation_inner_resource;
521
522 push(@$self_openannotation_resources,$openannotation_resource);
523 }
524}
525
526
527sub convert_and_append_openannotation_resources
528{
529 my $self = shift (@_);
530 my ($gv_dococr_json_filename, $doc_obj, $section) = @_;
531
532
533 # Read in JSON file
534 my $json_text = do {
535 open(my $json_fh, "<:encoding(UTF-8)", $gv_dococr_json_filename)
536 or die("Can't open \"$gv_dococr_json_filename\": $!\n");
537 local $/;
538 <$json_fh>
539 };
540
541 my $decoded_json = JSON::from_json($json_text);
542
543 my $gv_blocks = $decoded_json->{'fullTextAnnotation'}->{'pages'}->[0]->{'blocks'};
544
545 $self->convert_gvocr_to_openannotation_resource($gv_blocks, $doc_obj, $section);
546}
547
548
549
550sub end_openannotation_list
551{
552 my $self = shift (@_);
553 my ($doc_obj,$json_ofilename) = @_;
554
555 my $ret_status = 1;
556
557 if (!open(JOUT, "> $json_ofilename")) {
558 print STDERR "Error: Failed save Open Annotation List JSON to \"$json_ofilename\":\n $!\n";
559 $ret_status = 0;
560 }
561 else {
562 binmode(JOUT, ":utf8");
563
564 my $openannotation_list = $self->{'openannotation-list'};
565 my $openannotation_list_json_text = JSON::encode_json($openannotation_list);
566
567 print JOUT $openannotation_list_json_text;
568 close JOUT;
569
570 }
571
572 $self->{'openannotation-list'} = undef;
573 $self->{'openannotation-uri-prefix'} = undef;
574
575 return $ret_status;
576}
577
578
579sub openannotation_list_associate_json
580{
581 my $self = shift (@_);
582 my ($doc_obj, $gv_dococr_json_filename_recs) = @_;
583
584 my $outhandle = $self->{'outhandle'};
585
586 my $all_saved_ok = 1;
587
588 for my $gv_json_filename_rec (@$gv_dococr_json_filename_recs) {
589 my $gv_json_filename = $gv_json_filename_rec->{'filename'};
590 my $section = $gv_json_filename_rec->{'section'};
591
592
593 my ($gv_dococr_filename_root) = ($gv_dococr_json_filename_recs->[0]->{'filename'} =~ m/^(.+)\.json$/);
594
595 # slight of hand so new directory spot in cache_dir picked out is where we want it!
596 $gv_dococr_filename_root .= "/";
597
598 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
599 my $toplevel_cached_dir = &FileUtils::filenameConcatenate($collect_dir,"cached");
600
601 $self->init_cache_for_file($gv_dococr_filename_root);
602 my $cached_dir = $self->{'cached_dir'};
603
604 my $assoc_openannotation_json_ofile = "openannotation-list${section}.json";
605 my $cached_openannotation_json_ofilename = &FileUtils::filenameConcatenate($cached_dir,$assoc_openannotation_json_ofile);
606
607 my $needs_json_regen = 0;
608
609 if (!-f $cached_openannotation_json_ofilename) {
610 $needs_json_regen = 1;
611 }
612 else {
613 if (-M $gv_json_filename > -M $cached_openannotation_json_ofilename) {
614 $needs_json_regen = 1;
615 }
616 }
617
618 my $saved_ok = 1;
619
620 if ($needs_json_regen) {
621
622 print $outhandle " OpenAnnotation-List: Generating $cached_openannotation_json_ofilename\n";
623
624 $self->start_openannotation_list($doc_obj);
625 $self->convert_and_append_openannotation_resources($gv_json_filename, $doc_obj,$section);
626
627 $saved_ok = $self->end_openannotation_list($doc_obj,$cached_openannotation_json_ofilename);
628 }
629 else {
630 print $outhandle " OpenAnnotation-List: Cached file $cached_openannotation_json_ofilename already exists\n";
631 }
632
633 if ($saved_ok) {
634 my $top_section = $doc_obj->get_top_section();
635 $doc_obj->associate_file($cached_openannotation_json_ofilename,$assoc_openannotation_json_ofile,"application/json",$top_section);
636 }
637 else {
638 $all_saved_ok = 0;
639 }
640 }
641
642 return $all_saved_ok;
643}
644
645
646sub opt_run_gen_openannotation
647{
648 my $self = shift (@_);
649 my ($doc_obj) = @_;
650
651 my $gv_dococr_json_filename_recs = $self->{'gv-dococr-json-filename-recs'};
652 my $num_gv_dococr_json_filename_recs = scalar(@$gv_dococr_json_filename_recs);
653
654 my $ret_val_ok = 1;
655
656 if ($num_gv_dococr_json_filename_recs > 0) {
657 $ret_val_ok = $self->openannotation_list_associate_json($doc_obj,$gv_dococr_json_filename_recs);
658 }
659
660 return $ret_val_ok;
661}
662
663
664
665sub start_webannotation_list_INPROGRESS
666{
667 my $self = shift (@_);
668 my ($doc_obj,$section) = @_;
669
670 my $OID = $doc_obj->get_OID();
671
672 my $site = $self->{'site'};
673 my $collect = my $collect_dir = $ENV{'GSDLCOLLECTION'};
674
675 # Implication of the following is that the generated webannotation-list JSON content
676 # is bound to the site/collection where it has been imported.
677 # => if renaming a collection at the file system level, then
678 # (i) The versios of webannotation-list*.json in the collections 'cache' dir
679 # need to be removed
680 # (ii) And collection rebuilt
681
682 my $uri_prefix = "http-greenstone://";
683 $uri_prefix .= "${site}/" if (defined $site) && $site ne ""; # GS3 specific
684 $uri_prefix .= "${collect}/";
685
686 my $id_uri = "${uri_prefix}${OID}/webannotation-list.json";
687
688 my $webannotation_list = {
689 "\@context" => "http://www.shared-canvas.org/ns/context.json",
690 "\@id" => $id_uri,
691 "\@type" => "sc:AnnotationList",
692 "resources" => []
693 };
694
695 $self->{'webannotation-list'} = $webannotation_list;
696 $self->{'webannotation-uri-prefix'} = $uri_prefix;
697}
698
699
700
701
702sub convert_gvocr_to_webannotation_resource_INPROGRESS
703{
704 my $self = shift (@_);
705 my ($gv_blocks, $doc_obj, $section) = @_;
706
707 my $OID = $doc_obj->get_OID();
708 my $OID_with_section = ($section ne "") ? "${OID}_$section" : $OID;
709 $section = 1 if ($section eq ""); # occurs when the document is a single image
710
711 # Details on difference between OpenAnnotation and WebAnnotation covered at
712 # https://www.google.com/search?q=iiif+simpleannotationserver&sxsrf=ALiCzsbIpm1YO0SYE9sCXBQ231_oyEmopw:1672137985013&source=lnms&tbm=vid&sa=X&ved=2ahUKEwizu_K0z5n8AhXF1DgGHQ7FCb4Q_AUoA3oECAEQBQ&biw=1536&bih=742&dpr=1.25#fpstate=ive&vld=cid:07a4e9d9,vid:gFNWWIe5QpM
713
714
715 my $self_webannotation_resources = $self->{'webannotation-list'}->{'resources'};
716
717 my $block_i = 0;
718
719 my $uri_prefix = $self->{'webannotation-uri-prefix'};
720
721 foreach my $block (@{$gv_blocks}) {
722 $block_i++;
723
724 my $annotation_id_uri = "${uri_prefix}${OID_with_section}/annotation/gv-block-$block_i";
725
726 my $webannotation_resource = {
727 "\@context" => "http://iiif.io/api/presentation/2/context.json",
728 "id" => $annotation_id_uri,
729 "type" => "Annotation",
730 "motivation" => [ "commenting" ]
731 };
732
733
734 my $bbox_rect = $self->gv_ocr_bounding_box_rect($block);
735 my $bb_x_org = $bbox_rect->{'x_org'};
736 my $bb_y_org = $bbox_rect->{'y_org'};
737 my $bb_x_dim = $bbox_rect->{'x_dim'};
738 my $bb_y_dim = $bbox_rect->{'y_dim'};
739
740 my $canvas_full_uri = "${uri_prefix}${OID}/canvas/$section";
741 my $manifest_id_uri = "${uri_prefix}${OID_with_section}/manifest";
742
743 # Needs updating -- see openannotation_on above !!!!!!! *********
744 my $webannotation_target = [ {
745 "type" => "oa:SpecificResource",
746 "full" => $canvas_full_uri,
747 "selector" => {
748 "type" => "oa:FragmentSelector",
749 "value" => "xywh=${bb_x_org},${bb_y_org},${bb_x_dim},${bb_y_dim}"
750 },
751 "within" => {
752 "id" => $manifest_id_uri,
753 "type" => "sc:Manifest"
754 }
755 } ];
756 $webannotation_resource->{'target'} = $webannotation_target;
757
758
759 my $block_text_html = "";
760
761 foreach my $paragraph (@{$block->{'paragraphs'}}) {
762 my $para_text = "";
763
764 foreach my $word (@{$paragraph->{'words'}}) {
765 my $word_text = "";
766
767 foreach my $letter (@{$word->{'symbols'}}) {
768 $word_text .= $letter->{'text'};
769 }
770
771 $para_text .= " " if $para_text ne "";
772 $para_text .= $word_text;
773 }
774
775 $block_text_html .= "<p>\n$para_text\n</p>\n\n";
776 }
777
778 my $webannotation_body = [{
779 "type" => "TextualBody",
780 "chars" => "$block_text_html",
781 "format" => "text/html"
782 }];
783
784 $webannotation_resource->{'body'} = $webannotation_body;
785
786 push(@$self_webannotation_resources,$webannotation_resource);
787 }
788}
789
790
791sub convert_and_append_webannotation_resources_INPROGRESS
792{
793 my $self = shift (@_);
794 my ($gv_dococr_json_filename, $doc_obj, $section) = @_;
795
796
797 # Read in JSON file
798 my $json_text = do {
799 open(my $json_fh, "<:encoding(UTF-8)", $gv_dococr_json_filename)
800 or die("Can't open \"$gv_dococr_json_filename\": $!\n");
801 local $/;
802 <$json_fh>
803 };
804
805 my $decoded_json = JSON::from_json($json_text);
806
807 my $gv_blocks = $decoded_json->{'fullTextAnnotation'}->{'pages'}->[0]->{'blocks'};
808
809 $self->convert_gvocr_to_webannotation_resource_INPROGRESS($gv_blocks, $doc_obj, $section);
810}
811
812
813
814sub end_webannotation_list_INPROGRESS
815{
816 my $self = shift (@_);
817 my ($doc_obj,$json_ofilename) = @_;
818
819 my $ret_status = 1;
820
821 if (!open(JOUT, "> $json_ofilename")) {
822 print STDERR "Error: Failed save Open Annotation List JSON to \"$json_ofilename\":\n $!\n";
823 $ret_status = 0;
824 }
825 else {
826 binmode(JOUT, ":utf8");
827
828 my $webannotation_list = $self->{'webannotation-list'};
829 my $webannotation_list_json_text = JSON::encode_json($webannotation_list);
830
831 print JOUT $webannotation_list_json_text;
832 close JOUT;
833
834 }
835
836 $self->{'webannotation-list'} = undef;
837 $self->{'webannotation-uri-prefix'} = undef;
838
839 return $ret_status;
840}
841
842sub webannotation_list_associate_json_INPROGRESS
843{
844 my $self = shift (@_);
845 my ($doc_obj, $gv_dococr_json_filename_recs) = @_;
846
847 my $outhandle = $self->{'outhandle'};
848
849 my $all_saved_ok = 1;
850
851 for my $gv_json_filename_rec (@$gv_dococr_json_filename_recs) {
852 my $gv_json_filename = $gv_json_filename_rec->{'filename'};
853 my $section = $gv_json_filename_rec->{'section'};
854
855
856 my ($gv_dococr_filename_root) = ($gv_dococr_json_filename_recs->[0]->{'filename'} =~ m/^(.+)\.json$/);
857
858 # slight of hand so new directory spot in cache_dir picked out is where we want it!
859 $gv_dococr_filename_root .= "/";
860
861 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
862 my $toplevel_cached_dir = &FileUtils::filenameConcatenate($collect_dir,"cached");
863
864 $self->init_cache_for_file($gv_dococr_filename_root);
865 my $cached_dir = $self->{'cached_dir'};
866
867 my $assoc_webannotation_json_ofile = "webannotation-list${section}.json";
868 my $cached_webannotation_json_ofilename = &FileUtils::filenameConcatenate($cached_dir,$assoc_webannotation_json_ofile);
869
870 my $needs_json_regen = 0;
871
872 if (!-f $cached_webannotation_json_ofilename) {
873 $needs_json_regen = 1;
874 }
875 else {
876 if (-M $gv_json_filename > -M $cached_webannotation_json_ofilename) {
877 $needs_json_regen = 1;
878 }
879 }
880
881 my $saved_ok = 1;
882
883 if ($needs_json_regen) {
884
885 print $outhandle " WebAnnotation-List: Generating $cached_webannotation_json_ofilename\n";
886
887 $self->start_webannotation_list_INPROGRESS($doc_obj);
888 $self->convert_and_append_webannotation_resources_INPROGRESS($gv_json_filename, $doc_obj,$section);
889
890 $saved_ok = $self->end_webannotation_list_INPROGRESS($doc_obj,$cached_webannotation_json_ofilename);
891 }
892 else {
893 print $outhandle " WebAnnotation-List: Cached file $cached_webannotation_json_ofilename already exists\n";
894 }
895
896 if ($saved_ok) {
897 my $top_section = $doc_obj->get_top_section();
898 $doc_obj->associate_file($cached_webannotation_json_ofilename,$assoc_webannotation_json_ofile,"application/json",$top_section);
899 }
900 else {
901 $all_saved_ok = 0;
902 }
903 }
904
905 return $all_saved_ok;
906}
907
908
909sub opt_run_gen_webannotation_INPROGRESS
910{
911 my $self = shift (@_);
912 my ($doc_obj) = @_;
913
914 my $gv_dococr_json_filename_recs = $self->{'gv-dococr-json-filename-recs'};
915 my $num_gv_dococr_json_filename_recs = scalar(@$gv_dococr_json_filename_recs);
916
917 my $ret_val_ok = 1;
918
919 if ($num_gv_dococr_json_filename_recs > 0) {
920 $ret_val_ok = $self->webannotation_list_associate_json($doc_obj,$gv_dococr_json_filename_recs);
921 }
922
923 return $ret_val_ok;
924}
925
926
9271;
928
Note: See TracBrowser for help on using the repository browser.