Ignore:
Timestamp:
2023-01-07T11:42:21+13:00 (16 months ago)
Author:
davidb
Message:

Allow for a page having now OCR'd text found

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/structured-image/trunk/perllib/plugins/GoogleVisionAPIConverter.pm

    r37065 r37117  
    261261    my $ocr_text;
    262262    if ($vision_type eq "enable_document_ocr" || $vision_type eq "enable_image_ocr") {
    263         $ocr_text = $decoded_json->{'textAnnotations'}[0]{'description'}; # access full ocr content
    264         $doc_obj->add_utf8_text($section, $ocr_text); # append text to section
    265 
    266         my $blocks = $decoded_json->{'fullTextAnnotation'}{'pages'}[0]{'blocks'};
    267         my %text_and_language;
    268 
    269         foreach my $block (@{$blocks}) {
    270             foreach my $paragraph (@{$block->{'paragraphs'}}) {
    271                 foreach my $word (@{$paragraph->{'words'}}) {
    272                     my $detected_language = $word->{'property'}{'detectedLanguages'}[0]{'languageCode'} || "no_lang";
    273                     my $word_text = "";
    274                     foreach my $letter (@{$word->{'symbols'}}) {
    275                         $word_text .= $letter->{'text'};
    276                     }
    277                     $text_and_language{$detected_language} .= $word_text . " ";
    278                 }
    279             }
    280         }
    281    
    282         for (keys %text_and_language) {
    283             $doc_obj->add_utf8_metadata($section, "z_" . $_, $text_and_language{$_});
    284         }
    285 
     263
     264    if (defined $decoded_json->{'textAnnotations'}) {
     265        $ocr_text = $decoded_json->{'textAnnotations'}->[0]->{'description'}; # access full ocr content
     266        $doc_obj->add_utf8_text($section, $ocr_text); # append text to section
     267       
     268        my $blocks = $decoded_json->{'fullTextAnnotation'}->{'pages'}->[0]->{'blocks'};
     269        my %text_and_language;
     270
     271        foreach my $block (@{$blocks}) {
     272        foreach my $paragraph (@{$block->{'paragraphs'}}) {
     273            foreach my $word (@{$paragraph->{'words'}}) {
     274            my $detected_language = $word->{'property'}->{'detectedLanguages'}->[0]->{'languageCode'} || "no_lang";
     275            my $word_text = "";
     276            foreach my $letter (@{$word->{'symbols'}}) {
     277                $word_text .= $letter->{'text'};
     278            }
     279            $text_and_language{$detected_language} .= $word_text . " ";
     280            }
     281        }
     282        }
     283   
     284        for (keys %text_and_language) {
     285        $doc_obj->add_utf8_metadata($section, "z_" . $_, $text_and_language{$_});
     286        }
     287       
     288    }
     289
     290    # Note: Even if there is no actual OCR'd text detected (if test above),
     291    # stil set metadata that show that we applied the Google Vision API seeking text
    286292   
    287293    my $assoc_json_metaname = "HasGoogleVision";
Note: See TracChangeset for help on using the changeset viewer.