Ignore:
Timestamp:
2022-12-23T10:37:02+13:00 (14 months ago)
Author:
davidb
Message:

A variety of changes: added in call to post_process_doc_obj() which is an existing base-class method that should have really been there (for PagedImagePlugin not particulary relevant, but needed in the GoogleVisionPagedImagePlugin inherited version for Open Annotation JSON files to be produced; code also changed over the the add_dummy_text_if_empty(); some refactoring of methods to use supporting subroutines; Adjustment of print statements that say 'processing by <plugin>' to use reflect to determine the class name, allowing for Inherited Plugins names to be used

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm

    r37028 r37051  
    324324    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    325325
    326     print $outhandle "PagedImagePlugin processing \"$filename_full_path\"\n"
     326    my $toplevel_plugin_classname = ref($self);
     327    print $outhandle "$toplevel_plugin_classname processing \"$filename_full_path\"\n"
    327328    if $verbosity > 1;
    328     print STDERR "<Processing n='$file' p='PagedImagePlugin'>\n" if ($gli);
     329    print STDERR "<Processing n='$file' p='$toplevel_plugin_classname'>\n" if ($gli);
    329330   
    330331    $self->{'MaxImageWidth'} = 0;
     
    366367
    367368    $self->add_OID($doc_obj);
     369
     370    $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
    368371    return (1,$doc_obj);
    369372}
     
    509512        $self->process_text ($self->{'xml_file_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
    510513    } else {
    511         $self->add_dummy_text($doc_obj, $self->{'current_section'});
     514        # A plugin inheriting from this might be able to derive text from the image
     515        # (e.g., through GoogleVisionAPI), and so don't just assume there is no
     516        # text for the image -- check its text length, and only set the dummy
     517        # text if it is zero
     518        $self->add_dummy_text_if_empty($doc_obj, $self->{'current_section'});
    512519    }
    513520    } elsif ($element eq "Metadata") {
     
    564571   
    565572    # create a new document
    566     $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
    567     # TODO is file filenmae_no_path??
    568     $self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'});
    569 
     573    #$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
     574    ## TODO is file filename_no_path??
     575    #$self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'});
     576
     577    # create a new document
     578    my $doc_obj = $self->init_new_doc_item($self->{'filename'}, $self->{'processor'}, $self->{'metadata'});
     579   
    570580    my ($dir, $file) = $self->{'filename'} =~ /^(.*?)([^\/\\]*)$/;
    571581    $self->{'xml_file_dir'} = $dir;
     
    614624    $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
    615625    $self->{'MaxImageWidth'} = undef;
    616     $self->{'MaxImageHeight'} = undef;
    617    
     626    $self->{'MaxImageHeight'} = undef;   
    618627}
    619628
     
    687696}
    688697
    689 sub process_item {
    690     my $self = shift (@_);
    691     my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_;
    692 
    693     my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
     698sub init_new_doc_item
     699{   
     700    my $self = shift (@_);
     701    my ($filename_full_path, $processor, $metadata) = @_;
     702
     703    my $doc_obj = new doc($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
    694704    $self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
     705
     706    return $doc_obj;
     707}
     708
     709sub read_and_process_itemtxt
     710{
     711    my $self = shift (@_);
     712    my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata, $doc_obj) = @_;
     713
    695714    my $topsection = $doc_obj->get_top_section();
    696     # simple item files are always paged unless user specified
    697     if ($self->{'documenttype'} eq "auto") {
    698     $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "paged");
    699     } else {
    700     $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $self->{'documenttype'});
    701     }
     715   
    702716    open (ITEMFILE, "<:encoding(UTF-8)", $filename_full_path) || die "couldn't open $filename_full_path\n";
    703717    my $line = "";
     
    742756        if (!defined $result2) {
    743757            print "PagedImagePlugin: couldn't process text file \"$dir.$txtname\" for item \"$filename_full_path\"\n";
    744             $self->add_dummy_text($doc_obj, $cursection);
     758            $self->add_dummy_text_if_empty($doc_obj, $cursection);
    745759        }
    746760        } else {
    747761        # otherwise add in some dummy text
    748         $self->add_dummy_text($doc_obj, $cursection);
     762        $self->add_dummy_text_if_empty($doc_obj, $cursection);
    749763        }
    750764    }
     
    753767    close ITEMFILE;
    754768
     769    return $num;
     770}
     771
     772   
     773sub process_item {
     774    my $self = shift (@_);
     775    my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_;
     776
     777    # create a new document
     778    #my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
     779    #$self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
     780
     781    # create a new document
     782    my $doc_obj = $self->init_new_doc_item($filename_full_path, $processor, $metadata);
     783       
     784    my $num_pages = $self->read_and_process_itemtxt($filename_full_path, $dir, $filename_no_path, $processor, $metadata, $doc_obj);
     785   
     786    my $topsection = $doc_obj->get_top_section();
     787   
     788    # simple item files are always paged unless user specified
     789    if ($self->{'documenttype'} eq "auto") {
     790    $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "paged");
     791    } else {
     792    $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $self->{'documenttype'});
     793    }
     794   
    755795    # add numpages metadata
    756     $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
     796    $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num_pages");
    757797
    758798    $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
     
    760800    $self->{'MaxImageWidth'} = undef;
    761801    $self->{'MaxImageHeight'} = undef;
    762 
    763802
    764803    return $doc_obj;
Note: See TracChangeset for help on using the changeset viewer.