Changeset 37051
- Timestamp:
- 2022-12-23T10:37:02+13:00 (11 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm
r37028 r37051 324 324 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 325 325 326 print $outhandle "PagedImagePlugin processing \"$filename_full_path\"\n" 326 my $toplevel_plugin_classname = ref($self); 327 print $outhandle "$toplevel_plugin_classname processing \"$filename_full_path\"\n" 327 328 if $verbosity > 1; 328 print STDERR "<Processing n='$file' p=' PagedImagePlugin'>\n" if ($gli);329 print STDERR "<Processing n='$file' p='$toplevel_plugin_classname'>\n" if ($gli); 329 330 330 331 $self->{'MaxImageWidth'} = 0; … … 366 367 367 368 $self->add_OID($doc_obj); 369 370 $self->post_process_doc_obj($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli); 368 371 return (1,$doc_obj); 369 372 } … … 509 512 $self->process_text ($self->{'xml_file_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'}); 510 513 } else { 511 $self->add_dummy_text($doc_obj, $self->{'current_section'}); 514 # A plugin inheriting from this might be able to derive text from the image 515 # (e.g., through GoogleVisionAPI), and so don't just assume there is no 516 # text for the image -- check its text length, and only set the dummy 517 # text if it is zero 518 $self->add_dummy_text_if_empty($doc_obj, $self->{'current_section'}); 512 519 } 513 520 } elsif ($element eq "Metadata") { … … 564 571 565 572 # create a new document 566 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'}); 567 # TODO is file filenmae_no_path?? 568 $self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'}); 569 573 #$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'}); 574 ## TODO is file filename_no_path?? 575 #$self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'}); 576 577 # create a new document 578 my $doc_obj = $self->init_new_doc_item($self->{'filename'}, $self->{'processor'}, $self->{'metadata'}); 579 570 580 my ($dir, $file) = $self->{'filename'} =~ /^(.*?)([^\/\\]*)$/; 571 581 $self->{'xml_file_dir'} = $dir; … … 614 624 $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'}); 615 625 $self->{'MaxImageWidth'} = undef; 616 $self->{'MaxImageHeight'} = undef; 617 626 $self->{'MaxImageHeight'} = undef; 618 627 } 619 628 … … 687 696 } 688 697 689 sub process_item { 690 my $self = shift (@_); 691 my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_; 692 693 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'}); 698 sub init_new_doc_item 699 { 700 my $self = shift (@_); 701 my ($filename_full_path, $processor, $metadata) = @_; 702 703 my $doc_obj = new doc($filename_full_path, "indexed_doc", $self->{'file_rename_method'}); 694 704 $self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata); 705 706 return $doc_obj; 707 } 708 709 sub read_and_process_itemtxt 710 { 711 my $self = shift (@_); 712 my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata, $doc_obj) = @_; 713 695 714 my $topsection = $doc_obj->get_top_section(); 696 # simple item files are always paged unless user specified 697 if ($self->{'documenttype'} eq "auto") { 698 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "paged"); 699 } else { 700 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $self->{'documenttype'}); 701 } 715 702 716 open (ITEMFILE, "<:encoding(UTF-8)", $filename_full_path) || die "couldn't open $filename_full_path\n"; 703 717 my $line = ""; … … 742 756 if (!defined $result2) { 743 757 print "PagedImagePlugin: couldn't process text file \"$dir.$txtname\" for item \"$filename_full_path\"\n"; 744 $self->add_dummy_text ($doc_obj, $cursection);758 $self->add_dummy_text_if_empty($doc_obj, $cursection); 745 759 } 746 760 } else { 747 761 # otherwise add in some dummy text 748 $self->add_dummy_text ($doc_obj, $cursection);762 $self->add_dummy_text_if_empty($doc_obj, $cursection); 749 763 } 750 764 } … … 753 767 close ITEMFILE; 754 768 769 return $num; 770 } 771 772 773 sub process_item { 774 my $self = shift (@_); 775 my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_; 776 777 # create a new document 778 #my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'}); 779 #$self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata); 780 781 # create a new document 782 my $doc_obj = $self->init_new_doc_item($filename_full_path, $processor, $metadata); 783 784 my $num_pages = $self->read_and_process_itemtxt($filename_full_path, $dir, $filename_no_path, $processor, $metadata, $doc_obj); 785 786 my $topsection = $doc_obj->get_top_section(); 787 788 # simple item files are always paged unless user specified 789 if ($self->{'documenttype'} eq "auto") { 790 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "paged"); 791 } else { 792 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $self->{'documenttype'}); 793 } 794 755 795 # add numpages metadata 756 $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num ");796 $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num_pages"); 757 797 758 798 $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'}); … … 760 800 $self->{'MaxImageWidth'} = undef; 761 801 $self->{'MaxImageHeight'} = undef; 762 763 802 764 803 return $doc_obj;
Note:
See TracChangeset
for help on using the changeset viewer.