package PJPlugin; use PagedImagePlugin; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa no strict 'subs'; sub BEGIN { @PJPlugin::ISA = ('PagedImagePlugin'); } my $arguments = []; my $options = { 'name' => "PJPlugin", 'desc' => "PagedImagePlugin variant for pei-jones written-works collection", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new PagedImagePlugin($pluginlist, $inputargs, $hashArgOptLists); return bless $self, $class; } sub process_item { my $self = shift (@_); my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_; my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'}); $self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata); my $topsection = $doc_obj->get_top_section(); # simple item files are always paged unless user specified if ($self->{'documenttype'} eq "auto") { $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "paged"); } else { $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $self->{'documenttype'}); } open (ITEMFILE, "<:encoding(UTF-8)", $filename_full_path) || die "couldn't open $filename_full_path\n"; my $line = ""; my $num = 0; while (defined ($line = )) { next unless $line =~ /\w/; chomp $line; next if $line =~ /^#/; # ignore comment lines if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/) { my $meta_name = $1; my $meta_value = $2; #if ($meta_name =~ /\./) { # $meta_name = "ex.$meta_name"; # } # PJ mod: if ($meta_value !~ /^unknown$/) { # don't add in unknown values # set all metadata at pj. $doc_obj->set_utf8_metadata_element ($topsection, "pj.".$meta_name, $meta_value); } #$meta->{$1} = $2; } else { $num++; # line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg $line =~ s/^\s+//; #remove space at the front $line =~ s/\s+$//; #remove space at the end my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line; # PJ: use jpg versions instead of tif versions $imgname =~ s/tif/jpg/g; print STDERR "new img name=$imgname\n"; # create a new section for each image file my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection)); # the page number becomes the Title $doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum); # process the image for this page if there is one if (defined $imgname && $imgname ne "") { my $result1 = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate); if (!defined $result1) { print "PagedImagePlugin: couldn't process image \"$dir$imgname\" for item \"$filename_full_path\"\n"; } } # process the text file if one is there if (defined $txtname && $txtname ne "") { my $result2 = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection); if (!defined $result2) { print "PagedImagePlugin: couldn't process text file \"$dir.$txtname\" for item \"$filename_full_path\"\n"; $self->add_dummy_text($doc_obj, $cursection); } } else { # otherwise add in some dummy text $self->add_dummy_text($doc_obj, $cursection); } } } close ITEMFILE; # add numpages metadata $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num"); $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'}); $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'}); $self->{'MaxImageWidth'} = undef; $self->{'MaxImageHeight'} = undef; return $doc_obj; } sub scan_item_for_files_to_block { my $self = shift (@_); my ($filename_full_path, $dir, $block_hash) = @_; open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path to work out which files to block\n"; my $line = ""; while (defined ($line = )) { next unless $line =~ /\w/; chomp $line; next if $line =~ /^#/; # ignore comment lines next if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/); # ignore metadata lines # line should be like page:imagefilename:textfilename:r $line =~ s/^\s+//; #remove space at the front $line =~ s/\s+$//; #remove space at the end my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line; # PJ: use jpg versions instead of tif versions $imgname =~ s/tif/jpg/g; # find the image file if there is one if (defined $imgname && $imgname ne "") { $self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate( $dir,$imgname)); } # find the text file if there is one if (defined $txtname && $txtname ne "") { $self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate($dir,$txtname)); } } close ITEMFILE; } 1;