[31892] | 1 | package PJPlugin;
|
---|
| 2 |
|
---|
| 3 | use PagedImagePlugin;
|
---|
| 4 |
|
---|
| 5 | use strict;
|
---|
| 6 | no strict 'refs'; # allow filehandles to be variables and viceversa
|
---|
| 7 | no strict 'subs';
|
---|
| 8 |
|
---|
| 9 | sub BEGIN {
|
---|
| 10 | @PJPlugin::ISA = ('PagedImagePlugin');
|
---|
| 11 | }
|
---|
| 12 |
|
---|
| 13 | my $arguments = [];
|
---|
| 14 |
|
---|
| 15 | my $options = { 'name' => "PJPlugin",
|
---|
| 16 | 'desc' => "PagedImagePlugin variant for pei-jones written-works collection",
|
---|
| 17 | 'abstract' => "no",
|
---|
| 18 | 'inherits' => "yes",
|
---|
| 19 | 'args' => $arguments };
|
---|
| 20 |
|
---|
| 21 | sub new {
|
---|
| 22 | my ($class) = shift (@_);
|
---|
| 23 | my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
|
---|
| 24 | push(@$pluginlist, $class);
|
---|
| 25 |
|
---|
| 26 | push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
|
---|
| 27 | push(@{$hashArgOptLists->{"OptList"}},$options);
|
---|
| 28 |
|
---|
| 29 | my $self = new PagedImagePlugin($pluginlist, $inputargs, $hashArgOptLists);
|
---|
| 30 |
|
---|
| 31 | return bless $self, $class;
|
---|
| 32 | }
|
---|
| 33 |
|
---|
| 34 |
|
---|
| 35 | sub process_item {
|
---|
| 36 | my $self = shift (@_);
|
---|
| 37 | my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_;
|
---|
| 38 |
|
---|
| 39 | my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
|
---|
| 40 | $self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
|
---|
| 41 | my $topsection = $doc_obj->get_top_section();
|
---|
| 42 | # simple item files are always paged unless user specified
|
---|
| 43 | if ($self->{'documenttype'} eq "auto") {
|
---|
| 44 | $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "paged");
|
---|
| 45 | } else {
|
---|
| 46 | $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $self->{'documenttype'});
|
---|
| 47 | }
|
---|
| 48 | open (ITEMFILE, "<:encoding(UTF-8)", $filename_full_path) || die "couldn't open $filename_full_path\n";
|
---|
| 49 | my $line = "";
|
---|
| 50 | my $num = 0;
|
---|
| 51 | while (defined ($line = <ITEMFILE>)) {
|
---|
| 52 |
|
---|
| 53 | next unless $line =~ /\w/;
|
---|
| 54 | chomp $line;
|
---|
| 55 | next if $line =~ /^#/; # ignore comment lines
|
---|
| 56 | if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/) {
|
---|
| 57 | my $meta_name = $1;
|
---|
| 58 | my $meta_value = $2;
|
---|
| 59 | #if ($meta_name =~ /\./) {
|
---|
| 60 | # $meta_name = "ex.$meta_name";
|
---|
| 61 | # }
|
---|
| 62 | # PJ mod:
|
---|
| 63 | if ($meta_value !~ /^unknown$/) {
|
---|
| 64 | # don't add in unknown values
|
---|
| 65 | # set all metadata at pj.
|
---|
| 66 | $doc_obj->set_utf8_metadata_element ($topsection, "pj.".$meta_name, $meta_value);
|
---|
| 67 | }
|
---|
| 68 | #$meta->{$1} = $2;
|
---|
| 69 | } else {
|
---|
| 70 | $num++;
|
---|
| 71 | # line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
|
---|
| 72 | $line =~ s/^\s+//; #remove space at the front
|
---|
| 73 | $line =~ s/\s+$//; #remove space at the end
|
---|
| 74 | my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
|
---|
| 75 |
|
---|
| 76 | # PJ: use jpg versions instead of tif versions
|
---|
[31940] | 77 | #$imgname =~ s/tif/jpg/g;
|
---|
| 78 | #print STDERR "new img name=$imgname\n";
|
---|
[31892] | 79 | # create a new section for each image file
|
---|
| 80 | my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
|
---|
| 81 | # the page number becomes the Title
|
---|
| 82 | $doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
|
---|
| 83 |
|
---|
| 84 | # process the image for this page if there is one
|
---|
| 85 | if (defined $imgname && $imgname ne "") {
|
---|
| 86 | my $result1 = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
|
---|
| 87 | if (!defined $result1)
|
---|
| 88 | {
|
---|
| 89 | print "PagedImagePlugin: couldn't process image \"$dir$imgname\" for item \"$filename_full_path\"\n";
|
---|
| 90 | }
|
---|
| 91 | }
|
---|
| 92 | # process the text file if one is there
|
---|
| 93 | if (defined $txtname && $txtname ne "") {
|
---|
| 94 | my $result2 = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
|
---|
| 95 |
|
---|
| 96 | if (!defined $result2) {
|
---|
| 97 | print "PagedImagePlugin: couldn't process text file \"$dir.$txtname\" for item \"$filename_full_path\"\n";
|
---|
| 98 | $self->add_dummy_text($doc_obj, $cursection);
|
---|
| 99 | }
|
---|
| 100 | } else {
|
---|
| 101 | # otherwise add in some dummy text
|
---|
| 102 | $self->add_dummy_text($doc_obj, $cursection);
|
---|
| 103 | }
|
---|
| 104 | }
|
---|
| 105 | }
|
---|
| 106 |
|
---|
| 107 | close ITEMFILE;
|
---|
| 108 |
|
---|
| 109 | # add numpages metadata
|
---|
| 110 | $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
|
---|
| 111 |
|
---|
| 112 | $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
|
---|
| 113 | $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
|
---|
| 114 | $self->{'MaxImageWidth'} = undef;
|
---|
| 115 | $self->{'MaxImageHeight'} = undef;
|
---|
| 116 |
|
---|
| 117 |
|
---|
| 118 | return $doc_obj;
|
---|
| 119 | }
|
---|
| 120 |
|
---|
| 121 | sub scan_item_for_files_to_block
|
---|
| 122 | {
|
---|
| 123 | my $self = shift (@_);
|
---|
| 124 | my ($filename_full_path, $dir, $block_hash) = @_;
|
---|
| 125 |
|
---|
| 126 |
|
---|
| 127 | open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path to work out which files to block\n";
|
---|
| 128 | my $line = "";
|
---|
| 129 | while (defined ($line = <ITEMFILE>)) {
|
---|
| 130 | next unless $line =~ /\w/;
|
---|
| 131 | chomp $line;
|
---|
| 132 | next if $line =~ /^#/; # ignore comment lines
|
---|
| 133 | next if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/); # ignore metadata lines
|
---|
| 134 | # line should be like page:imagefilename:textfilename:r
|
---|
| 135 | $line =~ s/^\s+//; #remove space at the front
|
---|
| 136 | $line =~ s/\s+$//; #remove space at the end
|
---|
| 137 | my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
|
---|
| 138 |
|
---|
| 139 | # PJ: use jpg versions instead of tif versions
|
---|
[31940] | 140 | #$imgname =~ s/tif/jpg/g;
|
---|
[31892] | 141 |
|
---|
| 142 | # find the image file if there is one
|
---|
| 143 | if (defined $imgname && $imgname ne "") {
|
---|
| 144 | $self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate( $dir,$imgname));
|
---|
| 145 | }
|
---|
| 146 | # find the text file if there is one
|
---|
| 147 | if (defined $txtname && $txtname ne "") {
|
---|
| 148 | $self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate($dir,$txtname));
|
---|
| 149 | }
|
---|
| 150 | }
|
---|
| 151 | close ITEMFILE;
|
---|
| 152 |
|
---|
| 153 | }
|
---|
| 154 |
|
---|
| 155 | 1;
|
---|