[31892] | 1 | package PJPlugin;
|
---|
| 2 |
|
---|
| 3 | use PagedImagePlugin;
|
---|
| 4 |
|
---|
| 5 | use strict;
|
---|
| 6 | no strict 'refs'; # allow filehandles to be variables and viceversa
|
---|
| 7 | no strict 'subs';
|
---|
| 8 |
|
---|
| 9 | sub BEGIN {
|
---|
| 10 | @PJPlugin::ISA = ('PagedImagePlugin');
|
---|
| 11 | }
|
---|
| 12 |
|
---|
| 13 | my $arguments = [];
|
---|
| 14 |
|
---|
| 15 | my $options = { 'name' => "PJPlugin",
|
---|
| 16 | 'desc' => "PagedImagePlugin variant for pei-jones written-works collection",
|
---|
| 17 | 'abstract' => "no",
|
---|
| 18 | 'inherits' => "yes",
|
---|
| 19 | 'args' => $arguments };
|
---|
| 20 |
|
---|
[32147] | 21 | my $files_dir ="/nzdl-storage/other-projects/pei-jones/Jones_Collection/";
|
---|
| 22 |
|
---|
| 23 | sub begin {
|
---|
| 24 | my $self = shift (@_);
|
---|
| 25 | my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
|
---|
| 26 |
|
---|
| 27 | # Save base_dir for use in file cache
|
---|
| 28 | $self->{'base_dir'} = $files_dir;
|
---|
| 29 | }
|
---|
| 30 |
|
---|
| 31 |
|
---|
[31892] | 32 | sub new {
|
---|
| 33 | my ($class) = shift (@_);
|
---|
| 34 | my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
|
---|
| 35 | push(@$pluginlist, $class);
|
---|
| 36 |
|
---|
| 37 | push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
|
---|
| 38 | push(@{$hashArgOptLists->{"OptList"}},$options);
|
---|
| 39 |
|
---|
| 40 | my $self = new PagedImagePlugin($pluginlist, $inputargs, $hashArgOptLists);
|
---|
| 41 |
|
---|
| 42 | return bless $self, $class;
|
---|
| 43 | }
|
---|
| 44 |
|
---|
| 45 |
|
---|
| 46 | sub process_item {
|
---|
| 47 | my $self = shift (@_);
|
---|
| 48 | my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_;
|
---|
| 49 |
|
---|
| 50 | my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
|
---|
| 51 | $self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
|
---|
| 52 | my $topsection = $doc_obj->get_top_section();
|
---|
| 53 | # simple item files are always paged unless user specified
|
---|
| 54 | if ($self->{'documenttype'} eq "auto") {
|
---|
| 55 | $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "paged");
|
---|
| 56 | } else {
|
---|
| 57 | $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $self->{'documenttype'});
|
---|
| 58 | }
|
---|
| 59 | open (ITEMFILE, "<:encoding(UTF-8)", $filename_full_path) || die "couldn't open $filename_full_path\n";
|
---|
| 60 | my $line = "";
|
---|
| 61 | my $num = 0;
|
---|
| 62 | while (defined ($line = <ITEMFILE>)) {
|
---|
| 63 |
|
---|
| 64 | next unless $line =~ /\w/;
|
---|
| 65 | chomp $line;
|
---|
| 66 | next if $line =~ /^#/; # ignore comment lines
|
---|
| 67 | if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/) {
|
---|
| 68 | my $meta_name = $1;
|
---|
| 69 | my $meta_value = $2;
|
---|
| 70 | #if ($meta_name =~ /\./) {
|
---|
| 71 | # $meta_name = "ex.$meta_name";
|
---|
| 72 | # }
|
---|
| 73 | # PJ mod:
|
---|
| 74 | if ($meta_value !~ /^unknown$/) {
|
---|
| 75 | # don't add in unknown values
|
---|
| 76 | # set all metadata at pj.
|
---|
| 77 | $doc_obj->set_utf8_metadata_element ($topsection, "pj.".$meta_name, $meta_value);
|
---|
| 78 | }
|
---|
| 79 | #$meta->{$1} = $2;
|
---|
| 80 | } else {
|
---|
| 81 | $num++;
|
---|
| 82 | # line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
|
---|
| 83 | $line =~ s/^\s+//; #remove space at the front
|
---|
| 84 | $line =~ s/\s+$//; #remove space at the end
|
---|
| 85 | my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
|
---|
| 86 |
|
---|
| 87 | # PJ: use jpg versions instead of tif versions
|
---|
[31940] | 88 | #$imgname =~ s/tif/jpg/g;
|
---|
| 89 | #print STDERR "new img name=$imgname\n";
|
---|
[31892] | 90 | # create a new section for each image file
|
---|
| 91 | my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
|
---|
| 92 | # the page number becomes the Title
|
---|
| 93 | $doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
|
---|
| 94 |
|
---|
[32147] | 95 | print STDERR "image file $files_dir$imgname\n";
|
---|
| 96 |
|
---|
[31892] | 97 | # process the image for this page if there is one
|
---|
| 98 | if (defined $imgname && $imgname ne "") {
|
---|
[32147] | 99 | my $result1 = $self->process_image($files_dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
|
---|
[31892] | 100 | if (!defined $result1)
|
---|
| 101 | {
|
---|
[32147] | 102 | print "PagedImagePlugin: couldn't process image \"$files_dir$imgname\" for item \"$filename_full_path\"\n";
|
---|
[31892] | 103 | }
|
---|
| 104 | }
|
---|
| 105 | # process the text file if one is there
|
---|
| 106 | if (defined $txtname && $txtname ne "") {
|
---|
[32147] | 107 | my $result2 = $self->process_text ($files_dir.$txtname, $txtname, $doc_obj, $cursection);
|
---|
[31892] | 108 |
|
---|
| 109 | if (!defined $result2) {
|
---|
[32147] | 110 | print "PagedImagePlugin: couldn't process text file \"$files_dir.$txtname\" for item \"$filename_full_path\"\n";
|
---|
[31892] | 111 | $self->add_dummy_text($doc_obj, $cursection);
|
---|
| 112 | }
|
---|
| 113 | } else {
|
---|
| 114 | # otherwise add in some dummy text
|
---|
| 115 | $self->add_dummy_text($doc_obj, $cursection);
|
---|
| 116 | }
|
---|
| 117 | }
|
---|
| 118 | }
|
---|
| 119 |
|
---|
| 120 | close ITEMFILE;
|
---|
| 121 |
|
---|
| 122 | # add numpages metadata
|
---|
| 123 | $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
|
---|
| 124 |
|
---|
| 125 | $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
|
---|
| 126 | $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
|
---|
| 127 | $self->{'MaxImageWidth'} = undef;
|
---|
| 128 | $self->{'MaxImageHeight'} = undef;
|
---|
| 129 |
|
---|
| 130 |
|
---|
| 131 | return $doc_obj;
|
---|
| 132 | }
|
---|
| 133 |
|
---|
| 134 | sub scan_item_for_files_to_block
|
---|
| 135 | {
|
---|
| 136 | my $self = shift (@_);
|
---|
| 137 | my ($filename_full_path, $dir, $block_hash) = @_;
|
---|
| 138 |
|
---|
| 139 |
|
---|
| 140 | open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path to work out which files to block\n";
|
---|
| 141 | my $line = "";
|
---|
| 142 | while (defined ($line = <ITEMFILE>)) {
|
---|
| 143 | next unless $line =~ /\w/;
|
---|
| 144 | chomp $line;
|
---|
| 145 | next if $line =~ /^#/; # ignore comment lines
|
---|
| 146 | next if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/); # ignore metadata lines
|
---|
| 147 | # line should be like page:imagefilename:textfilename:r
|
---|
| 148 | $line =~ s/^\s+//; #remove space at the front
|
---|
| 149 | $line =~ s/\s+$//; #remove space at the end
|
---|
| 150 | my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
|
---|
| 151 |
|
---|
| 152 | # PJ: use jpg versions instead of tif versions
|
---|
[31940] | 153 | #$imgname =~ s/tif/jpg/g;
|
---|
[31892] | 154 |
|
---|
| 155 | # find the image file if there is one
|
---|
| 156 | if (defined $imgname && $imgname ne "") {
|
---|
| 157 | $self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate( $dir,$imgname));
|
---|
| 158 | }
|
---|
| 159 | # find the text file if there is one
|
---|
| 160 | if (defined $txtname && $txtname ne "") {
|
---|
| 161 | $self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate($dir,$txtname));
|
---|
| 162 | }
|
---|
| 163 | }
|
---|
| 164 | close ITEMFILE;
|
---|
| 165 |
|
---|
| 166 | }
|
---|
| 167 |
|
---|
| 168 | 1;
|
---|