1 | package PJPlugin;
|
---|
2 |
|
---|
3 | use PagedImagePlugin;
|
---|
4 |
|
---|
5 | use strict;
|
---|
6 | no strict 'refs'; # allow filehandles to be variables and viceversa
|
---|
7 | no strict 'subs';
|
---|
8 |
|
---|
9 | sub BEGIN {
|
---|
10 | @PJPlugin::ISA = ('PagedImagePlugin');
|
---|
11 | }
|
---|
12 |
|
---|
13 | my $arguments = [];
|
---|
14 |
|
---|
15 | my $options = { 'name' => "PJPlugin",
|
---|
16 | 'desc' => "PagedImagePlugin variant for pei-jones written-works collection",
|
---|
17 | 'abstract' => "no",
|
---|
18 | 'inherits' => "yes",
|
---|
19 | 'args' => $arguments };
|
---|
20 |
|
---|
21 | my $files_dir ="/nzdl-storage/other-projects/pei-jones/Jones_Collection/";
|
---|
22 |
|
---|
23 | sub begin {
|
---|
24 | my $self = shift (@_);
|
---|
25 | my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
|
---|
26 |
|
---|
27 | # Save base_dir for use in file cache
|
---|
28 | $self->{'base_dir'} = $files_dir;
|
---|
29 | }
|
---|
30 |
|
---|
31 |
|
---|
32 | sub new {
|
---|
33 | my ($class) = shift (@_);
|
---|
34 | my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
|
---|
35 | push(@$pluginlist, $class);
|
---|
36 |
|
---|
37 | push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
|
---|
38 | push(@{$hashArgOptLists->{"OptList"}},$options);
|
---|
39 |
|
---|
40 | my $self = new PagedImagePlugin($pluginlist, $inputargs, $hashArgOptLists);
|
---|
41 |
|
---|
42 | return bless $self, $class;
|
---|
43 | }
|
---|
44 |
|
---|
45 |
|
---|
46 | sub process_item {
|
---|
47 | my $self = shift (@_);
|
---|
48 | my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_;
|
---|
49 |
|
---|
50 | my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
|
---|
51 | $self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
|
---|
52 | my $topsection = $doc_obj->get_top_section();
|
---|
53 | # simple item files are always paged unless user specified
|
---|
54 | if ($self->{'documenttype'} eq "auto") {
|
---|
55 | $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "paged");
|
---|
56 | } else {
|
---|
57 | $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $self->{'documenttype'});
|
---|
58 | }
|
---|
59 | open (ITEMFILE, "<:encoding(UTF-8)", $filename_full_path) || die "couldn't open $filename_full_path\n";
|
---|
60 | my $line = "";
|
---|
61 | my $num = 0;
|
---|
62 | while (defined ($line = <ITEMFILE>)) {
|
---|
63 |
|
---|
64 | next unless $line =~ /\w/;
|
---|
65 | chomp $line;
|
---|
66 | next if $line =~ /^#/; # ignore comment lines
|
---|
67 | if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/) {
|
---|
68 | my $meta_name = $1;
|
---|
69 | my $meta_value = $2;
|
---|
70 | #if ($meta_name =~ /\./) {
|
---|
71 | # $meta_name = "ex.$meta_name";
|
---|
72 | # }
|
---|
73 | # PJ mod:
|
---|
74 | if ($meta_value !~ /^unknown$/) {
|
---|
75 | # don't add in unknown values
|
---|
76 | # set all metadata at pj.
|
---|
77 | $doc_obj->set_utf8_metadata_element ($topsection, "pj.".$meta_name, $meta_value);
|
---|
78 | }
|
---|
79 | #$meta->{$1} = $2;
|
---|
80 | } else {
|
---|
81 | $num++;
|
---|
82 | # line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
|
---|
83 | $line =~ s/^\s+//; #remove space at the front
|
---|
84 | $line =~ s/\s+$//; #remove space at the end
|
---|
85 | my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
|
---|
86 |
|
---|
87 | # PJ: use jpg versions instead of tif versions
|
---|
88 | #$imgname =~ s/tif/jpg/g;
|
---|
89 | #print STDERR "new img name=$imgname\n";
|
---|
90 | # create a new section for each image file
|
---|
91 | my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
|
---|
92 | # the page number becomes the Title
|
---|
93 | $doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
|
---|
94 |
|
---|
95 | print STDERR "image file $files_dir$imgname\n";
|
---|
96 |
|
---|
97 | # process the image for this page if there is one
|
---|
98 | if (defined $imgname && $imgname ne "") {
|
---|
99 | my $result1 = $self->process_image($files_dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
|
---|
100 | if (!defined $result1)
|
---|
101 | {
|
---|
102 | print "PagedImagePlugin: couldn't process image \"$files_dir$imgname\" for item \"$filename_full_path\"\n";
|
---|
103 | }
|
---|
104 | }
|
---|
105 | # process the text file if one is there
|
---|
106 | if (defined $txtname && $txtname ne "") {
|
---|
107 | my $result2 = $self->process_text ($files_dir.$txtname, $txtname, $doc_obj, $cursection);
|
---|
108 |
|
---|
109 | if (!defined $result2) {
|
---|
110 | print "PagedImagePlugin: couldn't process text file \"$files_dir.$txtname\" for item \"$filename_full_path\"\n";
|
---|
111 | $self->add_dummy_text($doc_obj, $cursection);
|
---|
112 | }
|
---|
113 | } else {
|
---|
114 | # otherwise add in some dummy text
|
---|
115 | $self->add_dummy_text($doc_obj, $cursection);
|
---|
116 | }
|
---|
117 | }
|
---|
118 | }
|
---|
119 |
|
---|
120 | close ITEMFILE;
|
---|
121 |
|
---|
122 | # add numpages metadata
|
---|
123 | $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
|
---|
124 |
|
---|
125 | $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
|
---|
126 | $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
|
---|
127 | $self->{'MaxImageWidth'} = undef;
|
---|
128 | $self->{'MaxImageHeight'} = undef;
|
---|
129 |
|
---|
130 |
|
---|
131 | return $doc_obj;
|
---|
132 | }
|
---|
133 |
|
---|
134 | sub scan_item_for_files_to_block
|
---|
135 | {
|
---|
136 | my $self = shift (@_);
|
---|
137 | my ($filename_full_path, $dir, $block_hash) = @_;
|
---|
138 |
|
---|
139 |
|
---|
140 | open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path to work out which files to block\n";
|
---|
141 | my $line = "";
|
---|
142 | while (defined ($line = <ITEMFILE>)) {
|
---|
143 | next unless $line =~ /\w/;
|
---|
144 | chomp $line;
|
---|
145 | next if $line =~ /^#/; # ignore comment lines
|
---|
146 | next if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/); # ignore metadata lines
|
---|
147 | # line should be like page:imagefilename:textfilename:r
|
---|
148 | $line =~ s/^\s+//; #remove space at the front
|
---|
149 | $line =~ s/\s+$//; #remove space at the end
|
---|
150 | my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
|
---|
151 |
|
---|
152 | # PJ: use jpg versions instead of tif versions
|
---|
153 | #$imgname =~ s/tif/jpg/g;
|
---|
154 |
|
---|
155 | # find the image file if there is one
|
---|
156 | if (defined $imgname && $imgname ne "") {
|
---|
157 | $self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate( $dir,$imgname));
|
---|
158 | }
|
---|
159 | # find the text file if there is one
|
---|
160 | if (defined $txtname && $txtname ne "") {
|
---|
161 | $self->block_raw_filename($block_hash, &FileUtils::filenameConcatenate($dir,$txtname));
|
---|
162 | }
|
---|
163 | }
|
---|
164 | close ITEMFILE;
|
---|
165 |
|
---|
166 | }
|
---|
167 |
|
---|
168 | 1;
|
---|