source: main/trunk/greenstone2/perllib/plugins/EmbeddedMetadataPlugin.pm@ 24290

Last change on this file since 24290 was 24290, checked in by sjm84, 13 years ago

Several changes to how Greenstone hashes PDF files and also added several more options to the EmbeddedMetadataPlugin

File size: 7.9 KB
Line 
1###########################################################################
2#
3# EmbeddedMetadataPlugin.pm -- A plugin for EXIF
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2007 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package EmbeddedMetadataPlugin;
29
30use BasePlugin;
31
32use Encode;
33use Image::ExifTool qw(:Public);
34use strict;
35
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39sub BEGIN
40{
41 @EmbeddedMetadataPlugin::ISA = ('BasePlugin');
42 binmode(STDERR, ":utf8");
43}
44
45my $encoding_plus_auto_list = [{
46 'name' => "auto",
47 'desc' => "{ReadTextFile.input_encoding.auto}" }];
48push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
49
50my $arguments = [{
51 'name' => "metadata_field_separator",
52 'desc' => "{HTMLPlugin.metadata_field_separator}",
53 'type' => "string",
54 'deft' => ""
55 },{
56 'name' => "input_encoding",
57 'desc' => "{ReadTextFile.input_encoding}",
58 'type' => "enum",
59 'list' => $encoding_plus_auto_list,
60 'reqd' => "no",
61 'deft' => "auto"
62 },{
63 'name' => "join_before_split",
64 'desc' => "{EmbeddedMetadataPlugin.join_before_split}",
65 'type' => "flag"
66 },{
67 'name' => "join_character",
68 'desc' => "{EmbeddedMetadataPlugin.join_character}",
69 'type' => "string",
70 'deft' => " "
71 },{
72 'name' => "trim_whitespace",
73 'desc' => "{EmbeddedMetadataPlugin.trim_whitespace}",
74 'type' => "enum",
75 'list' => [{'name' => "true", 'desc' => "{common.true}"}, {'name' => "false", 'desc' => "{common.false}"}],
76 'deft' => "true"
77 }];
78
79my $options = {
80 'name' => "EmbeddedMetadataPlugin",
81 'desc' => "{EmbeddedMetadataPlugin.desc}",
82 'abstract' => "no",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86sub new()
87{
88 my ($class) = shift (@_);
89 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
90 push(@$pluginlist, $class);
91
92 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
93 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
94
95 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
96
97 # Create a new Image::ExifTool object
98 my $exifTool = new Image::ExifTool;
99 $exifTool->Options(Duplicates => 0);
100 $exifTool->Options(PrintConv => 0);
101 $exifTool->Options(Unknown => 1);
102 $exifTool->Options('Verbose');
103 $self->{'exiftool'} = $exifTool;
104
105 return bless $self, $class;
106}
107
108
109# Need to think some more about this
110sub get_default_process_exp()
111{
112 return ".*";
113 #q^(?i)\.(wma|wmv|jpe?g|gif)$^;
114}
115
116
117# This plugin doesn't block any files
118#sub get_default_block_exp()
119#{
120# return '';
121#}
122
123
124sub extractEmbeddedMetadata()
125{
126 my $self = shift(@_);
127 my ($file, $filename, $extrametadata, $extrametakeys) = @_;
128
129 my %exif_metadata = ();
130
131 my $verbosity = $self->{'verbosity'};
132 my $outhandle = $self->{'outhandle'};
133
134 my $metadata_count = 0;
135
136 my $separator = $self->{'metadata_field_separator'};
137 if ($separator eq "") {
138 undef $separator;
139 }
140
141 my @group_list = Image::ExifTool::GetAllGroups(0);
142 foreach my $group (@group_list) {
143## print STDERR "**** group = $group\n";
144
145 # Extract meta information from an image
146 $self->{'exiftool'}->Options(Group0 => [$group]);
147 $self->{'exiftool'}->ExtractInfo($filename);
148
149 # Get list of tags in the order they were found in the file
150 my @tag_list = $self->{'exiftool'}->GetFoundTags('File');
151 foreach my $tag (@tag_list) {
152
153 # Strip any numbering suffix
154 $tag =~ s/^([^\s]+)\s.*$/$1/i;
155 my $value = $self->{'exiftool'}->GetValue($tag);
156 if (defined $value && $value =~ /[a-z0-9]+/i) {
157 my $field = "ex.$group.$tag";
158
159 my $encoding = $self->{'input_encoding'};
160 if($encoding eq "auto")
161 {
162 $encoding = "utf8"
163 }
164
165 if (!defined $exif_metadata{$field})
166 {
167 $exif_metadata{$field} = [];
168 }
169
170 $field = Encode::decode($encoding,$field);
171 my $metadata_done = 0;
172 if (ref $value eq 'SCALAR') {
173 if ($$value =~ /^Binary data/) {
174 $value = "($$value)";
175 }
176 else {
177 my $len = length($$value);
178 $value = "(Binary data $len bytes)";
179 }
180 }
181 elsif (ref $value eq 'ARRAY') {
182 $metadata_done = 1;
183
184 my $allvals = "";
185 foreach my $v (@$value) {
186 $v = Encode::decode($encoding,$v);
187
188 if(!$self->{'join_before_split'}){
189 if (defined $separator) {
190 my @vs = split($separator, $v);
191 foreach my $val (@vs) {
192 if ($val =~ /\S/) {
193 push (@{$exif_metadata{$field}}, $self->gsSafe($val));
194 ++$metadata_count;
195 }
196 }
197 }
198 else
199 {
200 push (@{$exif_metadata{$field}}, $self->gsSafe($v));
201 ++$metadata_count;
202 }
203 }
204 else{
205 if($allvals ne ""){
206 $allvals = $allvals . $self->{'join_character'};
207 }
208 $allvals = $allvals . $v;
209 }
210 }
211
212 if($self->{'join_before_split'}){
213 if (defined $separator) {
214 my @vs = split($separator, $allvals);
215 foreach my $val (@vs) {
216 if ($val =~ /\S/) {
217 push (@{$exif_metadata{$field}}, $self->gsSafe($val));
218 ++$metadata_count;
219 }
220 }
221 }
222 else
223 {
224 push (@{$exif_metadata{$field}}, $self->gsSafe($allvals));
225 ++$metadata_count;
226 }
227 }
228 }
229 else {
230 $value = Encode::decode($encoding,$value);
231 if (defined $separator) {
232 my @vs = split($separator, $value);
233 $metadata_done = 1;
234 foreach my $v (@vs) {
235 if ($v =~ /\S/) {
236 push (@{$exif_metadata{$field}}, $self->gsSafe($v));
237 ++$metadata_count;
238 }
239 }
240 }
241 }
242 if (!$metadata_done) {
243 push (@{$exif_metadata{$field}}, $self->gsSafe($value));
244 ++$metadata_count;
245 }
246 }
247 }
248 }
249
250 if ($metadata_count > 0) {
251 print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n";
252 }
253
254 # Protect windows directory chars \
255 $file = &util::filename_to_regex($file);
256
257 # Associate the metadata now
258
259 $extrametadata->{$file} = \%exif_metadata;
260 push(@$extrametakeys, $file);
261
262}
263
264
265sub metadata_read()
266{
267 my $self = shift (@_);
268 my ($pluginfo, $base_dir, $file, $block_hash,
269 $extrametakeys, $extrametadata, $extrametafile,
270 $processor, $gli, $aux) = @_;
271
272 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
273
274 # we don't want to process directories
275 if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) {
276 return undef;
277 }
278 print STDERR "\n<Processing n='$file' p='EmbeddedMetadataPlugin'>\n" if ($gli);
279 print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
280
281 $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path,
282 $extrametadata,$extrametakeys);
283
284 return undef;
285}
286
287
288sub process()
289{
290 # not used
291 return undef;
292}
293
294sub gsSafe() {
295 my $self = shift(@_);
296 my ($text) = @_;
297
298 # Replace potentially problematic characters
299 $text =~ s/\(/&#40;/g;
300 $text =~ s/\)/&#41;/g;
301 $text =~ s/,/&#44;/g;
302 $text =~ s/\</&#60;/g;
303 $text =~ s/\>/&#62;/g;
304 $text =~ s/\[/&#91;/g;
305 $text =~ s/\]/&#93;/g;
306 $text =~ s/\{/&#123;/g;
307 $text =~ s/\}/&#125;/g;
308 # Done
309
310 if ($self->{'trim_whitespace'} eq "true"){
311 $text =~ s/^\s+//;
312 $text =~ s/\s+$//;
313 }
314
315 return $text;
316}
317
3181;
Note: See TracBrowser for help on using the repository browser.