source: main/trunk/greenstone2/perllib/plugins/EmbeddedMetadataPlugin.pm@ 24763

Last change on this file since 24763 was 24487, checked in by sjm84, 13 years ago

Added two new options in EmbeddedMetadataPlugin for filtering the embedded sets that are returned

File size: 10.7 KB
Line 
1###########################################################################
2#
3# EmbeddedMetadataPlugin.pm -- A plugin for EXIF
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2007 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package EmbeddedMetadataPlugin;
29
30use BasePlugin;
31
32use Encode;
33use Image::ExifTool qw(:Public);
34use strict;
35
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39sub BEGIN
40{
41 @EmbeddedMetadataPlugin::ISA = ('BasePlugin');
42 binmode(STDERR, ":utf8");
43}
44
45my $encoding_plus_auto_list = [{
46 'name' => "auto",
47 'desc' => "{ReadTextFile.input_encoding.auto}" }];
48push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
49
50my $arguments = [{
51 'name' => "metadata_field_separator",
52 'desc' => "{HTMLPlugin.metadata_field_separator}",
53 'type' => "string",
54 'deft' => ""
55 },{
56 'name' => "input_encoding",
57 'desc' => "{ReadTextFile.input_encoding}",
58 'type' => "enum",
59 'list' => $encoding_plus_auto_list,
60 'reqd' => "no",
61 'deft' => "auto"
62 },{
63 'name' => "join_before_split",
64 'desc' => "{EmbeddedMetadataPlugin.join_before_split}",
65 'type' => "flag"
66 },{
67 'name' => "join_character",
68 'desc' => "{EmbeddedMetadataPlugin.join_character}",
69 'type' => "string",
70 'deft' => " "
71 },{
72 'name' => "trim_whitespace",
73 'desc' => "{EmbeddedMetadataPlugin.trim_whitespace}",
74 'type' => "enum",
75 'list' => [{'name' => "true", 'desc' => "{common.true}"}, {'name' => "false", 'desc' => "{common.false}"}],
76 'deft' => "true"
77 },{
78 'name' => "set_filter_list",
79 'desc' => "{EmbeddedMetadataPlugin.set_filter_list}",
80 'type' => "string"
81 },{
82 'name' => "set_filter_regexp",
83 'desc' => "{EmbeddedMetadataPlugin.set_filter_regexp}",
84 'type' => "string",
85 'deft' => ".*" #If changing this default, also need to update the constructor
86 }];
87
88my $options = {
89 'name' => "EmbeddedMetadataPlugin",
90 'desc' => "{EmbeddedMetadataPlugin.desc}",
91 'abstract' => "no",
92 'inherits' => "yes",
93 'args' => $arguments };
94
95sub new()
96{
97 my ($class) = shift (@_);
98 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
99 push(@$pluginlist, $class);
100
101 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
102 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
103
104 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
105
106 # Create a new Image::ExifTool object
107 my $exifTool = new Image::ExifTool;
108 $exifTool->Options(Duplicates => 0);
109 $exifTool->Options(PrintConv => 0);
110 $exifTool->Options(Unknown => 1);
111 $exifTool->Options('Verbose');
112 $self->{'exiftool'} = $exifTool;
113
114 my $setFilterList = $self->{'set_filter_list'};
115 my $setFilterRegexp = $self->{'set_filter_regexp'};
116 if ((defined $setFilterList) && ($setFilterList ne ""))
117 {
118 if ((defined $setFilterRegexp) && ($setFilterRegexp ne ".*") && ($setFilterRegexp ne ""))
119 {
120 my $outhandle = $self->{'outhandle'};
121 print $outhandle "Warning: can only specify 'set_filter_list' or 'set_filter_regexp'\n";
122 print $outhandle " defaulting to 'set_filter_list'\n";
123 }
124
125 my @sets = split(/,/,$setFilterList);
126 my @sets_bracketed;
127 foreach my $s (@sets)
128 {
129 $s =~ s/^(ex\.)?(.*)$/(ex.$2)/;
130 push (@sets_bracketed, $s);
131 }
132
133 my $setFilterRegexp = join("|",@sets_bracketed);
134 $self->{'set_filter_regexp'} = $setFilterRegexp;
135 }
136
137 return bless $self, $class;
138}
139
140
141# Need to think some more about this
142sub get_default_process_exp()
143{
144 return ".*";
145 #q^(?i)\.(wma|wmv|jpe?g|gif)$^;
146}
147
148
149# This plugin doesn't block any files
150#sub get_default_block_exp()
151#{
152# return '';
153#}
154
155# plugins that rely on more than process_exp (eg XML plugins) can override this method
156sub can_process_this_file {
157 my $self = shift(@_);
158
159 # we process metadata, not the file
160 return 0;
161}
162
163# Even if a plugin can extract metadata in its metadata_read pass,
164# make the default return 'undef' so processing of the file continues
165# down the pipeline, so other plugins can also have the opportunity to
166# locate metadata and set it up in the extrametakeys variables that
167# are passed around.
168
169sub can_process_this_file_for_metadata {
170 my $self = shift(@_);
171
172 # this plugin will look for metadata in any file through its
173 # metadata_read(). Returning undef here means anything else further
174 # down the pipeline can do the same
175
176 return undef;
177}
178
179sub checkAgainstFilters
180{
181 my $self = shift(@_);
182 my $name = shift(@_);
183
184 my $setFilterRegexp = $self->{'set_filter_regexp'};
185 if((defined $setFilterRegexp) && ($setFilterRegexp ne ""))
186 {
187 return ($name =~ m/($setFilterRegexp)/i);
188 }
189 else
190 {
191 return 1;
192 }
193}
194
195sub extractEmbeddedMetadata()
196{
197 my $self = shift(@_);
198 my ($file, $filename, $extrametadata, $extrametakeys) = @_;
199
200 my %exif_metadata = ();
201
202 my $verbosity = $self->{'verbosity'};
203 my $outhandle = $self->{'outhandle'};
204
205 my $metadata_count = 0;
206
207 my $separator = $self->{'metadata_field_separator'};
208 if ($separator eq "") {
209 undef $separator;
210 }
211
212 my @group_list = Image::ExifTool::GetAllGroups(0);
213 foreach my $group (@group_list) {
214## print STDERR "**** group = $group\n";
215
216 # Extract meta information from an image
217 $self->{'exiftool'}->Options(Group0 => [$group]);
218 $self->{'exiftool'}->ExtractInfo($filename);
219
220 # Get list of tags in the order they were found in the file
221 my @tag_list = $self->{'exiftool'}->GetFoundTags('File');
222 foreach my $tag (@tag_list) {
223
224 # Strip any numbering suffix
225 $tag =~ s/^([^\s]+)\s.*$/$1/i;
226 my $value = $self->{'exiftool'}->GetValue($tag);
227 if (defined $value && $value =~ /[a-z0-9]+/i) {
228 my $field = "ex.$group.$tag";
229
230 my $encoding = $self->{'input_encoding'};
231 if($encoding eq "auto")
232 {
233 $encoding = "utf8"
234 }
235
236 if (!defined $exif_metadata{$field})
237 {
238 $exif_metadata{$field} = [];
239 }
240
241 $field = Encode::decode($encoding,$field);
242 my $metadata_done = 0;
243 if (ref $value eq 'SCALAR') {
244 if ($$value =~ /^Binary data/) {
245 $value = "($$value)";
246 }
247 else {
248 my $len = length($$value);
249 $value = "(Binary data $len bytes)";
250 }
251 }
252 elsif (ref $value eq 'ARRAY') {
253 $metadata_done = 1;
254
255 my $allvals = "";
256 foreach my $v (@$value) {
257 $v = Encode::decode($encoding,$v);
258
259 if(!$self->{'join_before_split'}){
260 if (defined $separator) {
261 my @vs = split($separator, $v);
262 foreach my $val (@vs) {
263 if ($val =~ /\S/) {
264 push (@{$exif_metadata{$field}}, $self->gsSafe($val)) if $self->checkAgainstFilters($field);
265 ++$metadata_count;
266 }
267 }
268 }
269 else
270 {
271 push (@{$exif_metadata{$field}}, $self->gsSafe($v)) if $self->checkAgainstFilters($field);
272 ++$metadata_count;
273 }
274 }
275 else{
276 if($allvals ne ""){
277 $allvals = $allvals . $self->{'join_character'};
278 }
279 $allvals = $allvals . $v;
280 }
281 }
282
283 if($self->{'join_before_split'}){
284 if (defined $separator) {
285 my @vs = split($separator, $allvals);
286 foreach my $val (@vs) {
287 if ($val =~ /\S/) {
288 push (@{$exif_metadata{$field}}, $self->gsSafe($val)) if $self->checkAgainstFilters($field);
289 ++$metadata_count;
290 }
291 }
292 }
293 else
294 {
295 push (@{$exif_metadata{$field}}, $self->gsSafe($allvals)) if $self->checkAgainstFilters($field);
296 ++$metadata_count;
297 }
298 }
299 }
300 else {
301 $value = Encode::decode($encoding,$value);
302 if (defined $separator) {
303 my @vs = split($separator, $value);
304 $metadata_done = 1;
305 foreach my $v (@vs) {
306 if ($v =~ /\S/) {
307 push (@{$exif_metadata{$field}}, $self->gsSafe($v)) if $self->checkAgainstFilters($field);
308 ++$metadata_count;
309 }
310 }
311 }
312 }
313 if (!$metadata_done) {
314 push (@{$exif_metadata{$field}}, $self->gsSafe($value)) if $self->checkAgainstFilters($field);
315 ++$metadata_count;
316 }
317 }
318 }
319 }
320
321 if ($metadata_count > 0) {
322 print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n";
323 }
324
325 # Protect windows directory chars \
326 $file = &util::filename_to_regex($file);
327
328 # Associate the metadata now
329
330 if (defined $extrametadata->{$file}) {
331 print STDERR "\n**** Need to merge new metadata with existing stored metadata: file = $file\n" if $verbosity > 2;
332
333 my $file_metadata_table = $extrametadata->{$file};
334
335 foreach my $metaname (keys %exif_metadata) {
336 # will create new entry if one does not already exist
337 push(@{$file_metadata_table->{$metaname}}, @{$exif_metadata{$metaname}});
338 }
339
340 # no need to push $file on to $extrametakeys as it is already in the list
341 }
342 else {
343 $extrametadata->{$file} = \%exif_metadata;
344 push(@$extrametakeys, $file);
345 }
346
347}
348
349
350sub metadata_read
351{
352 my $self = shift (@_);
353 my ($pluginfo, $base_dir, $file, $block_hash,
354 $extrametakeys, $extrametadata, $extrametafile,
355 $processor, $gli, $aux) = @_;
356
357 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
358
359 # we don't want to process directories
360 if (!-f $filename_full_path) {
361 return undef;
362 }
363 print STDERR "\n<Processing n='$file' p='EmbeddedMetadataPlugin'>\n" if ($gli);
364 print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
365
366 $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path,
367 $extrametadata,$extrametakeys);
368
369 return undef;
370}
371
372sub read
373{
374 return undef;
375}
376
377sub process
378{
379 # not used
380 return undef;
381}
382
383sub gsSafe() {
384 my $self = shift(@_);
385 my ($text) = @_;
386
387 # Replace potentially problematic characters
388 $text =~ s/\(/&#40;/g;
389 $text =~ s/\)/&#41;/g;
390 $text =~ s/,/&#44;/g;
391 $text =~ s/\</&#60;/g;
392 $text =~ s/\>/&#62;/g;
393 $text =~ s/\[/&#91;/g;
394 $text =~ s/\]/&#93;/g;
395 $text =~ s/\{/&#123;/g;
396 $text =~ s/\}/&#125;/g;
397 # Done
398
399 if ($self->{'trim_whitespace'} eq "true"){
400 $text =~ s/^\s+//;
401 $text =~ s/\s+$//;
402 }
403
404 return $text;
405}
406
4071;
Note: See TracBrowser for help on using the repository browser.