source: main/trunk/greenstone2/perllib/plugins/EmbeddedMetadataPlugin.pm@ 24951

Last change on this file since 24951 was 24951, checked in by ak19, 12 years ago

All perlcode that accesses extrametakeys, extrametadata, extrametafile data structures has been moved into a new perl module called extrametautil.pm. The next step will be to ensure that the file_regexes used to index into these data structures are consistent (using consistent slashes, like URL style slashes).

File size: 10.9 KB
RevLine 
[18516]1###########################################################################
2#
[20862]3# EmbeddedMetadataPlugin.pm -- A plugin for EXIF
[18516]4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2007 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
[20862]28package EmbeddedMetadataPlugin;
[18516]29
30use BasePlugin;
[24951]31use extrametautil;
[18516]32
[23810]33use Encode;
[18516]34use Image::ExifTool qw(:Public);
35use strict;
36
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39
40sub BEGIN
41{
[24290]42 @EmbeddedMetadataPlugin::ISA = ('BasePlugin');
43 binmode(STDERR, ":utf8");
[18516]44}
45
[24290]46my $encoding_plus_auto_list = [{
47 'name' => "auto",
48 'desc' => "{ReadTextFile.input_encoding.auto}" }];
[23810]49push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
[18516]50
[24290]51my $arguments = [{
52 'name' => "metadata_field_separator",
[22451]53 'desc' => "{HTMLPlugin.metadata_field_separator}",
54 'type' => "string",
[24290]55 'deft' => ""
56 },{
57 'name' => "input_encoding",
[23810]58 'desc' => "{ReadTextFile.input_encoding}",
59 'type' => "enum",
60 'list' => $encoding_plus_auto_list,
61 'reqd' => "no",
[24290]62 'deft' => "auto"
63 },{
64 'name' => "join_before_split",
65 'desc' => "{EmbeddedMetadataPlugin.join_before_split}",
66 'type' => "flag"
67 },{
68 'name' => "join_character",
69 'desc' => "{EmbeddedMetadataPlugin.join_character}",
70 'type' => "string",
71 'deft' => " "
72 },{
73 'name' => "trim_whitespace",
74 'desc' => "{EmbeddedMetadataPlugin.trim_whitespace}",
75 'type' => "enum",
76 'list' => [{'name' => "true", 'desc' => "{common.true}"}, {'name' => "false", 'desc' => "{common.false}"}],
77 'deft' => "true"
[24487]78 },{
79 'name' => "set_filter_list",
80 'desc' => "{EmbeddedMetadataPlugin.set_filter_list}",
81 'type' => "string"
82 },{
83 'name' => "set_filter_regexp",
84 'desc' => "{EmbeddedMetadataPlugin.set_filter_regexp}",
85 'type' => "string",
86 'deft' => ".*" #If changing this default, also need to update the constructor
[24290]87 }];
[18516]88
[24290]89my $options = {
90 'name' => "EmbeddedMetadataPlugin",
91 'desc' => "{EmbeddedMetadataPlugin.desc}",
92 'abstract' => "no",
93 'inherits' => "yes",
94 'args' => $arguments };
[22451]95
[18516]96sub new()
97{
98 my ($class) = shift (@_);
99 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
100 push(@$pluginlist, $class);
101
102 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
103 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
104
105 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
106
107 # Create a new Image::ExifTool object
108 my $exifTool = new Image::ExifTool;
109 $exifTool->Options(Duplicates => 0);
[24290]110 $exifTool->Options(PrintConv => 0);
[18516]111 $exifTool->Options(Unknown => 1);
112 $exifTool->Options('Verbose');
113 $self->{'exiftool'} = $exifTool;
[24487]114
115 my $setFilterList = $self->{'set_filter_list'};
116 my $setFilterRegexp = $self->{'set_filter_regexp'};
117 if ((defined $setFilterList) && ($setFilterList ne ""))
118 {
119 if ((defined $setFilterRegexp) && ($setFilterRegexp ne ".*") && ($setFilterRegexp ne ""))
120 {
121 my $outhandle = $self->{'outhandle'};
122 print $outhandle "Warning: can only specify 'set_filter_list' or 'set_filter_regexp'\n";
123 print $outhandle " defaulting to 'set_filter_list'\n";
124 }
[18516]125
[24487]126 my @sets = split(/,/,$setFilterList);
127 my @sets_bracketed;
128 foreach my $s (@sets)
129 {
130 $s =~ s/^(ex\.)?(.*)$/(ex.$2)/;
131 push (@sets_bracketed, $s);
132 }
133
134 my $setFilterRegexp = join("|",@sets_bracketed);
135 $self->{'set_filter_regexp'} = $setFilterRegexp;
136 }
137
[18516]138 return bless $self, $class;
139}
140
141
142# Need to think some more about this
[20927]143sub get_default_process_exp()
144{
[22552]145 return ".*";
[20927]146 #q^(?i)\.(wma|wmv|jpe?g|gif)$^;
147}
[18516]148
149
[24796]150# This plugin blocks *.oai files, so that they can be processed by OAIPlugin
151# even if OAIPlugin comes later in the pipeline than EmbeddedMetadataPlugin.
152sub get_default_block_exp()
153{
154 return q^(?i)\.(oai)$^;
155}
[18516]156
[24403]157# plugins that rely on more than process_exp (eg XML plugins) can override this method
158sub can_process_this_file {
159 my $self = shift(@_);
[18516]160
[24403]161 # we process metadata, not the file
162 return 0;
163}
164
165# Even if a plugin can extract metadata in its metadata_read pass,
166# make the default return 'undef' so processing of the file continues
167# down the pipeline, so other plugins can also have the opportunity to
168# locate metadata and set it up in the extrametakeys variables that
169# are passed around.
170
171sub can_process_this_file_for_metadata {
172 my $self = shift(@_);
173
174 # this plugin will look for metadata in any file through its
[24414]175 # metadata_read(). Returning undef here means anything else further
[24403]176 # down the pipeline can do the same
177
178 return undef;
179}
180
[24487]181sub checkAgainstFilters
182{
183 my $self = shift(@_);
184 my $name = shift(@_);
185
186 my $setFilterRegexp = $self->{'set_filter_regexp'};
187 if((defined $setFilterRegexp) && ($setFilterRegexp ne ""))
188 {
189 return ($name =~ m/($setFilterRegexp)/i);
190 }
191 else
192 {
193 return 1;
194 }
195}
[24403]196
[18516]197sub extractEmbeddedMetadata()
198{
199 my $self = shift(@_);
200 my ($file, $filename, $extrametadata, $extrametakeys) = @_;
[22074]201
[18516]202 my %exif_metadata = ();
203
204 my $verbosity = $self->{'verbosity'};
205 my $outhandle = $self->{'outhandle'};
206
207 my $metadata_count = 0;
208
[22451]209 my $separator = $self->{'metadata_field_separator'};
210 if ($separator eq "") {
[24290]211 undef $separator;
[22451]212 }
213
[18516]214 my @group_list = Image::ExifTool::GetAllGroups(0);
[24290]215 foreach my $group (@group_list) {
216## print STDERR "**** group = $group\n";
[18516]217
[24290]218 # Extract meta information from an image
219 $self->{'exiftool'}->Options(Group0 => [$group]);
220 $self->{'exiftool'}->ExtractInfo($filename);
[18516]221
[24290]222 # Get list of tags in the order they were found in the file
223 my @tag_list = $self->{'exiftool'}->GetFoundTags('File');
224 foreach my $tag (@tag_list) {
[18516]225
226 # Strip any numbering suffix
[24290]227 $tag =~ s/^([^\s]+)\s.*$/$1/i;
228 my $value = $self->{'exiftool'}->GetValue($tag);
229 if (defined $value && $value =~ /[a-z0-9]+/i) {
230 my $field = "ex.$group.$tag";
[23810]231
[24290]232 my $encoding = $self->{'input_encoding'};
233 if($encoding eq "auto")
234 {
235 $encoding = "utf8"
236 }
[18516]237
[24290]238 if (!defined $exif_metadata{$field})
239 {
240 $exif_metadata{$field} = [];
241 }
[18516]242
[24290]243 $field = Encode::decode($encoding,$field);
244 my $metadata_done = 0;
245 if (ref $value eq 'SCALAR') {
246 if ($$value =~ /^Binary data/) {
247 $value = "($$value)";
248 }
[22448]249 else {
[24290]250 my $len = length($$value);
251 $value = "(Binary data $len bytes)";
252 }
253 }
254 elsif (ref $value eq 'ARRAY') {
255 $metadata_done = 1;
256
257 my $allvals = "";
258 foreach my $v (@$value) {
259 $v = Encode::decode($encoding,$v);
260
261 if(!$self->{'join_before_split'}){
262 if (defined $separator) {
263 my @vs = split($separator, $v);
264 foreach my $val (@vs) {
265 if ($val =~ /\S/) {
[24487]266 push (@{$exif_metadata{$field}}, $self->gsSafe($val)) if $self->checkAgainstFilters($field);
[24290]267 ++$metadata_count;
268 }
269 }
270 }
271 else
272 {
[24487]273 push (@{$exif_metadata{$field}}, $self->gsSafe($v)) if $self->checkAgainstFilters($field);
[24290]274 ++$metadata_count;
275 }
276 }
277 else{
278 if($allvals ne ""){
279 $allvals = $allvals . $self->{'join_character'};
280 }
281 $allvals = $allvals . $v;
282 }
283 }
284
285 if($self->{'join_before_split'}){
286 if (defined $separator) {
287 my @vs = split($separator, $allvals);
288 foreach my $val (@vs) {
289 if ($val =~ /\S/) {
[24487]290 push (@{$exif_metadata{$field}}, $self->gsSafe($val)) if $self->checkAgainstFilters($field);
[24290]291 ++$metadata_count;
292 }
293 }
294 }
295 else
296 {
[24487]297 push (@{$exif_metadata{$field}}, $self->gsSafe($allvals)) if $self->checkAgainstFilters($field);
[24290]298 ++$metadata_count;
299 }
300 }
301 }
302 else {
303 $value = Encode::decode($encoding,$value);
304 if (defined $separator) {
305 my @vs = split($separator, $value);
306 $metadata_done = 1;
307 foreach my $v (@vs) {
308 if ($v =~ /\S/) {
[24487]309 push (@{$exif_metadata{$field}}, $self->gsSafe($v)) if $self->checkAgainstFilters($field);
[24290]310 ++$metadata_count;
311 }
312 }
313 }
314 }
315 if (!$metadata_done) {
[24487]316 push (@{$exif_metadata{$field}}, $self->gsSafe($value)) if $self->checkAgainstFilters($field);
[24290]317 ++$metadata_count;
318 }
[22451]319 }
320 }
[24290]321 }
[18516]322
323 if ($metadata_count > 0) {
[24290]324 print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n";
[18516]325 }
326
[19811]327 # Protect windows directory chars \
[24951]328 $file = &util::filename_to_regex($file); ####
[22074]329
[19811]330 # Associate the metadata now
331
[24951]332 if (defined &extrametautil::getmetadata($extrametadata, $file)) {
[24419]333 print STDERR "\n**** Need to merge new metadata with existing stored metadata: file = $file\n" if $verbosity > 2;
[19811]334
[24951]335 my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $file);
[24419]336
337 foreach my $metaname (keys %exif_metadata) {
338 # will create new entry if one does not already exist
339 push(@{$file_metadata_table->{$metaname}}, @{$exif_metadata{$metaname}});
340 }
341
342 # no need to push $file on to $extrametakeys as it is already in the list
343 }
344 else {
[24951]345 &extrametautil::setmetadata($extrametadata, $file, \%exif_metadata);
346 &extrametautil::addmetakey($extrametakeys, $file);
[24419]347 }
348
[18516]349}
350
351
[24404]352sub metadata_read
[18516]353{
[24349]354 my $self = shift (@_);
355 my ($pluginfo, $base_dir, $file, $block_hash,
356 $extrametakeys, $extrametadata, $extrametafile,
357 $processor, $gli, $aux) = @_;
358
359 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
360
361 # we don't want to process directories
[24414]362 if (!-f $filename_full_path) {
[24290]363 return undef;
[24349]364 }
365 print STDERR "\n<Processing n='$file' p='EmbeddedMetadataPlugin'>\n" if ($gli);
366 print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
367
368 $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path,
369 $extrametadata,$extrametakeys);
370
371 return undef;
[18516]372}
373
[24403]374sub read
375{
376 return undef;
377}
[18516]378
[24404]379sub process
[18516]380{
381 # not used
382 return undef;
383}
384
[24290]385sub gsSafe() {
386 my $self = shift(@_);
387 my ($text) = @_;
388
389 # Replace potentially problematic characters
390 $text =~ s/\(/&#40;/g;
391 $text =~ s/\)/&#41;/g;
392 $text =~ s/,/&#44;/g;
393 $text =~ s/\</&#60;/g;
394 $text =~ s/\>/&#62;/g;
395 $text =~ s/\[/&#91;/g;
396 $text =~ s/\]/&#93;/g;
397 $text =~ s/\{/&#123;/g;
398 $text =~ s/\}/&#125;/g;
399 # Done
400
401 if ($self->{'trim_whitespace'} eq "true"){
402 $text =~ s/^\s+//;
403 $text =~ s/\s+$//;
404 }
405
406 return $text;
407}
[18516]408
4091;
Note: See TracBrowser for help on using the repository browser.