########################################################################### # # EmbeddedMetadataPlugin.pm -- A plugin for EXIF # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright 2007 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package EmbeddedMetadataPlugin; use BasePlugin; use Encode; use Image::ExifTool qw(:Public); use strict; no strict 'refs'; # allow filehandles to be variables and viceversa sub BEGIN { @EmbeddedMetadataPlugin::ISA = ('BasePlugin'); binmode(STDERR, ":utf8"); } my $encoding_plus_auto_list = [{ 'name' => "auto", 'desc' => "{ReadTextFile.input_encoding.auto}" }]; push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list}); my $arguments = [{ 'name' => "metadata_field_separator", 'desc' => "{HTMLPlugin.metadata_field_separator}", 'type' => "string", 'deft' => "" },{ 'name' => "input_encoding", 'desc' => "{ReadTextFile.input_encoding}", 'type' => "enum", 'list' => $encoding_plus_auto_list, 'reqd' => "no", 'deft' => "auto" },{ 'name' => "join_before_split", 'desc' => "{EmbeddedMetadataPlugin.join_before_split}", 'type' => "flag" },{ 'name' => "join_character", 'desc' => "{EmbeddedMetadataPlugin.join_character}", 'type' => "string", 'deft' => " " },{ 'name' => "trim_whitespace", 'desc' => "{EmbeddedMetadataPlugin.trim_whitespace}", 'type' => "enum", 'list' => [{'name' => "true", 'desc' => "{common.true}"}, {'name' => "false", 'desc' => "{common.false}"}], 'deft' => "true" }]; my $options = { 'name' => "EmbeddedMetadataPlugin", 'desc' => "{EmbeddedMetadataPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new() { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists); # Create a new Image::ExifTool object my $exifTool = new Image::ExifTool; $exifTool->Options(Duplicates => 0); $exifTool->Options(PrintConv => 0); $exifTool->Options(Unknown => 1); $exifTool->Options('Verbose'); $self->{'exiftool'} = $exifTool; return bless $self, $class; } # Need to think some more about this sub get_default_process_exp() { return ".*"; #q^(?i)\.(wma|wmv|jpe?g|gif)$^; } # This plugin doesn't block any files #sub get_default_block_exp() #{ # return ''; #} sub extractEmbeddedMetadata() { my $self = shift(@_); my ($file, $filename, $extrametadata, $extrametakeys) = @_; my %exif_metadata = (); my $verbosity = $self->{'verbosity'}; my $outhandle = $self->{'outhandle'}; my $metadata_count = 0; my $separator = $self->{'metadata_field_separator'}; if ($separator eq "") { undef $separator; } my @group_list = Image::ExifTool::GetAllGroups(0); foreach my $group (@group_list) { ## print STDERR "**** group = $group\n"; # Extract meta information from an image $self->{'exiftool'}->Options(Group0 => [$group]); $self->{'exiftool'}->ExtractInfo($filename); # Get list of tags in the order they were found in the file my @tag_list = $self->{'exiftool'}->GetFoundTags('File'); foreach my $tag (@tag_list) { # Strip any numbering suffix $tag =~ s/^([^\s]+)\s.*$/$1/i; my $value = $self->{'exiftool'}->GetValue($tag); if (defined $value && $value =~ /[a-z0-9]+/i) { my $field = "ex.$group.$tag"; my $encoding = $self->{'input_encoding'}; if($encoding eq "auto") { $encoding = "utf8" } if (!defined $exif_metadata{$field}) { $exif_metadata{$field} = []; } $field = Encode::decode($encoding,$field); my $metadata_done = 0; if (ref $value eq 'SCALAR') { if ($$value =~ /^Binary data/) { $value = "($$value)"; } else { my $len = length($$value); $value = "(Binary data $len bytes)"; } } elsif (ref $value eq 'ARRAY') { $metadata_done = 1; my $allvals = ""; foreach my $v (@$value) { $v = Encode::decode($encoding,$v); if(!$self->{'join_before_split'}){ if (defined $separator) { my @vs = split($separator, $v); foreach my $val (@vs) { if ($val =~ /\S/) { push (@{$exif_metadata{$field}}, $self->gsSafe($val)); ++$metadata_count; } } } else { push (@{$exif_metadata{$field}}, $self->gsSafe($v)); ++$metadata_count; } } else{ if($allvals ne ""){ $allvals = $allvals . $self->{'join_character'}; } $allvals = $allvals . $v; } } if($self->{'join_before_split'}){ if (defined $separator) { my @vs = split($separator, $allvals); foreach my $val (@vs) { if ($val =~ /\S/) { push (@{$exif_metadata{$field}}, $self->gsSafe($val)); ++$metadata_count; } } } else { push (@{$exif_metadata{$field}}, $self->gsSafe($allvals)); ++$metadata_count; } } } else { $value = Encode::decode($encoding,$value); if (defined $separator) { my @vs = split($separator, $value); $metadata_done = 1; foreach my $v (@vs) { if ($v =~ /\S/) { push (@{$exif_metadata{$field}}, $self->gsSafe($v)); ++$metadata_count; } } } } if (!$metadata_done) { push (@{$exif_metadata{$field}}, $self->gsSafe($value)); ++$metadata_count; } } } } if ($metadata_count > 0) { print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n"; } # Protect windows directory chars \ $file = &util::filename_to_regex($file); # Associate the metadata now $extrametadata->{$file} = \%exif_metadata; push(@$extrametakeys, $file); } sub metadata_read() { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $extrametafile, $processor, $gli, $aux) = @_; my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); # we don't want to process directories if (!-f $filename_full_path || !$self->can_process_this_file($filename_full_path)) { return undef; } print STDERR "\n\n" if ($gli); print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1; $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path, $extrametadata,$extrametakeys); return undef; } sub process() { # not used return undef; } sub gsSafe() { my $self = shift(@_); my ($text) = @_; # Replace potentially problematic characters $text =~ s/\(/(/g; $text =~ s/\)/)/g; $text =~ s/,/,/g; $text =~ s/\/>/g; $text =~ s/\[/[/g; $text =~ s/\]/]/g; $text =~ s/\{/{/g; $text =~ s/\}/}/g; # Done if ($self->{'trim_whitespace'} eq "true"){ $text =~ s/^\s+//; $text =~ s/\s+$//; } return $text; } 1;