########################################################################### # # EmbeddedMetadataPlugin.pm -- A plugin for EXIF # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright 2007 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package EmbeddedMetadataPlugin; use BasePlugin; use extrametautil; use util; use Encode; use Image::ExifTool qw(:Public); use strict; no strict 'refs'; # allow filehandles to be variables and viceversa sub BEGIN { @EmbeddedMetadataPlugin::ISA = ('BasePlugin'); binmode(STDERR, ":utf8"); } my $encoding_plus_auto_list = [{ 'name' => "auto", 'desc' => "{ReadTextFile.input_encoding.auto}" }]; push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list}); my $arguments = [{ 'name' => "metadata_field_separator", 'desc' => "{HTMLPlugin.metadata_field_separator}", 'type' => "string", 'deft' => "" },{ 'name' => "input_encoding", 'desc' => "{ReadTextFile.input_encoding}", 'type' => "enum", 'list' => $encoding_plus_auto_list, 'reqd' => "no", 'deft' => "auto" },{ 'name' => "join_before_split", 'desc' => "{EmbeddedMetadataPlugin.join_before_split}", 'type' => "flag" },{ 'name' => "join_character", 'desc' => "{EmbeddedMetadataPlugin.join_character}", 'type' => "string", 'deft' => " " },{ 'name' => "trim_whitespace", 'desc' => "{EmbeddedMetadataPlugin.trim_whitespace}", 'type' => "enum", 'list' => [{'name' => "true", 'desc' => "{common.true}"}, {'name' => "false", 'desc' => "{common.false}"}], 'deft' => "true" },{ 'name' => "set_filter_list", 'desc' => "{EmbeddedMetadataPlugin.set_filter_list}", 'type' => "string" },{ 'name' => "set_filter_regexp", 'desc' => "{EmbeddedMetadataPlugin.set_filter_regexp}", 'type' => "string", 'deft' => ".*" #If changing this default, also need to update the constructor }]; my $options = { 'name' => "EmbeddedMetadataPlugin", 'desc' => "{EmbeddedMetadataPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new() { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists); # Create a new Image::ExifTool object my $exifTool = new Image::ExifTool; $exifTool->Options(Duplicates => 0); $exifTool->Options(PrintConv => 0); $exifTool->Options(Unknown => 1); $exifTool->Options('Verbose'); $self->{'exiftool'} = $exifTool; my $setFilterList = $self->{'set_filter_list'}; my $setFilterRegexp = $self->{'set_filter_regexp'}; if ((defined $setFilterList) && ($setFilterList ne "")) { if ((defined $setFilterRegexp) && ($setFilterRegexp ne ".*") && ($setFilterRegexp ne "")) { my $outhandle = $self->{'outhandle'}; print $outhandle "Warning: can only specify 'set_filter_list' or 'set_filter_regexp'\n"; print $outhandle " defaulting to 'set_filter_list'\n"; } my @sets = split(/,/,$setFilterList); my @sets_bracketed; foreach my $s (@sets) { $s =~ s/^(ex\.)?(.*)$/(ex.$2)/; push (@sets_bracketed, $s); } my $setFilterRegexp = join("|",@sets_bracketed); $self->{'set_filter_regexp'} = $setFilterRegexp; } return bless $self, $class; } # Need to think some more about this sub get_default_process_exp() { return ".*"; #q^(?i)\.(wma|wmv|jpe?g|gif)$^; } # plugins that rely on more than process_exp (eg XML plugins) can override this method sub can_process_this_file { my $self = shift(@_); # we process metadata, not the file return 0; } # Even if a plugin can extract metadata in its metadata_read pass, # make the default return 'undef' so processing of the file continues # down the pipeline, so other plugins can also have the opportunity to # locate metadata and set it up in the extrametakeys variables that # are passed around. sub can_process_this_file_for_metadata { my $self = shift(@_); # this plugin will look for metadata in any file through its # metadata_read(). Returning undef here means anything else further # down the pipeline can do the same return undef; } sub checkAgainstFilters { my $self = shift(@_); my $name = shift(@_); my $setFilterRegexp = $self->{'set_filter_regexp'}; if((defined $setFilterRegexp) && ($setFilterRegexp ne "")) { return ($name =~ m/($setFilterRegexp)/i); } else { return 1; } } sub extractEmbeddedMetadata() { my $self = shift(@_); my ($file, $filename, $extrametadata, $extrametakeys) = @_; my %exif_metadata = (); my $verbosity = $self->{'verbosity'}; my $outhandle = $self->{'outhandle'}; my $metadata_count = 0; my $separator = $self->{'metadata_field_separator'}; if ($separator eq "") { undef $separator; } my @group_list = Image::ExifTool::GetAllGroups(0); foreach my $group (@group_list) { ## print STDERR "**** group = $group\n"; # Extract meta information from an image $self->{'exiftool'}->Options(Group0 => [$group]); $self->{'exiftool'}->ExtractInfo($filename); # Get list of tags in the order they were found in the file my @tag_list = $self->{'exiftool'}->GetFoundTags('File'); foreach my $tag (@tag_list) { # Strip any numbering suffix $tag =~ s/^([^\s]+)\s.*$/$1/i; my $value = $self->{'exiftool'}->GetValue($tag); if (defined $value && $value =~ /[a-z0-9]+/i) { my $field = "ex.$group.$tag"; my $encoding = $self->{'input_encoding'}; if($encoding eq "auto") { $encoding = "utf8" } if (!defined $exif_metadata{$field}) { $exif_metadata{$field} = []; } $field = Encode::decode($encoding,$field); my $metadata_done = 0; if (ref $value eq 'SCALAR') { if ($$value =~ /^Binary data/) { $value = "($$value)"; } else { my $len = length($$value); $value = "(Binary data $len bytes)"; } } elsif (ref $value eq 'ARRAY') { $metadata_done = 1; my $allvals = ""; foreach my $v (@$value) { $v = Encode::decode($encoding,$v); if(!$self->{'join_before_split'}){ if (defined $separator) { my @vs = split($separator, $v); foreach my $val (@vs) { if ($val =~ /\S/) { push (@{$exif_metadata{$field}}, $self->gsSafe($val)) if $self->checkAgainstFilters($field); ++$metadata_count; } } } else { push (@{$exif_metadata{$field}}, $self->gsSafe($v)) if $self->checkAgainstFilters($field); ++$metadata_count; } } else{ if($allvals ne ""){ $allvals = $allvals . $self->{'join_character'}; } $allvals = $allvals . $v; } } if($self->{'join_before_split'}){ if (defined $separator) { my @vs = split($separator, $allvals); foreach my $val (@vs) { if ($val =~ /\S/) { push (@{$exif_metadata{$field}}, $self->gsSafe($val)) if $self->checkAgainstFilters($field); ++$metadata_count; } } } else { push (@{$exif_metadata{$field}}, $self->gsSafe($allvals)) if $self->checkAgainstFilters($field); ++$metadata_count; } } } else { $value = Encode::decode($encoding,$value); if (defined $separator) { my @vs = split($separator, $value); $metadata_done = 1; foreach my $v (@vs) { if ($v =~ /\S/) { push (@{$exif_metadata{$field}}, $self->gsSafe($v)) if $self->checkAgainstFilters($field); ++$metadata_count; } } } } if (!$metadata_done) { push (@{$exif_metadata{$field}}, $self->gsSafe($value)) if $self->checkAgainstFilters($field); ++$metadata_count; } } } } if ($metadata_count > 0) { print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n"; } # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone". $file = &util::filepath_to_url_format($file); $file = &util::filename_to_regex($file); # Associate the metadata now if (defined &extrametautil::getmetadata($extrametadata, $file)) { print STDERR "\n**** EmbeddedMetadataPlugin: Need to merge new metadata with existing stored metadata: file = $file\n" if $verbosity > 3; my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $file); foreach my $metaname (keys %exif_metadata) { # will create new entry if one does not already exist push(@{$file_metadata_table->{$metaname}}, @{$exif_metadata{$metaname}}); } # no need to push $file on to $extrametakeys as it is already in the list } else { &extrametautil::setmetadata($extrametadata, $file, \%exif_metadata); &extrametautil::addmetakey($extrametakeys, $file); } } sub metadata_read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $extrametafile, $processor, $gli, $aux) = @_; my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); # we don't want to process directories if (!-f $filename_full_path) { return undef; } print STDERR "\n\n" if ($gli); print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1; $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path, $extrametadata,$extrametakeys); return undef; } sub read { return undef; } sub process { # not used return undef; } sub gsSafe() { my $self = shift(@_); my ($text) = @_; # Replace potentially problematic characters $text =~ s/\(/(/g; $text =~ s/\)/)/g; $text =~ s/,/,/g; $text =~ s/\/>/g; $text =~ s/\[/[/g; $text =~ s/\]/]/g; $text =~ s/\{/{/g; $text =~ s/\}/}/g; # Done if ($self->{'trim_whitespace'} eq "true"){ $text =~ s/^\s+//; $text =~ s/\s+$//; } return $text; } 1;