source: main/trunk/greenstone2/perllib/plugins/EmbeddedMetadataPlugin.pm@ 25332

Last change on this file since 25332 was 25332, checked in by kjdon, 12 years ago

if we block OAI files, then they won't get processed by OAIPlugin, so removing the block exp

File size: 11.1 KB
RevLine 
[18516]1###########################################################################
2#
[20862]3# EmbeddedMetadataPlugin.pm -- A plugin for EXIF
[18516]4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2007 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
[20862]28package EmbeddedMetadataPlugin;
[18516]29
30use BasePlugin;
[24951]31use extrametautil;
[24971]32use util;
[18516]33
[23810]34use Encode;
[18516]35use Image::ExifTool qw(:Public);
36use strict;
37
38no strict 'refs'; # allow filehandles to be variables and viceversa
39
40
41sub BEGIN
42{
[24290]43 @EmbeddedMetadataPlugin::ISA = ('BasePlugin');
44 binmode(STDERR, ":utf8");
[18516]45}
46
[24290]47my $encoding_plus_auto_list = [{
48 'name' => "auto",
49 'desc' => "{ReadTextFile.input_encoding.auto}" }];
[23810]50push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
[18516]51
[24290]52my $arguments = [{
53 'name' => "metadata_field_separator",
[22451]54 'desc' => "{HTMLPlugin.metadata_field_separator}",
55 'type' => "string",
[24290]56 'deft' => ""
57 },{
58 'name' => "input_encoding",
[23810]59 'desc' => "{ReadTextFile.input_encoding}",
60 'type' => "enum",
61 'list' => $encoding_plus_auto_list,
62 'reqd' => "no",
[24290]63 'deft' => "auto"
64 },{
65 'name' => "join_before_split",
66 'desc' => "{EmbeddedMetadataPlugin.join_before_split}",
67 'type' => "flag"
68 },{
69 'name' => "join_character",
70 'desc' => "{EmbeddedMetadataPlugin.join_character}",
71 'type' => "string",
72 'deft' => " "
73 },{
74 'name' => "trim_whitespace",
75 'desc' => "{EmbeddedMetadataPlugin.trim_whitespace}",
76 'type' => "enum",
77 'list' => [{'name' => "true", 'desc' => "{common.true}"}, {'name' => "false", 'desc' => "{common.false}"}],
78 'deft' => "true"
[24487]79 },{
80 'name' => "set_filter_list",
81 'desc' => "{EmbeddedMetadataPlugin.set_filter_list}",
82 'type' => "string"
83 },{
84 'name' => "set_filter_regexp",
85 'desc' => "{EmbeddedMetadataPlugin.set_filter_regexp}",
86 'type' => "string",
87 'deft' => ".*" #If changing this default, also need to update the constructor
[24290]88 }];
[18516]89
[24290]90my $options = {
91 'name' => "EmbeddedMetadataPlugin",
92 'desc' => "{EmbeddedMetadataPlugin.desc}",
93 'abstract' => "no",
94 'inherits' => "yes",
95 'args' => $arguments };
[22451]96
[18516]97sub new()
98{
99 my ($class) = shift (@_);
100 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
101 push(@$pluginlist, $class);
102
103 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
104 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
105
106 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
107
108 # Create a new Image::ExifTool object
109 my $exifTool = new Image::ExifTool;
110 $exifTool->Options(Duplicates => 0);
[24290]111 $exifTool->Options(PrintConv => 0);
[18516]112 $exifTool->Options(Unknown => 1);
113 $exifTool->Options('Verbose');
114 $self->{'exiftool'} = $exifTool;
[24487]115
116 my $setFilterList = $self->{'set_filter_list'};
117 my $setFilterRegexp = $self->{'set_filter_regexp'};
118 if ((defined $setFilterList) && ($setFilterList ne ""))
119 {
120 if ((defined $setFilterRegexp) && ($setFilterRegexp ne ".*") && ($setFilterRegexp ne ""))
121 {
122 my $outhandle = $self->{'outhandle'};
123 print $outhandle "Warning: can only specify 'set_filter_list' or 'set_filter_regexp'\n";
124 print $outhandle " defaulting to 'set_filter_list'\n";
125 }
[18516]126
[24487]127 my @sets = split(/,/,$setFilterList);
128 my @sets_bracketed;
129 foreach my $s (@sets)
130 {
131 $s =~ s/^(ex\.)?(.*)$/(ex.$2)/;
132 push (@sets_bracketed, $s);
133 }
134
135 my $setFilterRegexp = join("|",@sets_bracketed);
136 $self->{'set_filter_regexp'} = $setFilterRegexp;
137 }
138
[18516]139 return bless $self, $class;
140}
141
142
143# Need to think some more about this
[20927]144sub get_default_process_exp()
145{
[22552]146 return ".*";
[20927]147 #q^(?i)\.(wma|wmv|jpe?g|gif)$^;
148}
[18516]149
[24403]150# plugins that rely on more than process_exp (eg XML plugins) can override this method
151sub can_process_this_file {
152 my $self = shift(@_);
[18516]153
[24403]154 # we process metadata, not the file
155 return 0;
156}
157
158# Even if a plugin can extract metadata in its metadata_read pass,
159# make the default return 'undef' so processing of the file continues
160# down the pipeline, so other plugins can also have the opportunity to
161# locate metadata and set it up in the extrametakeys variables that
162# are passed around.
163
164sub can_process_this_file_for_metadata {
165 my $self = shift(@_);
166
167 # this plugin will look for metadata in any file through its
[24414]168 # metadata_read(). Returning undef here means anything else further
[24403]169 # down the pipeline can do the same
170
171 return undef;
172}
173
[24487]174sub checkAgainstFilters
175{
176 my $self = shift(@_);
177 my $name = shift(@_);
178
179 my $setFilterRegexp = $self->{'set_filter_regexp'};
180 if((defined $setFilterRegexp) && ($setFilterRegexp ne ""))
181 {
182 return ($name =~ m/($setFilterRegexp)/i);
183 }
184 else
185 {
186 return 1;
187 }
188}
[24403]189
[18516]190sub extractEmbeddedMetadata()
191{
192 my $self = shift(@_);
193 my ($file, $filename, $extrametadata, $extrametakeys) = @_;
[22074]194
[18516]195 my %exif_metadata = ();
196
197 my $verbosity = $self->{'verbosity'};
198 my $outhandle = $self->{'outhandle'};
199
200 my $metadata_count = 0;
201
[22451]202 my $separator = $self->{'metadata_field_separator'};
203 if ($separator eq "") {
[24290]204 undef $separator;
[22451]205 }
206
[18516]207 my @group_list = Image::ExifTool::GetAllGroups(0);
[24290]208 foreach my $group (@group_list) {
209## print STDERR "**** group = $group\n";
[18516]210
[24290]211 # Extract meta information from an image
212 $self->{'exiftool'}->Options(Group0 => [$group]);
213 $self->{'exiftool'}->ExtractInfo($filename);
[18516]214
[24290]215 # Get list of tags in the order they were found in the file
216 my @tag_list = $self->{'exiftool'}->GetFoundTags('File');
217 foreach my $tag (@tag_list) {
[18516]218
219 # Strip any numbering suffix
[24290]220 $tag =~ s/^([^\s]+)\s.*$/$1/i;
221 my $value = $self->{'exiftool'}->GetValue($tag);
222 if (defined $value && $value =~ /[a-z0-9]+/i) {
223 my $field = "ex.$group.$tag";
[25332]224
[24290]225 my $encoding = $self->{'input_encoding'};
226 if($encoding eq "auto")
227 {
228 $encoding = "utf8"
229 }
[18516]230
[24290]231 if (!defined $exif_metadata{$field})
232 {
233 $exif_metadata{$field} = [];
234 }
[18516]235
[24290]236 $field = Encode::decode($encoding,$field);
237 my $metadata_done = 0;
238 if (ref $value eq 'SCALAR') {
239 if ($$value =~ /^Binary data/) {
240 $value = "($$value)";
241 }
[22448]242 else {
[24290]243 my $len = length($$value);
244 $value = "(Binary data $len bytes)";
245 }
246 }
247 elsif (ref $value eq 'ARRAY') {
248 $metadata_done = 1;
249
250 my $allvals = "";
251 foreach my $v (@$value) {
252 $v = Encode::decode($encoding,$v);
253
254 if(!$self->{'join_before_split'}){
255 if (defined $separator) {
256 my @vs = split($separator, $v);
257 foreach my $val (@vs) {
258 if ($val =~ /\S/) {
[24487]259 push (@{$exif_metadata{$field}}, $self->gsSafe($val)) if $self->checkAgainstFilters($field);
[24290]260 ++$metadata_count;
261 }
262 }
263 }
264 else
265 {
[24487]266 push (@{$exif_metadata{$field}}, $self->gsSafe($v)) if $self->checkAgainstFilters($field);
[24290]267 ++$metadata_count;
268 }
269 }
270 else{
271 if($allvals ne ""){
272 $allvals = $allvals . $self->{'join_character'};
273 }
274 $allvals = $allvals . $v;
275 }
276 }
277
278 if($self->{'join_before_split'}){
279 if (defined $separator) {
280 my @vs = split($separator, $allvals);
281 foreach my $val (@vs) {
282 if ($val =~ /\S/) {
[24487]283 push (@{$exif_metadata{$field}}, $self->gsSafe($val)) if $self->checkAgainstFilters($field);
[24290]284 ++$metadata_count;
285 }
286 }
287 }
288 else
289 {
[24487]290 push (@{$exif_metadata{$field}}, $self->gsSafe($allvals)) if $self->checkAgainstFilters($field);
[24290]291 ++$metadata_count;
292 }
293 }
294 }
295 else {
296 $value = Encode::decode($encoding,$value);
297 if (defined $separator) {
298 my @vs = split($separator, $value);
299 $metadata_done = 1;
300 foreach my $v (@vs) {
301 if ($v =~ /\S/) {
[24487]302 push (@{$exif_metadata{$field}}, $self->gsSafe($v)) if $self->checkAgainstFilters($field);
[24290]303 ++$metadata_count;
304 }
305 }
306 }
307 }
308 if (!$metadata_done) {
[24487]309 push (@{$exif_metadata{$field}}, $self->gsSafe($value)) if $self->checkAgainstFilters($field);
[24290]310 ++$metadata_count;
311 }
[22451]312 }
313 }
[24290]314 }
[18516]315
316 if ($metadata_count > 0) {
[24290]317 print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n";
[18516]318 }
319
[24971]320 # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
321 # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
322 # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
323 $file = &util::filepath_to_url_format($file);
324 $file = &util::filename_to_regex($file);
[22074]325
[19811]326 # Associate the metadata now
327
[24951]328 if (defined &extrametautil::getmetadata($extrametadata, $file)) {
[25332]329 print STDERR "\n**** EmbeddedMetadataPlugin: Need to merge new metadata with existing stored metadata: file = $file\n" if $verbosity > 3;
[19811]330
[24951]331 my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $file);
[24419]332
333 foreach my $metaname (keys %exif_metadata) {
334 # will create new entry if one does not already exist
335 push(@{$file_metadata_table->{$metaname}}, @{$exif_metadata{$metaname}});
336 }
337
338 # no need to push $file on to $extrametakeys as it is already in the list
339 }
340 else {
[24951]341 &extrametautil::setmetadata($extrametadata, $file, \%exif_metadata);
342 &extrametautil::addmetakey($extrametakeys, $file);
[24419]343 }
344
[18516]345}
346
347
[24404]348sub metadata_read
[18516]349{
[24349]350 my $self = shift (@_);
351 my ($pluginfo, $base_dir, $file, $block_hash,
352 $extrametakeys, $extrametadata, $extrametafile,
353 $processor, $gli, $aux) = @_;
354
355 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
356
357 # we don't want to process directories
[24414]358 if (!-f $filename_full_path) {
[24290]359 return undef;
[24349]360 }
361 print STDERR "\n<Processing n='$file' p='EmbeddedMetadataPlugin'>\n" if ($gli);
362 print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
363
364 $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path,
365 $extrametadata,$extrametakeys);
366
367 return undef;
[18516]368}
369
[24403]370sub read
371{
372 return undef;
373}
[18516]374
[24404]375sub process
[18516]376{
377 # not used
378 return undef;
379}
380
[24290]381sub gsSafe() {
382 my $self = shift(@_);
383 my ($text) = @_;
384
385 # Replace potentially problematic characters
386 $text =~ s/\(/&#40;/g;
387 $text =~ s/\)/&#41;/g;
388 $text =~ s/,/&#44;/g;
389 $text =~ s/\</&#60;/g;
390 $text =~ s/\>/&#62;/g;
391 $text =~ s/\[/&#91;/g;
392 $text =~ s/\]/&#93;/g;
393 $text =~ s/\{/&#123;/g;
394 $text =~ s/\}/&#125;/g;
395 # Done
396
397 if ($self->{'trim_whitespace'} eq "true"){
398 $text =~ s/^\s+//;
399 $text =~ s/\s+$//;
400 }
401
402 return $text;
403}
[18516]404
4051;
Note: See TracBrowser for help on using the repository browser.