root/main/trunk/greenstone2/perllib/plugins/EmbeddedMetadataPlugin.pm @ 24951

Revision 24951, 10.9 KB (checked in by ak19, 8 years ago)

All perlcode that accesses extrametakeys, extrametadata, extrametafile data structures has been moved into a new perl module called extrametautil.pm. The next step will be to ensure that the file_regexes used to index into these data structures are consistent (using consistent slashes, like URL style slashes).

Line 
1###########################################################################
2#
3# EmbeddedMetadataPlugin.pm -- A plugin for EXIF
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2007 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package EmbeddedMetadataPlugin;
29
30use BasePlugin;
31use extrametautil;
32
33use Encode;
34use Image::ExifTool qw(:Public);
35use strict;
36
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39
40sub BEGIN
41{
42    @EmbeddedMetadataPlugin::ISA = ('BasePlugin');
43    binmode(STDERR, ":utf8");
44}
45
46my $encoding_plus_auto_list = [{
47        'name' => "auto",
48        'desc' => "{ReadTextFile.input_encoding.auto}" }];
49push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
50
51my $arguments = [{
52    'name' => "metadata_field_separator",
53    'desc' => "{HTMLPlugin.metadata_field_separator}",
54    'type' => "string",
55    'deft' => ""
56    },{
57    'name' => "input_encoding",
58    'desc' => "{ReadTextFile.input_encoding}",
59    'type' => "enum",
60    'list' => $encoding_plus_auto_list,
61    'reqd' => "no",
62    'deft' => "auto"
63    },{
64    'name' => "join_before_split",
65    'desc' => "{EmbeddedMetadataPlugin.join_before_split}",
66    'type' => "flag"
67    },{
68    'name' => "join_character",
69    'desc' => "{EmbeddedMetadataPlugin.join_character}",
70    'type' => "string",
71    'deft' => " "
72    },{
73    'name' => "trim_whitespace",
74    'desc' => "{EmbeddedMetadataPlugin.trim_whitespace}",
75    'type' => "enum",
76    'list' => [{'name' => "true", 'desc' => "{common.true}"}, {'name' => "false", 'desc' => "{common.false}"}],
77    'deft' => "true"
78    },{
79    'name' => "set_filter_list",
80    'desc' => "{EmbeddedMetadataPlugin.set_filter_list}",
81    'type' => "string"
82    },{
83    'name' => "set_filter_regexp",
84    'desc' => "{EmbeddedMetadataPlugin.set_filter_regexp}",
85    'type' => "string",
86    'deft' => ".*" #If changing this default, also need to update the constructor
87    }];
88
89my $options = {
90    'name'     => "EmbeddedMetadataPlugin",
91    'desc'     => "{EmbeddedMetadataPlugin.desc}",
92    'abstract' => "no",
93    'inherits' => "yes",
94    'args'     => $arguments };
95
96sub new()
97{
98    my ($class) = shift (@_);
99    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
100    push(@$pluginlist, $class);
101
102    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
103    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
104
105    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
106
107    # Create a new Image::ExifTool object
108    my $exifTool = new Image::ExifTool;
109    $exifTool->Options(Duplicates => 0);
110    $exifTool->Options(PrintConv => 0);
111    $exifTool->Options(Unknown => 1);
112    $exifTool->Options('Verbose');
113    $self->{'exiftool'} = $exifTool;
114   
115    my $setFilterList = $self->{'set_filter_list'};
116    my $setFilterRegexp = $self->{'set_filter_regexp'};
117    if ((defined $setFilterList) && ($setFilterList ne ""))
118    {
119        if ((defined $setFilterRegexp) && ($setFilterRegexp ne ".*") && ($setFilterRegexp ne ""))
120        {
121            my $outhandle = $self->{'outhandle'};
122            print $outhandle "Warning: can only specify 'set_filter_list' or 'set_filter_regexp'\n";
123            print $outhandle "         defaulting to 'set_filter_list'\n";
124        }
125
126        my @sets = split(/,/,$setFilterList);
127        my @sets_bracketed;
128        foreach my $s (@sets)
129        {
130            $s =~ s/^(ex\.)?(.*)$/(ex.$2)/;
131            push (@sets_bracketed, $s);
132        }
133
134        my $setFilterRegexp = join("|",@sets_bracketed);
135        $self->{'set_filter_regexp'} = $setFilterRegexp;
136    }
137
138    return bless $self, $class;
139}
140
141
142# Need to think some more about this
143sub get_default_process_exp()
144{
145    return ".*";
146    #q^(?i)\.(wma|wmv|jpe?g|gif)$^;
147}
148
149
150# This plugin blocks *.oai files, so that they can be processed by OAIPlugin
151# even if OAIPlugin comes later in the pipeline than EmbeddedMetadataPlugin.
152sub get_default_block_exp()
153{
154    return q^(?i)\.(oai)$^;
155}
156
157# plugins that rely on more than process_exp (eg XML plugins) can override this method
158sub can_process_this_file {
159    my $self = shift(@_);
160
161    # we process metadata, not the file
162    return 0;   
163}
164
165# Even if a plugin can extract metadata in its metadata_read pass,
166# make the default return 'undef' so processing of the file continues
167# down the pipeline, so other plugins can also have the opportunity to
168# locate metadata and set it up in the extrametakeys variables that
169# are passed around.
170
171sub can_process_this_file_for_metadata {
172    my $self = shift(@_);
173
174    # this plugin will look for metadata in any file through its
175    # metadata_read(). Returning undef here means anything else further
176    # down the pipeline can do the same
177
178    return undef;
179}
180
181sub checkAgainstFilters
182{
183    my $self = shift(@_);
184    my $name = shift(@_);
185   
186    my $setFilterRegexp = $self->{'set_filter_regexp'};
187    if((defined $setFilterRegexp) && ($setFilterRegexp ne ""))
188    {
189        return ($name =~ m/($setFilterRegexp)/i);
190    }
191    else
192    {
193        return 1;
194    }
195}
196
197sub extractEmbeddedMetadata()
198{
199    my $self = shift(@_);
200    my ($file, $filename, $extrametadata, $extrametakeys) = @_;
201 
202    my %exif_metadata = ();
203
204    my $verbosity = $self->{'verbosity'};
205    my $outhandle = $self->{'outhandle'};
206
207    my $metadata_count = 0;
208   
209    my $separator = $self->{'metadata_field_separator'};
210    if ($separator eq "") {
211        undef $separator;
212    }
213
214    my @group_list = Image::ExifTool::GetAllGroups(0);
215    foreach my $group (@group_list) {
216##  print STDERR "**** group = $group\n";
217
218        # Extract meta information from an image
219        $self->{'exiftool'}->Options(Group0 => [$group]);
220        $self->{'exiftool'}->ExtractInfo($filename);
221
222        # Get list of tags in the order they were found in the file
223        my @tag_list = $self->{'exiftool'}->GetFoundTags('File');
224        foreach my $tag (@tag_list) {
225
226            # Strip any numbering suffix
227            $tag =~ s/^([^\s]+)\s.*$/$1/i;
228            my $value = $self->{'exiftool'}->GetValue($tag);
229            if (defined $value && $value =~ /[a-z0-9]+/i) {
230                my $field = "ex.$group.$tag";
231       
232                my $encoding = $self->{'input_encoding'};
233                if($encoding eq "auto")
234                {
235                    $encoding = "utf8"
236                }
237
238                if (!defined $exif_metadata{$field})
239                {
240                    $exif_metadata{$field} = [];
241                }
242
243                $field = Encode::decode($encoding,$field);
244                my $metadata_done = 0;
245                if (ref $value eq 'SCALAR') {
246                    if ($$value =~ /^Binary data/) {
247                        $value = "($$value)";
248                    }
249                    else {
250                        my $len = length($$value);
251                        $value = "(Binary data $len bytes)";
252                    }
253                }
254                elsif (ref $value eq 'ARRAY') {
255                    $metadata_done = 1;
256                   
257                    my $allvals = "";
258                    foreach my $v (@$value) {
259                        $v = Encode::decode($encoding,$v);
260                       
261                        if(!$self->{'join_before_split'}){
262                            if (defined $separator) {
263                                my @vs = split($separator, $v);
264                                foreach my $val (@vs) {
265                                    if ($val =~ /\S/) {
266                                        push (@{$exif_metadata{$field}}, $self->gsSafe($val)) if $self->checkAgainstFilters($field);
267                                        ++$metadata_count;
268                                    }
269                                }
270                            }
271                            else
272                            {
273                                push (@{$exif_metadata{$field}}, $self->gsSafe($v)) if $self->checkAgainstFilters($field);
274                                ++$metadata_count;
275                            }
276                        }
277                        else{
278                            if($allvals ne ""){
279                                $allvals = $allvals . $self->{'join_character'};
280                            }
281                            $allvals = $allvals . $v;
282                        }
283                    }
284                   
285                    if($self->{'join_before_split'}){
286                        if (defined $separator) {
287                            my @vs = split($separator, $allvals);
288                            foreach my $val (@vs) {
289                                if ($val =~ /\S/) {
290                                    push (@{$exif_metadata{$field}}, $self->gsSafe($val)) if $self->checkAgainstFilters($field);
291                                    ++$metadata_count;
292                                }
293                            }
294                        }
295                        else
296                        {
297                            push (@{$exif_metadata{$field}}, $self->gsSafe($allvals)) if $self->checkAgainstFilters($field);
298                            ++$metadata_count;
299                        }
300                    }
301                }
302                else {
303                    $value = Encode::decode($encoding,$value);
304                    if (defined $separator) {
305                        my @vs = split($separator, $value);
306                        $metadata_done = 1;
307                        foreach my $v (@vs) {
308                            if ($v =~ /\S/) {
309                                push (@{$exif_metadata{$field}}, $self->gsSafe($v)) if $self->checkAgainstFilters($field);
310                                ++$metadata_count;
311                            }
312                        }
313                    }
314                }
315                if (!$metadata_done) {
316                    push (@{$exif_metadata{$field}}, $self->gsSafe($value)) if $self->checkAgainstFilters($field);
317                    ++$metadata_count;
318                }
319            }
320        }
321    }
322
323    if ($metadata_count > 0) {
324        print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n";
325    }
326
327    # Protect windows directory chars \
328    $file = &util::filename_to_regex($file); ####
329   
330    # Associate the metadata now
331
332    if (defined &extrametautil::getmetadata($extrametadata, $file)) {
333    print STDERR "\n****  Need to merge new metadata with existing stored metadata: file = $file\n" if $verbosity > 2;
334
335    my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $file);
336
337    foreach my $metaname (keys %exif_metadata) {
338        # will create new entry if one does not already exist
339        push(@{$file_metadata_table->{$metaname}}, @{$exif_metadata{$metaname}});       
340    }
341
342    # no need to push $file on to $extrametakeys as it is already in the list
343    }
344    else {
345    &extrametautil::setmetadata($extrametadata, $file, \%exif_metadata);
346    &extrametautil::addmetakey($extrametakeys, $file);
347    }
348
349}
350
351
352sub metadata_read
353{
354    my $self = shift (@_);
355    my ($pluginfo, $base_dir, $file, $block_hash,
356    $extrametakeys, $extrametadata, $extrametafile,
357    $processor, $gli, $aux) = @_;
358 
359    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
360   
361    # we don't want to process directories
362    if (!-f $filename_full_path) {
363    return undef;
364    }
365    print STDERR "\n<Processing n='$file' p='EmbeddedMetadataPlugin'>\n" if ($gli);
366    print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
367   
368    $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path,
369                   $extrametadata,$extrametakeys);
370   
371    return undef;
372}
373
374sub read
375{
376    return undef;
377}
378
379sub process
380{
381    # not used
382    return undef;
383}
384
385sub gsSafe() {
386    my $self = shift(@_);
387    my ($text) = @_;
388   
389    # Replace potentially problematic characters
390    $text =~ s/\(/&#40;/g;
391    $text =~ s/\)/&#41;/g;
392    $text =~ s/,/&#44;/g;
393    $text =~ s/\</&#60;/g;
394    $text =~ s/\>/&#62;/g;
395    $text =~ s/\[/&#91;/g;
396    $text =~ s/\]/&#93;/g;
397    $text =~ s/\{/&#123;/g;
398    $text =~ s/\}/&#125;/g;
399    # Done
400   
401    if ($self->{'trim_whitespace'} eq "true"){
402        $text =~ s/^\s+//;
403        $text =~ s/\s+$//;
404    }
405   
406    return $text;
407}
408
4091;
Note: See TracBrowser for help on using the browser.