root/main/trunk/greenstone2/perllib/plugins/EmbeddedMetadataPlugin.pm @ 24414

Revision 24414, 8.7 KB (checked in by ak19, 8 years ago)

To do with EmbeddedMetadataPlugin?: 1.mkcol.pl and GLI changes puts the plugin in the bottom four plugins of the plugin pipeline. 2. EmbeddedMetadataPlugin? and PDFPlugin are modified to work together again after the recent changes (introduction of overridable BasePlugin? method can_process_file_for_metadata) which were needed to get the EmbeddedMetadataPlugin? and OAIPlugin to work together.

Line 
1###########################################################################
2#
3# EmbeddedMetadataPlugin.pm -- A plugin for EXIF
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2007 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package EmbeddedMetadataPlugin;
29
30use BasePlugin;
31
32use Encode;
33use Image::ExifTool qw(:Public);
34use strict;
35
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39sub BEGIN
40{
41    @EmbeddedMetadataPlugin::ISA = ('BasePlugin');
42    binmode(STDERR, ":utf8");
43}
44
45my $encoding_plus_auto_list = [{
46        'name' => "auto",
47        'desc' => "{ReadTextFile.input_encoding.auto}" }];
48push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
49
50my $arguments = [{
51    'name' => "metadata_field_separator",
52    'desc' => "{HTMLPlugin.metadata_field_separator}",
53    'type' => "string",
54    'deft' => ""
55    },{
56    'name' => "input_encoding",
57    'desc' => "{ReadTextFile.input_encoding}",
58    'type' => "enum",
59    'list' => $encoding_plus_auto_list,
60    'reqd' => "no",
61    'deft' => "auto"
62    },{
63    'name' => "join_before_split",
64    'desc' => "{EmbeddedMetadataPlugin.join_before_split}",
65    'type' => "flag"
66    },{
67    'name' => "join_character",
68    'desc' => "{EmbeddedMetadataPlugin.join_character}",
69    'type' => "string",
70    'deft' => " "
71    },{
72    'name' => "trim_whitespace",
73    'desc' => "{EmbeddedMetadataPlugin.trim_whitespace}",
74    'type' => "enum",
75    'list' => [{'name' => "true", 'desc' => "{common.true}"}, {'name' => "false", 'desc' => "{common.false}"}],
76    'deft' => "true"
77    }];
78
79my $options = {
80    'name'     => "EmbeddedMetadataPlugin",
81    'desc'     => "{EmbeddedMetadataPlugin.desc}",
82    'abstract' => "no",
83    'inherits' => "yes",
84    'args'     => $arguments };
85
86sub new()
87{
88    my ($class) = shift (@_);
89    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
90    push(@$pluginlist, $class);
91
92    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
93    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
94
95    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
96
97    # Create a new Image::ExifTool object
98    my $exifTool = new Image::ExifTool;
99    $exifTool->Options(Duplicates => 0);
100    $exifTool->Options(PrintConv => 0);
101    $exifTool->Options(Unknown => 1);
102    $exifTool->Options('Verbose');
103    $self->{'exiftool'} = $exifTool;
104
105    return bless $self, $class;
106}
107
108
109# Need to think some more about this
110sub get_default_process_exp()
111{
112    return ".*";
113    #q^(?i)\.(wma|wmv|jpe?g|gif)$^;
114}
115
116
117# This plugin doesn't block any files
118#sub get_default_block_exp()
119#{
120#    return '';
121#}
122
123# plugins that rely on more than process_exp (eg XML plugins) can override this method
124sub can_process_this_file {
125    my $self = shift(@_);
126
127    # we process metadata, not the file
128    return 0;   
129}
130
131# Even if a plugin can extract metadata in its metadata_read pass,
132# make the default return 'undef' so processing of the file continues
133# down the pipeline, so other plugins can also have the opportunity to
134# locate metadata and set it up in the extrametakeys variables that
135# are passed around.
136
137sub can_process_this_file_for_metadata {
138    my $self = shift(@_);
139
140    # this plugin will look for metadata in any file through its
141    # metadata_read(). Returning undef here means anything else further
142    # down the pipeline can do the same
143
144    return undef;
145}
146
147
148sub extractEmbeddedMetadata()
149{
150    my $self = shift(@_);
151    my ($file, $filename, $extrametadata, $extrametakeys) = @_;
152 
153    my %exif_metadata = ();
154
155    my $verbosity = $self->{'verbosity'};
156    my $outhandle = $self->{'outhandle'};
157
158    my $metadata_count = 0;
159   
160    my $separator = $self->{'metadata_field_separator'};
161    if ($separator eq "") {
162        undef $separator;
163    }
164
165    my @group_list = Image::ExifTool::GetAllGroups(0);
166    foreach my $group (@group_list) {
167##  print STDERR "**** group = $group\n";
168
169        # Extract meta information from an image
170        $self->{'exiftool'}->Options(Group0 => [$group]);
171        $self->{'exiftool'}->ExtractInfo($filename);
172
173        # Get list of tags in the order they were found in the file
174        my @tag_list = $self->{'exiftool'}->GetFoundTags('File');
175        foreach my $tag (@tag_list) {
176
177            # Strip any numbering suffix
178            $tag =~ s/^([^\s]+)\s.*$/$1/i;
179            my $value = $self->{'exiftool'}->GetValue($tag);
180            if (defined $value && $value =~ /[a-z0-9]+/i) {
181                my $field = "ex.$group.$tag";
182       
183                my $encoding = $self->{'input_encoding'};
184                if($encoding eq "auto")
185                {
186                    $encoding = "utf8"
187                }
188
189                if (!defined $exif_metadata{$field})
190                {
191                    $exif_metadata{$field} = [];
192                }
193
194                $field = Encode::decode($encoding,$field);
195                my $metadata_done = 0;
196                if (ref $value eq 'SCALAR') {
197                    if ($$value =~ /^Binary data/) {
198                        $value = "($$value)";
199                    }
200                    else {
201                        my $len = length($$value);
202                        $value = "(Binary data $len bytes)";
203                    }
204                }
205                elsif (ref $value eq 'ARRAY') {
206                    $metadata_done = 1;
207                   
208                    my $allvals = "";
209                    foreach my $v (@$value) {
210                        $v = Encode::decode($encoding,$v);
211                       
212                        if(!$self->{'join_before_split'}){
213                            if (defined $separator) {
214                                my @vs = split($separator, $v);
215                                foreach my $val (@vs) {
216                                    if ($val =~ /\S/) {
217                                        push (@{$exif_metadata{$field}}, $self->gsSafe($val));
218                                        ++$metadata_count;
219                                    }
220                                }
221                            }
222                            else
223                            {
224                                push (@{$exif_metadata{$field}}, $self->gsSafe($v));
225                                ++$metadata_count;
226                            }
227                        }
228                        else{
229                            if($allvals ne ""){
230                                $allvals = $allvals . $self->{'join_character'};
231                            }
232                            $allvals = $allvals . $v;
233                        }
234                    }
235                   
236                    if($self->{'join_before_split'}){
237                        if (defined $separator) {
238                            my @vs = split($separator, $allvals);
239                            foreach my $val (@vs) {
240                                if ($val =~ /\S/) {
241                                    push (@{$exif_metadata{$field}}, $self->gsSafe($val));
242                                    ++$metadata_count;
243                                }
244                            }
245                        }
246                        else
247                        {
248                            push (@{$exif_metadata{$field}}, $self->gsSafe($allvals));
249                            ++$metadata_count;
250                        }
251                    }
252                }
253                else {
254                    $value = Encode::decode($encoding,$value);
255                    if (defined $separator) {
256                        my @vs = split($separator, $value);
257                        $metadata_done = 1;
258                        foreach my $v (@vs) {
259                            if ($v =~ /\S/) {
260                                push (@{$exif_metadata{$field}}, $self->gsSafe($v));
261                                ++$metadata_count;
262                            }
263                        }
264                    }
265                }
266                if (!$metadata_done) {
267                    push (@{$exif_metadata{$field}}, $self->gsSafe($value));
268                    ++$metadata_count;
269                }
270            }
271        }
272    }
273
274    if ($metadata_count > 0) {
275        print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n";
276    }
277
278    # Protect windows directory chars \
279    $file = &util::filename_to_regex($file);
280   
281    # Associate the metadata now
282
283    $extrametadata->{$file} = \%exif_metadata;
284    push(@$extrametakeys, $file);
285
286}
287
288
289sub metadata_read
290{
291    my $self = shift (@_);
292    my ($pluginfo, $base_dir, $file, $block_hash,
293    $extrametakeys, $extrametadata, $extrametafile,
294    $processor, $gli, $aux) = @_;
295 
296    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
297   
298    # we don't want to process directories
299    if (!-f $filename_full_path) {
300    return undef;
301    }
302    print STDERR "\n<Processing n='$file' p='EmbeddedMetadataPlugin'>\n" if ($gli);
303    print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
304   
305    $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path,
306                   $extrametadata,$extrametakeys);
307   
308    return undef;
309}
310
311sub read
312{
313    return undef;
314}
315
316sub process
317{
318    # not used
319    return undef;
320}
321
322sub gsSafe() {
323    my $self = shift(@_);
324    my ($text) = @_;
325   
326    # Replace potentially problematic characters
327    $text =~ s/\(/&#40;/g;
328    $text =~ s/\)/&#41;/g;
329    $text =~ s/,/&#44;/g;
330    $text =~ s/\</&#60;/g;
331    $text =~ s/\>/&#62;/g;
332    $text =~ s/\[/&#91;/g;
333    $text =~ s/\]/&#93;/g;
334    $text =~ s/\{/&#123;/g;
335    $text =~ s/\}/&#125;/g;
336    # Done
337   
338    if ($self->{'trim_whitespace'} eq "true"){
339        $text =~ s/^\s+//;
340        $text =~ s/\s+$//;
341    }
342   
343    return $text;
344}
345
3461;
Note: See TracBrowser for help on using the browser.