source: main/trunk/greenstone2/perllib/plugins/EmbeddedMetadataPlugin.pm@ 24419

Last change on this file since 24419 was 24419, checked in by ak19, 13 years ago
  1. Dr Bainbridge fixed another bug with the EmbeddedMetadataPlugin and its interaction with OAIPlugin: extracted metadata is now merged into any earlier extracted metadata table, so that ex.dc.* meta extracted by the OAIPlugin higher up in the plugin pipeline is preserved after EmbeddedMetaPlug is through with the file. 2. Removed recent PDFPlugin commit where can_process_this_file_for_metadata was overridden. It isn't needed and may have adverse effects.
File size: 9.2 KB
Line 
1###########################################################################
2#
3# EmbeddedMetadataPlugin.pm -- A plugin for EXIF
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2007 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package EmbeddedMetadataPlugin;
29
30use BasePlugin;
31
32use Encode;
33use Image::ExifTool qw(:Public);
34use strict;
35
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39sub BEGIN
40{
41 @EmbeddedMetadataPlugin::ISA = ('BasePlugin');
42 binmode(STDERR, ":utf8");
43}
44
45my $encoding_plus_auto_list = [{
46 'name' => "auto",
47 'desc' => "{ReadTextFile.input_encoding.auto}" }];
48push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
49
50my $arguments = [{
51 'name' => "metadata_field_separator",
52 'desc' => "{HTMLPlugin.metadata_field_separator}",
53 'type' => "string",
54 'deft' => ""
55 },{
56 'name' => "input_encoding",
57 'desc' => "{ReadTextFile.input_encoding}",
58 'type' => "enum",
59 'list' => $encoding_plus_auto_list,
60 'reqd' => "no",
61 'deft' => "auto"
62 },{
63 'name' => "join_before_split",
64 'desc' => "{EmbeddedMetadataPlugin.join_before_split}",
65 'type' => "flag"
66 },{
67 'name' => "join_character",
68 'desc' => "{EmbeddedMetadataPlugin.join_character}",
69 'type' => "string",
70 'deft' => " "
71 },{
72 'name' => "trim_whitespace",
73 'desc' => "{EmbeddedMetadataPlugin.trim_whitespace}",
74 'type' => "enum",
75 'list' => [{'name' => "true", 'desc' => "{common.true}"}, {'name' => "false", 'desc' => "{common.false}"}],
76 'deft' => "true"
77 }];
78
79my $options = {
80 'name' => "EmbeddedMetadataPlugin",
81 'desc' => "{EmbeddedMetadataPlugin.desc}",
82 'abstract' => "no",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86sub new()
87{
88 my ($class) = shift (@_);
89 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
90 push(@$pluginlist, $class);
91
92 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
93 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
94
95 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
96
97 # Create a new Image::ExifTool object
98 my $exifTool = new Image::ExifTool;
99 $exifTool->Options(Duplicates => 0);
100 $exifTool->Options(PrintConv => 0);
101 $exifTool->Options(Unknown => 1);
102 $exifTool->Options('Verbose');
103 $self->{'exiftool'} = $exifTool;
104
105 return bless $self, $class;
106}
107
108
109# Need to think some more about this
110sub get_default_process_exp()
111{
112 return ".*";
113 #q^(?i)\.(wma|wmv|jpe?g|gif)$^;
114}
115
116
117# This plugin doesn't block any files
118#sub get_default_block_exp()
119#{
120# return '';
121#}
122
123# plugins that rely on more than process_exp (eg XML plugins) can override this method
124sub can_process_this_file {
125 my $self = shift(@_);
126
127 # we process metadata, not the file
128 return 0;
129}
130
131# Even if a plugin can extract metadata in its metadata_read pass,
132# make the default return 'undef' so processing of the file continues
133# down the pipeline, so other plugins can also have the opportunity to
134# locate metadata and set it up in the extrametakeys variables that
135# are passed around.
136
137sub can_process_this_file_for_metadata {
138 my $self = shift(@_);
139
140 # this plugin will look for metadata in any file through its
141 # metadata_read(). Returning undef here means anything else further
142 # down the pipeline can do the same
143
144 return undef;
145}
146
147
148sub extractEmbeddedMetadata()
149{
150 my $self = shift(@_);
151 my ($file, $filename, $extrametadata, $extrametakeys) = @_;
152
153 my %exif_metadata = ();
154
155 my $verbosity = $self->{'verbosity'};
156 my $outhandle = $self->{'outhandle'};
157
158 my $metadata_count = 0;
159
160 my $separator = $self->{'metadata_field_separator'};
161 if ($separator eq "") {
162 undef $separator;
163 }
164
165 my @group_list = Image::ExifTool::GetAllGroups(0);
166 foreach my $group (@group_list) {
167## print STDERR "**** group = $group\n";
168
169 # Extract meta information from an image
170 $self->{'exiftool'}->Options(Group0 => [$group]);
171 $self->{'exiftool'}->ExtractInfo($filename);
172
173 # Get list of tags in the order they were found in the file
174 my @tag_list = $self->{'exiftool'}->GetFoundTags('File');
175 foreach my $tag (@tag_list) {
176
177 # Strip any numbering suffix
178 $tag =~ s/^([^\s]+)\s.*$/$1/i;
179 my $value = $self->{'exiftool'}->GetValue($tag);
180 if (defined $value && $value =~ /[a-z0-9]+/i) {
181 my $field = "ex.$group.$tag";
182
183 my $encoding = $self->{'input_encoding'};
184 if($encoding eq "auto")
185 {
186 $encoding = "utf8"
187 }
188
189 if (!defined $exif_metadata{$field})
190 {
191 $exif_metadata{$field} = [];
192 }
193
194 $field = Encode::decode($encoding,$field);
195 my $metadata_done = 0;
196 if (ref $value eq 'SCALAR') {
197 if ($$value =~ /^Binary data/) {
198 $value = "($$value)";
199 }
200 else {
201 my $len = length($$value);
202 $value = "(Binary data $len bytes)";
203 }
204 }
205 elsif (ref $value eq 'ARRAY') {
206 $metadata_done = 1;
207
208 my $allvals = "";
209 foreach my $v (@$value) {
210 $v = Encode::decode($encoding,$v);
211
212 if(!$self->{'join_before_split'}){
213 if (defined $separator) {
214 my @vs = split($separator, $v);
215 foreach my $val (@vs) {
216 if ($val =~ /\S/) {
217 push (@{$exif_metadata{$field}}, $self->gsSafe($val));
218 ++$metadata_count;
219 }
220 }
221 }
222 else
223 {
224 push (@{$exif_metadata{$field}}, $self->gsSafe($v));
225 ++$metadata_count;
226 }
227 }
228 else{
229 if($allvals ne ""){
230 $allvals = $allvals . $self->{'join_character'};
231 }
232 $allvals = $allvals . $v;
233 }
234 }
235
236 if($self->{'join_before_split'}){
237 if (defined $separator) {
238 my @vs = split($separator, $allvals);
239 foreach my $val (@vs) {
240 if ($val =~ /\S/) {
241 push (@{$exif_metadata{$field}}, $self->gsSafe($val));
242 ++$metadata_count;
243 }
244 }
245 }
246 else
247 {
248 push (@{$exif_metadata{$field}}, $self->gsSafe($allvals));
249 ++$metadata_count;
250 }
251 }
252 }
253 else {
254 $value = Encode::decode($encoding,$value);
255 if (defined $separator) {
256 my @vs = split($separator, $value);
257 $metadata_done = 1;
258 foreach my $v (@vs) {
259 if ($v =~ /\S/) {
260 push (@{$exif_metadata{$field}}, $self->gsSafe($v));
261 ++$metadata_count;
262 }
263 }
264 }
265 }
266 if (!$metadata_done) {
267 push (@{$exif_metadata{$field}}, $self->gsSafe($value));
268 ++$metadata_count;
269 }
270 }
271 }
272 }
273
274 if ($metadata_count > 0) {
275 print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n";
276 }
277
278 # Protect windows directory chars \
279 $file = &util::filename_to_regex($file);
280
281 # Associate the metadata now
282
283 if (defined $extrametadata->{$file}) {
284 print STDERR "\n**** Need to merge new metadata with existing stored metadata: file = $file\n" if $verbosity > 2;
285
286 my $file_metadata_table = $extrametadata->{$file};
287
288 foreach my $metaname (keys %exif_metadata) {
289 # will create new entry if one does not already exist
290 push(@{$file_metadata_table->{$metaname}}, @{$exif_metadata{$metaname}});
291 }
292
293 # no need to push $file on to $extrametakeys as it is already in the list
294 }
295 else {
296 $extrametadata->{$file} = \%exif_metadata;
297 push(@$extrametakeys, $file);
298 }
299
300}
301
302
303sub metadata_read
304{
305 my $self = shift (@_);
306 my ($pluginfo, $base_dir, $file, $block_hash,
307 $extrametakeys, $extrametadata, $extrametafile,
308 $processor, $gli, $aux) = @_;
309
310 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
311
312 # we don't want to process directories
313 if (!-f $filename_full_path) {
314 return undef;
315 }
316 print STDERR "\n<Processing n='$file' p='EmbeddedMetadataPlugin'>\n" if ($gli);
317 print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
318
319 $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path,
320 $extrametadata,$extrametakeys);
321
322 return undef;
323}
324
325sub read
326{
327 return undef;
328}
329
330sub process
331{
332 # not used
333 return undef;
334}
335
336sub gsSafe() {
337 my $self = shift(@_);
338 my ($text) = @_;
339
340 # Replace potentially problematic characters
341 $text =~ s/\(/&#40;/g;
342 $text =~ s/\)/&#41;/g;
343 $text =~ s/,/&#44;/g;
344 $text =~ s/\</&#60;/g;
345 $text =~ s/\>/&#62;/g;
346 $text =~ s/\[/&#91;/g;
347 $text =~ s/\]/&#93;/g;
348 $text =~ s/\{/&#123;/g;
349 $text =~ s/\}/&#125;/g;
350 # Done
351
352 if ($self->{'trim_whitespace'} eq "true"){
353 $text =~ s/^\s+//;
354 $text =~ s/\s+$//;
355 }
356
357 return $text;
358}
359
3601;
Note: See TracBrowser for help on using the repository browser.