source: main/trunk/greenstone2/perllib/plugins/EmbeddedMetadataPlugin.pm

Last change on this file was 36470, checked in by davidb, 20 months ago

Tweaks after refactoring. Causes 'use strict' to report error as using variables that don't exist. Changes looked over by Kathy/David, yet to be tested however

File size: 15.0 KB
Line 
1###########################################################################
2#
3# EmbeddedMetadataPlugin.pm -- A plugin for EXIF
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2007 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package EmbeddedMetadataPlugin;
29
30use BaseImporter;
31use MetadataRead;
32use util;
33
34use Encode;
35use Image::ExifTool qw(:Public);
36use strict;
37
38no strict 'refs'; # allow filehandles to be variables and viceversa
39
40
41sub BEGIN
42{
43 @EmbeddedMetadataPlugin::ISA = ('MetadataRead', 'BaseImporter');
44 binmode(STDERR, ":utf8");
45}
46
47my $encoding_plus_auto_list = [{
48 'name' => "auto",
49 'desc' => "{ReadTextFile.input_encoding.auto}" }];
50push(@{$encoding_plus_auto_list},@{$CommonUtil::encoding_list});
51
52my $arguments = [{
53 'name' => "metadata_field_separator",
54 'desc' => "{HTMLPlugin.metadata_field_separator}",
55 'type' => "string",
56 'deft' => ""
57 },{
58 'name' => "input_encoding",
59 'desc' => "{ReadTextFile.input_encoding}",
60 'type' => "enum",
61 'list' => $encoding_plus_auto_list,
62 'reqd' => "no",
63 'deft' => "auto"
64 },{
65 'name' => "join_before_split",
66 'desc' => "{EmbeddedMetadataPlugin.join_before_split}",
67 'type' => "flag"
68 },{
69 'name' => "join_character",
70 'desc' => "{EmbeddedMetadataPlugin.join_character}",
71 'type' => "string",
72 'deft' => " "
73 },{
74 'name' => "apply_join_before_split_to_metafields",
75 'desc' => "{EmbeddedMetadataPlugin.apply_join_before_split_to_metafields}",
76 'type' => "string",
77 'reqd' => "no",
78 'deft' => ".*Keywords" # regex. Set to .* to apply join_before_split to All meta fields (that are arrays of values)
79 },{
80 'name' => "trim_whitespace",
81 'desc' => "{EmbeddedMetadataPlugin.trim_whitespace}",
82 'type' => "enum",
83 'list' => [{'name' => "true", 'desc' => "{common.true}"}, {'name' => "false", 'desc' => "{common.false}"}],
84 'deft' => "true"
85 },{
86 'name' => "set_filter_list",
87 'desc' => "{EmbeddedMetadataPlugin.set_filter_list}",
88 'type' => "string"
89 },{
90 'name' => "set_filter_regexp",
91 'desc' => "{EmbeddedMetadataPlugin.set_filter_regexp}",
92 'type' => "string",
93 'deft' => ".*" #If changing this default, also need to update the constructor
94 }];
95
96my $options = {
97 'name' => "EmbeddedMetadataPlugin",
98 'desc' => "{EmbeddedMetadataPlugin.desc}",
99 'abstract' => "no",
100 'inherits' => "yes",
101 'args' => $arguments };
102
103sub new()
104{
105 my ($class) = shift (@_);
106 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
107 push(@$pluginlist, $class);
108
109 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
110 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
111
112 my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists);
113
114 # Create a new Image::ExifTool object
115 my $exifTool = new Image::ExifTool;
116 $exifTool->Options(Duplicates => 0);
117 $exifTool->Options(PrintConv => 0);
118 $exifTool->Options(Unknown => 1);
119 $exifTool->Options('Verbose');
120 $self->{'exiftool'} = $exifTool;
121
122 my $setFilterList = $self->{'set_filter_list'};
123 my $setFilterRegexp = $self->{'set_filter_regexp'};
124 if ((defined $setFilterList) && ($setFilterList ne ""))
125 {
126 if ((defined $setFilterRegexp) && ($setFilterRegexp ne ".*") && ($setFilterRegexp ne ""))
127 {
128 my $outhandle = $self->{'outhandle'};
129 print $outhandle "Warning: can only specify 'set_filter_list' or 'set_filter_regexp'\n";
130 print $outhandle " defaulting to 'set_filter_list'\n";
131 }
132
133 my @sets = split(/,/,$setFilterList);
134 my @sets_bracketed;
135 foreach my $s (@sets)
136 {
137 $s =~ s/^(ex\.)?(.*)$/(ex.$2)/;
138 push (@sets_bracketed, $s);
139 }
140
141 my $setFilterRegexp = join("|",@sets_bracketed);
142 $self->{'set_filter_regexp'} = $setFilterRegexp;
143 }
144
145 return bless $self, $class;
146}
147
148sub begin {
149 my $self = shift (@_);
150 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
151
152
153 # For most plugins, the metadata_read() phase either does not
154 # exist, or is very fast at processing the files, and so is
155 # not an undue burden on collection building.
156
157 # EmbeddedMetadataPlugin bucks this trend, as the ExifTool module
158 # it relies on needs to make a fairly detailed scan of the files
159 # that match the plugin's process expression. This has the
160 # unfortunate side effect of hampering quick collection building
161 # with '-maxdocs'. It is therefore worth a bit of non-standard
162 # "meddling" (as Anu would say) to help the special case of
163 # 'maxdocs' run more quickly.
164 #
165 # The approach is to notice how many files EmbeddedMetadtaPlugin
166 # has scanned, and once this reaches 'maxdocs' to then force the
167 # can_process_this_file_for_metadata() method to always return the
168 # answer 'not recognized' to prevent any further scanning.
169 # Bacause 'maxdocs' is not one of the standard parameters passed
170 # in to metadata_read() we need to store the value in the object
171 # using this method so it can be used at the relevant place in the
172 # code later on
173
174 $self->{'maxdocs'} = $maxdocs;
175 $self->{'exif_scanned_count'} = 0;
176
177}
178
179
180# Need to think some more about this
181sub get_default_process_exp()
182{
183## return ".*";
184 q^(?i)\.(jpe?g|gif|png|tiff|pdf)$^;
185}
186
187# plugins that rely on more than process_exp (eg XML plugins) can override this method
188sub can_process_this_file {
189 my $self = shift(@_);
190
191 # we process metadata, not the file
192 return 0;
193}
194
195# Even if a plugin can extract metadata in its metadata_read pass,
196# make the default return 'undef' so processing of the file continues
197# down the pipeline, so other plugins can also have the opportunity to
198# locate metadata and set it up in the extrametakeys variables that
199# are passed around.
200
201sub can_process_this_file_for_metadata {
202 my $self = shift(@_);
203 my ($filename) = (@_);
204
205 # Want this plugin to look for metadata in the named file using
206 # ExifTool through its metadata_read() function, as long as it
207 # matches the process expression. But first there are a few
208 # special cases to test for ...
209 #
210
211 if (-d $filename && !$self->{'can_process_directories'}) {
212 return 0;
213 }
214
215 if ($self->{'maxdocs'} != -1) {
216 $self->{'exif_scanned_count'}++;
217 if ($self->{'exif_scanned_count'} > $self->{'maxdocs'}) {
218 # Above the limit of files to scan
219 return 0;
220 }
221 }
222
223
224 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
225 # Even though we say yes to this here, because we are using a custom
226 # metadata_read() method in this plugin, we can also ensure the
227 # file is considered by other plugins in the pipeline
228
229 return 1;
230 }
231
232 # If we get to here then the answer is no for processing by this plugin
233 # Note :because this plugin has its own custom metadata_read(), even
234 # though we return a 'no' here, this doesn't stop the file being
235 # considered by other plugins in the pipleline for metadata_read().
236 # This is needed to ensure a file like metadata.xml (which would
237 # normally not be of interest to this plugin) is passed on to
238 # the plugin that does need to read it (MetadataPlugin in this case).
239
240 return 0;
241}
242
243sub checkAgainstFilters
244{
245 my $self = shift(@_);
246 my $name = shift(@_);
247
248 my $setFilterRegexp = $self->{'set_filter_regexp'};
249 if((defined $setFilterRegexp) && ($setFilterRegexp ne ""))
250 {
251 return ($name =~ m/($setFilterRegexp)/i);
252 }
253 else
254 {
255 return 1;
256 }
257}
258
259sub filtered_add_metadata
260{
261 my $self = shift(@_);
262 my ($field,$val,$exif_metadata_ref) = @_;
263
264 my $count = 0;
265
266 if ($self->checkAgainstFilters($field)) {
267 push (@{$exif_metadata_ref->{$field}}, $self->gsSafe($val));
268 $count++;
269
270
271 if ($field =~ m/GPSPosition/) {
272 my ($lat,$long) = split(/\s+/,$val);
273
274 push (@{$exif_metadata_ref->{"Longitude"}}, $self->gsSafe($long));
275 push (@{$exif_metadata_ref->{"Latitude"}}, $self->gsSafe($lat));
276 # 'count' keeps track of the number of items extracted from the file
277 # so for these 'on the side' values set, don't include them in
278 # the count
279
280 }
281
282
283 if ($field =~ m/GPSDateTime/) {
284 my ($date,$time) = split(/\s+/,$val);
285
286 my ($yyyy,$mm,$dd) = ($date =~ m/^(\d{4}):(\d{2}):(\d{2})$/);
287
288 push (@{$exif_metadata_ref->{"Date"}}, $self->gsSafe("$yyyy$mm$dd"));
289 # as for Long/Lat don't need to increase 'count'
290
291 }
292
293
294 }
295
296 return $count;
297}
298
299
300sub extractEmbeddedMetadata()
301{
302 my $self = shift(@_);
303 my ($file, $filename, $extrametadata, $extrametakeys, $extrametafile) = @_;
304
305 my %exif_metadata = ();
306
307 my $verbosity = $self->{'verbosity'};
308 my $outhandle = $self->{'outhandle'};
309
310 my $metadata_count = 0;
311
312 my $separator = $self->{'metadata_field_separator'};
313 if ($separator eq "") {
314 undef $separator;
315 }
316
317 # Check if join_before_split is only applied to spefified fields or all fields
318 my $apply_join_before_split_to_metafields = $self->{'apply_join_before_split_to_metafields'};
319 if($apply_join_before_split_to_metafields eq "") {
320 undef $apply_join_before_split_to_metafields;
321 }
322
323 my @group_list = Image::ExifTool::GetAllGroups(0);
324 foreach my $group (@group_list) {
325## print STDERR "**** group = $group\n";
326
327 # Extract meta information from an image
328 $self->{'exiftool'}->Options(Group0 => [$group]);
329 $self->{'exiftool'}->ExtractInfo($filename);
330
331 # Get list of tags in the order they were found in the file
332 my @tag_list = $self->{'exiftool'}->GetFoundTags('File');
333 foreach my $tag (@tag_list) {
334
335 # Strip any numbering suffix
336 $tag =~ s/^([^\s]+)\s.*$/$1/i;
337 my $value = $self->{'exiftool'}->GetValue($tag);
338 if (defined $value && $value =~ /[a-z0-9]+/i) {
339 my $field = "ex.$group.$tag";
340
341 my $encoding = $self->{'input_encoding'};
342 if($encoding eq "auto")
343 {
344 $encoding = "utf8"
345 }
346
347 if (!defined $exif_metadata{$field})
348 {
349 $exif_metadata{$field} = [];
350 }
351
352 $field = Encode::decode($encoding,$field);
353 my $metadata_done = 0;
354 if (ref $value eq 'SCALAR') {
355 if ($$value =~ /^Binary data/) {
356 $value = "($$value)";
357 }
358 else {
359 my $len = length($$value);
360 $value = "(Binary data $len bytes)";
361 }
362 }
363 elsif (ref $value eq 'ARRAY') {
364 my $join_before_split_for_this_field = $self->{'join_before_split'}; # if join_before_split set, will apply it to all metadata fields
365 # If specific metadata fields specified, only apply join_before_split to those fields
366 # (in which case check if current metadata field matches any of those metadata fields)
367 # else apply to all metadata fields as before
368 if($join_before_split_for_this_field && $apply_join_before_split_to_metafields && $field !~ m/$apply_join_before_split_to_metafields/) {
369 undef $join_before_split_for_this_field;
370 }
371
372 $metadata_done = 1;
373
374 my $allvals = "";
375 foreach my $v (@$value) {
376 $v = Encode::decode($encoding,$v);
377
378 if(!$join_before_split_for_this_field) { # if(!$self->{'join_before_split'}){
379 if (defined $separator) {
380 my @vs = split($separator, $v);
381 foreach my $val (@vs) {
382 if ($val =~ /\S/) {
383 $metadata_count += $self->filtered_add_metadata($field,$val,\%exif_metadata);
384 }
385 }
386 }
387 else
388 {
389 $metadata_count += $self->filtered_add_metadata($field,$v,\%exif_metadata);
390 }
391 }
392 else{
393 if($allvals ne ""){
394 $allvals = $allvals . $self->{'join_character'};
395 }
396 $allvals = $allvals . $v;
397 }
398 }
399
400 if($join_before_split_for_this_field) { #if($self->{'join_before_split'}){
401 if (defined $separator) {
402 my @vs = split($separator, $allvals);
403 foreach my $val (@vs) {
404 if ($val =~ /\S/) {
405 $metadata_count += $self->filtered_add_metadata($field,$val,\%exif_metadata);
406 }
407 }
408 }
409 else
410 {
411 $metadata_count += $self->filtered_add_metadata($field,$allvals,\%exif_metadata);
412 }
413 }
414 }
415 else {
416 $value = Encode::decode($encoding,$value);
417 if (defined $separator) {
418 my @vs = split($separator, $value);
419 $metadata_done = 1;
420 foreach my $v (@vs) {
421 if ($v =~ /\S/) {
422 $metadata_count += $self->filtered_add_metadata($field,$v,\%exif_metadata);
423 }
424 }
425 }
426 }
427 if (!$metadata_done) {
428 $metadata_count += $self->filtered_add_metadata($field,$value,\%exif_metadata);
429 }
430 }
431 }
432 }
433
434 if ($metadata_count > 0) {
435 print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n";
436 }
437
438 #print STDERR "file = $file " . &unicode::debug_unicode_string($file);
439 $file = &util::raw_filename_to_unicode(&util::filename_head($filename), $file);
440
441 $self->store_meta_in_extrametadata($file, \%exif_metadata, undef, undef, $extrametakeys, $extrametadata, $extrametafile);
442
443
444}
445
446
447sub metadata_read
448{
449 my $self = shift (@_);
450 my ($pluginfo, $base_dir, $file, $block_hash,
451 $extrametakeys, $extrametadata, $extrametafile,
452 $processor, $gli, $aux) = @_;
453
454 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
455
456# Now handled in the can_process_this_file_for_metadata method
457#
458# # we don't want to process directories
459# if (!-f $filename_full_path) {
460# return undef;
461# }
462
463 if (!$self->can_process_this_file_for_metadata($filename_full_path)) {
464
465 # Avoid scanning it with ExifTool ...
466 # ... but let any other plugin in metadata_read() passes pipeline
467 # consider it
468
469 return undef;
470 }
471
472
473 print STDERR "\n<Processing n='$file' p='EmbeddedMetadataPlugin'>\n" if ($gli);
474 print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
475
476 $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path,
477 $extrametadata,$extrametakeys,$extrametafile);
478
479 # also want it considered by other plugins in the metadata_read() pipeline
480 return undef;
481}
482
483sub read
484{
485 return undef;
486}
487
488sub process
489{
490 # not used
491 return undef;
492}
493
494sub gsSafe() {
495 my $self = shift(@_);
496 my ($text) = @_;
497
498 # Replace potentially problematic characters
499 $text =~ s/\(/&#40;/g;
500 $text =~ s/\)/&#41;/g;
501 $text =~ s/,/&#44;/g;
502 $text =~ s/\</&#60;/g;
503 $text =~ s/\>/&#62;/g;
504 $text =~ s/\[/&#91;/g;
505 $text =~ s/\]/&#93;/g;
506 $text =~ s/\{/&#123;/g;
507 $text =~ s/\}/&#125;/g;
508 # Done
509
510 if ($self->{'trim_whitespace'} eq "true"){
511 $text =~ s/^\s+//;
512 $text =~ s/\s+$//;
513 }
514
515 return $text;
516}
517
5181;
Note: See TracBrowser for help on using the repository browser.