source: main/trunk/greenstone2/perllib/plugins/EmbeddedMetadataPlugin.pm@ 26146

Last change on this file since 26146 was 26146, checked in by davidb, 12 years ago

Refinement to EmbeddedMetadataPlugin that allows it to operate with maxdocs, for faster prototype building/rebuilding. Rather than pass one every file in metadata_read() this plugin now abides by the process expression it had, which has (in this mode) been changed from '*' back to a more conservative one that looks for images and PDF docs

File size: 14.5 KB
Line 
1###########################################################################
2#
3# EmbeddedMetadataPlugin.pm -- A plugin for EXIF
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2007 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package EmbeddedMetadataPlugin;
29
30use BasePlugin;
31use extrametautil;
32use util;
33
34use Encode;
35use Image::ExifTool qw(:Public);
36use strict;
37
38no strict 'refs'; # allow filehandles to be variables and viceversa
39
40
41sub BEGIN
42{
43 @EmbeddedMetadataPlugin::ISA = ('BasePlugin');
44 binmode(STDERR, ":utf8");
45}
46
47my $encoding_plus_auto_list = [{
48 'name' => "auto",
49 'desc' => "{ReadTextFile.input_encoding.auto}" }];
50push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
51
52my $arguments = [{
53 'name' => "metadata_field_separator",
54 'desc' => "{HTMLPlugin.metadata_field_separator}",
55 'type' => "string",
56 'deft' => ""
57 },{
58 'name' => "input_encoding",
59 'desc' => "{ReadTextFile.input_encoding}",
60 'type' => "enum",
61 'list' => $encoding_plus_auto_list,
62 'reqd' => "no",
63 'deft' => "auto"
64 },{
65 'name' => "join_before_split",
66 'desc' => "{EmbeddedMetadataPlugin.join_before_split}",
67 'type' => "flag"
68 },{
69 'name' => "join_character",
70 'desc' => "{EmbeddedMetadataPlugin.join_character}",
71 'type' => "string",
72 'deft' => " "
73 },{
74 'name' => "trim_whitespace",
75 'desc' => "{EmbeddedMetadataPlugin.trim_whitespace}",
76 'type' => "enum",
77 'list' => [{'name' => "true", 'desc' => "{common.true}"}, {'name' => "false", 'desc' => "{common.false}"}],
78 'deft' => "true"
79 },{
80 'name' => "set_filter_list",
81 'desc' => "{EmbeddedMetadataPlugin.set_filter_list}",
82 'type' => "string"
83 },{
84 'name' => "set_filter_regexp",
85 'desc' => "{EmbeddedMetadataPlugin.set_filter_regexp}",
86 'type' => "string",
87 'deft' => ".*" #If changing this default, also need to update the constructor
88 }];
89
90my $options = {
91 'name' => "EmbeddedMetadataPlugin",
92 'desc' => "{EmbeddedMetadataPlugin.desc}",
93 'abstract' => "no",
94 'inherits' => "yes",
95 'args' => $arguments };
96
97sub new()
98{
99 my ($class) = shift (@_);
100 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
101 push(@$pluginlist, $class);
102
103 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
104 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
105
106 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
107
108 # Create a new Image::ExifTool object
109 my $exifTool = new Image::ExifTool;
110 $exifTool->Options(Duplicates => 0);
111 $exifTool->Options(PrintConv => 0);
112 $exifTool->Options(Unknown => 1);
113 $exifTool->Options('Verbose');
114 $self->{'exiftool'} = $exifTool;
115
116 my $setFilterList = $self->{'set_filter_list'};
117 my $setFilterRegexp = $self->{'set_filter_regexp'};
118 if ((defined $setFilterList) && ($setFilterList ne ""))
119 {
120 if ((defined $setFilterRegexp) && ($setFilterRegexp ne ".*") && ($setFilterRegexp ne ""))
121 {
122 my $outhandle = $self->{'outhandle'};
123 print $outhandle "Warning: can only specify 'set_filter_list' or 'set_filter_regexp'\n";
124 print $outhandle " defaulting to 'set_filter_list'\n";
125 }
126
127 my @sets = split(/,/,$setFilterList);
128 my @sets_bracketed;
129 foreach my $s (@sets)
130 {
131 $s =~ s/^(ex\.)?(.*)$/(ex.$2)/;
132 push (@sets_bracketed, $s);
133 }
134
135 my $setFilterRegexp = join("|",@sets_bracketed);
136 $self->{'set_filter_regexp'} = $setFilterRegexp;
137 }
138
139 return bless $self, $class;
140}
141
142sub begin {
143 my $self = shift (@_);
144 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
145
146
147 # For most plugins, the metadata_read() phase either does not
148 # exist, or is very fast at processing the files, and so is
149 # not an undue burden on collection building.
150
151 # EmbeddedMetadataPlugin bucks this trend, as the ExifTool module
152 # it relies on needs to make a fairly detailed scan of the files
153 # that match the plugin's process expression. This has the
154 # unfortunate side effect of hampering quick collection building
155 # with '-maxdocs'. It is therefore worth a bit of non-standard
156 # "meddling" (as Anu would say) to help the special case of
157 # 'maxdocs' run more quickly.
158 #
159 # The approach is to notice how many files EmbeddedMetadtaPlugin
160 # has scanned, and once this reaches 'maxdocs' to then force the
161 # can_process_this_file_for_metadata() method to always return the
162 # answer 'not recognized' to prevent any further scanning.
163 # Bacause 'maxdocs' is not one of the standard parameters passed
164 # in to metadata_read() we need to store the value in the object
165 # using this method so it can be used at the relevant place in the
166 # code later on
167
168 $self->{'maxdocs'} = $maxdocs;
169 $self->{'exif_scanned_count'} = 0;
170
171}
172
173
174# Need to think some more about this
175sub get_default_process_exp()
176{
177## return ".*";
178 q^(?i)\.(jpe?g|gif|png|tiff|pdf)$^;
179}
180
181# plugins that rely on more than process_exp (eg XML plugins) can override this method
182sub can_process_this_file {
183 my $self = shift(@_);
184
185 # we process metadata, not the file
186 return 0;
187}
188
189# Even if a plugin can extract metadata in its metadata_read pass,
190# make the default return 'undef' so processing of the file continues
191# down the pipeline, so other plugins can also have the opportunity to
192# locate metadata and set it up in the extrametakeys variables that
193# are passed around.
194
195sub can_process_this_file_for_metadata {
196 my $self = shift(@_);
197 my ($filename) = (@_);
198
199 # Want this plugin to look for metadata in the named file using
200 # ExifTool through its metadata_read() function, as long as it
201 # matches the process expression. But first there are a few
202 # special cases to test for ...
203 #
204
205 if (-d $filename && !$self->{'can_process_directories'}) {
206 return 0;
207 }
208
209 if ($self->{'maxdocs'} != -1) {
210 $self->{'exif_scanned_count'}++;
211 if ($self->{'exif_scanned_count'} > $self->{'maxdocs'}) {
212 # Above the limit of files to scan
213 return 0;
214 }
215 }
216
217
218 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
219 # Even though we say yes to this here, because we are using a custom
220 # metadata_read() method in this plugin, we can also ensure the
221 # file is considered by other plugins in the pipeline
222
223 return 1;
224 }
225
226 # If we get to here then the answer is no for processing by this plugin
227 # Note :because this plugin has its own custom metadata_read(), even
228 # though we return a 'no' here, this doesn't stop the file being
229 # considered by other plugins in the pipleline for metadata_read().
230 # This is needed to ensure a file like metadata.xml (which would
231 # normally not be of interest to this plugin) is passed on to
232 # the plugin that does need to read it (MetadataPlugin in this case).
233
234 return 0;
235}
236
237sub checkAgainstFilters
238{
239 my $self = shift(@_);
240 my $name = shift(@_);
241
242 my $setFilterRegexp = $self->{'set_filter_regexp'};
243 if((defined $setFilterRegexp) && ($setFilterRegexp ne ""))
244 {
245 return ($name =~ m/($setFilterRegexp)/i);
246 }
247 else
248 {
249 return 1;
250 }
251}
252
253sub filtered_add_metadata
254{
255 my $self = shift(@_);
256 my ($field,$val,$exif_metadata_ref) = @_;
257
258 my $count = 0;
259
260 if ($self->checkAgainstFilters($field)) {
261 push (@{$exif_metadata_ref->{$field}}, $self->gsSafe($val));
262 $count++;
263
264
265 if ($field =~ m/GPSPosition/) {
266 my ($lat,$long) = split(/\s+/,$val);
267
268 push (@{$exif_metadata_ref->{"Longitude"}}, $self->gsSafe($long));
269 push (@{$exif_metadata_ref->{"Latitude"}}, $self->gsSafe($lat));
270 # 'count' keeps track of the number of items extracted from the file
271 # so for these 'on the side' values set, don't include them in
272 # the count
273
274 }
275
276
277 if ($field =~ m/GPSDateTime/) {
278 my ($date,$time) = split(/\s+/,$val);
279
280 my ($yyyy,$mm,$dd) = ($date =~ m/^(\d{4}):(\d{2}):(\d{2})$/);
281
282 push (@{$exif_metadata_ref->{"Date"}}, $self->gsSafe("$yyyy$mm$dd"));
283 # as for Long/Lat don't need to increase 'count'
284
285 }
286
287
288 }
289
290 return $count;
291}
292
293
294sub extractEmbeddedMetadata()
295{
296 my $self = shift(@_);
297 my ($file, $filename, $extrametadata, $extrametakeys) = @_;
298
299 my %exif_metadata = ();
300
301 my $verbosity = $self->{'verbosity'};
302 my $outhandle = $self->{'outhandle'};
303
304 my $metadata_count = 0;
305
306 my $separator = $self->{'metadata_field_separator'};
307 if ($separator eq "") {
308 undef $separator;
309 }
310
311 my @group_list = Image::ExifTool::GetAllGroups(0);
312 foreach my $group (@group_list) {
313## print STDERR "**** group = $group\n";
314
315 # Extract meta information from an image
316 $self->{'exiftool'}->Options(Group0 => [$group]);
317 $self->{'exiftool'}->ExtractInfo($filename);
318
319 # Get list of tags in the order they were found in the file
320 my @tag_list = $self->{'exiftool'}->GetFoundTags('File');
321 foreach my $tag (@tag_list) {
322
323 # Strip any numbering suffix
324 $tag =~ s/^([^\s]+)\s.*$/$1/i;
325 my $value = $self->{'exiftool'}->GetValue($tag);
326 if (defined $value && $value =~ /[a-z0-9]+/i) {
327 my $field = "ex.$group.$tag";
328
329 my $encoding = $self->{'input_encoding'};
330 if($encoding eq "auto")
331 {
332 $encoding = "utf8"
333 }
334
335 if (!defined $exif_metadata{$field})
336 {
337 $exif_metadata{$field} = [];
338 }
339
340 $field = Encode::decode($encoding,$field);
341 my $metadata_done = 0;
342 if (ref $value eq 'SCALAR') {
343 if ($$value =~ /^Binary data/) {
344 $value = "($$value)";
345 }
346 else {
347 my $len = length($$value);
348 $value = "(Binary data $len bytes)";
349 }
350 }
351 elsif (ref $value eq 'ARRAY') {
352 $metadata_done = 1;
353
354 my $allvals = "";
355 foreach my $v (@$value) {
356 $v = Encode::decode($encoding,$v);
357
358 if(!$self->{'join_before_split'}){
359 if (defined $separator) {
360 my @vs = split($separator, $v);
361 foreach my $val (@vs) {
362 if ($val =~ /\S/) {
363 $metadata_count += $self->filtered_add_metadata($field,$val,\%exif_metadata);
364 }
365 }
366 }
367 else
368 {
369 $metadata_count += $self->filtered_add_metadata($field,$v,\%exif_metadata);
370 }
371 }
372 else{
373 if($allvals ne ""){
374 $allvals = $allvals . $self->{'join_character'};
375 }
376 $allvals = $allvals . $v;
377 }
378 }
379
380 if($self->{'join_before_split'}){
381 if (defined $separator) {
382 my @vs = split($separator, $allvals);
383 foreach my $val (@vs) {
384 if ($val =~ /\S/) {
385 $metadata_count += $self->filtered_add_metadata($field,$val,\%exif_metadata);
386 }
387 }
388 }
389 else
390 {
391 $metadata_count += $self->filtered_add_metadata($field,$allvals,\%exif_metadata);
392 }
393 }
394 }
395 else {
396 $value = Encode::decode($encoding,$value);
397 if (defined $separator) {
398 my @vs = split($separator, $value);
399 $metadata_done = 1;
400 foreach my $v (@vs) {
401 if ($v =~ /\S/) {
402 $metadata_count += $self->filtered_add_metadata($field,$v,\%exif_metadata);
403 }
404 }
405 }
406 }
407 if (!$metadata_done) {
408 $metadata_count += $self->filtered_add_metadata($field,$value,\%exif_metadata);
409 }
410 }
411 }
412 }
413
414 if ($metadata_count > 0) {
415 print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n";
416 }
417
418 # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
419 # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
420 # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
421 $file = &util::filepath_to_url_format($file);
422 $file = &util::filename_to_regex($file);
423
424 # Associate the metadata now
425
426 if (defined &extrametautil::getmetadata($extrametadata, $file)) {
427 print STDERR "\n**** EmbeddedMetadataPlugin: Need to merge new metadata with existing stored metadata: file = $file\n" if $verbosity > 3;
428
429 my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $file);
430
431 foreach my $metaname (keys %exif_metadata) {
432 # will create new entry if one does not already exist
433 push(@{$file_metadata_table->{$metaname}}, @{$exif_metadata{$metaname}});
434 }
435
436 # no need to push $file on to $extrametakeys as it is already in the list
437 }
438 else {
439 &extrametautil::setmetadata($extrametadata, $file, \%exif_metadata);
440 &extrametautil::addmetakey($extrametakeys, $file);
441 }
442
443}
444
445
446sub metadata_read
447{
448 my $self = shift (@_);
449 my ($pluginfo, $base_dir, $file, $block_hash,
450 $extrametakeys, $extrametadata, $extrametafile,
451 $processor, $gli, $aux) = @_;
452
453 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
454
455# Now handled in the can_process_this_file_for_metadata method
456#
457# # we don't want to process directories
458# if (!-f $filename_full_path) {
459# return undef;
460# }
461
462 if (!$self->can_process_this_file_for_metadata($filename_full_path)) {
463
464 # Avoid scanning it with ExitTool ...
465 # ... but let any other plugin in metadata_read() passes pipeline
466 # consider it
467
468 return undef;
469 }
470
471
472 print STDERR "\n<Processing n='$file' p='EmbeddedMetadataPlugin'>\n" if ($gli);
473 print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
474
475 $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path,
476 $extrametadata,$extrametakeys);
477
478 # also want it considered by other plugins in the metadata_read() pipeline
479 return undef;
480}
481
482sub read
483{
484 return undef;
485}
486
487sub process
488{
489 # not used
490 return undef;
491}
492
493sub gsSafe() {
494 my $self = shift(@_);
495 my ($text) = @_;
496
497 # Replace potentially problematic characters
498 $text =~ s/\(/&#40;/g;
499 $text =~ s/\)/&#41;/g;
500 $text =~ s/,/&#44;/g;
501 $text =~ s/\</&#60;/g;
502 $text =~ s/\>/&#62;/g;
503 $text =~ s/\[/&#91;/g;
504 $text =~ s/\]/&#93;/g;
505 $text =~ s/\{/&#123;/g;
506 $text =~ s/\}/&#125;/g;
507 # Done
508
509 if ($self->{'trim_whitespace'} eq "true"){
510 $text =~ s/^\s+//;
511 $text =~ s/\s+$//;
512 }
513
514 return $text;
515}
516
5171;
Note: See TracBrowser for help on using the repository browser.