source: main/trunk/greenstone2/perllib/plugins/EmbeddedMetadataPlugin.pm@ 32122

Last change on this file since 32122 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

File size: 14.9 KB
Line 
1###########################################################################
2#
3# EmbeddedMetadataPlugin.pm -- A plugin for EXIF
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 2007 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package EmbeddedMetadataPlugin;
29
30use BaseImporter;
31use extrametautil;
32use util;
33
34use Encode;
35use Image::ExifTool qw(:Public);
36use strict;
37
38no strict 'refs'; # allow filehandles to be variables and viceversa
39
40
41sub BEGIN
42{
43 @EmbeddedMetadataPlugin::ISA = ('BaseImporter');
44 binmode(STDERR, ":utf8");
45}
46
47my $encoding_plus_auto_list = [{
48 'name' => "auto",
49 'desc' => "{ReadTextFile.input_encoding.auto}" }];
50push(@{$encoding_plus_auto_list},@{$CommonUtil::encoding_list});
51
52my $arguments = [{
53 'name' => "metadata_field_separator",
54 'desc' => "{HTMLPlugin.metadata_field_separator}",
55 'type' => "string",
56 'deft' => ""
57 },{
58 'name' => "input_encoding",
59 'desc' => "{ReadTextFile.input_encoding}",
60 'type' => "enum",
61 'list' => $encoding_plus_auto_list,
62 'reqd' => "no",
63 'deft' => "auto"
64 },{
65 'name' => "join_before_split",
66 'desc' => "{EmbeddedMetadataPlugin.join_before_split}",
67 'type' => "flag"
68 },{
69 'name' => "join_character",
70 'desc' => "{EmbeddedMetadataPlugin.join_character}",
71 'type' => "string",
72 'deft' => " "
73 },{
74 'name' => "trim_whitespace",
75 'desc' => "{EmbeddedMetadataPlugin.trim_whitespace}",
76 'type' => "enum",
77 'list' => [{'name' => "true", 'desc' => "{common.true}"}, {'name' => "false", 'desc' => "{common.false}"}],
78 'deft' => "true"
79 },{
80 'name' => "set_filter_list",
81 'desc' => "{EmbeddedMetadataPlugin.set_filter_list}",
82 'type' => "string"
83 },{
84 'name' => "set_filter_regexp",
85 'desc' => "{EmbeddedMetadataPlugin.set_filter_regexp}",
86 'type' => "string",
87 'deft' => ".*" #If changing this default, also need to update the constructor
88 }];
89
90my $options = {
91 'name' => "EmbeddedMetadataPlugin",
92 'desc' => "{EmbeddedMetadataPlugin.desc}",
93 'abstract' => "no",
94 'inherits' => "yes",
95 'args' => $arguments };
96
97sub new()
98{
99 my ($class) = shift (@_);
100 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
101 push(@$pluginlist, $class);
102
103 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
104 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
105
106 my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists);
107
108 # Create a new Image::ExifTool object
109 my $exifTool = new Image::ExifTool;
110 $exifTool->Options(Duplicates => 0);
111 $exifTool->Options(PrintConv => 0);
112 $exifTool->Options(Unknown => 1);
113 $exifTool->Options('Verbose');
114 $self->{'exiftool'} = $exifTool;
115
116 my $setFilterList = $self->{'set_filter_list'};
117 my $setFilterRegexp = $self->{'set_filter_regexp'};
118 if ((defined $setFilterList) && ($setFilterList ne ""))
119 {
120 if ((defined $setFilterRegexp) && ($setFilterRegexp ne ".*") && ($setFilterRegexp ne ""))
121 {
122 my $outhandle = $self->{'outhandle'};
123 print $outhandle "Warning: can only specify 'set_filter_list' or 'set_filter_regexp'\n";
124 print $outhandle " defaulting to 'set_filter_list'\n";
125 }
126
127 my @sets = split(/,/,$setFilterList);
128 my @sets_bracketed;
129 foreach my $s (@sets)
130 {
131 $s =~ s/^(ex\.)?(.*)$/(ex.$2)/;
132 push (@sets_bracketed, $s);
133 }
134
135 my $setFilterRegexp = join("|",@sets_bracketed);
136 $self->{'set_filter_regexp'} = $setFilterRegexp;
137 }
138
139 return bless $self, $class;
140}
141
142sub begin {
143 my $self = shift (@_);
144 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
145
146
147 # For most plugins, the metadata_read() phase either does not
148 # exist, or is very fast at processing the files, and so is
149 # not an undue burden on collection building.
150
151 # EmbeddedMetadataPlugin bucks this trend, as the ExifTool module
152 # it relies on needs to make a fairly detailed scan of the files
153 # that match the plugin's process expression. This has the
154 # unfortunate side effect of hampering quick collection building
155 # with '-maxdocs'. It is therefore worth a bit of non-standard
156 # "meddling" (as Anu would say) to help the special case of
157 # 'maxdocs' run more quickly.
158 #
159 # The approach is to notice how many files EmbeddedMetadtaPlugin
160 # has scanned, and once this reaches 'maxdocs' to then force the
161 # can_process_this_file_for_metadata() method to always return the
162 # answer 'not recognized' to prevent any further scanning.
163 # Bacause 'maxdocs' is not one of the standard parameters passed
164 # in to metadata_read() we need to store the value in the object
165 # using this method so it can be used at the relevant place in the
166 # code later on
167
168 $self->{'maxdocs'} = $maxdocs;
169 $self->{'exif_scanned_count'} = 0;
170
171}
172
173
174# Need to think some more about this
175sub get_default_process_exp()
176{
177## return ".*";
178 q^(?i)\.(jpe?g|gif|png|tiff|pdf)$^;
179}
180
181# plugins that rely on more than process_exp (eg XML plugins) can override this method
182sub can_process_this_file {
183 my $self = shift(@_);
184
185 # we process metadata, not the file
186 return 0;
187}
188
189# Even if a plugin can extract metadata in its metadata_read pass,
190# make the default return 'undef' so processing of the file continues
191# down the pipeline, so other plugins can also have the opportunity to
192# locate metadata and set it up in the extrametakeys variables that
193# are passed around.
194
195sub can_process_this_file_for_metadata {
196 my $self = shift(@_);
197 my ($filename) = (@_);
198
199 # Want this plugin to look for metadata in the named file using
200 # ExifTool through its metadata_read() function, as long as it
201 # matches the process expression. But first there are a few
202 # special cases to test for ...
203 #
204
205 if (-d $filename && !$self->{'can_process_directories'}) {
206 return 0;
207 }
208
209 if ($self->{'maxdocs'} != -1) {
210 $self->{'exif_scanned_count'}++;
211 if ($self->{'exif_scanned_count'} > $self->{'maxdocs'}) {
212 # Above the limit of files to scan
213 return 0;
214 }
215 }
216
217
218 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
219 # Even though we say yes to this here, because we are using a custom
220 # metadata_read() method in this plugin, we can also ensure the
221 # file is considered by other plugins in the pipeline
222
223 return 1;
224 }
225
226 # If we get to here then the answer is no for processing by this plugin
227 # Note :because this plugin has its own custom metadata_read(), even
228 # though we return a 'no' here, this doesn't stop the file being
229 # considered by other plugins in the pipleline for metadata_read().
230 # This is needed to ensure a file like metadata.xml (which would
231 # normally not be of interest to this plugin) is passed on to
232 # the plugin that does need to read it (MetadataPlugin in this case).
233
234 return 0;
235}
236
237sub checkAgainstFilters
238{
239 my $self = shift(@_);
240 my $name = shift(@_);
241
242 my $setFilterRegexp = $self->{'set_filter_regexp'};
243 if((defined $setFilterRegexp) && ($setFilterRegexp ne ""))
244 {
245 return ($name =~ m/($setFilterRegexp)/i);
246 }
247 else
248 {
249 return 1;
250 }
251}
252
253sub filtered_add_metadata
254{
255 my $self = shift(@_);
256 my ($field,$val,$exif_metadata_ref) = @_;
257
258 my $count = 0;
259
260 if ($self->checkAgainstFilters($field)) {
261 push (@{$exif_metadata_ref->{$field}}, $self->gsSafe($val));
262 $count++;
263
264
265 if ($field =~ m/GPSPosition/) {
266 my ($lat,$long) = split(/\s+/,$val);
267
268 push (@{$exif_metadata_ref->{"Longitude"}}, $self->gsSafe($long));
269 push (@{$exif_metadata_ref->{"Latitude"}}, $self->gsSafe($lat));
270 # 'count' keeps track of the number of items extracted from the file
271 # so for these 'on the side' values set, don't include them in
272 # the count
273
274 }
275
276
277 if ($field =~ m/GPSDateTime/) {
278 my ($date,$time) = split(/\s+/,$val);
279
280 my ($yyyy,$mm,$dd) = ($date =~ m/^(\d{4}):(\d{2}):(\d{2})$/);
281
282 push (@{$exif_metadata_ref->{"Date"}}, $self->gsSafe("$yyyy$mm$dd"));
283 # as for Long/Lat don't need to increase 'count'
284
285 }
286
287
288 }
289
290 return $count;
291}
292
293
294sub extractEmbeddedMetadata()
295{
296 my $self = shift(@_);
297 my ($file, $filename, $extrametadata, $extrametakeys) = @_;
298
299 my %exif_metadata = ();
300
301 my $verbosity = $self->{'verbosity'};
302 my $outhandle = $self->{'outhandle'};
303
304 my $metadata_count = 0;
305
306 my $separator = $self->{'metadata_field_separator'};
307 if ($separator eq "") {
308 undef $separator;
309 }
310
311 my @group_list = Image::ExifTool::GetAllGroups(0);
312 foreach my $group (@group_list) {
313## print STDERR "**** group = $group\n";
314
315 # Extract meta information from an image
316 $self->{'exiftool'}->Options(Group0 => [$group]);
317 $self->{'exiftool'}->ExtractInfo($filename);
318
319 # Get list of tags in the order they were found in the file
320 my @tag_list = $self->{'exiftool'}->GetFoundTags('File');
321 foreach my $tag (@tag_list) {
322
323 # Strip any numbering suffix
324 $tag =~ s/^([^\s]+)\s.*$/$1/i;
325 my $value = $self->{'exiftool'}->GetValue($tag);
326 if (defined $value && $value =~ /[a-z0-9]+/i) {
327 my $field = "ex.$group.$tag";
328
329 my $encoding = $self->{'input_encoding'};
330 if($encoding eq "auto")
331 {
332 $encoding = "utf8"
333 }
334
335 if (!defined $exif_metadata{$field})
336 {
337 $exif_metadata{$field} = [];
338 }
339
340 $field = Encode::decode($encoding,$field);
341 my $metadata_done = 0;
342 if (ref $value eq 'SCALAR') {
343 if ($$value =~ /^Binary data/) {
344 $value = "($$value)";
345 }
346 else {
347 my $len = length($$value);
348 $value = "(Binary data $len bytes)";
349 }
350 }
351 elsif (ref $value eq 'ARRAY') {
352 $metadata_done = 1;
353
354 my $allvals = "";
355 foreach my $v (@$value) {
356 $v = Encode::decode($encoding,$v);
357
358 if(!$self->{'join_before_split'}){
359 if (defined $separator) {
360 my @vs = split($separator, $v);
361 foreach my $val (@vs) {
362 if ($val =~ /\S/) {
363 $metadata_count += $self->filtered_add_metadata($field,$val,\%exif_metadata);
364 }
365 }
366 }
367 else
368 {
369 $metadata_count += $self->filtered_add_metadata($field,$v,\%exif_metadata);
370 }
371 }
372 else{
373 if($allvals ne ""){
374 $allvals = $allvals . $self->{'join_character'};
375 }
376 $allvals = $allvals . $v;
377 }
378 }
379
380 if($self->{'join_before_split'}){
381 if (defined $separator) {
382 my @vs = split($separator, $allvals);
383 foreach my $val (@vs) {
384 if ($val =~ /\S/) {
385 $metadata_count += $self->filtered_add_metadata($field,$val,\%exif_metadata);
386 }
387 }
388 }
389 else
390 {
391 $metadata_count += $self->filtered_add_metadata($field,$allvals,\%exif_metadata);
392 }
393 }
394 }
395 else {
396 $value = Encode::decode($encoding,$value);
397 if (defined $separator) {
398 my @vs = split($separator, $value);
399 $metadata_done = 1;
400 foreach my $v (@vs) {
401 if ($v =~ /\S/) {
402 $metadata_count += $self->filtered_add_metadata($field,$v,\%exif_metadata);
403 }
404 }
405 }
406 }
407 if (!$metadata_done) {
408 $metadata_count += $self->filtered_add_metadata($field,$value,\%exif_metadata);
409 }
410 }
411 }
412 }
413
414 if ($metadata_count > 0) {
415 print $outhandle " Extracted $metadata_count pieces of metadata from $filename EXIF block\n";
416 }
417
418 # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
419 # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
420 # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
421 print STDERR "file = $file " . &unicode::debug_unicode_string($file);
422 $file = &util::raw_filename_to_unicode(&util::filename_head($filename), $file);
423 print STDERR "$file ". &unicode::debug_unicode_string($file);
424 $file = &util::filepath_to_url_format($file);
425 print STDERR "$file " . &unicode::debug_unicode_string($file);
426 $file = &util::filename_to_regex($file);
427 print STDERR "$file ".&unicode::debug_unicode_string($file) ."\n";
428
429 # Associate the metadata now
430
431 if (defined &extrametautil::getmetadata($extrametadata, $file)) {
432 print STDERR "\n**** EmbeddedMetadataPlugin: Need to merge new metadata with existing stored metadata: file = $file\n" if $verbosity > 3;
433
434 my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $file);
435
436 foreach my $metaname (keys %exif_metadata) {
437 # will create new entry if one does not already exist
438 push(@{$file_metadata_table->{$metaname}}, @{$exif_metadata{$metaname}});
439 }
440
441 # no need to push $file on to $extrametakeys as it is already in the list
442 }
443 else {
444 &extrametautil::setmetadata($extrametadata, $file, \%exif_metadata);
445 &extrametautil::addmetakey($extrametakeys, $file);
446 }
447
448}
449
450
451sub metadata_read
452{
453 my $self = shift (@_);
454 my ($pluginfo, $base_dir, $file, $block_hash,
455 $extrametakeys, $extrametadata, $extrametafile,
456 $processor, $gli, $aux) = @_;
457
458 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
459
460# Now handled in the can_process_this_file_for_metadata method
461#
462# # we don't want to process directories
463# if (!-f $filename_full_path) {
464# return undef;
465# }
466
467 if (!$self->can_process_this_file_for_metadata($filename_full_path)) {
468
469 # Avoid scanning it with ExifTool ...
470 # ... but let any other plugin in metadata_read() passes pipeline
471 # consider it
472
473 return undef;
474 }
475
476
477 print STDERR "\n<Processing n='$file' p='EmbeddedMetadataPlugin'>\n" if ($gli);
478 print STDERR "EmbeddedMetadataPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
479
480 $self->extractEmbeddedMetadata($filename_no_path,$filename_full_path,
481 $extrametadata,$extrametakeys);
482
483 # also want it considered by other plugins in the metadata_read() pipeline
484 return undef;
485}
486
487sub read
488{
489 return undef;
490}
491
492sub process
493{
494 # not used
495 return undef;
496}
497
498sub gsSafe() {
499 my $self = shift(@_);
500 my ($text) = @_;
501
502 # Replace potentially problematic characters
503 $text =~ s/\(/&#40;/g;
504 $text =~ s/\)/&#41;/g;
505 $text =~ s/,/&#44;/g;
506 $text =~ s/\</&#60;/g;
507 $text =~ s/\>/&#62;/g;
508 $text =~ s/\[/&#91;/g;
509 $text =~ s/\]/&#93;/g;
510 $text =~ s/\{/&#123;/g;
511 $text =~ s/\}/&#125;/g;
512 # Done
513
514 if ($self->{'trim_whitespace'} eq "true"){
515 $text =~ s/^\s+//;
516 $text =~ s/\s+$//;
517 }
518
519 return $text;
520}
521
5221;
Note: See TracBrowser for help on using the repository browser.