source: trunk/gsdl/perllib/plugins/RecPlug.pm@ 2228

Last change on this file since 2228 was 2228, checked in by paynter, 23 years ago

The -use_metadata_files option tells RecPlug to read any metadata XML files
it finds in the import directories and to add their metadata to the files
they describe. A few other changes: metadata is passed recursively down
the directory tree, collect.cfg-line arguments are parsed correctly, and
block expressions are inherited from BasPlug and blocked appropriately
(default block_exp: CVS).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.3 KB
Line 
1###########################################################################
2#
3# RecPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# RecPlug is a plugin which recurses through directories processing
27# each file it finds.
28
29# RecPlug has one option: use_metadata_files. When this is set, it will
30# check each directory for an XML file called "metadata" that specifies
31# metadata for the files (and subdirectories) in the directory. It will
32# also look in any file of the form *.metadata for metadata about the file
33# with the same prefix.
34#
35# Here's an example of a metadata file that cuses theree metadata structures
36# (ignore the # characters):
37
38#<metadata>
39# <filename>nugget.*</filename>
40# <Title>Nugget Point, The Catlins</Title>
41# <Place mode=accumulate>Nugget Point</Place>
42#</metadata>
43#
44#<metadata>
45# <filename>nugget-point-1.jpg</filename>
46# <Title>Nugget Point Lighthouse, The Catlins</Title>
47# <Subject>Lighthouse</Subject>
48#</metadata>
49#
50#<metadata>
51# <filename>kaka-point-dir</filename>
52# <Title>Kaka Point, The Catlins</Title>
53#</metadata>
54
55# Metadata elements are read and applied to files in the order they appear
56# in the file. The directory's "metadata" file is erad first, and then any
57# other files of the form "*.metadata" are read in alphabetical order.
58#
59# The filename element describes the subfiles in the directory that the
60# metadata applies to as a perl regular expression, so
61# <filename>nugget.*</filename> indicates that the first metadata record
62# applies to every subfile that starts with "nugget". For these files, a
63# Title metadata element is set, overriding any old value that the Title
64# might have had.
65#
66# Occasionally, we want to have multiple metadata values applied to a
67# document; in this case we use the "mode=accumulate" attribute of the
68# particular metadata item. In the first metadata element above, the
69# "Place" metadata is accumulating, and is therefore given several values.
70# If we wanted to override these values and use a single metadata element
71# again, we could write <Place mode=override>New Zealand</Place> instead.
72# Remember: every element is assumed to be in override mode unless you
73# specify otherwise, so if you want to accumulate metadata for some field,
74# every occurance must have "mode=accumulate" specified.
75#
76# The second metadata element applies to a specific file, called
77# nugget-point-1.jpg. This element overrides the Title set in the first
78# element above, and adds a "Subject" ,etadata field.
79#
80# The third and fional metadata element sets metadata for a subdirectory
81# rather than a file. The metadata specified (a Title) will be passed into
82# the subdirectory and applied to every file that occurs in the
83# subdirectory (and to every subsubdirectory and its contents, and so on)
84# unless the metadata is explictly overridden later in the import.
85
86
87
88package RecPlug;
89
90use BasPlug;
91use plugin;
92use util;
93
94
95BEGIN {
96 @ISA = ('BasPlug');
97}
98
99sub print_usage {
100 my ($plugin_name) = @_;
101
102 print STDERR "
103 usage: plugin RecPlug [options]
104
105 -use_metadata_files Read metadata from metadata XML files.
106
107"
108}
109
110sub new {
111 my $class = shift (@_);
112 my $self = new BasPlug ($class, @_);
113
114 if (!parsargv::parse(\@_,
115 q^use_metadata_files^, \$self->{'use_metadata_files'},
116 "allow_extra_options")) {
117 print STDERR "\nRecPlug uses an incorrect option.\n";
118 print STDERR "Check your collect.cfg configuration file.\n\n";
119 &print_usage("RecPlug");
120 die "\n";
121 }
122
123 return bless $self, $class;
124}
125
126# return 1 if this class might recurse using $pluginfo
127sub is_recursive {
128 my $self = shift (@_);
129
130 return 1;
131}
132
133sub get_default_block_exp {
134 my $self = shift (@_);
135
136 return 'CVS';
137}
138
139# return number of files processed, undef if can't process
140# Note that $base_dir might be "" and that $file might
141# include directories
142
143# This function passes around metadata hash structures. Metadata hash
144# structures are hashes that map from a (scalar) key (the metadata element
145# name) to either a scalar metadata value or a reference to an array of
146# such values.
147
148sub read {
149 my $self = shift (@_);
150 my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs) = @_;
151
152 my $outhandle = $self->{'outhandle'};
153 my $verbosity = $self->{'verbosity'};
154 my $read_metadata_files = $self->{'use_metadata_files'};
155
156 # Calculate the directory name and ensure it is a directory and
157 # that it is not explicitly blocked.
158 $file =~ s/^[\/\\]+//;
159 my $dirname = &util::filename_cat ($base_dir, $file);
160 return undef unless (-d $dirname);
161 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
162
163
164 # check to make sure we're not reading the archives or index directory
165 my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
166 if ($dirname =~ m%^${gsdlhome}/.*?/import.*?/(archives|index)$%) {
167 print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
168 return 0;
169 }
170
171 # check to see we haven't got a cyclic path...
172 if ($dirname =~ m%(/.*){,41}%) {
173 print $outhandle "RecPlug: $dirname is 40 directories deep, is this a recursive path? if not increase constant in RecPlug.pm.\n";
174 return 0;
175 }
176
177 # check to see we haven't got a cyclic path...
178 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
179 print $outhandle "RecPlug: $dirname appears to be in a recursive loop...\n";
180 return 0;
181 }
182
183 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
184 print $outhandle "RecPlug: metadata passed in: ",
185 join(", ", keys %$in_metadata), "\n";
186 }
187
188 # Recur over directory contents.
189 my (@dir, $subfile);
190 my $count = 0;
191 print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity);
192
193 # find all the files in the directory
194 if (!opendir (DIR, $dirname)) {
195 print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n";
196 return undef;
197 }
198 @dir = readdir (DIR);
199 closedir (DIR);
200
201 # read XML metadata files (if supplied)
202 my $additionalmetadata = 0; # is there extra metadata available?
203 my %extrametadata; # maps from filespec to extra metadata keys
204 my @extrametakeys; # keys of %extrametadata in order read
205
206 if ($read_metadata_files) {
207
208 # first read the directory "metadata" file
209 my $metadatafile = &util::filename_cat ($dirname, 'metadata');
210 if (-e $metadatafile) {
211 print $outhandle "RecPlug: found metadata in $metadatafile\n"
212 if ($verbosity);
213 &read_metadata_file($metadatafile, \%extrametadata, \@extrametakeys);
214 $additionalmetadata = 1;
215 }
216
217 # then read any files with names of the form *.metadata
218 foreach $subfile (sort @dir) {
219 next unless ($subfile =~ /^.*\.metadata$/);
220 $metadatafile = &util::filename_cat ($dirname, $subfile);
221 print $outhandle "RecPlug: found metadata in $metadatafile\n"
222 if ($verbosity);
223 &read_metadata_file($metadatafile, \%extrametadata, \@extrametakeys);
224 $additionalmetadata = 1;
225 }
226 }
227
228 # import each of the files in the directory
229 my $out_metadata;
230 foreach $subfile (@dir) {
231
232 last if ($maxdocs != -1 && $count >= $maxdocs);
233 next if ($subfile =~ /^\.\.?$/);
234 next if ($read_metadata_files && $subfile =~ /metadata$/);
235 print "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2);
236
237 # Make a copy of $in_metadata to pass to $subfile
238 $out_metadata = {};
239 &combine_metadata_structures($out_metadata, $in_metadata);
240
241 # Next add metadata read in XML files (if it is supplied)
242 if ($additionalmetadata == 1) {
243
244 my ($filespec, $mdref);
245 foreach $filespec (@extrametakeys) {
246 if ($subfile =~ /$filespec/) {
247 print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
248 if ($verbosity > 2);
249 $mdref = $extrametadata{$filespec};
250 &combine_metadata_structures($out_metadata, $mdref);
251 }
252 }
253 }
254
255 # Recursively read each $subfile
256 print $outhandle "RecPlug recurring: $subfile\n" if ($verbosity > 2);
257 $count += &plugin::read ($pluginfo, $base_dir,
258 &util::filename_cat($file, $subfile),
259 $out_metadata, $processor, $maxdocs);
260 }
261 return $count;
262
263}
264
265
266
267# Read a manually-constructed metadata file and store the data
268# it contains in the $metadataerf structure.
269#
270# (metadataref is a reference to a hash whose keys are filenames
271# and whose values are metadata hash structures.)
272
273sub read_metadata_file {
274 my ($filename, $metadataref, $metakeysref) = @_;
275
276 my ($metadatafiletext, $metatext);
277 my ($target, $targetdataref, $default_target, $tag, $key, $value);
278
279 # Read the file
280 open(MTDT, "<$filename");
281 $metadatafiletext = join(' ', <MTDT>);
282 $metadatafiletext =~ s/\s+/ /go;
283 close MTDT;
284
285 # set default filespec for *.metadata files
286 if ($filename =~ /\.metadata$/) {
287 $default_target = $filename;
288 $default_target =~ s/.*\///o;
289 $default_target =~ s/\.metadata$//;
290 } else {
291 $default_target = '';
292 }
293
294 # split the file into sections on "metadata" tag
295 foreach $metatext (split(/\<metadata\>/, $metadatafiletext)) {
296 # print "metadata text: $metatext\n";
297
298 # split the metadata set into sections on each field tag
299 $target = $default_target;
300 $targetdataref = {};
301 foreach $tag (split(/</, $metatext)) {
302 next if ($tag =~ m"^/");
303 next if ($tag !~ m/>/);
304
305 ($key, $value) = split(/>/, $tag);
306 # print "$key -> $value\n";
307
308 if ($key eq 'filename') {
309 $target = $value;
310 } else {
311
312 # a metadata field can be flagged as accumulated or overridden
313 my $accumulateflag = 0;
314 my $overrideflag = 0;
315 if ($key =~ / mode=a.*/io) {
316 $accumulateflag = 1;
317 } elsif ($key =~ / mode=o.*/io) {
318 $overrideflag = 1;
319 }
320 $key =~ s/ mode=.*$//io;
321
322 # set the metadata value, using an array for accumulating fields
323 # and a scalar for override fields
324 if ($accumulateflag) {
325 # the accumulate flag directs us to accumulate metadata values
326 if (!defined $targetdataref->{$key}) {
327 # there is no existing value for this field
328 $targetdataref->{$key} = [$value];
329 } elsif (ref ($targetdataref->{$key}) eq "ARRAY") {
330 # we already have an array of values for this field
331 my $aref = $targetdataref->{$key};
332 push @$aref, $value;
333 } else {
334 # we have a scalar for this field - convert to array
335 $targetdataref->{$key} = [$targetdataref->{$key}, $value];
336 }
337 } elsif ($overrideflag) {
338 # the override flag directs us to override exising values
339 $targetdataref->{$key} = $value;
340 } elsif (!defined $targetdataref->{$key}) {
341 # there is no flag, and no existing value: default to override mode
342 # In the future, I should let the user specify the default mode.
343 $targetdataref->{$key} = $value;
344 } elsif (ref ($targetdataref->{$key}) eq "ARRAY") {
345 # there is no flag, and we're already in accumulate mode
346 my $aref = $targetdataref->{$key};
347 push @$aref, $value;
348 } else {
349 # there is no flag, and we're already in override mode
350 $targetdataref->{$key} = $value;
351 }
352 }
353 }
354
355 # store this metadata information in the metadata ref
356 if ($target) {
357 push @$metakeysref, $target;
358 $metadataref->{$target} = $targetdataref;
359 }
360 }
361}
362
363
364# Combine two metadata structures. Given two references to metadata
365# element structures, add every field of the second ($mdref2) to the first
366# ($mdref1).
367#
368# Afterwards $mdref1 will be updated, and $mdref2 will be unchanged.
369#
370# We have to be acreful about the way we merge metadata when one metadata
371# structure is in "override" mode and one is in "merge" mode. In fact, we
372# use the mode from the second structure, $mdref2, because it is generally
373# defined later (lower in the directory structure) and is therefore more
374# "local" to the document concerned.
375#
376# Another issue is the use of references to pass metadata around. If we
377# simply copy one metadata structure reference to another, then we're
378# effectively justr copyinga pointer, and changes to the new referene
379# will affect the old (copied) one also. This also applies to ARRAY
380# references used as metadata element values (hence the "clonedata"
381# function below).
382
383sub combine_metadata_structures {
384 my ($mdref1, $mdref2) = @_;
385 my ($key, $value1, $value2);
386
387 foreach $key (keys %$mdref2) {
388
389 $value1 = $mdref1->{$key};
390 $value2 = $mdref2->{$key};
391
392 # If there is no existing value for this metadata field in
393 # $mdref1, so we simply copy the value from $mdref2 over.
394 if (!defined $value1) {
395 $mdref1->{$key} = &clonedata($value2);
396 }
397 # Otherwise we have to add the new values to the existing ones.
398 # If the second structure is accumulated, then acculate all the
399 # values into the first structure
400 elsif ((ref $value2) eq "ARRAY") {
401 # If the first metadata element is a scalar we have to
402 # convert it into an array before we add anything more.
403 if ((ref $value1) ne 'ARRAY') {
404 $mdref1->{$key} = [$value1];
405 $value1 = $mdref1->{$key};
406 }
407 # Now add the value(s) from the second array to the first
408 $value2 = &clonedata($value2);
409 push @$value1, @$value2;
410 }
411 # Finally, If the second structure is not an array erference, we
412 # know it is in override mode, so override the first structure.
413 else {
414 $mdref1->{$key} = &clonedata($value2);
415 }
416 }
417}
418
419
420# Make a "cloned" copy of a metadata value.
421# This is trivial for a simple scalar value,
422# but not for an array reference.
423
424sub clonedata {
425 my ($value) = @_;
426 my $result;
427
428 if ((ref $value) eq 'ARRAY') {
429 $result = [];
430 foreach my $item (@$value) {
431 push @$result, $item;
432 }
433 } else {
434 $result = $value;
435 }
436 return $result;
437}
438
439
4401;
441
442
443
Note: See TracBrowser for help on using the repository browser.