source: trunk/gsdl/perllib/plugins/RecPlug.pm@ 2795

Last change on this file since 2795 was 2795, checked in by sjboddie, 23 years ago

Got ZIPPlug working under under windows

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.3 KB
Line 
1###########################################################################
2#
3# RecPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# RecPlug is a plugin which recurses through directories processing
27# each file it finds.
28
29# RecPlug has one option: use_metadata_files. When this is set, it will
30# check each directory for an XML file called "metadata" that specifies
31# metadata for the files (and subdirectories) in the directory. It will
32# also look in any file of the form *.metadata for metadata about the file
33# with the same prefix.
34#
35# Here's an example of a metadata file that cuses theree metadata structures
36# (ignore the # characters):
37
38#<metadata>
39# <filename>nugget.*</filename>
40# <Title>Nugget Point, The Catlins</Title>
41# <Place mode=accumulate>Nugget Point</Place>
42#</metadata>
43#
44#<metadata>
45# <filename>nugget-point-1.jpg</filename>
46# <Title>Nugget Point Lighthouse, The Catlins</Title>
47# <Subject>Lighthouse</Subject>
48#</metadata>
49#
50#<metadata>
51# <filename>kaka-point-dir</filename>
52# <Title>Kaka Point, The Catlins</Title>
53#</metadata>
54
55# Metadata elements are read and applied to files in the order they appear
56# in the file. The directory's "metadata" file is erad first, and then any
57# other files of the form "*.metadata" are read in alphabetical order.
58#
59# The filename element describes the subfiles in the directory that the
60# metadata applies to as a perl regular expression, so
61# <filename>nugget.*</filename> indicates that the first metadata record
62# applies to every subfile that starts with "nugget". For these files, a
63# Title metadata element is set, overriding any old value that the Title
64# might have had.
65#
66# Occasionally, we want to have multiple metadata values applied to a
67# document; in this case we use the "mode=accumulate" attribute of the
68# particular metadata item. In the first metadata element above, the
69# "Place" metadata is accumulating, and is therefore given several values.
70# If we wanted to override these values and use a single metadata element
71# again, we could write <Place mode=override>New Zealand</Place> instead.
72# Remember: every element is assumed to be in override mode unless you
73# specify otherwise, so if you want to accumulate metadata for some field,
74# every occurance must have "mode=accumulate" specified.
75#
76# The second metadata element applies to a specific file, called
77# nugget-point-1.jpg. This element overrides the Title set in the first
78# element above, and adds a "Subject" ,etadata field.
79#
80# The third and fional metadata element sets metadata for a subdirectory
81# rather than a file. The metadata specified (a Title) will be passed into
82# the subdirectory and applied to every file that occurs in the
83# subdirectory (and to every subsubdirectory and its contents, and so on)
84# unless the metadata is explictly overridden later in the import.
85
86
87
88package RecPlug;
89
90use BasPlug;
91use plugin;
92use util;
93
94
95BEGIN {
96 @ISA = ('BasPlug');
97}
98
99sub print_usage {
100 my ($plugin_name) = @_;
101
102 print STDERR "
103 usage: plugin RecPlug [options]
104
105 -use_metadata_files Read metadata from metadata XML files.
106
107"
108}
109
110sub new {
111 my $class = shift (@_);
112 my $self = new BasPlug ($class, @_);
113
114 if (!parsargv::parse(\@_,
115 q^use_metadata_files^, \$self->{'use_metadata_files'},
116 "allow_extra_options")) {
117 print STDERR "\nRecPlug uses an incorrect option.\n";
118 print STDERR "Check your collect.cfg configuration file.\n\n";
119 &print_usage("RecPlug");
120 die "\n";
121 }
122
123 return bless $self, $class;
124}
125
126# return 1 if this class might recurse using $pluginfo
127sub is_recursive {
128 my $self = shift (@_);
129
130 return 1;
131}
132
133sub get_default_block_exp {
134 my $self = shift (@_);
135
136 return 'CVS';
137}
138
139# return number of files processed, undef if can't process
140# Note that $base_dir might be "" and that $file might
141# include directories
142
143# This function passes around metadata hash structures. Metadata hash
144# structures are hashes that map from a (scalar) key (the metadata element
145# name) to either a scalar metadata value or a reference to an array of
146# such values.
147
148sub read {
149 my $self = shift (@_);
150 my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs) = @_;
151
152 my $outhandle = $self->{'outhandle'};
153 my $verbosity = $self->{'verbosity'};
154 my $read_metadata_files = $self->{'use_metadata_files'};
155
156 # Calculate the directory name and ensure it is a directory and
157 # that it is not explicitly blocked.
158 my $dirname = $file;
159 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
160 return undef unless (-d $dirname);
161 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
162
163 # check to make sure we're not reading the archives or index directory
164 my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
165 if ($dirname =~ m%^${gsdlhome}/.*?/import.*?/(archives|index)$%) {
166 print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
167 return 0;
168 }
169
170 # check to see we haven't got a cyclic path...
171 if ($dirname =~ m%(/.*){,41}%) {
172 print $outhandle "RecPlug: $dirname is 40 directories deep, is this a recursive path? if not increase constant in RecPlug.pm.\n";
173 return 0;
174 }
175
176 # check to see we haven't got a cyclic path...
177 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
178 print $outhandle "RecPlug: $dirname appears to be in a recursive loop...\n";
179 return 0;
180 }
181
182 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
183 print $outhandle "RecPlug: metadata passed in: ",
184 join(", ", keys %$in_metadata), "\n";
185 }
186
187 # Recur over directory contents.
188 my (@dir, $subfile);
189 my $count = 0;
190 print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity);
191
192 # find all the files in the directory
193 if (!opendir (DIR, $dirname)) {
194 print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n";
195 return undef;
196 }
197 @dir = readdir (DIR);
198 closedir (DIR);
199
200 # read XML metadata files (if supplied)
201 my $additionalmetadata = 0; # is there extra metadata available?
202 my %extrametadata; # maps from filespec to extra metadata keys
203 my @extrametakeys; # keys of %extrametadata in order read
204
205 if ($read_metadata_files) {
206
207 # first read the directory "metadata" file
208 my $metadatafile = &util::filename_cat ($dirname, 'metadata');
209 if (-e $metadatafile) {
210 print $outhandle "RecPlug: found metadata in $metadatafile\n"
211 if ($verbosity);
212 &read_metadata_file($metadatafile, \%extrametadata, \@extrametakeys);
213 $additionalmetadata = 1;
214 }
215
216 # then read any files with names of the form *.metadata
217 foreach $subfile (sort @dir) {
218 next unless ($subfile =~ /^.*\.metadata$/);
219 $metadatafile = &util::filename_cat ($dirname, $subfile);
220 print $outhandle "RecPlug: found metadata in $metadatafile\n"
221 if ($verbosity);
222 &read_metadata_file($metadatafile, \%extrametadata, \@extrametakeys);
223 $additionalmetadata = 1;
224 }
225 }
226
227 # import each of the files in the directory
228 my $out_metadata;
229 foreach $subfile (@dir) {
230
231 last if ($maxdocs != -1 && $count >= $maxdocs);
232 next if ($subfile =~ /^\.\.?$/);
233 next if ($read_metadata_files && $subfile =~ /metadata$/);
234 print "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2);
235
236 # Make a copy of $in_metadata to pass to $subfile
237 $out_metadata = {};
238 &combine_metadata_structures($out_metadata, $in_metadata);
239
240 # Next add metadata read in XML files (if it is supplied)
241 if ($additionalmetadata == 1) {
242
243 my ($filespec, $mdref);
244 foreach $filespec (@extrametakeys) {
245 if ($subfile =~ /$filespec/) {
246 print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
247 if ($verbosity > 2);
248 $mdref = $extrametadata{$filespec};
249 &combine_metadata_structures($out_metadata, $mdref);
250 }
251 }
252 }
253
254 # Recursively read each $subfile
255 print $outhandle "RecPlug recurring: $subfile\n" if ($verbosity > 2);
256 $count += &plugin::read ($pluginfo, $base_dir,
257 &util::filename_cat($file, $subfile),
258 $out_metadata, $processor, $maxdocs);
259 }
260 return $count;
261
262}
263
264
265
266# Read a manually-constructed metadata file and store the data
267# it contains in the $metadataerf structure.
268#
269# (metadataref is a reference to a hash whose keys are filenames
270# and whose values are metadata hash structures.)
271
272sub read_metadata_file {
273 my ($filename, $metadataref, $metakeysref) = @_;
274
275 my ($metadatafiletext, $metatext);
276 my ($target, $targetdataref, $default_target, $tag, $key, $value);
277
278 # Read the file
279 open(MTDT, "<$filename");
280 $metadatafiletext = join(' ', <MTDT>);
281 $metadatafiletext =~ s/\s+/ /go;
282 close MTDT;
283
284 # set default filespec for *.metadata files
285 if ($filename =~ /\.metadata$/) {
286 $default_target = $filename;
287 $default_target =~ s/.*\///o;
288 $default_target =~ s/\.metadata$//;
289 } else {
290 $default_target = '';
291 }
292
293 # split the file into sections on "metadata" tag
294 foreach $metatext (split(/\<metadata\>/, $metadatafiletext)) {
295 # print "metadata text: $metatext\n";
296
297 # split the metadata set into sections on each field tag
298 $target = $default_target;
299 $targetdataref = {};
300 foreach $tag (split(/</, $metatext)) {
301 next if ($tag =~ m"^/");
302 next if ($tag !~ m/>/);
303
304 ($key, $value) = split(/>/, $tag);
305 # print "$key -> $value\n";
306
307 if ($key eq 'filename') {
308 $target = $value;
309 } else {
310
311 # a metadata field can be flagged as accumulated or overridden
312 my $accumulateflag = 0;
313 my $overrideflag = 0;
314 if ($key =~ / mode=a.*/io) {
315 $accumulateflag = 1;
316 } elsif ($key =~ / mode=o.*/io) {
317 $overrideflag = 1;
318 }
319 $key =~ s/ mode=.*$//io;
320
321 # set the metadata value, using an array for accumulating fields
322 # and a scalar for override fields
323 if ($accumulateflag) {
324 # the accumulate flag directs us to accumulate metadata values
325 if (!defined $targetdataref->{$key}) {
326 # there is no existing value for this field
327 $targetdataref->{$key} = [$value];
328 } elsif (ref ($targetdataref->{$key}) eq "ARRAY") {
329 # we already have an array of values for this field
330 my $aref = $targetdataref->{$key};
331 push @$aref, $value;
332 } else {
333 # we have a scalar for this field - convert to array
334 $targetdataref->{$key} = [$targetdataref->{$key}, $value];
335 }
336 } elsif ($overrideflag) {
337 # the override flag directs us to override exising values
338 $targetdataref->{$key} = $value;
339 } elsif (!defined $targetdataref->{$key}) {
340 # there is no flag, and no existing value: default to override mode
341 # In the future, I should let the user specify the default mode.
342 $targetdataref->{$key} = $value;
343 } elsif (ref ($targetdataref->{$key}) eq "ARRAY") {
344 # there is no flag, and we're already in accumulate mode
345 my $aref = $targetdataref->{$key};
346 push @$aref, $value;
347 } else {
348 # there is no flag, and we're already in override mode
349 $targetdataref->{$key} = $value;
350 }
351 }
352 }
353
354 # store this metadata information in the metadata ref
355 if ($target) {
356 push @$metakeysref, $target;
357 $metadataref->{$target} = $targetdataref;
358 }
359 }
360}
361
362
363# Combine two metadata structures. Given two references to metadata
364# element structures, add every field of the second ($mdref2) to the first
365# ($mdref1).
366#
367# Afterwards $mdref1 will be updated, and $mdref2 will be unchanged.
368#
369# We have to be acreful about the way we merge metadata when one metadata
370# structure is in "override" mode and one is in "merge" mode. In fact, we
371# use the mode from the second structure, $mdref2, because it is generally
372# defined later (lower in the directory structure) and is therefore more
373# "local" to the document concerned.
374#
375# Another issue is the use of references to pass metadata around. If we
376# simply copy one metadata structure reference to another, then we're
377# effectively justr copyinga pointer, and changes to the new referene
378# will affect the old (copied) one also. This also applies to ARRAY
379# references used as metadata element values (hence the "clonedata"
380# function below).
381
382sub combine_metadata_structures {
383 my ($mdref1, $mdref2) = @_;
384 my ($key, $value1, $value2);
385
386 foreach $key (keys %$mdref2) {
387
388 $value1 = $mdref1->{$key};
389 $value2 = $mdref2->{$key};
390
391 # If there is no existing value for this metadata field in
392 # $mdref1, so we simply copy the value from $mdref2 over.
393 if (!defined $value1) {
394 $mdref1->{$key} = &clonedata($value2);
395 }
396 # Otherwise we have to add the new values to the existing ones.
397 # If the second structure is accumulated, then acculate all the
398 # values into the first structure
399 elsif ((ref $value2) eq "ARRAY") {
400 # If the first metadata element is a scalar we have to
401 # convert it into an array before we add anything more.
402 if ((ref $value1) ne 'ARRAY') {
403 $mdref1->{$key} = [$value1];
404 $value1 = $mdref1->{$key};
405 }
406 # Now add the value(s) from the second array to the first
407 $value2 = &clonedata($value2);
408 push @$value1, @$value2;
409 }
410 # Finally, If the second structure is not an array erference, we
411 # know it is in override mode, so override the first structure.
412 else {
413 $mdref1->{$key} = &clonedata($value2);
414 }
415 }
416}
417
418
419# Make a "cloned" copy of a metadata value.
420# This is trivial for a simple scalar value,
421# but not for an array reference.
422
423sub clonedata {
424 my ($value) = @_;
425 my $result;
426
427 if ((ref $value) eq 'ARRAY') {
428 $result = [];
429 foreach my $item (@$value) {
430 push @$result, $item;
431 }
432 } else {
433 $result = $value;
434 }
435 return $result;
436}
437
438
4391;
440
441
442
Note: See TracBrowser for help on using the repository browser.