source: trunk/gsdl/perllib/plugins/RecPlug.pm@ 2925

Last change on this file since 2925 was 2925, checked in by sjboddie, 22 years ago

Altered the format of the GreenstoneArchive and GreenstoneDirectoryMetadata
XML files slightly (they're now called Archive and DirectoryMetadata
respectively).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.8 KB
Line 
1###########################################################################
2#
3# RecPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# RecPlug is a plugin which recurses through directories processing
27# each file it finds.
28
29# RecPlug has one option: use_metadata_files. When this is set, it will
30# check each directory for an XML file called "metadata.xml" that specifies
31# metadata for the files (and subdirectories) in the directory.
32#
33# Here's an example of a metadata file that uses three FileSet structures
34# (ignore the # characters):
35
36#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
37#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
38#<DirectoryMetadata>
39# <FileSet>
40# <FileName>nugget.*</FileName>
41# <Description>
42# <Metadata name="Title">Nugget Point, The Catlins</Metadata>
43# <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
44# </Description>
45# </FileSet>
46# <FileSet>
47# <FileName>nugget-point-1.jpg</FileName>
48# <Description>
49# <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
50# <Metadata name="Subject">Lighthouse</Metadata>
51# </Description>
52# </FileSet>
53# <FileSet>
54# <FileName>kaka-point-dir</FileName>
55# <Description>
56# <Metadata name="Title">Kaka Point, The Catlins</Metadata>
57# </Description>
58# </FileSet>
59#</DirectoryMetadata>
60
61# Metadata elements are read and applied to files in the order they appear
62# in the file.
63#
64# The FileName element describes the subfiles in the directory that the
65# metadata applies to as a perl regular expression (a FileSet group may
66# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
67# indicates that the metadata records in the following Description block
68# apply to every subfile that starts with "nugget". For these files, a
69# Title metadata element is set, overriding any old value that the Title
70# might have had.
71#
72# Occasionally, we want to have multiple metadata values applied to a
73# document; in this case we use the "mode=accumulate" attribute of the
74# particular Metadata element. In the second metadata element of the first
75# FileSet above, the "Place" metadata is accumulating, and may therefore be
76# given several values. If we wanted to override these values and use a
77# single metadata element again, we could set the mode attribute to
78# "override" instead. Remember: every element is assumed to be in override
79# mode unless you specify otherwise, so if you want to accumulate metadata
80# for some field, every occurance must have "mode=accumulate" specified.
81#
82# The second FileSet element above applies to a specific file, called
83# nugget-point-1.jpg. This element overrides the Title metadata set in the
84# first FileSet, and adds a "Subject" metadata field.
85#
86# The third and final FileSet sets metadata for a subdirectory rather than
87# a file. The metadata specified (a Title) will be passed into the
88# subdirectory and applied to every file that occurs in the subdirectory
89# (and to every subsubdirectory and its contents, and so on) unless the
90# metadata is explictly overridden later in the import.
91
92
93
94package RecPlug;
95
96use BasPlug;
97use plugin;
98use util;
99
100
101BEGIN {
102 @ISA = ('BasPlug');
103 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
104}
105
106use XML::Parser;
107
108sub print_usage {
109 my ($plugin_name) = @_;
110
111 print STDERR "
112 usage: plugin RecPlug [options]
113
114 -use_metadata_files Read metadata from metadata XML files.
115
116"
117}
118
119my ($self);
120sub new {
121 my $class = shift (@_);
122
123 # $self is global for use within subroutines called by XML::Parser
124 $self = new BasPlug ($class, @_);
125
126 if (!parsargv::parse(\@_,
127 q^use_metadata_files^, \$self->{'use_metadata_files'},
128 "allow_extra_options")) {
129 print STDERR "\nRecPlug uses an incorrect option.\n";
130 print STDERR "Check your collect.cfg configuration file.\n\n";
131 &print_usage("RecPlug");
132 die "\n";
133 }
134
135 if ($self->{'use_metadata_files'}) {
136 # create XML::Parser object for parsing metadata.xml files
137 my $parser = new XML::Parser('Style' => 'Stream',
138 'Handlers' => {'Char' => \&Char,
139 'Doctype' => \&Doctype
140 });
141 $self->{'parser'} = $parser;
142 $self->{'in_filename'} = 0;
143
144 }
145
146 return bless $self, $class;
147}
148
149# return 1 if this class might recurse using $pluginfo
150sub is_recursive {
151 my $self = shift (@_);
152
153 return 1;
154}
155
156sub get_default_block_exp {
157 my $self = shift (@_);
158
159 return 'CVS';
160}
161
162# return number of files processed, undef if can't process
163# Note that $base_dir might be "" and that $file might
164# include directories
165
166# This function passes around metadata hash structures. Metadata hash
167# structures are hashes that map from a (scalar) key (the metadata element
168# name) to either a scalar metadata value or a reference to an array of
169# such values.
170
171sub read {
172 my $self = shift (@_);
173 my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs) = @_;
174
175 my $outhandle = $self->{'outhandle'};
176 my $verbosity = $self->{'verbosity'};
177 my $read_metadata_files = $self->{'use_metadata_files'};
178
179 # Calculate the directory name and ensure it is a directory and
180 # that it is not explicitly blocked.
181 my $dirname = $file;
182 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
183 return undef unless (-d $dirname);
184 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
185
186 # check to make sure we're not reading the archives or index directory
187 my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
188 if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
189 print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
190 return 0;
191 }
192
193 # check to see we haven't got a cyclic path...
194 if ($dirname =~ m%(/.*){,41}%) {
195 print $outhandle "RecPlug: $dirname is 40 directories deep, is this a recursive path? if not increase constant in RecPlug.pm.\n";
196 return 0;
197 }
198
199 # check to see we haven't got a cyclic path...
200 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
201 print $outhandle "RecPlug: $dirname appears to be in a recursive loop...\n";
202 return 0;
203 }
204
205 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
206 print $outhandle "RecPlug: metadata passed in: ",
207 join(", ", keys %$in_metadata), "\n";
208 }
209
210 # Recur over directory contents.
211 my (@dir, $subfile);
212 my $count = 0;
213 print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity);
214
215 # find all the files in the directory
216 if (!opendir (DIR, $dirname)) {
217 print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n";
218 return undef;
219 }
220 @dir = readdir (DIR);
221 closedir (DIR);
222
223 # read XML metadata files (if supplied)
224 my $additionalmetadata = 0; # is there extra metadata available?
225 my %extrametadata; # maps from filespec to extra metadata keys
226 my @extrametakeys; # keys of %extrametadata in order read
227
228 if ($read_metadata_files) {
229
230 # read the directory "metadata.xml" file
231 my $metadatafile = &util::filename_cat ($dirname, 'metadata.xml');
232 if (-e $metadatafile) {
233 print $outhandle "RecPlug: found metadata in $metadatafile\n"
234 if ($verbosity);
235 $self->read_metadata_xml_file($metadatafile, \%extrametadata, \@extrametakeys);
236 $additionalmetadata = 1;
237 }
238 }
239
240 # import each of the files in the directory
241 my $out_metadata;
242 foreach $subfile (@dir) {
243
244 last if ($maxdocs != -1 && $count >= $maxdocs);
245 next if ($subfile =~ /^\.\.?$/);
246 next if ($read_metadata_files && $subfile =~ /metadata\.xml$/);
247 print $outhandle "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2);
248
249 # Make a copy of $in_metadata to pass to $subfile
250 $out_metadata = {};
251 &combine_metadata_structures($out_metadata, $in_metadata);
252
253 # Next add metadata read in XML files (if it is supplied)
254 if ($additionalmetadata == 1) {
255
256 my ($filespec, $mdref);
257 foreach $filespec (@extrametakeys) {
258 if ($subfile =~ /^$filespec$/) {
259 print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
260 if ($verbosity > 2);
261 $mdref = $extrametadata{$filespec};
262 &combine_metadata_structures($out_metadata, $mdref);
263 }
264 }
265 }
266
267 # Recursively read each $subfile
268 print $outhandle "RecPlug recurring: $subfile\n" if ($verbosity > 2);
269 $count += &plugin::read ($pluginfo, $base_dir,
270 &util::filename_cat($file, $subfile),
271 $out_metadata, $processor, $maxdocs);
272 }
273 return $count;
274
275}
276
277
278
279# Read a manually-constructed metadata file and store the data
280# it contains in the $metadataref structure.
281#
282# (metadataref is a reference to a hash whose keys are filenames
283# and whose values are metadata hash structures.)
284
285sub read_metadata_xml_file {
286 my $self = shift(@_);
287 my ($filename, $metadataref, $metakeysref) = @_;
288 $self->{'metadataref'} = $metadataref;
289 $self->{'metakeysref'} = $metakeysref;
290
291 eval {
292 $self->{'parser'}->parsefile($filename);
293 };
294 if ($@) {
295 my $outhandle = $self->{'outhandle'};
296 print $outhandle "RecPlug: Warning: Ignoring $filename because it is not a well formed metadata.xml file\n";
297 return;
298 }
299}
300
301sub Doctype {
302 my ($expat, $name, $sysid, $pubid, $internal) = @_;
303
304 # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
305 # to be processed as well as the "DirectoryMetadata" files which should now
306 # be created by import.pl
307 die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
308}
309
310sub StartTag {
311 my ($expat, $element) = @_;
312
313 if ($element eq "FileSet") {
314 $self->{'saved_targets'} = [];
315 $self->{'saved_metadata'} = {};
316 }
317 elsif ($element eq "FileName") {
318 $self->{'in_filename'} = 1;
319 }
320 elsif ($element eq "Metadata") {
321 $self->{'metadata_name'} = $_{'name'};
322 if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
323 $self->{'metadata_accumulate'} = 1;
324 } else {
325 $self->{'metadata_accumulate'} = 0;
326 }
327 }
328}
329
330sub EndTag {
331 my ($expat, $element) = @_;
332
333 if ($element eq "FileSet") {
334 push (@{$self->{'metakeysref'}}, @{$self->{'saved_targets'}});
335 foreach my $target (@{$self->{'saved_targets'}}) {
336 $self->{'metadataref'}->{$target} = $self->{'saved_metadata'};
337 }
338 }
339 elsif ($element eq "FileName") {
340 $self->{'in_filename'} = 0;
341 }
342 elsif ($element eq "Metadata") {
343 $self->{'metadata_name'} = "";
344 }
345
346}
347
348sub Text {
349
350 if ($self->{'in_filename'}) {
351 # $_ == FileName content
352 push (@{$self->{'saved_targets'}}, $_);
353 }
354 elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
355 # $_ == Metadata content
356 my $mname = $self->{'metadata_name'};
357 if (defined $self->{'saved_metadata'}->{$mname}) {
358 if ($self->{'metadata_accumulate'}) {
359 # accumulate mode - add value to existing value(s)
360 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
361 push (@{$self->{'saved_metadata'}->{$mname}}, $_);
362 } else {
363 $self->{'saved_metadata'}->{$mname} =
364 [$self->{'saved_metadata'}->{$mname}, $_];
365 }
366 } else {
367 # override mode
368 $self->{'saved_metadata'}->{$mname} = $_;
369 }
370 } else {
371 if ($self->{'metadata_accumulate'}) {
372 # accumulate mode - add value into (currently empty) array
373 $self->{'saved_metadata'}->{$mname} = [$_];
374 } else {
375 # override mode
376 $self->{'saved_metadata'}->{$mname} = $_;
377 }
378 }
379 }
380}
381
382# This Char function overrides the one in XML::Parser::Stream to overcome a
383# problem where $expat->{Text} is treated as the return value, slowing
384# things down significantly in some cases.
385sub Char {
386 $_[0]->{'Text'} .= $_[1];
387 return undef;
388}
389
390# Combine two metadata structures. Given two references to metadata
391# element structures, add every field of the second ($mdref2) to the first
392# ($mdref1).
393#
394# Afterwards $mdref1 will be updated, and $mdref2 will be unchanged.
395#
396# We have to be acreful about the way we merge metadata when one metadata
397# structure is in "override" mode and one is in "merge" mode. In fact, we
398# use the mode from the second structure, $mdref2, because it is generally
399# defined later (lower in the directory structure) and is therefore more
400# "local" to the document concerned.
401#
402# Another issue is the use of references to pass metadata around. If we
403# simply copy one metadata structure reference to another, then we're
404# effectively justr copyinga pointer, and changes to the new referene
405# will affect the old (copied) one also. This also applies to ARRAY
406# references used as metadata element values (hence the "clonedata"
407# function below).
408
409sub combine_metadata_structures {
410 my ($mdref1, $mdref2) = @_;
411 my ($key, $value1, $value2);
412
413 foreach $key (keys %$mdref2) {
414
415 $value1 = $mdref1->{$key};
416 $value2 = $mdref2->{$key};
417
418 # If there is no existing value for this metadata field in
419 # $mdref1, so we simply copy the value from $mdref2 over.
420 if (!defined $value1) {
421 $mdref1->{$key} = &clonedata($value2);
422 }
423 # Otherwise we have to add the new values to the existing ones.
424 # If the second structure is accumulated, then acculate all the
425 # values into the first structure
426 elsif ((ref $value2) eq "ARRAY") {
427 # If the first metadata element is a scalar we have to
428 # convert it into an array before we add anything more.
429 if ((ref $value1) ne 'ARRAY') {
430 $mdref1->{$key} = [$value1];
431 $value1 = $mdref1->{$key};
432 }
433 # Now add the value(s) from the second array to the first
434 $value2 = &clonedata($value2);
435 push @$value1, @$value2;
436 }
437 # Finally, If the second structure is not an array erference, we
438 # know it is in override mode, so override the first structure.
439 else {
440 $mdref1->{$key} = &clonedata($value2);
441 }
442 }
443}
444
445
446# Make a "cloned" copy of a metadata value.
447# This is trivial for a simple scalar value,
448# but not for an array reference.
449
450sub clonedata {
451 my ($value) = @_;
452 my $result;
453
454 if ((ref $value) eq 'ARRAY') {
455 $result = [];
456 foreach my $item (@$value) {
457 push @$result, $item;
458 }
459 } else {
460 $result = $value;
461 }
462 return $result;
463}
464
465
4661;
Note: See TracBrowser for help on using the repository browser.