source: trunk/gsdl/perllib/plugins/RecPlug.pm@ 4785

Last change on this file since 4785 was 4785, checked in by mdewsnip, 21 years ago

Commented out print_usage functions - plugins should now call $self->print_txt_usage() to display their usage text. Updates to the options of a plugin should be made in the $options and $arguments data structures at the top of the plugin.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.4 KB
Line 
1###########################################################################
2#
3# RecPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# RecPlug is a plugin which recurses through directories processing
27# each file it finds.
28
29# RecPlug has one option: use_metadata_files. When this is set, it will
30# check each directory for an XML file called "metadata.xml" that specifies
31# metadata for the files (and subdirectories) in the directory.
32#
33# Here's an example of a metadata file that uses three FileSet structures
34# (ignore the # characters):
35
36#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
37#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
38#<DirectoryMetadata>
39# <FileSet>
40# <FileName>nugget.*</FileName>
41# <Description>
42# <Metadata name="Title">Nugget Point, The Catlins</Metadata>
43# <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
44# </Description>
45# </FileSet>
46# <FileSet>
47# <FileName>nugget-point-1.jpg</FileName>
48# <Description>
49# <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
50# <Metadata name="Subject">Lighthouse</Metadata>
51# </Description>
52# </FileSet>
53# <FileSet>
54# <FileName>kaka-point-dir</FileName>
55# <Description>
56# <Metadata name="Title">Kaka Point, The Catlins</Metadata>
57# </Description>
58# </FileSet>
59#</DirectoryMetadata>
60
61# Metadata elements are read and applied to files in the order they appear
62# in the file.
63#
64# The FileName element describes the subfiles in the directory that the
65# metadata applies to as a perl regular expression (a FileSet group may
66# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
67# indicates that the metadata records in the following Description block
68# apply to every subfile that starts with "nugget". For these files, a
69# Title metadata element is set, overriding any old value that the Title
70# might have had.
71#
72# Occasionally, we want to have multiple metadata values applied to a
73# document; in this case we use the "mode=accumulate" attribute of the
74# particular Metadata element. In the second metadata element of the first
75# FileSet above, the "Place" metadata is accumulating, and may therefore be
76# given several values. If we wanted to override these values and use a
77# single metadata element again, we could set the mode attribute to
78# "override" instead. Remember: every element is assumed to be in override
79# mode unless you specify otherwise, so if you want to accumulate metadata
80# for some field, every occurance must have "mode=accumulate" specified.
81#
82# The second FileSet element above applies to a specific file, called
83# nugget-point-1.jpg. This element overrides the Title metadata set in the
84# first FileSet, and adds a "Subject" metadata field.
85#
86# The third and final FileSet sets metadata for a subdirectory rather than
87# a file. The metadata specified (a Title) will be passed into the
88# subdirectory and applied to every file that occurs in the subdirectory
89# (and to every subsubdirectory and its contents, and so on) unless the
90# metadata is explictly overridden later in the import.
91
92
93
94package RecPlug;
95
96use BasPlug;
97use plugin;
98use util;
99
100
101BEGIN {
102 @ISA = ('BasPlug');
103 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
104}
105
106use XML::Parser;
107
108my $arguments =
109 [ { 'name' => "block_exp",
110 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
111 'type' => "string",
112 'deft' => &get_default_block_exp(),
113 'reqd' => "no" },
114 { 'name' => "use_metadata_files",
115 'desc' => "Read metadata from metadata XML files.",
116 'type' => "flag",
117 'reqd' => "no" } ];
118
119my $options = { 'name' => "RecPlug",
120 'desc' => "RecPlug is a plugin which recurses through directories processing each file it finds.",
121 'inherits' => "yes",
122 'args' => $arguments };
123
124# sub print_usage {
125# my ($plugin_name) = @_;
126
127# print STDERR "
128# usage: plugin RecPlug [options]
129
130# -use_metadata_files Read metadata from metadata XML files.
131
132# "
133# }
134
135my ($self);
136sub new {
137 my $class = shift (@_);
138
139 # $self is global for use within subroutines called by XML::Parser
140 $self = new BasPlug ($class, @_);
141
142 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
143 my $option_list = $self->{'option_list'};
144 push( @{$option_list}, $options );
145
146 if (!parsargv::parse(\@_,
147 q^use_metadata_files^, \$self->{'use_metadata_files'},
148 "allow_extra_options")) {
149 print STDERR "\nRecPlug uses an incorrect option.\n";
150 print STDERR "Check your collect.cfg configuration file.\n\n";
151 $self->print_txt_usage();
152 die "\n";
153 }
154
155 if ($self->{'use_metadata_files'}) {
156 # create XML::Parser object for parsing metadata.xml files
157 my $parser = new XML::Parser('Style' => 'Stream',
158 'Handlers' => {'Char' => \&Char,
159 'Doctype' => \&Doctype
160 });
161 $self->{'parser'} = $parser;
162 $self->{'in_filename'} = 0;
163
164 }
165
166 return bless $self, $class;
167}
168
169# return 1 if this class might recurse using $pluginfo
170sub is_recursive {
171 my $self = shift (@_);
172
173 return 1;
174}
175
176sub get_default_block_exp {
177 my $self = shift (@_);
178
179 return 'CVS';
180}
181
182# return number of files processed, undef if can't process
183# Note that $base_dir might be "" and that $file might
184# include directories
185
186# This function passes around metadata hash structures. Metadata hash
187# structures are hashes that map from a (scalar) key (the metadata element
188# name) to either a scalar metadata value or a reference to an array of
189# such values.
190
191sub read {
192 my $self = shift (@_);
193 my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs) = @_;
194
195 my $outhandle = $self->{'outhandle'};
196 my $verbosity = $self->{'verbosity'};
197 my $read_metadata_files = $self->{'use_metadata_files'};
198
199 # Calculate the directory name and ensure it is a directory and
200 # that it is not explicitly blocked.
201 my $dirname = $file;
202 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
203 return undef unless (-d $dirname);
204 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
205
206 # check to make sure we're not reading the archives or index directory
207 my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
208 if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
209 print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
210 return 0;
211 }
212
213 # check to see we haven't got a cyclic path...
214 if ($dirname =~ m%(/.*){,41}%) {
215 print $outhandle "RecPlug: $dirname is 40 directories deep, is this a recursive path? if not increase constant in RecPlug.pm.\n";
216 return 0;
217 }
218
219 # check to see we haven't got a cyclic path...
220 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
221 print $outhandle "RecPlug: $dirname appears to be in a recursive loop...\n";
222 return 0;
223 }
224
225 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
226 print $outhandle "RecPlug: metadata passed in: ",
227 join(", ", keys %$in_metadata), "\n";
228 }
229
230 # Recur over directory contents.
231 my (@dir, $subfile);
232 my $count = 0;
233 print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity);
234
235 # find all the files in the directory
236 if (!opendir (DIR, $dirname)) {
237 print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n";
238 return undef;
239 }
240 @dir = readdir (DIR);
241 closedir (DIR);
242
243 # read XML metadata files (if supplied)
244 my $additionalmetadata = 0; # is there extra metadata available?
245 my %extrametadata; # maps from filespec to extra metadata keys
246 my @extrametakeys; # keys of %extrametadata in order read
247
248 if ($read_metadata_files) {
249
250 # read the directory "metadata.xml" file
251 my $metadatafile = &util::filename_cat ($dirname, 'metadata.xml');
252 if (-e $metadatafile) {
253 print $outhandle "RecPlug: found metadata in $metadatafile\n"
254 if ($verbosity);
255 $self->read_metadata_xml_file($metadatafile, \%extrametadata, \@extrametakeys);
256 $additionalmetadata = 1;
257 }
258 }
259
260 # import each of the files in the directory
261 my $out_metadata;
262 foreach $subfile (@dir) {
263
264 last if ($maxdocs != -1 && $count >= $maxdocs);
265 next if ($subfile =~ /^\.\.?$/);
266 next if ($read_metadata_files && $subfile =~ /metadata\.xml$/);
267
268 # check for a symlink pointing back to a leading directory
269 if (-d "$dirname/$subfile" && -l "$dirname/$subfile") {
270 # readlink gives a "fatal error" on systems that don't implement
271 # symlinks. This assumes the the -l test above would fail on those.
272 my $linkdest=readlink "$dirname/$subfile";
273 if (!defined ($linkdest)) {
274 # system error - file not found?
275 warn "RecPlug: symlink problem - $!";
276 } else {
277 # see if link points to current or a parent directory
278 if ($linkdest =~ m@^[\./\\]+$@ ||
279 index($dirname, $linkdest) != -1) {
280 warn "RecPlug: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n";
281 next;
282 ;
283 }
284 }
285 }
286
287 print $outhandle "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2);
288
289 # Make a copy of $in_metadata to pass to $subfile
290 $out_metadata = {};
291 &combine_metadata_structures($out_metadata, $in_metadata);
292
293 # Next add metadata read in XML files (if it is supplied)
294 if ($additionalmetadata == 1) {
295
296 my ($filespec, $mdref);
297 foreach $filespec (@extrametakeys) {
298 if ($subfile =~ /^$filespec$/) {
299 print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
300 if ($verbosity > 2);
301 $mdref = $extrametadata{$filespec};
302 &combine_metadata_structures($out_metadata, $mdref);
303 }
304 }
305 }
306
307 # Recursively read each $subfile
308 print $outhandle "RecPlug recurring: $subfile\n" if ($verbosity > 2);
309 $count += &plugin::read ($pluginfo, $base_dir,
310 &util::filename_cat($file, $subfile),
311 $out_metadata, $processor, $maxdocs);
312 }
313 return $count;
314
315}
316
317
318
319# Read a manually-constructed metadata file and store the data
320# it contains in the $metadataref structure.
321#
322# (metadataref is a reference to a hash whose keys are filenames
323# and whose values are metadata hash structures.)
324
325sub read_metadata_xml_file {
326 my $self = shift(@_);
327 my ($filename, $metadataref, $metakeysref) = @_;
328 $self->{'metadataref'} = $metadataref;
329 $self->{'metakeysref'} = $metakeysref;
330
331 eval {
332 $self->{'parser'}->parsefile($filename);
333 };
334 if ($@) {
335 die "RecPlug: ERROR $filename is not a well formed metadata.xml file ($@)\n";
336 }
337}
338
339sub Doctype {
340 my ($expat, $name, $sysid, $pubid, $internal) = @_;
341
342 # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
343 # to be processed as well as the "DirectoryMetadata" files which should now
344 # be created by import.pl
345 die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
346}
347
348sub StartTag {
349 my ($expat, $element) = @_;
350
351 if ($element eq "FileSet") {
352 $self->{'saved_targets'} = [];
353 $self->{'saved_metadata'} = {};
354 }
355 elsif ($element eq "FileName") {
356 $self->{'in_filename'} = 1;
357 }
358 elsif ($element eq "Metadata") {
359 $self->{'metadata_name'} = $_{'name'};
360 if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
361 $self->{'metadata_accumulate'} = 1;
362 } else {
363 $self->{'metadata_accumulate'} = 0;
364 }
365 }
366}
367
368sub EndTag {
369 my ($expat, $element) = @_;
370
371 if ($element eq "FileSet") {
372 push (@{$self->{'metakeysref'}}, @{$self->{'saved_targets'}});
373 foreach my $target (@{$self->{'saved_targets'}}) {
374 $self->{'metadataref'}->{$target} = $self->{'saved_metadata'};
375 }
376 }
377 elsif ($element eq "FileName") {
378 $self->{'in_filename'} = 0;
379 }
380 elsif ($element eq "Metadata") {
381 $self->{'metadata_name'} = "";
382 }
383
384}
385
386sub Text {
387
388 if ($self->{'in_filename'}) {
389 # $_ == FileName content
390 push (@{$self->{'saved_targets'}}, $_);
391 }
392 elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
393 # $_ == Metadata content
394 my $mname = $self->{'metadata_name'};
395 if (defined $self->{'saved_metadata'}->{$mname}) {
396 if ($self->{'metadata_accumulate'}) {
397 # accumulate mode - add value to existing value(s)
398 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
399 push (@{$self->{'saved_metadata'}->{$mname}}, $_);
400 } else {
401 $self->{'saved_metadata'}->{$mname} =
402 [$self->{'saved_metadata'}->{$mname}, $_];
403 }
404 } else {
405 # override mode
406 $self->{'saved_metadata'}->{$mname} = $_;
407 }
408 } else {
409 if ($self->{'metadata_accumulate'}) {
410 # accumulate mode - add value into (currently empty) array
411 $self->{'saved_metadata'}->{$mname} = [$_];
412 } else {
413 # override mode
414 $self->{'saved_metadata'}->{$mname} = $_;
415 }
416 }
417 }
418}
419
420# This Char function overrides the one in XML::Parser::Stream to overcome a
421# problem where $expat->{Text} is treated as the return value, slowing
422# things down significantly in some cases.
423sub Char {
424 $_[0]->{'Text'} .= $_[1];
425 return undef;
426}
427
428# Combine two metadata structures. Given two references to metadata
429# element structures, add every field of the second ($mdref2) to the first
430# ($mdref1).
431#
432# Afterwards $mdref1 will be updated, and $mdref2 will be unchanged.
433#
434# We have to be acreful about the way we merge metadata when one metadata
435# structure is in "override" mode and one is in "merge" mode. In fact, we
436# use the mode from the second structure, $mdref2, because it is generally
437# defined later (lower in the directory structure) and is therefore more
438# "local" to the document concerned.
439#
440# Another issue is the use of references to pass metadata around. If we
441# simply copy one metadata structure reference to another, then we're
442# effectively justr copyinga pointer, and changes to the new referene
443# will affect the old (copied) one also. This also applies to ARRAY
444# references used as metadata element values (hence the "clonedata"
445# function below).
446
447sub combine_metadata_structures {
448 my ($mdref1, $mdref2) = @_;
449 my ($key, $value1, $value2);
450
451 foreach $key (keys %$mdref2) {
452
453 $value1 = $mdref1->{$key};
454 $value2 = $mdref2->{$key};
455
456 # If there is no existing value for this metadata field in
457 # $mdref1, so we simply copy the value from $mdref2 over.
458 if (!defined $value1) {
459 $mdref1->{$key} = &clonedata($value2);
460 }
461 # Otherwise we have to add the new values to the existing ones.
462 # If the second structure is accumulated, then acculate all the
463 # values into the first structure
464 elsif ((ref $value2) eq "ARRAY") {
465 # If the first metadata element is a scalar we have to
466 # convert it into an array before we add anything more.
467 if ((ref $value1) ne 'ARRAY') {
468 $mdref1->{$key} = [$value1];
469 $value1 = $mdref1->{$key};
470 }
471 # Now add the value(s) from the second array to the first
472 $value2 = &clonedata($value2);
473 push @$value1, @$value2;
474 }
475 # Finally, If the second structure is not an array erference, we
476 # know it is in override mode, so override the first structure.
477 else {
478 $mdref1->{$key} = &clonedata($value2);
479 }
480 }
481}
482
483
484# Make a "cloned" copy of a metadata value.
485# This is trivial for a simple scalar value,
486# but not for an array reference.
487
488sub clonedata {
489 my ($value) = @_;
490 my $result;
491
492 if ((ref $value) eq 'ARRAY') {
493 $result = [];
494 foreach my $item (@$value) {
495 push @$result, $item;
496 }
497 } else {
498 $result = $value;
499 }
500 return $result;
501}
502
503
5041;
Note: See TracBrowser for help on using the repository browser.