source: trunk/gsdl/perllib/plugins/RecPlug.pm@ 5295

Last change on this file since 5295 was 5295, checked in by mdewsnip, 21 years ago

Added one tiny little option to help the GLI out with monitoring the progress of importing.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.2 KB
Line 
1###########################################################################
2#
3# RecPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# RecPlug is a plugin which recurses through directories processing
27# each file it finds.
28
29# RecPlug has one option: use_metadata_files. When this is set, it will
30# check each directory for an XML file called "metadata.xml" that specifies
31# metadata for the files (and subdirectories) in the directory.
32#
33# Here's an example of a metadata file that uses three FileSet structures
34# (ignore the # characters):
35
36#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
37#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
38#<DirectoryMetadata>
39# <FileSet>
40# <FileName>nugget.*</FileName>
41# <Description>
42# <Metadata name="Title">Nugget Point, The Catlins</Metadata>
43# <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
44# </Description>
45# </FileSet>
46# <FileSet>
47# <FileName>nugget-point-1.jpg</FileName>
48# <Description>
49# <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
50# <Metadata name="Subject">Lighthouse</Metadata>
51# </Description>
52# </FileSet>
53# <FileSet>
54# <FileName>kaka-point-dir</FileName>
55# <Description>
56# <Metadata name="Title">Kaka Point, The Catlins</Metadata>
57# </Description>
58# </FileSet>
59#</DirectoryMetadata>
60
61# Metadata elements are read and applied to files in the order they appear
62# in the file.
63#
64# The FileName element describes the subfiles in the directory that the
65# metadata applies to as a perl regular expression (a FileSet group may
66# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
67# indicates that the metadata records in the following Description block
68# apply to every subfile that starts with "nugget". For these files, a
69# Title metadata element is set, overriding any old value that the Title
70# might have had.
71#
72# Occasionally, we want to have multiple metadata values applied to a
73# document; in this case we use the "mode=accumulate" attribute of the
74# particular Metadata element. In the second metadata element of the first
75# FileSet above, the "Place" metadata is accumulating, and may therefore be
76# given several values. If we wanted to override these values and use a
77# single metadata element again, we could set the mode attribute to
78# "override" instead. Remember: every element is assumed to be in override
79# mode unless you specify otherwise, so if you want to accumulate metadata
80# for some field, every occurance must have "mode=accumulate" specified.
81#
82# The second FileSet element above applies to a specific file, called
83# nugget-point-1.jpg. This element overrides the Title metadata set in the
84# first FileSet, and adds a "Subject" metadata field.
85#
86# The third and final FileSet sets metadata for a subdirectory rather than
87# a file. The metadata specified (a Title) will be passed into the
88# subdirectory and applied to every file that occurs in the subdirectory
89# (and to every subsubdirectory and its contents, and so on) unless the
90# metadata is explictly overridden later in the import.
91
92
93
94package RecPlug;
95
96use BasPlug;
97use plugin;
98use util;
99
100
101BEGIN {
102 @ISA = ('BasPlug');
103 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
104}
105
106use XML::Parser;
107
108my $arguments =
109 [ { 'name' => "block_exp",
110 'desc' => "{BasPlug.block_exp}",
111 'type' => "string",
112 'deft' => &get_default_block_exp(),
113 'reqd' => "no" },
114 { 'name' => "use_metadata_files",
115 'desc' => "{RecPlug.use_metadata_files}",
116 'type' => "flag",
117 'reqd' => "no" } ];
118
119my $options = { 'name' => "RecPlug",
120 'desc' => "RecPlug is a plugin which recurses through directories processing each file it finds.",
121 'inherits' => "yes",
122 'args' => $arguments };
123
124# sub print_usage {
125# my ($plugin_name) = @_;
126
127# print STDERR "
128# usage: plugin RecPlug [options]
129
130# -use_metadata_files Read metadata from metadata XML files.
131
132# "
133# }
134
135my ($self);
136sub new {
137 my $class = shift (@_);
138
139 # $self is global for use within subroutines called by XML::Parser
140 $self = new BasPlug ($class, @_);
141
142 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
143 my $option_list = $self->{'option_list'};
144 push( @{$option_list}, $options );
145
146 if (!parsargv::parse(\@_,
147 q^use_metadata_files^, \$self->{'use_metadata_files'},
148 q^show_progress^, \$self->{'show_progress'}, # Undocumented (for GLI)
149 "allow_extra_options")) {
150 print STDERR "\nRecPlug uses an incorrect option.\n";
151 print STDERR "Check your collect.cfg configuration file.\n\n";
152 $self->print_txt_usage(""); # Use default resource bundle
153 die "\n";
154 }
155
156 if ($self->{'use_metadata_files'}) {
157 # create XML::Parser object for parsing metadata.xml files
158 my $parser = new XML::Parser('Style' => 'Stream',
159 'Handlers' => {'Char' => \&Char,
160 'Doctype' => \&Doctype
161 });
162 $self->{'parser'} = $parser;
163 $self->{'in_filename'} = 0;
164
165 }
166
167 return bless $self, $class;
168}
169
170# return 1 if this class might recurse using $pluginfo
171sub is_recursive {
172 my $self = shift (@_);
173
174 return 1;
175}
176
177sub get_default_block_exp {
178 my $self = shift (@_);
179
180 return 'CVS';
181}
182
183# return number of files processed, undef if can't process
184# Note that $base_dir might be "" and that $file might
185# include directories
186
187# This function passes around metadata hash structures. Metadata hash
188# structures are hashes that map from a (scalar) key (the metadata element
189# name) to either a scalar metadata value or a reference to an array of
190# such values.
191
192sub read {
193 my $self = shift (@_);
194 my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs) = @_;
195
196 my $outhandle = $self->{'outhandle'};
197 my $verbosity = $self->{'verbosity'};
198 my $read_metadata_files = $self->{'use_metadata_files'};
199
200 # Calculate the directory name and ensure it is a directory and
201 # that it is not explicitly blocked.
202 my $dirname = $file;
203 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
204 return undef unless (-d $dirname);
205 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
206
207 # check to make sure we're not reading the archives or index directory
208 my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
209 if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
210 print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
211 return 0;
212 }
213
214 # check to see we haven't got a cyclic path...
215 if ($dirname =~ m%(/.*){,41}%) {
216 print $outhandle "RecPlug: $dirname is 40 directories deep, is this a recursive path? if not increase constant in RecPlug.pm.\n";
217 return 0;
218 }
219
220 # check to see we haven't got a cyclic path...
221 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
222 print $outhandle "RecPlug: $dirname appears to be in a recursive loop...\n";
223 return 0;
224 }
225
226 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
227 print $outhandle "RecPlug: metadata passed in: ",
228 join(", ", keys %$in_metadata), "\n";
229 }
230
231 # Recur over directory contents.
232 my (@dir, $subfile);
233 my $count = 0;
234 print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity);
235
236 # find all the files in the directory
237 if (!opendir (DIR, $dirname)) {
238 print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n";
239 return undef;
240 }
241 @dir = readdir (DIR);
242 closedir (DIR);
243
244 # read XML metadata files (if supplied)
245 my $additionalmetadata = 0; # is there extra metadata available?
246 my %extrametadata; # maps from filespec to extra metadata keys
247 my @extrametakeys; # keys of %extrametadata in order read
248
249 if ($read_metadata_files) {
250
251 # read the directory "metadata.xml" file
252 my $metadatafile = &util::filename_cat ($dirname, 'metadata.xml');
253 if (-e $metadatafile) {
254 print $outhandle "RecPlug: found metadata in $metadatafile\n"
255 if ($verbosity);
256 $self->read_metadata_xml_file($metadatafile, \%extrametadata, \@extrametakeys);
257 $additionalmetadata = 1;
258 }
259 }
260
261 # import each of the files in the directory
262 my $out_metadata;
263 foreach $subfile (@dir) {
264
265 last if ($maxdocs != -1 && $count >= $maxdocs);
266 next if ($subfile =~ /^\.\.?$/);
267 next if ($read_metadata_files && $subfile =~ /metadata\.xml$/);
268
269 # check for a symlink pointing back to a leading directory
270 if (-d "$dirname/$subfile" && -l "$dirname/$subfile") {
271 # readlink gives a "fatal error" on systems that don't implement
272 # symlinks. This assumes the the -l test above would fail on those.
273 my $linkdest=readlink "$dirname/$subfile";
274 if (!defined ($linkdest)) {
275 # system error - file not found?
276 warn "RecPlug: symlink problem - $!";
277 } else {
278 # see if link points to current or a parent directory
279 if ($linkdest =~ m@^[\./\\]+$@ ||
280 index($dirname, $linkdest) != -1) {
281 warn "RecPlug: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n";
282 next;
283 ;
284 }
285 }
286 }
287
288 print $outhandle "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2);
289
290 # Make a copy of $in_metadata to pass to $subfile
291 $out_metadata = {};
292 &combine_metadata_structures($out_metadata, $in_metadata);
293
294 # Next add metadata read in XML files (if it is supplied)
295 if ($additionalmetadata == 1) {
296
297 my ($filespec, $mdref);
298 foreach $filespec (@extrametakeys) {
299 if ($subfile =~ /^$filespec$/) {
300 print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
301 if ($verbosity > 2);
302 $mdref = $extrametadata{$filespec};
303 &combine_metadata_structures($out_metadata, $mdref);
304 }
305 }
306 }
307
308 # Recursively read each $subfile
309 print $outhandle "RecPlug recurring: $subfile\n" if ($verbosity > 2);
310 print $outhandle "RecPlug - $subfile\n" if ($self->{'show_progress'});
311
312 $count += &plugin::read ($pluginfo, $base_dir,
313 &util::filename_cat($file, $subfile),
314 $out_metadata, $processor, $maxdocs);
315 }
316 return $count;
317
318}
319
320
321
322# Read a manually-constructed metadata file and store the data
323# it contains in the $metadataref structure.
324#
325# (metadataref is a reference to a hash whose keys are filenames
326# and whose values are metadata hash structures.)
327
328sub read_metadata_xml_file {
329 my $self = shift(@_);
330 my ($filename, $metadataref, $metakeysref) = @_;
331 $self->{'metadataref'} = $metadataref;
332 $self->{'metakeysref'} = $metakeysref;
333
334 eval {
335 $self->{'parser'}->parsefile($filename);
336 };
337 if ($@) {
338 die "RecPlug: ERROR $filename is not a well formed metadata.xml file ($@)\n";
339 }
340}
341
342sub Doctype {
343 my ($expat, $name, $sysid, $pubid, $internal) = @_;
344
345 # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
346 # to be processed as well as the "DirectoryMetadata" files which should now
347 # be created by import.pl
348 die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
349}
350
351sub StartTag {
352 my ($expat, $element) = @_;
353
354 if ($element eq "FileSet") {
355 $self->{'saved_targets'} = [];
356 $self->{'saved_metadata'} = {};
357 }
358 elsif ($element eq "FileName") {
359 $self->{'in_filename'} = 1;
360 }
361 elsif ($element eq "Metadata") {
362 $self->{'metadata_name'} = $_{'name'};
363 if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
364 $self->{'metadata_accumulate'} = 1;
365 } else {
366 $self->{'metadata_accumulate'} = 0;
367 }
368 }
369}
370
371sub EndTag {
372 my ($expat, $element) = @_;
373
374 if ($element eq "FileSet") {
375 push (@{$self->{'metakeysref'}}, @{$self->{'saved_targets'}});
376 foreach my $target (@{$self->{'saved_targets'}}) {
377 $self->{'metadataref'}->{$target} = $self->{'saved_metadata'};
378 }
379 }
380 elsif ($element eq "FileName") {
381 $self->{'in_filename'} = 0;
382 }
383 elsif ($element eq "Metadata") {
384 $self->{'metadata_name'} = "";
385 }
386
387}
388
389sub Text {
390
391 if ($self->{'in_filename'}) {
392 # $_ == FileName content
393 push (@{$self->{'saved_targets'}}, $_);
394 }
395 elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
396 # $_ == Metadata content
397 my $mname = $self->{'metadata_name'};
398 if (defined $self->{'saved_metadata'}->{$mname}) {
399 if ($self->{'metadata_accumulate'}) {
400 # accumulate mode - add value to existing value(s)
401 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
402 push (@{$self->{'saved_metadata'}->{$mname}}, $_);
403 } else {
404 $self->{'saved_metadata'}->{$mname} =
405 [$self->{'saved_metadata'}->{$mname}, $_];
406 }
407 } else {
408 # override mode
409 $self->{'saved_metadata'}->{$mname} = $_;
410 }
411 } else {
412 if ($self->{'metadata_accumulate'}) {
413 # accumulate mode - add value into (currently empty) array
414 $self->{'saved_metadata'}->{$mname} = [$_];
415 } else {
416 # override mode
417 $self->{'saved_metadata'}->{$mname} = $_;
418 }
419 }
420 }
421}
422
423# This Char function overrides the one in XML::Parser::Stream to overcome a
424# problem where $expat->{Text} is treated as the return value, slowing
425# things down significantly in some cases.
426sub Char {
427 $_[0]->{'Text'} .= $_[1];
428 return undef;
429}
430
431# Combine two metadata structures. Given two references to metadata
432# element structures, add every field of the second ($mdref2) to the first
433# ($mdref1).
434#
435# Afterwards $mdref1 will be updated, and $mdref2 will be unchanged.
436#
437# We have to be acreful about the way we merge metadata when one metadata
438# structure is in "override" mode and one is in "merge" mode. In fact, we
439# use the mode from the second structure, $mdref2, because it is generally
440# defined later (lower in the directory structure) and is therefore more
441# "local" to the document concerned.
442#
443# Another issue is the use of references to pass metadata around. If we
444# simply copy one metadata structure reference to another, then we're
445# effectively justr copyinga pointer, and changes to the new referene
446# will affect the old (copied) one also. This also applies to ARRAY
447# references used as metadata element values (hence the "clonedata"
448# function below).
449
450sub combine_metadata_structures {
451 my ($mdref1, $mdref2) = @_;
452 my ($key, $value1, $value2);
453
454 foreach $key (keys %$mdref2) {
455
456 $value1 = $mdref1->{$key};
457 $value2 = $mdref2->{$key};
458
459 # If there is no existing value for this metadata field in
460 # $mdref1, so we simply copy the value from $mdref2 over.
461 if (!defined $value1) {
462 $mdref1->{$key} = &clonedata($value2);
463 }
464 # Otherwise we have to add the new values to the existing ones.
465 # If the second structure is accumulated, then acculate all the
466 # values into the first structure
467 elsif ((ref $value2) eq "ARRAY") {
468 # If the first metadata element is a scalar we have to
469 # convert it into an array before we add anything more.
470 if ((ref $value1) ne 'ARRAY') {
471 $mdref1->{$key} = [$value1];
472 $value1 = $mdref1->{$key};
473 }
474 # Now add the value(s) from the second array to the first
475 $value2 = &clonedata($value2);
476 push @$value1, @$value2;
477 }
478 # Finally, If the second structure is not an array erference, we
479 # know it is in override mode, so override the first structure.
480 else {
481 $mdref1->{$key} = &clonedata($value2);
482 }
483 }
484}
485
486
487# Make a "cloned" copy of a metadata value.
488# This is trivial for a simple scalar value,
489# but not for an array reference.
490
491sub clonedata {
492 my ($value) = @_;
493 my $result;
494
495 if ((ref $value) eq 'ARRAY') {
496 $result = [];
497 foreach my $item (@$value) {
498 push @$result, $item;
499 }
500 } else {
501 $result = $value;
502 }
503 return $result;
504}
505
506
5071;
Note: See TracBrowser for help on using the repository browser.