########################################################################### # # RecPlug.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # RecPlug is a plugin which recurses through directories processing # each file it finds. # RecPlug has one option: use_metadata_files. When this is set, it will # check each directory for an XML file called "metadata.xml" that specifies # metadata for the files (and subdirectories) in the directory. # # Here's an example of a metadata file that uses three FileSet structures # (ignore the # characters): # # # # # nugget.* # # Nugget Point, The Catlins # Nugget Point # # # # nugget-point-1.jpg # # Nugget Point Lighthouse, The Catlins # Lighthouse # # # # kaka-point-dir # # Kaka Point, The Catlins # # # # Metadata elements are read and applied to files in the order they appear # in the file. # # The FileName element describes the subfiles in the directory that the # metadata applies to as a perl regular expression (a FileSet group may # contain multiple FileName elements). So, nugget.* # indicates that the metadata records in the following Description block # apply to every subfile that starts with "nugget". For these files, a # Title metadata element is set, overriding any old value that the Title # might have had. # # Occasionally, we want to have multiple metadata values applied to a # document; in this case we use the "mode=accumulate" attribute of the # particular Metadata element. In the second metadata element of the first # FileSet above, the "Place" metadata is accumulating, and may therefore be # given several values. If we wanted to override these values and use a # single metadata element again, we could set the mode attribute to # "override" instead. Remember: every element is assumed to be in override # mode unless you specify otherwise, so if you want to accumulate metadata # for some field, every occurance must have "mode=accumulate" specified. # # The second FileSet element above applies to a specific file, called # nugget-point-1.jpg. This element overrides the Title metadata set in the # first FileSet, and adds a "Subject" metadata field. # # The third and final FileSet sets metadata for a subdirectory rather than # a file. The metadata specified (a Title) will be passed into the # subdirectory and applied to every file that occurs in the subdirectory # (and to every subsubdirectory and its contents, and so on) unless the # metadata is explictly overridden later in the import. package RecPlug; use BasPlug; use plugin; use util; BEGIN { @ISA = ('BasPlug'); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); } use XML::Parser; my $arguments = [ { 'name' => "block_exp", 'desc' => "{BasPlug.block_exp}", 'type' => "string", 'deft' => &get_default_block_exp(), 'reqd' => "no" }, { 'name' => "use_metadata_files", 'desc' => "{RecPlug.use_metadata_files}", 'type' => "flag", 'reqd' => "no" } ]; my $options = { 'name' => "RecPlug", 'desc' => "RecPlug is a plugin which recurses through directories processing each file it finds.", 'inherits' => "yes", 'args' => $arguments }; # sub print_usage { # my ($plugin_name) = @_; # print STDERR " # usage: plugin RecPlug [options] # -use_metadata_files Read metadata from metadata XML files. # " # } my ($self); sub new { my $class = shift (@_); # $self is global for use within subroutines called by XML::Parser $self = new BasPlug ($class, @_); # 14-05-02 To allow for proper inheritance of arguments - John Thompson my $option_list = $self->{'option_list'}; push( @{$option_list}, $options ); if (!parsargv::parse(\@_, q^use_metadata_files^, \$self->{'use_metadata_files'}, q^show_progress^, \$self->{'show_progress'}, # Undocumented (for GLI) "allow_extra_options")) { print STDERR "\nRecPlug uses an incorrect option.\n"; print STDERR "Check your collect.cfg configuration file.\n\n"; $self->print_txt_usage(""); # Use default resource bundle die "\n"; } if ($self->{'use_metadata_files'}) { # create XML::Parser object for parsing metadata.xml files my $parser = new XML::Parser('Style' => 'Stream', 'Handlers' => {'Char' => \&Char, 'Doctype' => \&Doctype }); $self->{'parser'} = $parser; $self->{'in_filename'} = 0; } return bless $self, $class; } # return 1 if this class might recurse using $pluginfo sub is_recursive { my $self = shift (@_); return 1; } sub get_default_block_exp { my $self = shift (@_); return 'CVS'; } # return number of files processed, undef if can't process # Note that $base_dir might be "" and that $file might # include directories # This function passes around metadata hash structures. Metadata hash # structures are hashes that map from a (scalar) key (the metadata element # name) to either a scalar metadata value or a reference to an array of # such values. sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs) = @_; my $outhandle = $self->{'outhandle'}; my $verbosity = $self->{'verbosity'}; my $read_metadata_files = $self->{'use_metadata_files'}; # Calculate the directory name and ensure it is a directory and # that it is not explicitly blocked. my $dirname = $file; $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; return undef unless (-d $dirname); return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/); # check to make sure we're not reading the archives or index directory my $gsdlhome = quotemeta($ENV{'GSDLHOME'}); if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) { print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n"; return 0; } # check to see we haven't got a cyclic path... if ($dirname =~ m%(/.*){,41}%) { print $outhandle "RecPlug: $dirname is 40 directories deep, is this a recursive path? if not increase constant in RecPlug.pm.\n"; return 0; } # check to see we haven't got a cyclic path... if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) { print $outhandle "RecPlug: $dirname appears to be in a recursive loop...\n"; return 0; } if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) { print $outhandle "RecPlug: metadata passed in: ", join(", ", keys %$in_metadata), "\n"; } # Recur over directory contents. my (@dir, $subfile); my $count = 0; print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity); # find all the files in the directory if (!opendir (DIR, $dirname)) { print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n"; return undef; } @dir = readdir (DIR); closedir (DIR); # read XML metadata files (if supplied) my $additionalmetadata = 0; # is there extra metadata available? my %extrametadata; # maps from filespec to extra metadata keys my @extrametakeys; # keys of %extrametadata in order read if ($read_metadata_files) { # read the directory "metadata.xml" file my $metadatafile = &util::filename_cat ($dirname, 'metadata.xml'); if (-e $metadatafile) { print $outhandle "RecPlug: found metadata in $metadatafile\n" if ($verbosity); $self->read_metadata_xml_file($metadatafile, \%extrametadata, \@extrametakeys); $additionalmetadata = 1; } } # import each of the files in the directory my $out_metadata; foreach $subfile (@dir) { last if ($maxdocs != -1 && $count >= $maxdocs); next if ($subfile =~ /^\.\.?$/); next if ($read_metadata_files && $subfile =~ /metadata\.xml$/); # check for a symlink pointing back to a leading directory if (-d "$dirname/$subfile" && -l "$dirname/$subfile") { # readlink gives a "fatal error" on systems that don't implement # symlinks. This assumes the the -l test above would fail on those. my $linkdest=readlink "$dirname/$subfile"; if (!defined ($linkdest)) { # system error - file not found? warn "RecPlug: symlink problem - $!"; } else { # see if link points to current or a parent directory if ($linkdest =~ m@^[\./\\]+$@ || index($dirname, $linkdest) != -1) { warn "RecPlug: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n"; next; ; } } } print $outhandle "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2); # Make a copy of $in_metadata to pass to $subfile $out_metadata = {}; &combine_metadata_structures($out_metadata, $in_metadata); # Next add metadata read in XML files (if it is supplied) if ($additionalmetadata == 1) { my ($filespec, $mdref); foreach $filespec (@extrametakeys) { if ($subfile =~ /^$filespec$/) { print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n" if ($verbosity > 2); $mdref = $extrametadata{$filespec}; &combine_metadata_structures($out_metadata, $mdref); } } } # Recursively read each $subfile print $outhandle "RecPlug recurring: $subfile\n" if ($verbosity > 2); print $outhandle "RecPlug - $subfile\n" if ($self->{'show_progress'}); $count += &plugin::read ($pluginfo, $base_dir, &util::filename_cat($file, $subfile), $out_metadata, $processor, $maxdocs); } return $count; } # Read a manually-constructed metadata file and store the data # it contains in the $metadataref structure. # # (metadataref is a reference to a hash whose keys are filenames # and whose values are metadata hash structures.) sub read_metadata_xml_file { my $self = shift(@_); my ($filename, $metadataref, $metakeysref) = @_; $self->{'metadataref'} = $metadataref; $self->{'metakeysref'} = $metakeysref; eval { $self->{'parser'}->parsefile($filename); }; if ($@) { die "RecPlug: ERROR $filename is not a well formed metadata.xml file ($@)\n"; } } sub Doctype { my ($expat, $name, $sysid, $pubid, $internal) = @_; # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files # to be processed as well as the "DirectoryMetadata" files which should now # be created by import.pl die if ($name !~ /^(Greenstone)?DirectoryMetadata$/); } sub StartTag { my ($expat, $element) = @_; if ($element eq "FileSet") { $self->{'saved_targets'} = []; $self->{'saved_metadata'} = {}; } elsif ($element eq "FileName") { $self->{'in_filename'} = 1; } elsif ($element eq "Metadata") { $self->{'metadata_name'} = $_{'name'}; if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) { $self->{'metadata_accumulate'} = 1; } else { $self->{'metadata_accumulate'} = 0; } } } sub EndTag { my ($expat, $element) = @_; if ($element eq "FileSet") { push (@{$self->{'metakeysref'}}, @{$self->{'saved_targets'}}); foreach my $target (@{$self->{'saved_targets'}}) { $self->{'metadataref'}->{$target} = $self->{'saved_metadata'}; } } elsif ($element eq "FileName") { $self->{'in_filename'} = 0; } elsif ($element eq "Metadata") { $self->{'metadata_name'} = ""; } } sub Text { if ($self->{'in_filename'}) { # $_ == FileName content push (@{$self->{'saved_targets'}}, $_); } elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") { # $_ == Metadata content my $mname = $self->{'metadata_name'}; if (defined $self->{'saved_metadata'}->{$mname}) { if ($self->{'metadata_accumulate'}) { # accumulate mode - add value to existing value(s) if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") { push (@{$self->{'saved_metadata'}->{$mname}}, $_); } else { $self->{'saved_metadata'}->{$mname} = [$self->{'saved_metadata'}->{$mname}, $_]; } } else { # override mode $self->{'saved_metadata'}->{$mname} = $_; } } else { if ($self->{'metadata_accumulate'}) { # accumulate mode - add value into (currently empty) array $self->{'saved_metadata'}->{$mname} = [$_]; } else { # override mode $self->{'saved_metadata'}->{$mname} = $_; } } } } # This Char function overrides the one in XML::Parser::Stream to overcome a # problem where $expat->{Text} is treated as the return value, slowing # things down significantly in some cases. sub Char { $_[0]->{'Text'} .= $_[1]; return undef; } # Combine two metadata structures. Given two references to metadata # element structures, add every field of the second ($mdref2) to the first # ($mdref1). # # Afterwards $mdref1 will be updated, and $mdref2 will be unchanged. # # We have to be acreful about the way we merge metadata when one metadata # structure is in "override" mode and one is in "merge" mode. In fact, we # use the mode from the second structure, $mdref2, because it is generally # defined later (lower in the directory structure) and is therefore more # "local" to the document concerned. # # Another issue is the use of references to pass metadata around. If we # simply copy one metadata structure reference to another, then we're # effectively justr copyinga pointer, and changes to the new referene # will affect the old (copied) one also. This also applies to ARRAY # references used as metadata element values (hence the "clonedata" # function below). sub combine_metadata_structures { my ($mdref1, $mdref2) = @_; my ($key, $value1, $value2); foreach $key (keys %$mdref2) { $value1 = $mdref1->{$key}; $value2 = $mdref2->{$key}; # If there is no existing value for this metadata field in # $mdref1, so we simply copy the value from $mdref2 over. if (!defined $value1) { $mdref1->{$key} = &clonedata($value2); } # Otherwise we have to add the new values to the existing ones. # If the second structure is accumulated, then acculate all the # values into the first structure elsif ((ref $value2) eq "ARRAY") { # If the first metadata element is a scalar we have to # convert it into an array before we add anything more. if ((ref $value1) ne 'ARRAY') { $mdref1->{$key} = [$value1]; $value1 = $mdref1->{$key}; } # Now add the value(s) from the second array to the first $value2 = &clonedata($value2); push @$value1, @$value2; } # Finally, If the second structure is not an array erference, we # know it is in override mode, so override the first structure. else { $mdref1->{$key} = &clonedata($value2); } } } # Make a "cloned" copy of a metadata value. # This is trivial for a simple scalar value, # but not for an array reference. sub clonedata { my ($value) = @_; my $result; if ((ref $value) eq 'ARRAY') { $result = []; foreach my $item (@$value) { push @$result, $item; } } else { $result = $value; } return $result; } 1;