########################################################################### # # RecPlug.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # RecPlug is a plugin which recurses through directories processing # each file it finds. # RecPlug has one option: use_metadata_files. When this is set, it will # check each directory for an XML file called "metadata.xml" that specifies # metadata for the files (and subdirectories) in the directory. # # Here's an example of a metadata file that uses three FileSet structures # (ignore the # characters): # # # # # nugget.* # # Nugget Point, The Catlins # Nugget Point # # # # nugget-point-1.jpg # # Nugget Point Lighthouse, The Catlins # Lighthouse # # # # kaka-point-dir # # Kaka Point, The Catlins # # # # Metadata elements are read and applied to files in the order they appear # in the file. # # The FileName element describes the subfiles in the directory that the # metadata applies to as a perl regular expression (a FileSet group may # contain multiple FileName elements). So, nugget.* # indicates that the metadata records in the following Description block # apply to every subfile that starts with "nugget". For these files, a # Title metadata element is set, overriding any old value that the Title # might have had. # # Occasionally, we want to have multiple metadata values applied to a # document; in this case we use the "mode=accumulate" attribute of the # particular Metadata element. In the second metadata element of the first # FileSet above, the "Place" metadata is accumulating, and may therefore be # given several values. If we wanted to override these values and use a # single metadata element again, we could set the mode attribute to # "override" instead. Remember: every element is assumed to be in override # mode unless you specify otherwise, so if you want to accumulate metadata # for some field, every occurance must have "mode=accumulate" specified. # # The second FileSet element above applies to a specific file, called # nugget-point-1.jpg. This element overrides the Title metadata set in the # first FileSet, and adds a "Subject" metadata field. # # The third and final FileSet sets metadata for a subdirectory rather than # a file. The metadata specified (a Title) will be passed into the # subdirectory and applied to every file that occurs in the subdirectory # (and to every subsubdirectory and its contents, and so on) unless the # metadata is explictly overridden later in the import. package RecPlug; use BasPlug; use plugin; use util; BEGIN { @ISA = ('BasPlug'); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); } use XML::Parser; my $arguments = [ { 'name' => "block_exp", 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 'type' => "string", 'deft' => &get_default_block_exp(), 'reqd' => "no" }, { 'name' => "use_metadata_files", 'desc' => "Read metadata from metadata XML files.", 'type' => "flag", 'reqd' => "no" } ]; my $options = { 'name' => "RecPlug", 'desc' => "RecPlug is a plugin which recurses through directories processing each file it finds.", 'inherits' => "yes", 'args' => $arguments }; # sub print_usage { # my ($plugin_name) = @_; # print STDERR " # usage: plugin RecPlug [options] # -use_metadata_files Read metadata from metadata XML files. # " # } my ($self); sub new { my $class = shift (@_); # $self is global for use within subroutines called by XML::Parser $self = new BasPlug ($class, @_); # 14-05-02 To allow for proper inheritance of arguments - John Thompson my $option_list = $self->{'option_list'}; push( @{$option_list}, $options ); if (!parsargv::parse(\@_, q^use_metadata_files^, \$self->{'use_metadata_files'}, "allow_extra_options")) { print STDERR "\nRecPlug uses an incorrect option.\n"; print STDERR "Check your collect.cfg configuration file.\n\n"; $self->print_txt_usage(); die "\n"; } if ($self->{'use_metadata_files'}) { # create XML::Parser object for parsing metadata.xml files my $parser = new XML::Parser('Style' => 'Stream', 'Handlers' => {'Char' => \&Char, 'Doctype' => \&Doctype }); $self->{'parser'} = $parser; $self->{'in_filename'} = 0; } return bless $self, $class; } # return 1 if this class might recurse using $pluginfo sub is_recursive { my $self = shift (@_); return 1; } sub get_default_block_exp { my $self = shift (@_); return 'CVS'; } # return number of files processed, undef if can't process # Note that $base_dir might be "" and that $file might # include directories # This function passes around metadata hash structures. Metadata hash # structures are hashes that map from a (scalar) key (the metadata element # name) to either a scalar metadata value or a reference to an array of # such values. sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs) = @_; my $outhandle = $self->{'outhandle'}; my $verbosity = $self->{'verbosity'}; my $read_metadata_files = $self->{'use_metadata_files'}; # Calculate the directory name and ensure it is a directory and # that it is not explicitly blocked. my $dirname = $file; $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; return undef unless (-d $dirname); return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/); # check to make sure we're not reading the archives or index directory my $gsdlhome = quotemeta($ENV{'GSDLHOME'}); if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) { print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n"; return 0; } # check to see we haven't got a cyclic path... if ($dirname =~ m%(/.*){,41}%) { print $outhandle "RecPlug: $dirname is 40 directories deep, is this a recursive path? if not increase constant in RecPlug.pm.\n"; return 0; } # check to see we haven't got a cyclic path... if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) { print $outhandle "RecPlug: $dirname appears to be in a recursive loop...\n"; return 0; } if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) { print $outhandle "RecPlug: metadata passed in: ", join(", ", keys %$in_metadata), "\n"; } # Recur over directory contents. my (@dir, $subfile); my $count = 0; print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity); # find all the files in the directory if (!opendir (DIR, $dirname)) { print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n"; return undef; } @dir = readdir (DIR); closedir (DIR); # read XML metadata files (if supplied) my $additionalmetadata = 0; # is there extra metadata available? my %extrametadata; # maps from filespec to extra metadata keys my @extrametakeys; # keys of %extrametadata in order read if ($read_metadata_files) { # read the directory "metadata.xml" file my $metadatafile = &util::filename_cat ($dirname, 'metadata.xml'); if (-e $metadatafile) { print $outhandle "RecPlug: found metadata in $metadatafile\n" if ($verbosity); $self->read_metadata_xml_file($metadatafile, \%extrametadata, \@extrametakeys); $additionalmetadata = 1; } } # import each of the files in the directory my $out_metadata; foreach $subfile (@dir) { last if ($maxdocs != -1 && $count >= $maxdocs); next if ($subfile =~ /^\.\.?$/); next if ($read_metadata_files && $subfile =~ /metadata\.xml$/); # check for a symlink pointing back to a leading directory if (-d "$dirname/$subfile" && -l "$dirname/$subfile") { # readlink gives a "fatal error" on systems that don't implement # symlinks. This assumes the the -l test above would fail on those. my $linkdest=readlink "$dirname/$subfile"; if (!defined ($linkdest)) { # system error - file not found? warn "RecPlug: symlink problem - $!"; } else { # see if link points to current or a parent directory if ($linkdest =~ m@^[\./\\]+$@ || index($dirname, $linkdest) != -1) { warn "RecPlug: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n"; next; ; } } } print $outhandle "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2); # Make a copy of $in_metadata to pass to $subfile $out_metadata = {}; &combine_metadata_structures($out_metadata, $in_metadata); # Next add metadata read in XML files (if it is supplied) if ($additionalmetadata == 1) { my ($filespec, $mdref); foreach $filespec (@extrametakeys) { if ($subfile =~ /^$filespec$/) { print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n" if ($verbosity > 2); $mdref = $extrametadata{$filespec}; &combine_metadata_structures($out_metadata, $mdref); } } } # Recursively read each $subfile print $outhandle "RecPlug recurring: $subfile\n" if ($verbosity > 2); $count += &plugin::read ($pluginfo, $base_dir, &util::filename_cat($file, $subfile), $out_metadata, $processor, $maxdocs); } return $count; } # Read a manually-constructed metadata file and store the data # it contains in the $metadataref structure. # # (metadataref is a reference to a hash whose keys are filenames # and whose values are metadata hash structures.) sub read_metadata_xml_file { my $self = shift(@_); my ($filename, $metadataref, $metakeysref) = @_; $self->{'metadataref'} = $metadataref; $self->{'metakeysref'} = $metakeysref; eval { $self->{'parser'}->parsefile($filename); }; if ($@) { die "RecPlug: ERROR $filename is not a well formed metadata.xml file ($@)\n"; } } sub Doctype { my ($expat, $name, $sysid, $pubid, $internal) = @_; # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files # to be processed as well as the "DirectoryMetadata" files which should now # be created by import.pl die if ($name !~ /^(Greenstone)?DirectoryMetadata$/); } sub StartTag { my ($expat, $element) = @_; if ($element eq "FileSet") { $self->{'saved_targets'} = []; $self->{'saved_metadata'} = {}; } elsif ($element eq "FileName") { $self->{'in_filename'} = 1; } elsif ($element eq "Metadata") { $self->{'metadata_name'} = $_{'name'}; if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) { $self->{'metadata_accumulate'} = 1; } else { $self->{'metadata_accumulate'} = 0; } } } sub EndTag { my ($expat, $element) = @_; if ($element eq "FileSet") { push (@{$self->{'metakeysref'}}, @{$self->{'saved_targets'}}); foreach my $target (@{$self->{'saved_targets'}}) { $self->{'metadataref'}->{$target} = $self->{'saved_metadata'}; } } elsif ($element eq "FileName") { $self->{'in_filename'} = 0; } elsif ($element eq "Metadata") { $self->{'metadata_name'} = ""; } } sub Text { if ($self->{'in_filename'}) { # $_ == FileName content push (@{$self->{'saved_targets'}}, $_); } elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") { # $_ == Metadata content my $mname = $self->{'metadata_name'}; if (defined $self->{'saved_metadata'}->{$mname}) { if ($self->{'metadata_accumulate'}) { # accumulate mode - add value to existing value(s) if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") { push (@{$self->{'saved_metadata'}->{$mname}}, $_); } else { $self->{'saved_metadata'}->{$mname} = [$self->{'saved_metadata'}->{$mname}, $_]; } } else { # override mode $self->{'saved_metadata'}->{$mname} = $_; } } else { if ($self->{'metadata_accumulate'}) { # accumulate mode - add value into (currently empty) array $self->{'saved_metadata'}->{$mname} = [$_]; } else { # override mode $self->{'saved_metadata'}->{$mname} = $_; } } } } # This Char function overrides the one in XML::Parser::Stream to overcome a # problem where $expat->{Text} is treated as the return value, slowing # things down significantly in some cases. sub Char { $_[0]->{'Text'} .= $_[1]; return undef; } # Combine two metadata structures. Given two references to metadata # element structures, add every field of the second ($mdref2) to the first # ($mdref1). # # Afterwards $mdref1 will be updated, and $mdref2 will be unchanged. # # We have to be acreful about the way we merge metadata when one metadata # structure is in "override" mode and one is in "merge" mode. In fact, we # use the mode from the second structure, $mdref2, because it is generally # defined later (lower in the directory structure) and is therefore more # "local" to the document concerned. # # Another issue is the use of references to pass metadata around. If we # simply copy one metadata structure reference to another, then we're # effectively justr copyinga pointer, and changes to the new referene # will affect the old (copied) one also. This also applies to ARRAY # references used as metadata element values (hence the "clonedata" # function below). sub combine_metadata_structures { my ($mdref1, $mdref2) = @_; my ($key, $value1, $value2); foreach $key (keys %$mdref2) { $value1 = $mdref1->{$key}; $value2 = $mdref2->{$key}; # If there is no existing value for this metadata field in # $mdref1, so we simply copy the value from $mdref2 over. if (!defined $value1) { $mdref1->{$key} = &clonedata($value2); } # Otherwise we have to add the new values to the existing ones. # If the second structure is accumulated, then acculate all the # values into the first structure elsif ((ref $value2) eq "ARRAY") { # If the first metadata element is a scalar we have to # convert it into an array before we add anything more. if ((ref $value1) ne 'ARRAY') { $mdref1->{$key} = [$value1]; $value1 = $mdref1->{$key}; } # Now add the value(s) from the second array to the first $value2 = &clonedata($value2); push @$value1, @$value2; } # Finally, If the second structure is not an array erference, we # know it is in override mode, so override the first structure. else { $mdref1->{$key} = &clonedata($value2); } } } # Make a "cloned" copy of a metadata value. # This is trivial for a simple scalar value, # but not for an array reference. sub clonedata { my ($value) = @_; my $result; if ((ref $value) eq 'ARRAY') { $result = []; foreach my $item (@$value) { push @$result, $item; } } else { $result = $value; } return $result; } 1;