########################################################################### # # RecPlug.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # RecPlug is a plugin which recurses through directories processing # each file it finds. # RecPlug has one option: use_metadata_files. When this is set, it will # check each directory for an XML file called "metadata.xml" that specifies # metadata for the files (and subdirectories) in the directory. # # Here's an example of a metadata file that uses three FileSet structures # (ignore the # characters): # # # # # nugget.* # # Nugget Point, The Catlins # Nugget Point # # # # nugget-point-1.jpg # # Nugget Point Lighthouse, The Catlins # Lighthouse # # # # kaka-point-dir # # Kaka Point, The Catlins # # # # Metadata elements are read and applied to files in the order they appear # in the file. # # The FileName element describes the subfiles in the directory that the # metadata applies to as a perl regular expression (a FileSet group may # contain multiple FileName elements). So, nugget.* # indicates that the metadata records in the following Description block # apply to every subfile that starts with "nugget". For these files, a # Title metadata element is set, overriding any old value that the Title # might have had. # # Occasionally, we want to have multiple metadata values applied to a # document; in this case we use the "mode=accumulate" attribute of the # particular Metadata element. In the second metadata element of the first # FileSet above, the "Place" metadata is accumulating, and may therefore be # given several values. If we wanted to override these values and use a # single metadata element again, we could set the mode attribute to # "override" instead. Remember: every element is assumed to be in override # mode unless you specify otherwise, so if you want to accumulate metadata # for some field, every occurance must have "mode=accumulate" specified. # # The second FileSet element above applies to a specific file, called # nugget-point-1.jpg. This element overrides the Title metadata set in the # first FileSet, and adds a "Subject" metadata field. # # The third and final FileSet sets metadata for a subdirectory rather than # a file. The metadata specified (a Title) will be passed into the # subdirectory and applied to every file that occurs in the subdirectory # (and to every subsubdirectory and its contents, and so on) unless the # metadata is explictly overridden later in the import. package RecPlug; use BasPlug; use plugin; use util; use metadatautil; use File::Basename; use strict; no strict 'refs'; use Encode; BEGIN { @RecPlug::ISA = ('BasPlug'); } my $arguments = [ { 'name' => "block_exp", 'desc' => "{BasPlug.block_exp}", 'type' => "regexp", 'deft' => &get_default_block_exp(), 'reqd' => "no" }, # this option has been deprecated. leave it here for now so we can warn people not to use it { 'name' => "use_metadata_files", 'desc' => "{RecPlug.use_metadata_files}", 'type' => "flag", 'reqd' => "no", 'hiddengli' => "yes" }, { 'name' => "recheck_directories", 'desc' => "{RecPlug.recheck_directories}", 'type' => "flag", 'reqd' => "no" } ]; my $options = { 'name' => "RecPlug", 'desc' => "{RecPlug.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists); if ($self->{'info_only'}) { # don't worry about any options or initialisations etc return bless $self, $class; } # we have left this option in so we can warn people who are still using it if ($self->{'use_metadata_files'}) { die "ERROR: RecPlug -use_metadata_files option has been deprecated. Please remove the option and add MetadataXMLPlug to your plugin list instead!\n"; } $self->{'subdir_extrametakeys'} = {}; return bless $self, $class; } sub begin { my $self = shift (@_); my ($pluginfo, $base_dir, $processor, $maxdocs) = @_; my $proc_package_name = ref $processor; if ($proc_package_name !~ /buildproc$/ && $self->{'incremental'} == 1) { # Only lookup timestamp info for import.pl, and only if incremental is set my $output_dir = $processor->getoutputdir(); my $archives_inf = &util::filename_cat($output_dir,"archives.inf"); if ( -e $archives_inf ) { $self->{'inf_timestamp'} = -M $archives_inf; } } $self->SUPER::begin($pluginfo, $base_dir, $processor, $maxdocs); } # return 1 if this class might recurse using $pluginfo sub is_recursive { my $self = shift (@_); return 1; } sub get_default_block_exp { my $self = shift (@_); return '(CVS|\.svn)'; } # return number of files processed, undef if can't process # Note that $base_dir might be "" and that $file might # include directories # This function passes around metadata hash structures. Metadata hash # structures are hashes that map from a (scalar) key (the metadata element # name) to either a scalar metadata value or a reference to an array of # such values. sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_; my $outhandle = $self->{'outhandle'}; my $verbosity = $self->{'verbosity'}; my $read_metadata_files = $self->{'use_metadata_files'}; # Calculate the directory name and ensure it is a directory and # that it is not explicitly blocked. my $dirname = $file; $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; return undef unless (-d $dirname); return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/); # check to make sure we're not reading the archives or index directory my $gsdlhome = quotemeta($ENV{'GSDLHOME'}); if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) { print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n"; return 0; } # check to see we haven't got a cyclic path... if ($dirname =~ m%(/.*){,41}%) { print $outhandle "RecPlug: $dirname is 40 directories deep, is this a recursive path? if not increase constant in RecPlug.pm.\n"; return 0; } # check to see we haven't got a cyclic path... if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) { print $outhandle "RecPlug: $dirname appears to be in a recursive loop...\n"; return 0; } if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) { print $outhandle "RecPlug: metadata passed in: ", join(", ", keys %$in_metadata), "\n"; } # Recur over directory contents. my (@dir, $subfile); my $count = 0; print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity); # find all the files in the directory if (!opendir (DIR, $dirname)) { if ($gli) { print STDERR "\n"; } print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n"; return -1; # error in processing } @dir = readdir (DIR); closedir (DIR); # Re-order the files in the list so any directories ending with .all are moved to the end for (my $i = scalar(@dir) - 1; $i >= 0; $i--) { if (-d &util::filename_cat($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) { push(@dir, splice(@dir, $i, 1)); } } # setup the metadata structures. we do a metadata_read pass to see if there is any additional metadata, then pass it to read my $additionalmetadata = 0; # is there extra metadata available? my %extrametadata; # maps from filespec to extra metadata keys my @extrametakeys; # keys of %extrametadata in order read my $os_dirsep = &util::get_os_dirsep(); my $dirsep = &util::get_dirsep(); my $base_dir_regexp = $base_dir; $base_dir_regexp =~ s/\//$os_dirsep/g; my $local_dirname = $dirname; $local_dirname =~ s/^$base_dir_regexp($os_dirsep)//; $local_dirname .= $dirsep; if (defined $self->{'subdir_extrametakeys'}->{$local_dirname}) { my $extrakeys = $self->{'subdir_extrametakeys'}->{$local_dirname}; foreach my $ek (@$extrakeys) { my $extrakeys_re = $ek->{'re'}; my $extrakeys_md = $ek->{'md'}; push(@extrametakeys,$extrakeys_re); $extrametadata{$extrakeys_re} = $extrakeys_md; } delete($self->{'subdir_extrametakeys'}->{$local_dirname}); } # apply metadata pass for each of the files in the directory my $out_metadata; my $num_files = scalar(@dir); for (my $i = 0; $i < scalar(@dir); $i++) { my $subfile = $dir[$i]; my $this_file_base_dir = $base_dir; last if ($maxdocs != -1 && $count >= $maxdocs); next if ($subfile =~ m/^\.\.?$/); # Recursively read each $subfile print $outhandle "RecPlug metadata recurring: $subfile\n" if ($verbosity > 2); $count += &plugin::metadata_read ($pluginfo, $this_file_base_dir, &util::filename_cat($file, $subfile), $out_metadata, \@extrametakeys, \%extrametadata, $processor, $maxdocs, $gli); $additionalmetadata = 1; } # filter out any extrametakeys that mention subdirectories and store # for later use (i.e. when that sub-directory is being processed) foreach my $ek (@extrametakeys) { my ($subdir_re,$extrakey_dir) = &File::Basename::fileparse($ek); $extrakey_dir =~ s/\\\./\./g; # remove RE syntax my $dirsep_re = &util::get_re_dirsep(); if ($ek =~ m/$dirsep_re/) { # specifies at least one directory my $md = $extrametadata{$ek}; my $subdir_extrametakeys = $self->{'subdir_extrametakeys'}; my $subdir_rec = { 're' => $subdir_re, 'md' => $md }; push(@{$subdir_extrametakeys->{$extrakey_dir}},$subdir_rec); } } # import each of the files in the directory $count=0; for (my $i = 0; $i <= scalar(@dir); $i++) { # When every file in the directory has been done, pause for a moment (figuratively!) # If the -recheck_directories argument hasn't been provided, stop now (default) # Otherwise, re-read the contents of the directory to check for new files # Any new files are added to the @dir list and are processed as normal # This is necessary when documents to be indexed are specified in bibliographic DBs # These files are copied/downloaded and stored in a new folder at import time if ($i == $num_files) { last unless $self->{'recheck_directories'}; # Re-read the files in the directory to see if there are any new files last if (!opendir (DIR, $dirname)); my @dirnow = readdir (DIR); closedir (DIR); # We're only interested if there are more files than there were before last if (scalar(@dirnow) <= scalar(@dir)); # Any new files are added to the end of @dir to get processed by the loop my $j; foreach my $subfilenow (@dirnow) { for ($j = 0; $j < $num_files; $j++) { last if ($subfilenow eq $dir[$j]); } if ($j == $num_files) { # New file push(@dir, $subfilenow); } } # When the new files have been processed, check again $num_files = scalar(@dir); } my $subfile = $dir[$i]; my $this_file_base_dir = $base_dir; last if ($maxdocs != -1 && ($count + $total_count) >= $maxdocs); next if ($subfile =~ /^\.\.?$/); # Follow Windows shortcuts if ($subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) { require Win32::Shortcut; my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $subfile)); if ($shortcut) { # The file to be processed is now the target of the shortcut $this_file_base_dir = ""; $file = ""; $subfile = $shortcut->Path; } } # check for a symlink pointing back to a leading directory if (-d "$dirname/$subfile" && -l "$dirname/$subfile") { # readlink gives a "fatal error" on systems that don't implement # symlinks. This assumes the the -l test above would fail on those. my $linkdest=readlink "$dirname/$subfile"; if (!defined ($linkdest)) { # system error - file not found? warn "RecPlug: symlink problem - $!"; } else { # see if link points to current or a parent directory if ($linkdest =~ m@^[\./\\]+$@ || index($dirname, $linkdest) != -1) { warn "RecPlug: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n"; next; ; } } } print $outhandle "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2); # Make a copy of $in_metadata to pass to $subfile $out_metadata = {}; &metadatautil::combine_metadata_structures($out_metadata, $in_metadata); ## encode the filename as perl5 doesn't handle unicode filenames my $tmpfile = Encode::encode_utf8($subfile); # Next add metadata read in XML files (if it is supplied) if ($additionalmetadata == 1) { my ($filespec, $mdref); foreach $filespec (@extrametakeys) { ## use the utf8 encoded filename to do the filename comparison if ($tmpfile =~ /^$filespec$/) { print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n" if ($verbosity > 2); $mdref = $extrametadata{$filespec}; &metadatautil::combine_metadata_structures($out_metadata, $mdref); } } } my $file_subfile = &util::filename_cat($file, $subfile); my $filename_subfile = &util::filename_cat($this_file_base_dir,$file_subfile); if (defined $self->{'inf_timestamp'}) { my $inf_timestamp = $self->{'inf_timestamp'}; if (! -d $filename_subfile) { my $filename_timestamp = -M $filename_subfile; if ($filename_timestamp > $inf_timestamp) { # filename has been around for longer than inf ##### print $outhandle "**** Skipping $subfile\n"; next; } } } # Recursively read each $subfile print $outhandle "RecPlug recurring: $subfile\n" if ($verbosity > 2); $count += &plugin::read ($pluginfo, $this_file_base_dir, $file_subfile, $out_metadata, $processor, $maxdocs, ($total_count + $count), $gli); } return $count; } 1;