###########################################################################
#
# RecPlug.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
# RecPlug is a plugin which recurses through directories processing
# each file it finds.
# RecPlug has one option: use_metadata_files. When this is set, it will
# check each directory for an XML file called "metadata.xml" that specifies
# metadata for the files (and subdirectories) in the directory.
#
# Here's an example of a metadata file that uses three FileSet structures
# (ignore the # characters):
#
#
#
#
# nugget.*
#
# Nugget Point, The Catlins
# Nugget Point
#
#
#
# nugget-point-1.jpg
#
# Nugget Point Lighthouse, The Catlins
# Lighthouse
#
#
#
# kaka-point-dir
#
# Kaka Point, The Catlins
#
#
#
# Metadata elements are read and applied to files in the order they appear
# in the file.
#
# The FileName element describes the subfiles in the directory that the
# metadata applies to as a perl regular expression (a FileSet group may
# contain multiple FileName elements). So, nugget.*
# indicates that the metadata records in the following Description block
# apply to every subfile that starts with "nugget". For these files, a
# Title metadata element is set, overriding any old value that the Title
# might have had.
#
# Occasionally, we want to have multiple metadata values applied to a
# document; in this case we use the "mode=accumulate" attribute of the
# particular Metadata element. In the second metadata element of the first
# FileSet above, the "Place" metadata is accumulating, and may therefore be
# given several values. If we wanted to override these values and use a
# single metadata element again, we could set the mode attribute to
# "override" instead. Remember: every element is assumed to be in override
# mode unless you specify otherwise, so if you want to accumulate metadata
# for some field, every occurance must have "mode=accumulate" specified.
#
# The second FileSet element above applies to a specific file, called
# nugget-point-1.jpg. This element overrides the Title metadata set in the
# first FileSet, and adds a "Subject" metadata field.
#
# The third and final FileSet sets metadata for a subdirectory rather than
# a file. The metadata specified (a Title) will be passed into the
# subdirectory and applied to every file that occurs in the subdirectory
# (and to every subsubdirectory and its contents, and so on) unless the
# metadata is explictly overridden later in the import.
package RecPlug;
use BasPlug;
use plugin;
use util;
use metadatautil;
use File::Basename;
use strict;
no strict 'refs';
use Encode;
BEGIN {
@RecPlug::ISA = ('BasPlug');
}
my $arguments =
[ { 'name' => "block_exp",
'desc' => "{BasPlug.block_exp}",
'type' => "regexp",
'deft' => &get_default_block_exp(),
'reqd' => "no" },
# this option has been deprecated. leave it here for now so we can warn people not to use it
{ 'name' => "use_metadata_files",
'desc' => "{RecPlug.use_metadata_files}",
'type' => "flag",
'reqd' => "no",
'hiddengli' => "yes" },
{ 'name' => "recheck_directories",
'desc' => "{RecPlug.recheck_directories}",
'type' => "flag",
'reqd' => "no" } ];
my $options = { 'name' => "RecPlug",
'desc' => "{RecPlug.desc}",
'abstract' => "no",
'inherits' => "yes",
'args' => $arguments };
sub new {
my ($class) = shift (@_);
my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
push(@$pluginlist, $class);
if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
if ($self->{'info_only'}) {
# don't worry about any options or initialisations etc
return bless $self, $class;
}
# we have left this option in so we can warn people who are still using it
if ($self->{'use_metadata_files'}) {
die "ERROR: RecPlug -use_metadata_files option has been deprecated. Please remove the option and add MetadataXMLPlug to your plugin list instead!\n";
}
$self->{'subdir_extrametakeys'} = {};
return bless $self, $class;
}
sub begin {
my $self = shift (@_);
my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
my $proc_package_name = ref $processor;
if ($proc_package_name !~ /buildproc$/ && $self->{'incremental'} == 1) {
# Only lookup timestamp info for import.pl, and only if incremental is set
my $output_dir = $processor->getoutputdir();
my $archives_inf = &util::filename_cat($output_dir,"archives.inf");
if ( -e $archives_inf ) {
$self->{'inf_timestamp'} = -M $archives_inf;
}
}
$self->SUPER::begin($pluginfo, $base_dir, $processor, $maxdocs);
}
# return 1 if this class might recurse using $pluginfo
sub is_recursive {
my $self = shift (@_);
return 1;
}
sub get_default_block_exp {
my $self = shift (@_);
return '(CVS|\.svn)';
}
# return number of files processed, undef if can't process
# Note that $base_dir might be "" and that $file might
# include directories
# This function passes around metadata hash structures. Metadata hash
# structures are hashes that map from a (scalar) key (the metadata element
# name) to either a scalar metadata value or a reference to an array of
# such values.
sub read {
my $self = shift (@_);
my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_;
my $outhandle = $self->{'outhandle'};
my $verbosity = $self->{'verbosity'};
my $read_metadata_files = $self->{'use_metadata_files'};
# Calculate the directory name and ensure it is a directory and
# that it is not explicitly blocked.
my $dirname = $file;
$dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
return undef unless (-d $dirname);
return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
# check to make sure we're not reading the archives or index directory
my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
return 0;
}
# check to see we haven't got a cyclic path...
if ($dirname =~ m%(/.*){,41}%) {
print $outhandle "RecPlug: $dirname is 40 directories deep, is this a recursive path? if not increase constant in RecPlug.pm.\n";
return 0;
}
# check to see we haven't got a cyclic path...
if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
print $outhandle "RecPlug: $dirname appears to be in a recursive loop...\n";
return 0;
}
if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
print $outhandle "RecPlug: metadata passed in: ",
join(", ", keys %$in_metadata), "\n";
}
# Recur over directory contents.
my (@dir, $subfile);
my $count = 0;
print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity);
# find all the files in the directory
if (!opendir (DIR, $dirname)) {
if ($gli) {
print STDERR "\n";
}
print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n";
return -1; # error in processing
}
@dir = readdir (DIR);
closedir (DIR);
# Re-order the files in the list so any directories ending with .all are moved to the end
for (my $i = scalar(@dir) - 1; $i >= 0; $i--) {
if (-d &util::filename_cat($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) {
push(@dir, splice(@dir, $i, 1));
}
}
# setup the metadata structures. we do a metadata_read pass to see if there is any additional metadata, then pass it to read
my $additionalmetadata = 0; # is there extra metadata available?
my %extrametadata; # maps from filespec to extra metadata keys
my @extrametakeys; # keys of %extrametadata in order read
my $os_dirsep = &util::get_os_dirsep();
my $dirsep = &util::get_dirsep();
my $base_dir_regexp = $base_dir;
$base_dir_regexp =~ s/\//$os_dirsep/g;
my $local_dirname = $dirname;
$local_dirname =~ s/^$base_dir_regexp($os_dirsep)//;
$local_dirname .= $dirsep;
if (defined $self->{'subdir_extrametakeys'}->{$local_dirname}) {
my $extrakeys = $self->{'subdir_extrametakeys'}->{$local_dirname};
foreach my $ek (@$extrakeys) {
my $extrakeys_re = $ek->{'re'};
my $extrakeys_md = $ek->{'md'};
push(@extrametakeys,$extrakeys_re);
$extrametadata{$extrakeys_re} = $extrakeys_md;
}
delete($self->{'subdir_extrametakeys'}->{$local_dirname});
}
# apply metadata pass for each of the files in the directory
my $out_metadata;
my $num_files = scalar(@dir);
for (my $i = 0; $i < scalar(@dir); $i++) {
my $subfile = $dir[$i];
my $this_file_base_dir = $base_dir;
last if ($maxdocs != -1 && $count >= $maxdocs);
next if ($subfile =~ m/^\.\.?$/);
# Recursively read each $subfile
print $outhandle "RecPlug metadata recurring: $subfile\n" if ($verbosity > 2);
$count += &plugin::metadata_read ($pluginfo, $this_file_base_dir,
&util::filename_cat($file, $subfile),
$out_metadata, \@extrametakeys, \%extrametadata,
$processor, $maxdocs, $gli);
$additionalmetadata = 1;
}
# filter out any extrametakeys that mention subdirectories and store
# for later use (i.e. when that sub-directory is being processed)
foreach my $ek (@extrametakeys) {
my ($subdir_re,$extrakey_dir) = &File::Basename::fileparse($ek);
$extrakey_dir =~ s/\\\./\./g; # remove RE syntax
my $dirsep_re = &util::get_re_dirsep();
if ($ek =~ m/$dirsep_re/) { # specifies at least one directory
my $md = $extrametadata{$ek};
my $subdir_extrametakeys = $self->{'subdir_extrametakeys'};
my $subdir_rec = { 're' => $subdir_re, 'md' => $md };
push(@{$subdir_extrametakeys->{$extrakey_dir}},$subdir_rec);
}
}
# import each of the files in the directory
$count=0;
for (my $i = 0; $i <= scalar(@dir); $i++) {
# When every file in the directory has been done, pause for a moment (figuratively!)
# If the -recheck_directories argument hasn't been provided, stop now (default)
# Otherwise, re-read the contents of the directory to check for new files
# Any new files are added to the @dir list and are processed as normal
# This is necessary when documents to be indexed are specified in bibliographic DBs
# These files are copied/downloaded and stored in a new folder at import time
if ($i == $num_files) {
last unless $self->{'recheck_directories'};
# Re-read the files in the directory to see if there are any new files
last if (!opendir (DIR, $dirname));
my @dirnow = readdir (DIR);
closedir (DIR);
# We're only interested if there are more files than there were before
last if (scalar(@dirnow) <= scalar(@dir));
# Any new files are added to the end of @dir to get processed by the loop
my $j;
foreach my $subfilenow (@dirnow) {
for ($j = 0; $j < $num_files; $j++) {
last if ($subfilenow eq $dir[$j]);
}
if ($j == $num_files) {
# New file
push(@dir, $subfilenow);
}
}
# When the new files have been processed, check again
$num_files = scalar(@dir);
}
my $subfile = $dir[$i];
my $this_file_base_dir = $base_dir;
last if ($maxdocs != -1 && ($count + $total_count) >= $maxdocs);
next if ($subfile =~ /^\.\.?$/);
# Follow Windows shortcuts
if ($subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {
require Win32::Shortcut;
my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $subfile));
if ($shortcut) {
# The file to be processed is now the target of the shortcut
$this_file_base_dir = "";
$file = "";
$subfile = $shortcut->Path;
}
}
# check for a symlink pointing back to a leading directory
if (-d "$dirname/$subfile" && -l "$dirname/$subfile") {
# readlink gives a "fatal error" on systems that don't implement
# symlinks. This assumes the the -l test above would fail on those.
my $linkdest=readlink "$dirname/$subfile";
if (!defined ($linkdest)) {
# system error - file not found?
warn "RecPlug: symlink problem - $!";
} else {
# see if link points to current or a parent directory
if ($linkdest =~ m@^[\./\\]+$@ ||
index($dirname, $linkdest) != -1) {
warn "RecPlug: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n";
next;
;
}
}
}
print $outhandle "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2);
# Make a copy of $in_metadata to pass to $subfile
$out_metadata = {};
&metadatautil::combine_metadata_structures($out_metadata, $in_metadata);
## encode the filename as perl5 doesn't handle unicode filenames
my $tmpfile = Encode::encode_utf8($subfile);
# Next add metadata read in XML files (if it is supplied)
if ($additionalmetadata == 1) {
my ($filespec, $mdref);
foreach $filespec (@extrametakeys) {
## use the utf8 encoded filename to do the filename comparison
if ($tmpfile =~ /^$filespec$/) {
print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
if ($verbosity > 2);
$mdref = $extrametadata{$filespec};
&metadatautil::combine_metadata_structures($out_metadata, $mdref);
}
}
}
my $file_subfile = &util::filename_cat($file, $subfile);
my $filename_subfile
= &util::filename_cat($this_file_base_dir,$file_subfile);
if (defined $self->{'inf_timestamp'}) {
my $inf_timestamp = $self->{'inf_timestamp'};
if (! -d $filename_subfile) {
my $filename_timestamp = -M $filename_subfile;
if ($filename_timestamp > $inf_timestamp) {
# filename has been around for longer than inf
##### print $outhandle "**** Skipping $subfile\n";
next;
}
}
}
# Recursively read each $subfile
print $outhandle "RecPlug recurring: $subfile\n" if ($verbosity > 2);
$count += &plugin::read ($pluginfo, $this_file_base_dir,
$file_subfile,
$out_metadata, $processor, $maxdocs, ($total_count + $count), $gli);
}
return $count;
}
1;