source: gsdl/trunk/perllib/plugins/DirectoryPlugin.pm@ 15870

Last change on this file since 15870 was 15870, checked in by kjdon, 16 years ago

plugin overhaul: ArchivesInf and Directory plugins are not true plugins as they don't process a file during import. I'd like to get rid of them all together and make them part of import/build scripts. In the meantime they are still here, and inherit from AbstractPlugin not BasePlugin as they don't need all the options that BasePlugin provides

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.0 KB
Line 
1###########################################################################
2#
3# DirectoryPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# DirectoryPlugin is a plugin which recurses through directories processing
27# each file it finds - which basically means passing it down the plugin
28# pipeline
29
30package DirectoryPlugin;
31
32use AbstractPlugin;
33use plugin;
34use util;
35use metadatautil;
36
37use File::Basename;
38use strict;
39no strict 'refs';
40no strict 'subs';
41
42use Encode;
43
44BEGIN {
45 @DirectoryPlugin::ISA = ('AbstractPlugin');
46}
47
48my $arguments =
49 [ { 'name' => "block_exp",
50 'desc' => "{BasePlugin.block_exp}",
51 'type' => "regexp",
52 'deft' => &get_default_block_exp(),
53 'reqd' => "no" },
54 # this option has been deprecated. leave it here for now so we can warn people not to use it
55 { 'name' => "use_metadata_files",
56 'desc' => "{DirectoryPlugin.use_metadata_files}",
57 'type' => "flag",
58 'reqd' => "no",
59 'hiddengli' => "yes" },
60 { 'name' => "recheck_directories",
61 'desc' => "{DirectoryPlugin.recheck_directories}",
62 'type' => "flag",
63 'reqd' => "no" } ];
64
65my $options = { 'name' => "DirectoryPlugin",
66 'desc' => "{DirectoryPlugin.desc}",
67 'abstract' => "no",
68 'inherits' => "yes",
69 'args' => $arguments };
70
71sub new {
72 my ($class) = shift (@_);
73 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
74 push(@$pluginlist, $class);
75
76 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
77 push(@{$hashArgOptLists->{"OptList"}},$options);
78
79 my $self = new AbstractPlugin($pluginlist, $inputargs, $hashArgOptLists);
80
81 if ($self->{'info_only'}) {
82 # don't worry about any options or initialisations etc
83 return bless $self, $class;
84 }
85
86 # we have left this option in so we can warn people who are still using it
87 if ($self->{'use_metadata_files'}) {
88 die "ERROR: DirectoryPlugin -use_metadata_files option has been deprecated. Please remove the option and add MetadataXMLPlug to your plugin list instead!\n";
89 }
90
91 $self->{'subdir_extrametakeys'} = {};
92
93 return bless $self, $class;
94}
95
96sub begin {
97 my $self = shift (@_);
98 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
99
100 my $proc_package_name = ref $processor;
101
102 if ($proc_package_name !~ /buildproc$/ && $self->{'incremental'} == 1) {
103
104 # Only lookup timestamp info for import.pl, and only if incremental is set
105
106 my $output_dir = $processor->getoutputdir();
107 my $archives_inf = &util::filename_cat($output_dir,"archives.inf");
108
109 if ( -e $archives_inf ) {
110 $self->{'inf_timestamp'} = -M $archives_inf;
111 }
112 }
113
114 #$self->SUPER::begin($pluginfo, $base_dir, $processor, $maxdocs);
115}
116
117sub end {
118
119}
120
121sub deinit {
122
123}
124# return 1 if this class might recurse using $pluginfo
125sub is_recursive {
126 my $self = shift (@_);
127
128 return 1;
129}
130
131sub get_default_block_exp {
132 my $self = shift (@_);
133
134 return '(CVS|\.svn)';
135}
136
137# return number of files processed, undef if can't process
138# Note that $base_dir might be "" and that $file might
139# include directories
140
141# This function passes around metadata hash structures. Metadata hash
142# structures are hashes that map from a (scalar) key (the metadata element
143# name) to either a scalar metadata value or a reference to an array of
144# such values.
145
146sub read {
147 my $self = shift (@_);
148 my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_;
149
150 my $outhandle = $self->{'outhandle'};
151 my $verbosity = $self->{'verbosity'};
152
153 # Calculate the directory name and ensure it is a directory and
154 # that it is not explicitly blocked.
155 my $dirname = $file;
156 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
157 return undef unless (-d $dirname);
158 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
159
160 # check to make sure we're not reading the archives or index directory
161 my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
162 if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
163 print $outhandle "DirectoryPlugin: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
164 return 0;
165 }
166
167 # check to see we haven't got a cyclic path...
168 if ($dirname =~ m%(/.*){,41}%) {
169 print $outhandle "DirectoryPlugin: $dirname is 40 directories deep, is this a recursive path? if not increase constant in DirectoryPlugin.pm.\n";
170 return 0;
171 }
172
173 # check to see we haven't got a cyclic path...
174 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
175 print $outhandle "DirectoryPlugin: $dirname appears to be in a recursive loop...\n";
176 return 0;
177 }
178
179 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
180 print $outhandle "DirectoryPlugin: metadata passed in: ",
181 join(", ", keys %$in_metadata), "\n";
182 }
183
184 # Recur over directory contents.
185 my (@dir, $subfile);
186 my $count = 0;
187
188 print $outhandle "DirectoryPlugin: getting directory $dirname\n" if ($verbosity);
189
190 # find all the files in the directory
191 if (!opendir (DIR, $dirname)) {
192 if ($gli) {
193 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
194 }
195 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
196 return -1; # error in processing
197 }
198 @dir = readdir (DIR);
199 closedir (DIR);
200
201 # Re-order the files in the list so any directories ending with .all are moved to the end
202 for (my $i = scalar(@dir) - 1; $i >= 0; $i--) {
203 if (-d &util::filename_cat($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) {
204 push(@dir, splice(@dir, $i, 1));
205 }
206 }
207
208 # setup the metadata structures. we do a metadata_read pass to see if there is any additional metadata, then pass it to read
209
210 my $additionalmetadata = 0; # is there extra metadata available?
211 my %extrametadata; # maps from filespec to extra metadata keys
212 my @extrametakeys; # keys of %extrametadata in order read
213
214 my $os_dirsep = &util::get_os_dirsep();
215 my $dirsep = &util::get_dirsep();
216 my $base_dir_regexp = $base_dir;
217 $base_dir_regexp =~ s/\//$os_dirsep/g;
218 my $local_dirname = $dirname;
219 $local_dirname =~ s/^$base_dir_regexp($os_dirsep)//;
220 $local_dirname .= $dirsep;
221
222 if (defined $self->{'subdir_extrametakeys'}->{$local_dirname}) {
223 my $extrakeys = $self->{'subdir_extrametakeys'}->{$local_dirname};
224 foreach my $ek (@$extrakeys) {
225 my $extrakeys_re = $ek->{'re'};
226 my $extrakeys_md = $ek->{'md'};
227 push(@extrametakeys,$extrakeys_re);
228 $extrametadata{$extrakeys_re} = $extrakeys_md;
229 }
230 delete($self->{'subdir_extrametakeys'}->{$local_dirname});
231 }
232
233 # apply metadata pass for each of the files in the directory
234 my $out_metadata;
235 my $num_files = scalar(@dir);
236 for (my $i = 0; $i < scalar(@dir); $i++) {
237 my $subfile = $dir[$i];
238 my $this_file_base_dir = $base_dir;
239 last if ($maxdocs != -1 && $count >= $maxdocs);
240 next if ($subfile =~ m/^\.\.?$/);
241
242 # Recursively read each $subfile
243 print $outhandle "DirectoryPlugin metadata recurring: $subfile\n" if ($verbosity > 2);
244
245 $count += &plugin::metadata_read ($pluginfo, $this_file_base_dir,
246 &util::filename_cat($file, $subfile),
247 $out_metadata, \@extrametakeys, \%extrametadata,
248 $processor, $maxdocs, $gli);
249 $additionalmetadata = 1;
250 }
251
252 # filter out any extrametakeys that mention subdirectories and store
253 # for later use (i.e. when that sub-directory is being processed)
254
255 foreach my $ek (@extrametakeys) {
256 my ($subdir_re,$extrakey_dir) = &File::Basename::fileparse($ek);
257 $extrakey_dir =~ s/\\\./\./g; # remove RE syntax
258
259 my $dirsep_re = &util::get_re_dirsep();
260
261 my $ek_non_re = $ek;
262 $ek_non_re =~ s/\\\./\./g; # remove RE syntax
263
264 if ($ek_non_re =~ m/$dirsep_re/) { # specifies at least one directory
265 my $md = $extrametadata{$ek};
266
267 my $subdir_extrametakeys = $self->{'subdir_extrametakeys'};
268
269 my $subdir_rec = { 're' => $subdir_re, 'md' => $md };
270
271 push(@{$subdir_extrametakeys->{$extrakey_dir}},$subdir_rec);
272 }
273 }
274
275 # import each of the files in the directory
276 $count=0;
277 for (my $i = 0; $i <= scalar(@dir); $i++) {
278 # When every file in the directory has been done, pause for a moment (figuratively!)
279 # If the -recheck_directories argument hasn't been provided, stop now (default)
280 # Otherwise, re-read the contents of the directory to check for new files
281 # Any new files are added to the @dir list and are processed as normal
282 # This is necessary when documents to be indexed are specified in bibliographic DBs
283 # These files are copied/downloaded and stored in a new folder at import time
284 if ($i == $num_files) {
285 last unless $self->{'recheck_directories'};
286
287 # Re-read the files in the directory to see if there are any new files
288 last if (!opendir (DIR, $dirname));
289 my @dirnow = readdir (DIR);
290 closedir (DIR);
291
292 # We're only interested if there are more files than there were before
293 last if (scalar(@dirnow) <= scalar(@dir));
294
295 # Any new files are added to the end of @dir to get processed by the loop
296 my $j;
297 foreach my $subfilenow (@dirnow) {
298 for ($j = 0; $j < $num_files; $j++) {
299 last if ($subfilenow eq $dir[$j]);
300 }
301 if ($j == $num_files) {
302 # New file
303 push(@dir, $subfilenow);
304 }
305 }
306 # When the new files have been processed, check again
307 $num_files = scalar(@dir);
308 }
309
310 my $subfile = $dir[$i];
311 my $this_file_base_dir = $base_dir;
312 last if ($maxdocs != -1 && ($count + $total_count) >= $maxdocs);
313 next if ($subfile =~ /^\.\.?$/);
314
315 # Follow Windows shortcuts
316 if ($subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {
317 require Win32::Shortcut;
318 my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $subfile));
319 if ($shortcut) {
320 # The file to be processed is now the target of the shortcut
321 $this_file_base_dir = "";
322 $file = "";
323 $subfile = $shortcut->Path;
324 }
325 }
326
327 # check for a symlink pointing back to a leading directory
328 if (-d "$dirname/$subfile" && -l "$dirname/$subfile") {
329 # readlink gives a "fatal error" on systems that don't implement
330 # symlinks. This assumes the the -l test above would fail on those.
331 my $linkdest=readlink "$dirname/$subfile";
332 if (!defined ($linkdest)) {
333 # system error - file not found?
334 warn "DirectoryPlugin: symlink problem - $!";
335 } else {
336 # see if link points to current or a parent directory
337 if ($linkdest =~ m@^[\./\\]+$@ ||
338 index($dirname, $linkdest) != -1) {
339 warn "DirectoryPlugin: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n";
340 next;
341 ;
342 }
343 }
344 }
345
346 print $outhandle "DirectoryPlugin: preparing metadata for $subfile\n" if ($verbosity > 2);
347
348 # Make a copy of $in_metadata to pass to $subfile
349 $out_metadata = {};
350 &metadatautil::combine_metadata_structures($out_metadata, $in_metadata);
351
352 ## encode the filename as perl5 doesn't handle unicode filenames
353 my $tmpfile = Encode::encode_utf8($subfile);
354
355 # Next add metadata read in XML files (if it is supplied)
356 if ($additionalmetadata == 1) {
357
358 my ($filespec, $mdref);
359 foreach $filespec (@extrametakeys) {
360 ## use the utf8 encoded filename to do the filename comparison
361 if ($tmpfile =~ /^$filespec$/) {
362 print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
363 if ($verbosity > 2);
364 $mdref = $extrametadata{$filespec};
365 &metadatautil::combine_metadata_structures($out_metadata, $mdref);
366 }
367 }
368 }
369
370
371 my $file_subfile = &util::filename_cat($file, $subfile);
372 my $filename_subfile
373 = &util::filename_cat($this_file_base_dir,$file_subfile);
374 if (defined $self->{'inf_timestamp'}) {
375 my $inf_timestamp = $self->{'inf_timestamp'};
376
377 if (! -d $filename_subfile) {
378 my $filename_timestamp = -M $filename_subfile;
379 if ($filename_timestamp > $inf_timestamp) {
380 # filename has been around for longer than inf
381##### print $outhandle "**** Skipping $subfile\n";
382 next;
383 }
384 }
385 }
386
387 # Recursively read each $subfile
388 print $outhandle "DirectoryPlugin recurring: $subfile\n" if ($verbosity > 2);
389
390 $count += &plugin::read ($pluginfo, $this_file_base_dir,
391 $file_subfile,
392 $out_metadata, $processor, $maxdocs, ($total_count + $count), $gli);
393 }
394
395 return $count;
396}
397
3981;
Note: See TracBrowser for help on using the repository browser.