source: gsdl/trunk/perllib/plugins/DirectoryPlugin.pm@ 17320

Last change on this file since 17320 was 17320, checked in by kjdon, 16 years ago

found and fixed what I think is a bug - in the metadata structures for metadata_read, file targets are local to the current subdirectory. But when we look for directory stuff, its using path local to the base dir (import). So when add the file target into the subdir_extrametakeys, need to add on the local dir

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.9 KB
Line 
1###########################################################################
2#
3# DirectoryPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# DirectoryPlugin is a plugin which recurses through directories processing
27# each file it finds - which basically means passing it down the plugin
28# pipeline
29
30package DirectoryPlugin;
31
32use AbstractPlugin;
33use plugin;
34use util;
35use metadatautil;
36
37use File::Basename;
38use strict;
39no strict 'refs';
40no strict 'subs';
41
42use Encode;
43
44BEGIN {
45 @DirectoryPlugin::ISA = ('AbstractPlugin');
46}
47
48my $arguments =
49 [ { 'name' => "block_exp",
50 'desc' => "{BasePlugin.block_exp}",
51 'type' => "regexp",
52 'deft' => &get_default_block_exp(),
53 'reqd' => "no" },
54 # this option has been deprecated. leave it here for now so we can warn people not to use it
55 { 'name' => "use_metadata_files",
56 'desc' => "{DirectoryPlugin.use_metadata_files}",
57 'type' => "flag",
58 'reqd' => "no",
59 'hiddengli' => "yes" },
60 { 'name' => "recheck_directories",
61 'desc' => "{DirectoryPlugin.recheck_directories}",
62 'type' => "flag",
63 'reqd' => "no" } ];
64
65my $options = { 'name' => "DirectoryPlugin",
66 'desc' => "{DirectoryPlugin.desc}",
67 'abstract' => "no",
68 'inherits' => "yes",
69 'args' => $arguments };
70
71sub new {
72 my ($class) = shift (@_);
73 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
74 push(@$pluginlist, $class);
75
76 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
77 push(@{$hashArgOptLists->{"OptList"}},$options);
78
79 my $self = new AbstractPlugin($pluginlist, $inputargs, $hashArgOptLists);
80
81 if ($self->{'info_only'}) {
82 # don't worry about any options or initialisations etc
83 return bless $self, $class;
84 }
85
86 # we have left this option in so we can warn people who are still using it
87 if ($self->{'use_metadata_files'}) {
88 die "ERROR: DirectoryPlugin -use_metadata_files option has been deprecated. Please remove the option and add MetadataXMLPlug to your plugin list instead!\n";
89 }
90
91 $self->{'subdir_extrametakeys'} = {};
92
93 return bless $self, $class;
94}
95
96sub begin {
97 my $self = shift (@_);
98 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
99
100 my $proc_package_name = ref $processor;
101
102 if ($proc_package_name !~ /buildproc$/ && $self->{'incremental'} == 1) {
103
104 # Only lookup timestamp info for import.pl, and only if incremental is set
105
106 my $output_dir = $processor->getoutputdir();
107 my $archives_inf = &util::filename_cat($output_dir,"archives.inf");
108
109 if ( -e $archives_inf ) {
110 $self->{'inf_timestamp'} = -M $archives_inf;
111 }
112 }
113
114 #$self->SUPER::begin($pluginfo, $base_dir, $processor, $maxdocs);
115}
116
117sub end {
118
119}
120
121sub deinit {
122
123}
124# return 1 if this class might recurse using $pluginfo
125sub is_recursive {
126 my $self = shift (@_);
127
128 return 1;
129}
130
131sub get_default_block_exp {
132 my $self = shift (@_);
133
134 return '(?i)(CVS|\.svn|Thumbs\.db|~$)';
135}
136
137sub check_directory_path {
138
139 my $self = shift(@_);
140 my ($dirname) = @_;
141
142 return undef unless (-d $dirname);
143
144 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
145
146 my $outhandle = $self->{'outhandle'};
147
148 # check to make sure we're not reading the archives or index directory
149 my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
150 if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
151 print $outhandle "DirectoryPlugin: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
152 return 0;
153 }
154
155 # check to see we haven't got a cyclic path...
156 if ($dirname =~ m%(/.*){,41}%) {
157 print $outhandle "DirectoryPlugin: $dirname is 40 directories deep, is this a recursive path? if not increase constant in DirectoryPlugin.pm.\n";
158 return 0;
159 }
160
161 # check to see we haven't got a cyclic path...
162 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
163 print $outhandle "DirectoryPlugin: $dirname appears to be in a recursive loop...\n";
164 return 0;
165 }
166
167 return 1;
168}
169
170# this may be called more than once
171sub sort_out_associated_files {
172
173 my $self = shift (@_);
174 my ($block_hash) = @_;
175 if (!scalar (keys %{$block_hash->{'shared_fileroot'}})) {
176 return;
177 }
178
179 $self->{'assocfile_info'} = {} unless defined $self->{'assocfile_info'};
180 my $metadata = $self->{'assocfile_info'};
181 foreach my $prefix (keys %{$block_hash->{'shared_fileroot'}}) {
182 my $record = $block_hash->{'shared_fileroot'}->{$prefix};
183
184 my $tie_to = $record->{'tie_to'};
185 my $exts = $record->{'exts'};
186
187 if ((defined $tie_to) && (scalar (keys %$exts) > 0)) {
188 # set up fileblocks and assocfile_tobe
189 my $base_file = "$prefix$tie_to";
190 $metadata->{$base_file} = {} unless defined $metadata->{$base_file};
191 my $base_file_metadata = $metadata->{$base_file};
192
193 $base_file_metadata->{'gsdlassocfile_tobe'} = [] unless defined $base_file_metadata->{'gsdlassocfile_tobe'};
194 my $assoc_tobe = $base_file_metadata->{'gsdlassocfile_tobe'};
195 foreach my $e (keys %$exts) {
196 # block the file
197 $block_hash->{'file_blocks'}->{"$prefix$e"} = 1;
198 # set up as an associatd file
199 print STDERR " $self->{'plugin_type'}: Associating $prefix$e with $tie_to version\n";
200 my $mime_type = ""; # let system auto detect this
201 push(@$assoc_tobe,"$prefix$e:$mime_type:");
202
203 }
204 }
205 } # foreach record
206
207 $block_hash->{'shared_fileroot'} = undef;
208 $block_hash->{'shared_fileroot'} = {};
209
210}
211
212
213# do block exp OR special blocking ???
214
215sub file_is_blocked {
216 my $self = shift (@_);
217 my ($block_hash, $filename_full_path) = @_;
218
219 if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) {
220 $self->{'num_blocked'} ++;
221 return 1;
222 }
223 # check Directory plugin's own block_exp
224 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
225 $self->{'num_blocked'} ++;
226 return 1; # blocked
227 }
228 return 0;
229}
230
231
232
233sub file_block_read {
234 my $self = shift (@_);
235 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
236
237 my $outhandle = $self->{'outhandle'};
238 my $verbosity = $self->{'verbosity'};
239
240 # Calculate the directory name and ensure it is a directory and
241 # that it is not explicitly blocked.
242 my $dirname = $file;
243 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
244
245 my $directory_ok = $self->check_directory_path($dirname);
246 return $directory_ok unless (defined $directory_ok && $directory_ok == 1);
247
248 $block_hash->{'file_blocks'} = {} unless defined $block_hash->{'file_blocks'};
249 $block_hash->{'shared_fileroot'} = {} unless defined $block_hash->{'shared_fileroot'};
250
251 # Recur over directory contents.
252 my (@dir, $subfile);
253 #my $count = 0;
254
255 print $outhandle "DirectoryPlugin block: getting directory $dirname\n" if ($verbosity > 2);
256
257 # find all the files in the directory
258 if (!opendir (DIR, $dirname)) {
259 if ($gli) {
260 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
261 }
262 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
263 return -1; # error in processing
264 }
265 @dir = readdir (DIR);
266 closedir (DIR);
267
268 for (my $i = 0; $i < scalar(@dir); $i++) {
269 my $subfile = $dir[$i];
270 my $this_file_base_dir = $base_dir;
271 next if ($subfile =~ m/^\.\.?$/);
272
273 # Recursively read each $subfile
274 print $outhandle "DirectoryPlugin block recurring: $subfile\n" if ($verbosity > 2);
275
276 #$count += &plugin::file_block_read ($pluginfo, $this_file_base_dir,
277 &plugin::file_block_read ($pluginfo, $this_file_base_dir,
278 &util::filename_cat($file, $subfile),
279 $block_hash, $metadata, $gli);
280
281 }
282 $self->sort_out_associated_files($block_hash);
283 #return $count;
284
285}
286# return number of files processed, undef if can't process
287# Note that $base_dir might be "" and that $file might
288# include directories
289
290# This function passes around metadata hash structures. Metadata hash
291# structures are hashes that map from a (scalar) key (the metadata element
292# name) to either a scalar metadata value or a reference to an array of
293# such values.
294
295sub read {
296 my $self = shift (@_);
297 my ($pluginfo, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_;
298
299 my $outhandle = $self->{'outhandle'};
300 my $verbosity = $self->{'verbosity'};
301
302 # Calculate the directory name and ensure it is a directory and
303 # that it is not explicitly blocked.
304 my $dirname;
305 if ($file eq "") {
306 $dirname = $base_dir;
307 } else {
308 $dirname = $file;
309 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
310 }
311
312 my $directory_ok = $self->check_directory_path($dirname);
313 return $directory_ok unless (defined $directory_ok && $directory_ok == 1);
314
315 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
316 print $outhandle "DirectoryPlugin: metadata passed in: ",
317 join(", ", keys %$in_metadata), "\n";
318 }
319
320
321 # Recur over directory contents.
322 my (@dir, $subfile);
323 my $count = 0;
324
325 print $outhandle "DirectoryPlugin read: getting directory $dirname\n" if ($verbosity > 2);
326
327 # find all the files in the directory
328 if (!opendir (DIR, $dirname)) {
329 if ($gli) {
330 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
331 }
332 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
333 return -1; # error in processing
334 }
335 @dir = readdir (DIR);
336 closedir (DIR);
337
338 # Re-order the files in the list so any directories ending with .all are moved to the end
339 for (my $i = scalar(@dir) - 1; $i >= 0; $i--) {
340 if (-d &util::filename_cat($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) {
341 push(@dir, splice(@dir, $i, 1));
342 }
343 }
344
345 # setup the metadata structures. we do a metadata_read pass to see if there is any additional metadata, then pass it to read
346
347 my $additionalmetadata = 0; # is there extra metadata available?
348 my %extrametadata; # maps from filespec to extra metadata keys
349 my @extrametakeys; # keys of %extrametadata in order read
350
351
352 my $os_dirsep = &util::get_os_dirsep();
353 my $dirsep = &util::get_dirsep();
354 my $base_dir_regexp = $base_dir;
355 $base_dir_regexp =~ s/\//$os_dirsep/g;
356 my $local_dirname = $dirname;
357 $local_dirname =~ s/^$base_dir_regexp($os_dirsep)//;
358 $local_dirname .= $dirsep;
359
360 if (defined $self->{'subdir_extrametakeys'}->{$local_dirname}) {
361 my $extrakeys = $self->{'subdir_extrametakeys'}->{$local_dirname};
362 foreach my $ek (@$extrakeys) {
363 my $extrakeys_re = $ek->{'re'};
364 my $extrakeys_md = $ek->{'md'};
365 push(@extrametakeys,$extrakeys_re);
366 $extrametadata{$extrakeys_re} = $extrakeys_md;
367 }
368 delete($self->{'subdir_extrametakeys'}->{$local_dirname});
369 }
370
371 # apply metadata pass for each of the files in the directory
372 my $num_files = scalar(@dir);
373 for (my $i = 0; $i < scalar(@dir); $i++) {
374 my $subfile = $dir[$i];
375 my $this_file_base_dir = $base_dir;
376 last if ($maxdocs != -1 && $count >= $maxdocs);
377 next if ($subfile =~ m/^\.\.?$/);
378 my $file_subfile = &util::filename_cat($file, $subfile);
379 my $full_filename = &util::filename_cat($this_file_base_dir, $file_subfile);
380 if ($self->file_is_blocked($block_hash,$full_filename)) {
381 print STDERR "DirectoryPlugin: file $full_filename was blocked for metadata_read\n" if ($verbosity > 2);
382 next;
383 }
384
385 # Recursively read each $subfile
386 print $outhandle "DirectoryPlugin metadata recurring: $subfile\n" if ($verbosity > 2);
387
388 $count += &plugin::metadata_read ($pluginfo, $this_file_base_dir,
389 $file_subfile,$block_hash,
390 \@extrametakeys, \%extrametadata,
391 $processor, $maxdocs, $gli);
392 $additionalmetadata = 1;
393 }
394
395 # filter out any extrametakeys that mention subdirectories and store
396 # for later use (i.e. when that sub-directory is being processed)
397
398 foreach my $ek (@extrametakeys) {
399 my ($subdir_re,$extrakey_dir) = &File::Basename::fileparse($ek);
400 $extrakey_dir =~ s/\\\./\./g; # remove RE syntax
401
402 my $dirsep_re = &util::get_re_dirsep();
403
404 my $ek_non_re = $ek;
405 $ek_non_re =~ s/\\\./\./g; # remove RE syntax
406
407 if ($ek_non_re =~ m/$dirsep_re/) { # specifies at least one directory
408 my $md = $extrametadata{$ek};
409
410 my $subdir_extrametakeys = $self->{'subdir_extrametakeys'};
411
412 my $subdir_rec = { 're' => $subdir_re, 'md' => $md };
413
414 # when its looked up, it must be relative to the base dir
415 push(@{$subdir_extrametakeys->{"$local_dirname$extrakey_dir"}},$subdir_rec);
416 #push(@{$subdir_extrametakeys->{"$extrakey_dir"}},$subdir_rec);
417 }
418 }
419
420 # import each of the files in the directory
421 $count=0;
422 for (my $i = 0; $i <= scalar(@dir); $i++) {
423 # When every file in the directory has been done, pause for a moment (figuratively!)
424 # If the -recheck_directories argument hasn't been provided, stop now (default)
425 # Otherwise, re-read the contents of the directory to check for new files
426 # Any new files are added to the @dir list and are processed as normal
427 # This is necessary when documents to be indexed are specified in bibliographic DBs
428 # These files are copied/downloaded and stored in a new folder at import time
429 if ($i == $num_files) {
430 last unless $self->{'recheck_directories'};
431
432 # Re-read the files in the directory to see if there are any new files
433 last if (!opendir (DIR, $dirname));
434 my @dirnow = readdir (DIR);
435 closedir (DIR);
436
437 # We're only interested if there are more files than there were before
438 last if (scalar(@dirnow) <= scalar(@dir));
439
440 # Any new files are added to the end of @dir to get processed by the loop
441 my $j;
442 foreach my $subfilenow (@dirnow) {
443 for ($j = 0; $j < $num_files; $j++) {
444 last if ($subfilenow eq $dir[$j]);
445 }
446 if ($j == $num_files) {
447 # New file
448 push(@dir, $subfilenow);
449 }
450 }
451 # When the new files have been processed, check again
452 $num_files = scalar(@dir);
453 }
454
455 my $subfile = $dir[$i];
456 my $this_file_base_dir = $base_dir;
457 last if ($maxdocs != -1 && ($count + $total_count) >= $maxdocs);
458 next if ($subfile =~ /^\.\.?$/);
459
460 my $file_subfile = &util::filename_cat($file, $subfile);
461 my $full_filename
462 = &util::filename_cat($this_file_base_dir,$file_subfile);
463
464 if ($self->file_is_blocked($block_hash,$full_filename)) {
465 print STDERR "DirectoryPlugin: file $full_filename was blocked for read\n" if ($verbosity > 2);
466 next;
467 }
468 #print STDERR "processing $full_filename\n";
469 # Follow Windows shortcuts
470 if ($subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {
471 require Win32::Shortcut;
472 my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $subfile));
473 if ($shortcut) {
474 # The file to be processed is now the target of the shortcut
475 $this_file_base_dir = "";
476 $file = "";
477 $subfile = $shortcut->Path;
478 }
479 }
480
481 # check for a symlink pointing back to a leading directory
482 if (-d "$dirname/$subfile" && -l "$dirname/$subfile") {
483 # readlink gives a "fatal error" on systems that don't implement
484 # symlinks. This assumes the the -l test above would fail on those.
485 my $linkdest=readlink "$dirname/$subfile";
486 if (!defined ($linkdest)) {
487 # system error - file not found?
488 warn "DirectoryPlugin: symlink problem - $!";
489 } else {
490 # see if link points to current or a parent directory
491 if ($linkdest =~ m@^[\./\\]+$@ ||
492 index($dirname, $linkdest) != -1) {
493 warn "DirectoryPlugin: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n";
494 next;
495 ;
496 }
497 }
498 }
499
500 print $outhandle "DirectoryPlugin: preparing metadata for $subfile\n" if ($verbosity > 2);
501
502 # Make a copy of $in_metadata to pass to $subfile
503 my $out_metadata = {};
504 &metadatautil::combine_metadata_structures($out_metadata, $in_metadata);
505
506 # check the assocfile_info
507 if (defined $self->{'assocfile_info'}->{$full_filename}) {
508 &metadatautil::combine_metadata_structures($out_metadata, $self->{'assocfile_info'}->{$full_filename});
509 }
510 ## encode the filename as perl5 doesn't handle unicode filenames
511
512 my $tmpfile = Encode::encode_utf8($subfile);
513 #print STDERR "subfile = $subfile, tmpfile = $tmpfile\n";
514 # Next add metadata read in XML files (if it is supplied)
515 if ($additionalmetadata == 1) {
516 my ($filespec, $mdref);
517 foreach $filespec (@extrametakeys) {
518 ## use the utf8 encoded filename to do the filename comparison
519 if ($tmpfile =~ /^$filespec$/) {
520 print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
521 if ($verbosity > 2);
522 $mdref = $extrametadata{$filespec};
523 &metadatautil::combine_metadata_structures($out_metadata, $mdref);
524 }
525 }
526 }
527
528
529 if (defined $self->{'inf_timestamp'}) {
530 my $inf_timestamp = $self->{'inf_timestamp'};
531
532 if (! -d $full_filename) {
533 my $filename_timestamp = -M $full_filename;
534 if ($filename_timestamp > $inf_timestamp) {
535 # filename has been around for longer than inf
536##### print $outhandle "**** Skipping $subfile\n";
537 next;
538 }
539 }
540 }
541
542 # Recursively read each $subfile
543 print $outhandle "DirectoryPlugin recurring: $subfile\n" if ($verbosity > 2);
544
545 $count += &plugin::read ($pluginfo, $this_file_base_dir,
546 $file_subfile, $block_hash,
547 $out_metadata, $processor, $maxdocs, ($total_count + $count), $gli);
548 }
549
550 return $count;
551}
552
5531;
Note: See TracBrowser for help on using the repository browser.