source: gsdl/trunk/perllib/plugins/DirectoryPlugin.pm@ 17738

Last change on this file since 17738 was 17738, checked in by kjdon, 15 years ago

AbstractPLugin has been removedd, so these now inherit from PrintInfo, and code from AbstractPlugin has been added to these two classes

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.0 KB
Line 
1###########################################################################
2#
3# DirectoryPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# DirectoryPlugin is a plugin which recurses through directories processing
27# each file it finds - which basically means passing it down the plugin
28# pipeline
29
30package DirectoryPlugin;
31
32use PrintInfo;
33use plugin;
34use util;
35use metadatautil;
36
37use File::Basename;
38use strict;
39no strict 'refs';
40no strict 'subs';
41
42use Encode;
43
44BEGIN {
45 @DirectoryPlugin::ISA = ('PrintInfo');
46}
47
48my $arguments =
49 [ { 'name' => "block_exp",
50 'desc' => "{BasePlugin.block_exp}",
51 'type' => "regexp",
52 'deft' => &get_default_block_exp(),
53 'reqd' => "no" },
54 # this option has been deprecated. leave it here for now so we can warn people not to use it
55 { 'name' => "use_metadata_files",
56 'desc' => "{DirectoryPlugin.use_metadata_files}",
57 'type' => "flag",
58 'reqd' => "no",
59 'hiddengli' => "yes" },
60 { 'name' => "recheck_directories",
61 'desc' => "{DirectoryPlugin.recheck_directories}",
62 'type' => "flag",
63 'reqd' => "no" } ];
64
65my $options = { 'name' => "DirectoryPlugin",
66 'desc' => "{DirectoryPlugin.desc}",
67 'abstract' => "no",
68 'inherits' => "yes",
69 'args' => $arguments };
70
71sub new {
72 my ($class) = shift (@_);
73 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
74 push(@$pluginlist, $class);
75
76 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
77 push(@{$hashArgOptLists->{"OptList"}},$options);
78
79 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
80
81 if ($self->{'info_only'}) {
82 # don't worry about any options or initialisations etc
83 return bless $self, $class;
84 }
85
86 # we have left this option in so we can warn people who are still using it
87 if ($self->{'use_metadata_files'}) {
88 die "ERROR: DirectoryPlugin -use_metadata_files option has been deprecated. Please remove the option and add MetadataXMLPlug to your plugin list instead!\n";
89 }
90
91 $self->{'num_processed'} = 0;
92 $self->{'num_not_processed'} = 0;
93 $self->{'num_blocked'} = 0;
94 $self->{'num_archives'} = 0;
95
96 $self->{'subdir_extrametakeys'} = {};
97
98 return bless $self, $class;
99}
100
101# called once, at the start of processing
102sub init {
103 my $self = shift (@_);
104 my ($verbosity, $outhandle, $failhandle) = @_;
105
106 # verbosity is passed through from the processor
107 $self->{'verbosity'} = $verbosity;
108
109 # as are the outhandle and failhandle
110 $self->{'outhandle'} = $outhandle if defined $outhandle;
111 $self->{'failhandle'} = $failhandle;
112
113}
114
115# called once, after all passes have finished
116sub deinit {
117 my ($self) = @_;
118
119}
120
121# called at the beginning of each plugin pass (import has one, building has many)
122sub begin {
123 my $self = shift (@_);
124 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
125
126 my $proc_package_name = ref $processor;
127
128 if ($proc_package_name !~ /buildproc$/ && $self->{'incremental'} == 1) {
129
130 # Only lookup timestamp info for import.pl, and only if incremental is set
131
132 my $output_dir = $processor->getoutputdir();
133 my $archives_inf = &util::filename_cat($output_dir,"archives.inf");
134
135 if ( -e $archives_inf ) {
136 $self->{'inf_timestamp'} = -M $archives_inf;
137 }
138 }
139}
140
141# called at the end of each plugin pass
142sub end {
143 my ($self) = shift (@_);
144
145}
146
147
148# called if we are doing incremental building
149sub set_incremental {
150 my $self = shift(@_);
151 my ($incremental) = @_;
152
153 $self->{'incremental'} = $incremental;
154}
155
156# return 1 if this class might recurse using $pluginfo
157sub is_recursive {
158 my $self = shift (@_);
159
160 return 1;
161}
162
163sub get_default_block_exp {
164 my $self = shift (@_);
165
166 return '(?i)(CVS|\.svn|Thumbs\.db|~)$';
167}
168
169sub check_directory_path {
170
171 my $self = shift(@_);
172 my ($dirname) = @_;
173
174 return undef unless (-d $dirname);
175
176 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
177
178 my $outhandle = $self->{'outhandle'};
179
180 # check to make sure we're not reading the archives or index directory
181 my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
182 if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
183 print $outhandle "DirectoryPlugin: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
184 return 0;
185 }
186
187 # check to see we haven't got a cyclic path...
188 if ($dirname =~ m%(/.*){,41}%) {
189 print $outhandle "DirectoryPlugin: $dirname is 40 directories deep, is this a recursive path? if not increase constant in DirectoryPlugin.pm.\n";
190 return 0;
191 }
192
193 # check to see we haven't got a cyclic path...
194 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
195 print $outhandle "DirectoryPlugin: $dirname appears to be in a recursive loop...\n";
196 return 0;
197 }
198
199 return 1;
200}
201
202# this may be called more than once
203sub sort_out_associated_files {
204
205 my $self = shift (@_);
206 my ($block_hash) = @_;
207 if (!scalar (keys %{$block_hash->{'shared_fileroot'}})) {
208 return;
209 }
210
211 $self->{'assocfile_info'} = {} unless defined $self->{'assocfile_info'};
212 my $metadata = $self->{'assocfile_info'};
213 foreach my $prefix (keys %{$block_hash->{'shared_fileroot'}}) {
214 my $record = $block_hash->{'shared_fileroot'}->{$prefix};
215
216 my $tie_to = $record->{'tie_to'};
217 my $exts = $record->{'exts'};
218
219 if ((defined $tie_to) && (scalar (keys %$exts) > 0)) {
220 # set up fileblocks and assocfile_tobe
221 my $base_file = "$prefix$tie_to";
222 $metadata->{$base_file} = {} unless defined $metadata->{$base_file};
223 my $base_file_metadata = $metadata->{$base_file};
224
225 $base_file_metadata->{'gsdlassocfile_tobe'} = [] unless defined $base_file_metadata->{'gsdlassocfile_tobe'};
226 my $assoc_tobe = $base_file_metadata->{'gsdlassocfile_tobe'};
227 foreach my $e (keys %$exts) {
228 # block the file
229 $block_hash->{'file_blocks'}->{"$prefix$e"} = 1;
230 # set up as an associatd file
231 print STDERR " $self->{'plugin_type'}: Associating $prefix$e with $tie_to version\n";
232 my $mime_type = ""; # let system auto detect this
233 push(@$assoc_tobe,"$prefix$e:$mime_type:");
234
235 }
236 }
237 } # foreach record
238
239 $block_hash->{'shared_fileroot'} = undef;
240 $block_hash->{'shared_fileroot'} = {};
241
242}
243
244
245# do block exp OR special blocking ???
246
247sub file_is_blocked {
248 my $self = shift (@_);
249 my ($block_hash, $filename_full_path) = @_;
250
251 if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) {
252 $self->{'num_blocked'} ++;
253 return 1;
254 }
255 # check Directory plugin's own block_exp
256 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
257 $self->{'num_blocked'} ++;
258 return 1; # blocked
259 }
260 return 0;
261}
262
263
264
265sub file_block_read {
266 my $self = shift (@_);
267 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
268
269 my $outhandle = $self->{'outhandle'};
270 my $verbosity = $self->{'verbosity'};
271
272 # Calculate the directory name and ensure it is a directory and
273 # that it is not explicitly blocked.
274 my $dirname = $file;
275 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
276
277 my $directory_ok = $self->check_directory_path($dirname);
278 return $directory_ok unless (defined $directory_ok && $directory_ok == 1);
279
280 $block_hash->{'file_blocks'} = {} unless defined $block_hash->{'file_blocks'};
281 $block_hash->{'shared_fileroot'} = {} unless defined $block_hash->{'shared_fileroot'};
282
283 # Recur over directory contents.
284 my (@dir, $subfile);
285 #my $count = 0;
286
287 print $outhandle "DirectoryPlugin block: getting directory $dirname\n" if ($verbosity > 2);
288
289 # find all the files in the directory
290 if (!opendir (DIR, $dirname)) {
291 if ($gli) {
292 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
293 }
294 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
295 return -1; # error in processing
296 }
297 @dir = readdir (DIR);
298 closedir (DIR);
299
300 for (my $i = 0; $i < scalar(@dir); $i++) {
301 my $subfile = $dir[$i];
302 my $this_file_base_dir = $base_dir;
303 next if ($subfile =~ m/^\.\.?$/);
304
305 # Recursively read each $subfile
306 print $outhandle "DirectoryPlugin block recurring: $subfile\n" if ($verbosity > 2);
307
308 #$count += &plugin::file_block_read ($pluginfo, $this_file_base_dir,
309 &plugin::file_block_read ($pluginfo, $this_file_base_dir,
310 &util::filename_cat($file, $subfile),
311 $block_hash, $metadata, $gli);
312
313 }
314 $self->sort_out_associated_files($block_hash);
315 #return $count;
316
317}
318
319# We don't do metadata_read
320sub metadata_read {
321 my $self = shift (@_);
322 my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
323
324 return undef;
325}
326
327
328# return number of files processed, undef if can't process
329# Note that $base_dir might be "" and that $file might
330# include directories
331
332# This function passes around metadata hash structures. Metadata hash
333# structures are hashes that map from a (scalar) key (the metadata element
334# name) to either a scalar metadata value or a reference to an array of
335# such values.
336
337sub read {
338 my $self = shift (@_);
339 my ($pluginfo, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_;
340
341 my $outhandle = $self->{'outhandle'};
342 my $verbosity = $self->{'verbosity'};
343
344 # Calculate the directory name and ensure it is a directory and
345 # that it is not explicitly blocked.
346 my $dirname;
347 if ($file eq "") {
348 $dirname = $base_dir;
349 } else {
350 $dirname = $file;
351 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
352 }
353
354 my $directory_ok = $self->check_directory_path($dirname);
355 return $directory_ok unless (defined $directory_ok && $directory_ok == 1);
356
357 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
358 print $outhandle "DirectoryPlugin: metadata passed in: ",
359 join(", ", keys %$in_metadata), "\n";
360 }
361
362
363 # Recur over directory contents.
364 my (@dir, $subfile);
365 my $count = 0;
366
367 print $outhandle "DirectoryPlugin read: getting directory $dirname\n" if ($verbosity > 2);
368
369 # find all the files in the directory
370 if (!opendir (DIR, $dirname)) {
371 if ($gli) {
372 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
373 }
374 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
375 return -1; # error in processing
376 }
377 @dir = readdir (DIR);
378 closedir (DIR);
379
380 # Re-order the files in the list so any directories ending with .all are moved to the end
381 for (my $i = scalar(@dir) - 1; $i >= 0; $i--) {
382 if (-d &util::filename_cat($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) {
383 push(@dir, splice(@dir, $i, 1));
384 }
385 }
386
387 # setup the metadata structures. we do a metadata_read pass to see if there is any additional metadata, then pass it to read
388
389 my $additionalmetadata = 0; # is there extra metadata available?
390 my %extrametadata; # maps from filespec to extra metadata keys
391 my @extrametakeys; # keys of %extrametadata in order read
392
393
394 my $os_dirsep = &util::get_os_dirsep();
395 my $dirsep = &util::get_dirsep();
396 my $base_dir_regexp = $base_dir;
397 $base_dir_regexp =~ s/\//$os_dirsep/g;
398 my $local_dirname = $dirname;
399 $local_dirname =~ s/^$base_dir_regexp($os_dirsep)//;
400 $local_dirname .= $dirsep;
401
402 if (defined $self->{'subdir_extrametakeys'}->{$local_dirname}) {
403 my $extrakeys = $self->{'subdir_extrametakeys'}->{$local_dirname};
404 foreach my $ek (@$extrakeys) {
405 my $extrakeys_re = $ek->{'re'};
406 my $extrakeys_md = $ek->{'md'};
407 push(@extrametakeys,$extrakeys_re);
408 $extrametadata{$extrakeys_re} = $extrakeys_md;
409 }
410 delete($self->{'subdir_extrametakeys'}->{$local_dirname});
411 }
412
413 # apply metadata pass for each of the files in the directory
414 my $num_files = scalar(@dir);
415 for (my $i = 0; $i < scalar(@dir); $i++) {
416 my $subfile = $dir[$i];
417 my $this_file_base_dir = $base_dir;
418 last if ($maxdocs != -1 && $count >= $maxdocs);
419 next if ($subfile =~ m/^\.\.?$/);
420 my $file_subfile = &util::filename_cat($file, $subfile);
421 my $full_filename = &util::filename_cat($this_file_base_dir, $file_subfile);
422 if ($self->file_is_blocked($block_hash,$full_filename)) {
423 print STDERR "DirectoryPlugin: file $full_filename was blocked for metadata_read\n" if ($verbosity > 2);
424 next;
425 }
426
427 # Recursively read each $subfile
428 print $outhandle "DirectoryPlugin metadata recurring: $subfile\n" if ($verbosity > 2);
429
430 $count += &plugin::metadata_read ($pluginfo, $this_file_base_dir,
431 $file_subfile,$block_hash,
432 \@extrametakeys, \%extrametadata,
433 $processor, $maxdocs, $gli);
434 $additionalmetadata = 1;
435 }
436
437 # filter out any extrametakeys that mention subdirectories and store
438 # for later use (i.e. when that sub-directory is being processed)
439
440 foreach my $ek (@extrametakeys) {
441 my ($subdir_re,$extrakey_dir) = &File::Basename::fileparse($ek);
442 $extrakey_dir =~ s/\\\./\./g; # remove RE syntax
443
444 my $dirsep_re = &util::get_re_dirsep();
445
446 my $ek_non_re = $ek;
447 $ek_non_re =~ s/\\\./\./g; # remove RE syntax
448
449 if ($ek_non_re =~ m/$dirsep_re/) { # specifies at least one directory
450 my $md = $extrametadata{$ek};
451
452 my $subdir_extrametakeys = $self->{'subdir_extrametakeys'};
453
454 my $subdir_rec = { 're' => $subdir_re, 'md' => $md };
455
456 # when its looked up, it must be relative to the base dir
457 push(@{$subdir_extrametakeys->{"$local_dirname$extrakey_dir"}},$subdir_rec);
458 #push(@{$subdir_extrametakeys->{"$extrakey_dir"}},$subdir_rec);
459 }
460 }
461
462 # import each of the files in the directory
463 $count=0;
464 for (my $i = 0; $i <= scalar(@dir); $i++) {
465 # When every file in the directory has been done, pause for a moment (figuratively!)
466 # If the -recheck_directories argument hasn't been provided, stop now (default)
467 # Otherwise, re-read the contents of the directory to check for new files
468 # Any new files are added to the @dir list and are processed as normal
469 # This is necessary when documents to be indexed are specified in bibliographic DBs
470 # These files are copied/downloaded and stored in a new folder at import time
471 if ($i == $num_files) {
472 last unless $self->{'recheck_directories'};
473
474 # Re-read the files in the directory to see if there are any new files
475 last if (!opendir (DIR, $dirname));
476 my @dirnow = readdir (DIR);
477 closedir (DIR);
478
479 # We're only interested if there are more files than there were before
480 last if (scalar(@dirnow) <= scalar(@dir));
481
482 # Any new files are added to the end of @dir to get processed by the loop
483 my $j;
484 foreach my $subfilenow (@dirnow) {
485 for ($j = 0; $j < $num_files; $j++) {
486 last if ($subfilenow eq $dir[$j]);
487 }
488 if ($j == $num_files) {
489 # New file
490 push(@dir, $subfilenow);
491 }
492 }
493 # When the new files have been processed, check again
494 $num_files = scalar(@dir);
495 }
496
497 my $subfile = $dir[$i];
498 my $this_file_base_dir = $base_dir;
499 last if ($maxdocs != -1 && ($count + $total_count) >= $maxdocs);
500 next if ($subfile =~ /^\.\.?$/);
501
502 my $file_subfile = &util::filename_cat($file, $subfile);
503 my $full_filename
504 = &util::filename_cat($this_file_base_dir,$file_subfile);
505
506 if ($self->file_is_blocked($block_hash,$full_filename)) {
507 print STDERR "DirectoryPlugin: file $full_filename was blocked for read\n" if ($verbosity > 2);
508 next;
509 }
510 #print STDERR "processing $full_filename\n";
511 # Follow Windows shortcuts
512 if ($subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {
513 require Win32::Shortcut;
514 my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $subfile));
515 if ($shortcut) {
516 # The file to be processed is now the target of the shortcut
517 $this_file_base_dir = "";
518 $file = "";
519 $subfile = $shortcut->Path;
520 }
521 }
522
523 # check for a symlink pointing back to a leading directory
524 if (-d "$dirname/$subfile" && -l "$dirname/$subfile") {
525 # readlink gives a "fatal error" on systems that don't implement
526 # symlinks. This assumes the the -l test above would fail on those.
527 my $linkdest=readlink "$dirname/$subfile";
528 if (!defined ($linkdest)) {
529 # system error - file not found?
530 warn "DirectoryPlugin: symlink problem - $!";
531 } else {
532 # see if link points to current or a parent directory
533 if ($linkdest =~ m@^[\./\\]+$@ ||
534 index($dirname, $linkdest) != -1) {
535 warn "DirectoryPlugin: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n";
536 next;
537 ;
538 }
539 }
540 }
541
542 print $outhandle "DirectoryPlugin: preparing metadata for $subfile\n" if ($verbosity > 2);
543
544 # Make a copy of $in_metadata to pass to $subfile
545 my $out_metadata = {};
546 &metadatautil::combine_metadata_structures($out_metadata, $in_metadata);
547
548 # check the assocfile_info
549 if (defined $self->{'assocfile_info'}->{$full_filename}) {
550 &metadatautil::combine_metadata_structures($out_metadata, $self->{'assocfile_info'}->{$full_filename});
551 }
552 ## encode the filename as perl5 doesn't handle unicode filenames
553
554 my $tmpfile = Encode::encode_utf8($subfile);
555 #print STDERR "subfile = $subfile, tmpfile = $tmpfile\n";
556 # Next add metadata read in XML files (if it is supplied)
557 if ($additionalmetadata == 1) {
558 my ($filespec, $mdref);
559 foreach $filespec (@extrametakeys) {
560 ## use the utf8 encoded filename to do the filename comparison
561 if ($tmpfile =~ /^$filespec$/) {
562 print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
563 if ($verbosity > 2);
564 $mdref = $extrametadata{$filespec};
565 &metadatautil::combine_metadata_structures($out_metadata, $mdref);
566 }
567 }
568 }
569
570
571 if (defined $self->{'inf_timestamp'}) {
572 my $inf_timestamp = $self->{'inf_timestamp'};
573
574 if (! -d $full_filename) {
575 my $filename_timestamp = -M $full_filename;
576 if ($filename_timestamp > $inf_timestamp) {
577 # filename has been around for longer than inf
578##### print $outhandle "**** Skipping $subfile\n";
579 next;
580 }
581 }
582 }
583
584 # Recursively read each $subfile
585 print $outhandle "DirectoryPlugin recurring: $subfile\n" if ($verbosity > 2);
586
587 $count += &plugin::read ($pluginfo, $this_file_base_dir,
588 $file_subfile, $block_hash,
589 $out_metadata, $processor, $maxdocs, ($total_count + $count), $gli);
590 }
591
592 return $count;
593}
594
595sub compile_stats {
596 my $self = shift(@_);
597 my ($stats) = @_;
598}
599
6001;
Note: See TracBrowser for help on using the repository browser.