source: gsdl/trunk/perllib/plugins/DirectoryPlugin.pm@ 18523

Last change on this file since 18523 was 18523, checked in by davidb, 15 years ago

File-block pass now prints out message for each directory processed. Might be over-kill in the amount of message printed to the command file, but at this stage it is considered better than the import.pl just sitting there seemingly doing nothing when run on a large collections. If output eventually considered too much, could shift to system of printint out a '.' for every 50 or so files/directories processed, as is done when converting video files by VideoPlugin

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.8 KB
Line 
1###########################################################################
2#
3# DirectoryPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# DirectoryPlugin is a plugin which recurses through directories processing
27# each file it finds - which basically means passing it down the plugin
28# pipeline
29
30package DirectoryPlugin;
31
32use PrintInfo;
33use plugin;
34use util;
35use metadatautil;
36
37use File::Basename;
38use strict;
39no strict 'refs';
40no strict 'subs';
41
42use Encode;
43
44BEGIN {
45 @DirectoryPlugin::ISA = ('PrintInfo');
46}
47
48my $arguments =
49 [ { 'name' => "block_exp",
50 'desc' => "{BasePlugin.block_exp}",
51 'type' => "regexp",
52 'deft' => &get_default_block_exp(),
53 'reqd' => "no" },
54 # this option has been deprecated. leave it here for now so we can warn people not to use it
55 { 'name' => "use_metadata_files",
56 'desc' => "{DirectoryPlugin.use_metadata_files}",
57 'type' => "flag",
58 'reqd' => "no",
59 'hiddengli' => "yes" },
60 { 'name' => "recheck_directories",
61 'desc' => "{DirectoryPlugin.recheck_directories}",
62 'type' => "flag",
63 'reqd' => "no" } ];
64
65my $options = { 'name' => "DirectoryPlugin",
66 'desc' => "{DirectoryPlugin.desc}",
67 'abstract' => "no",
68 'inherits' => "yes",
69 'args' => $arguments };
70
71sub new {
72 my ($class) = shift (@_);
73 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
74 push(@$pluginlist, $class);
75
76 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
77 push(@{$hashArgOptLists->{"OptList"}},$options);
78
79 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
80
81 if ($self->{'info_only'}) {
82 # don't worry about any options or initialisations etc
83 return bless $self, $class;
84 }
85
86 # we have left this option in so we can warn people who are still using it
87 if ($self->{'use_metadata_files'}) {
88 die "ERROR: DirectoryPlugin -use_metadata_files option has been deprecated. Please remove the option and add MetadataXMLPlug to your plugin list instead!\n";
89 }
90
91 $self->{'num_processed'} = 0;
92 $self->{'num_not_processed'} = 0;
93 $self->{'num_blocked'} = 0;
94 $self->{'num_archives'} = 0;
95
96 $self->{'subdir_extrametakeys'} = {};
97
98 return bless $self, $class;
99}
100
101# called once, at the start of processing
102sub init {
103 my $self = shift (@_);
104 my ($verbosity, $outhandle, $failhandle) = @_;
105
106 # verbosity is passed through from the processor
107 $self->{'verbosity'} = $verbosity;
108
109 # as are the outhandle and failhandle
110 $self->{'outhandle'} = $outhandle if defined $outhandle;
111 $self->{'failhandle'} = $failhandle;
112
113}
114
115# called once, after all passes have finished
116sub deinit {
117 my ($self) = @_;
118
119}
120
121# called at the beginning of each plugin pass (import has one, building has many)
122sub begin {
123 my $self = shift (@_);
124 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
125
126 my $proc_package_name = ref $processor;
127
128 if ($proc_package_name !~ /buildproc$/ && $self->{'incremental'} == 1) {
129
130 # Only lookup timestamp info for import.pl, and only if incremental is set
131
132 my $output_dir = $processor->getoutputdir();
133## my $archives_inf = &util::filename_cat($output_dir,"archives.inf");
134 my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb";
135 my $doc_db = "archiveinf-doc$db_ext";
136 my $archives_inf = &util::filename_cat($output_dir,$doc_db);
137
138 if ( -e $archives_inf ) {
139 $self->{'inf_timestamp'} = -M $archives_inf;
140 }
141 }
142}
143
144# called at the end of each plugin pass
145sub end {
146 my ($self) = shift (@_);
147
148}
149
150
151# called if we are doing incremental building
152sub set_incremental {
153 my $self = shift(@_);
154 my ($incremental) = @_;
155
156 $self->{'incremental'} = $incremental;
157}
158
159# return 1 if this class might recurse using $pluginfo
160sub is_recursive {
161 my $self = shift (@_);
162
163 return 1;
164}
165
166sub get_default_block_exp {
167 my $self = shift (@_);
168
169 return '(?i)(CVS|\.svn|Thumbs\.db|~)$';
170}
171
172sub check_directory_path {
173
174 my $self = shift(@_);
175 my ($dirname) = @_;
176
177 return undef unless (-d $dirname);
178
179 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
180
181 my $outhandle = $self->{'outhandle'};
182
183 # check to make sure we're not reading the archives or index directory
184 my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
185 if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
186 print $outhandle "DirectoryPlugin: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
187 return 0;
188 }
189
190 # check to see we haven't got a cyclic path...
191 if ($dirname =~ m%(/.*){,41}%) {
192 print $outhandle "DirectoryPlugin: $dirname is 40 directories deep, is this a recursive path? if not increase constant in DirectoryPlugin.pm.\n";
193 return 0;
194 }
195
196 # check to see we haven't got a cyclic path...
197 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
198 print $outhandle "DirectoryPlugin: $dirname appears to be in a recursive loop...\n";
199 return 0;
200 }
201
202 return 1;
203}
204
205# this may be called more than once
206sub sort_out_associated_files {
207
208 my $self = shift (@_);
209 my ($block_hash) = @_;
210 if (!scalar (keys %{$block_hash->{'shared_fileroot'}})) {
211 return;
212 }
213
214 $self->{'assocfile_info'} = {} unless defined $self->{'assocfile_info'};
215 my $metadata = $self->{'assocfile_info'};
216 foreach my $prefix (keys %{$block_hash->{'shared_fileroot'}}) {
217 my $record = $block_hash->{'shared_fileroot'}->{$prefix};
218
219 my $tie_to = $record->{'tie_to'};
220 my $exts = $record->{'exts'};
221
222 if ((defined $tie_to) && (scalar (keys %$exts) > 0)) {
223 # set up fileblocks and assocfile_tobe
224 my $base_file = "$prefix$tie_to";
225 $metadata->{$base_file} = {} unless defined $metadata->{$base_file};
226 my $base_file_metadata = $metadata->{$base_file};
227
228 $base_file_metadata->{'gsdlassocfile_tobe'} = [] unless defined $base_file_metadata->{'gsdlassocfile_tobe'};
229 my $assoc_tobe = $base_file_metadata->{'gsdlassocfile_tobe'};
230 foreach my $e (keys %$exts) {
231 # block the file
232 $block_hash->{'file_blocks'}->{"$prefix$e"} = 1;
233 # set up as an associatd file
234 print STDERR " $self->{'plugin_type'}: Associating $prefix$e with $tie_to version\n";
235 my $mime_type = ""; # let system auto detect this
236 push(@$assoc_tobe,"$prefix$e:$mime_type:");
237
238 }
239 }
240 } # foreach record
241
242 $block_hash->{'shared_fileroot'} = undef;
243 $block_hash->{'shared_fileroot'} = {};
244
245}
246
247
248# do block exp OR special blocking ???
249
250sub file_is_blocked {
251 my $self = shift (@_);
252 my ($block_hash, $filename_full_path) = @_;
253
254 if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) {
255 $self->{'num_blocked'} ++;
256 return 1;
257 }
258 # check Directory plugin's own block_exp
259 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
260 $self->{'num_blocked'} ++;
261 return 1; # blocked
262 }
263 return 0;
264}
265
266
267
268sub file_block_read {
269 my $self = shift (@_);
270 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
271
272 my $outhandle = $self->{'outhandle'};
273 my $verbosity = $self->{'verbosity'};
274
275 # Calculate the directory name and ensure it is a directory and
276 # that it is not explicitly blocked.
277 my $dirname = $file;
278 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
279
280 my $directory_ok = $self->check_directory_path($dirname);
281 return $directory_ok unless (defined $directory_ok && $directory_ok == 1);
282
283 print $outhandle "Global file scan checking directory: $dirname\n";
284
285 $block_hash->{'all_files'} = {} unless defined $block_hash->{'all_files'};
286
287 $block_hash->{'file_blocks'} = {} unless defined $block_hash->{'file_blocks'};
288 $block_hash->{'shared_fileroot'} = {} unless defined $block_hash->{'shared_fileroot'};
289
290 # Recur over directory contents.
291 my (@dir, $subfile);
292 #my $count = 0;
293
294 print $outhandle "DirectoryPlugin block: getting directory $dirname\n" if ($verbosity > 2);
295
296 # find all the files in the directory
297 if (!opendir (DIR, $dirname)) {
298 if ($gli) {
299 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
300 }
301 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
302 return -1; # error in processing
303 }
304 @dir = readdir (DIR);
305 closedir (DIR);
306
307 for (my $i = 0; $i < scalar(@dir); $i++) {
308 my $subfile = $dir[$i];
309 my $this_file_base_dir = $base_dir;
310 next if ($subfile =~ m/^\.\.?$/);
311
312 # Recursively read each $subfile
313 print $outhandle "DirectoryPlugin block recurring: $subfile\n" if ($verbosity > 2);
314
315 #$count += &plugin::file_block_read ($pluginfo, $this_file_base_dir,
316 &plugin::file_block_read ($pluginfo, $this_file_base_dir,
317 &util::filename_cat($file, $subfile),
318 $block_hash, $metadata, $gli);
319
320 }
321 $self->sort_out_associated_files($block_hash);
322 #return $count;
323
324}
325
326# We don't do metadata_read
327sub metadata_read {
328 my $self = shift (@_);
329 my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
330
331 return undef;
332}
333
334
335# return number of files processed, undef if can't process
336# Note that $base_dir might be "" and that $file might
337# include directories
338
339# This function passes around metadata hash structures. Metadata hash
340# structures are hashes that map from a (scalar) key (the metadata element
341# name) to either a scalar metadata value or a reference to an array of
342# such values.
343
344sub read {
345 my $self = shift (@_);
346 my ($pluginfo, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_;
347
348 my $outhandle = $self->{'outhandle'};
349 my $verbosity = $self->{'verbosity'};
350
351 # Calculate the directory name and ensure it is a directory and
352 # that it is not explicitly blocked.
353 my $dirname;
354 if ($file eq "") {
355 $dirname = $base_dir;
356 } else {
357 $dirname = $file;
358 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
359 }
360
361 my $directory_ok = $self->check_directory_path($dirname);
362 return $directory_ok unless (defined $directory_ok && $directory_ok == 1);
363
364 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
365 print $outhandle "DirectoryPlugin: metadata passed in: ",
366 join(", ", keys %$in_metadata), "\n";
367 }
368
369
370 # Recur over directory contents.
371 my (@dir, $subfile);
372 my $count = 0;
373
374 print $outhandle "DirectoryPlugin read: getting directory $dirname\n" if ($verbosity > 2);
375
376 # find all the files in the directory
377 if (!opendir (DIR, $dirname)) {
378 if ($gli) {
379 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
380 }
381 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
382 return -1; # error in processing
383 }
384 @dir = readdir (DIR);
385 closedir (DIR);
386
387 # Re-order the files in the list so any directories ending with .all are moved to the end
388 for (my $i = scalar(@dir) - 1; $i >= 0; $i--) {
389 if (-d &util::filename_cat($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) {
390 push(@dir, splice(@dir, $i, 1));
391 }
392 }
393
394 # setup the metadata structures. we do a metadata_read pass to see if there is any additional metadata, then pass it to read
395
396 my $additionalmetadata = 0; # is there extra metadata available?
397 my %extrametadata; # maps from filespec to extra metadata keys
398 my @extrametakeys; # keys of %extrametadata in order read
399
400
401 my $os_dirsep = &util::get_os_dirsep();
402 my $dirsep = &util::get_dirsep();
403 my $base_dir_regexp = $base_dir;
404 $base_dir_regexp =~ s/\//$os_dirsep/g;
405 my $local_dirname = $dirname;
406 $local_dirname =~ s/^$base_dir_regexp($os_dirsep)//;
407 $local_dirname .= $dirsep;
408
409 if (defined $self->{'subdir_extrametakeys'}->{$local_dirname}) {
410 my $extrakeys = $self->{'subdir_extrametakeys'}->{$local_dirname};
411 foreach my $ek (@$extrakeys) {
412 my $extrakeys_re = $ek->{'re'};
413 my $extrakeys_md = $ek->{'md'};
414 push(@extrametakeys,$extrakeys_re);
415 $extrametadata{$extrakeys_re} = $extrakeys_md;
416 }
417 delete($self->{'subdir_extrametakeys'}->{$local_dirname});
418 }
419
420 # apply metadata pass for each of the files in the directory
421 my $num_files = scalar(@dir);
422 for (my $i = 0; $i < scalar(@dir); $i++) {
423 my $subfile = $dir[$i];
424 my $this_file_base_dir = $base_dir;
425 last if ($maxdocs != -1 && $count >= $maxdocs);
426 next if ($subfile =~ m/^\.\.?$/);
427 my $file_subfile = &util::filename_cat($file, $subfile);
428 my $full_filename = &util::filename_cat($this_file_base_dir, $file_subfile);
429 if ($self->file_is_blocked($block_hash,$full_filename)) {
430 print STDERR "DirectoryPlugin: file $full_filename was blocked for metadata_read\n" if ($verbosity > 2);
431 next;
432 }
433
434 # Recursively read each $subfile
435 print $outhandle "DirectoryPlugin metadata recurring: $subfile\n" if ($verbosity > 2);
436
437 $count += &plugin::metadata_read ($pluginfo, $this_file_base_dir,
438 $file_subfile,$block_hash,
439 \@extrametakeys, \%extrametadata,
440 $processor, $maxdocs, $gli);
441 $additionalmetadata = 1;
442 }
443
444 # filter out any extrametakeys that mention subdirectories and store
445 # for later use (i.e. when that sub-directory is being processed)
446
447 foreach my $ek (@extrametakeys) {
448 my ($subdir_re,$extrakey_dir) = &File::Basename::fileparse($ek);
449 $extrakey_dir =~ s/\\\./\./g; # remove RE syntax
450
451 my $dirsep_re = &util::get_re_dirsep();
452
453 my $ek_non_re = $ek;
454 $ek_non_re =~ s/\\\./\./g; # remove RE syntax
455
456 if ($ek_non_re =~ m/$dirsep_re/) { # specifies at least one directory
457 my $md = $extrametadata{$ek};
458
459 my $subdir_extrametakeys = $self->{'subdir_extrametakeys'};
460
461 my $subdir_rec = { 're' => $subdir_re, 'md' => $md };
462
463 # when its looked up, it must be relative to the base dir
464 push(@{$subdir_extrametakeys->{"$local_dirname$extrakey_dir"}},$subdir_rec);
465 #push(@{$subdir_extrametakeys->{"$extrakey_dir"}},$subdir_rec);
466 }
467 }
468
469 # import each of the files in the directory
470 $count=0;
471 for (my $i = 0; $i <= scalar(@dir); $i++) {
472 # When every file in the directory has been done, pause for a moment (figuratively!)
473 # If the -recheck_directories argument hasn't been provided, stop now (default)
474 # Otherwise, re-read the contents of the directory to check for new files
475 # Any new files are added to the @dir list and are processed as normal
476 # This is necessary when documents to be indexed are specified in bibliographic DBs
477 # These files are copied/downloaded and stored in a new folder at import time
478 if ($i == $num_files) {
479 last unless $self->{'recheck_directories'};
480
481 # Re-read the files in the directory to see if there are any new files
482 last if (!opendir (DIR, $dirname));
483 my @dirnow = readdir (DIR);
484 closedir (DIR);
485
486 # We're only interested if there are more files than there were before
487 last if (scalar(@dirnow) <= scalar(@dir));
488
489 # Any new files are added to the end of @dir to get processed by the loop
490 my $j;
491 foreach my $subfilenow (@dirnow) {
492 for ($j = 0; $j < $num_files; $j++) {
493 last if ($subfilenow eq $dir[$j]);
494 }
495 if ($j == $num_files) {
496 # New file
497 push(@dir, $subfilenow);
498 }
499 }
500 # When the new files have been processed, check again
501 $num_files = scalar(@dir);
502 }
503
504 my $subfile = $dir[$i];
505 my $this_file_base_dir = $base_dir;
506 last if ($maxdocs != -1 && ($count + $total_count) >= $maxdocs);
507 next if ($subfile =~ /^\.\.?$/);
508
509 my $file_subfile = &util::filename_cat($file, $subfile);
510 my $full_filename
511 = &util::filename_cat($this_file_base_dir,$file_subfile);
512
513 if ($self->file_is_blocked($block_hash,$full_filename)) {
514 print STDERR "DirectoryPlugin: file $full_filename was blocked for read\n" if ($verbosity > 2);
515 next;
516 }
517 #print STDERR "processing $full_filename\n";
518 # Follow Windows shortcuts
519 if ($subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {
520 require Win32::Shortcut;
521 my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $subfile));
522 if ($shortcut) {
523 # The file to be processed is now the target of the shortcut
524 $this_file_base_dir = "";
525 $file = "";
526 $subfile = $shortcut->Path;
527 }
528 }
529
530 # check for a symlink pointing back to a leading directory
531 if (-d "$dirname/$subfile" && -l "$dirname/$subfile") {
532 # readlink gives a "fatal error" on systems that don't implement
533 # symlinks. This assumes the the -l test above would fail on those.
534 my $linkdest=readlink "$dirname/$subfile";
535 if (!defined ($linkdest)) {
536 # system error - file not found?
537 warn "DirectoryPlugin: symlink problem - $!";
538 } else {
539 # see if link points to current or a parent directory
540 if ($linkdest =~ m@^[\./\\]+$@ ||
541 index($dirname, $linkdest) != -1) {
542 warn "DirectoryPlugin: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n";
543 next;
544 ;
545 }
546 }
547 }
548
549 print $outhandle "DirectoryPlugin: preparing metadata for $subfile\n" if ($verbosity > 2);
550
551 # Make a copy of $in_metadata to pass to $subfile
552 my $out_metadata = {};
553 &metadatautil::combine_metadata_structures($out_metadata, $in_metadata);
554
555 # check the assocfile_info
556 if (defined $self->{'assocfile_info'}->{$full_filename}) {
557 &metadatautil::combine_metadata_structures($out_metadata, $self->{'assocfile_info'}->{$full_filename});
558 }
559 ## encode the filename as perl5 doesn't handle unicode filenames
560
561 my $tmpfile = Encode::encode_utf8($subfile);
562 #print STDERR "subfile = $subfile, tmpfile = $tmpfile\n";
563 # Next add metadata read in XML files (if it is supplied)
564 if ($additionalmetadata == 1) {
565 my ($filespec, $mdref);
566 foreach $filespec (@extrametakeys) {
567 ## use the utf8 encoded filename to do the filename comparison
568 if ($tmpfile =~ /^$filespec$/) {
569 print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
570 if ($verbosity > 2);
571 $mdref = $extrametadata{$filespec};
572 &metadatautil::combine_metadata_structures($out_metadata, $mdref);
573 }
574 }
575 }
576
577
578 if (defined $self->{'inf_timestamp'}) {
579 # Look to see if it's a completely new file
580
581 if (!$block_hash->{'new_files'}->{$full_filename}) {
582 # Not a new file, must be an existing file
583 # Let' see if it's newer than the last import.pl
584
585
586 my $inf_timestamp = $self->{'inf_timestamp'};
587
588 if (! -d $full_filename) {
589 my $filename_timestamp = -M $full_filename;
590 if ($filename_timestamp > $inf_timestamp) {
591 # filename has been around for longer than inf
592 print $outhandle "**** Skipping $subfile\n" if ($verbosity >3);
593 next;
594 }
595 else {
596 # Remove old folder in archives (might hash to something different)
597 # *** should be doing this on a Del one as well
598 # but leave folder name?? and ensure hashs to
599 # same again??
600
601 # Then let through as new doc??
602
603 # mark to doc-oids that rely on it for re-indexing
604 ## &GDBMUtils::gdbmDatabase();
605
606 }
607 }
608 }
609 }
610
611 # Recursively read each $subfile
612 print $outhandle "DirectoryPlugin recurring: $subfile\n" if ($verbosity > 2);
613
614 $count += &plugin::read ($pluginfo, $this_file_base_dir,
615 $file_subfile, $block_hash,
616 $out_metadata, $processor, $maxdocs, ($total_count + $count), $gli);
617 }
618
619 return $count;
620}
621
622sub compile_stats {
623 my $self = shift(@_);
624 my ($stats) = @_;
625}
626
6271;
Note: See TracBrowser for help on using the repository browser.