source: main/trunk/greenstone2/perllib/plugins/DirectoryPlugin.pm@ 21563

Last change on this file since 21563 was 21563, checked in by mdewsnip, 14 years ago

Deleted GDBMUtils.pm. What was the point of me spending all that time creating dbutil if people are just going to ignore it and continue writing GDBM-only code?!? Not impressed...

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 20.6 KB
Line 
1###########################################################################
2#
3# DirectoryPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# DirectoryPlugin is a plugin which recurses through directories processing
27# each file it finds - which basically means passing it down the plugin
28# pipeline
29
30package DirectoryPlugin;
31
32use PrintInfo;
33use plugin;
34use util;
35use metadatautil;
36
37use File::Basename;
38use strict;
39no strict 'refs';
40no strict 'subs';
41
42use Encode;
43
44BEGIN {
45 @DirectoryPlugin::ISA = ('PrintInfo');
46}
47
48my $arguments =
49 [ { 'name' => "block_exp",
50 'desc' => "{BasePlugin.block_exp}",
51 'type' => "regexp",
52 'deft' => &get_default_block_exp(),
53 'reqd' => "no" },
54 # this option has been deprecated. leave it here for now so we can warn people not to use it
55 { 'name' => "use_metadata_files",
56 'desc' => "{DirectoryPlugin.use_metadata_files}",
57 'type' => "flag",
58 'reqd' => "no",
59 'hiddengli' => "yes" },
60 { 'name' => "recheck_directories",
61 'desc' => "{DirectoryPlugin.recheck_directories}",
62 'type' => "flag",
63 'reqd' => "no" } ];
64
65my $options = { 'name' => "DirectoryPlugin",
66 'desc' => "{DirectoryPlugin.desc}",
67 'abstract' => "no",
68 'inherits' => "yes",
69 'args' => $arguments };
70
71sub new {
72 my ($class) = shift (@_);
73 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
74 push(@$pluginlist, $class);
75
76 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
77 push(@{$hashArgOptLists->{"OptList"}},$options);
78
79 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
80
81 if ($self->{'info_only'}) {
82 # don't worry about any options or initialisations etc
83 return bless $self, $class;
84 }
85
86 # we have left this option in so we can warn people who are still using it
87 if ($self->{'use_metadata_files'}) {
88 die "ERROR: DirectoryPlugin -use_metadata_files option has been deprecated. Please remove the option and add MetadataXMLPlug to your plugin list instead!\n";
89 }
90
91 $self->{'num_processed'} = 0;
92 $self->{'num_not_processed'} = 0;
93 $self->{'num_blocked'} = 0;
94 $self->{'num_archives'} = 0;
95
96 $self->{'subdir_extrametakeys'} = {};
97
98 return bless $self, $class;
99}
100
101# called once, at the start of processing
102sub init {
103 my $self = shift (@_);
104 my ($verbosity, $outhandle, $failhandle) = @_;
105
106 # verbosity is passed through from the processor
107 $self->{'verbosity'} = $verbosity;
108
109 # as are the outhandle and failhandle
110 $self->{'outhandle'} = $outhandle if defined $outhandle;
111 $self->{'failhandle'} = $failhandle;
112
113}
114
115# called once, after all passes have finished
116sub deinit {
117 my ($self) = @_;
118
119}
120
121# called at the beginning of each plugin pass (import has one, building has many)
122sub begin {
123 my $self = shift (@_);
124 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
125
126 my $proc_package_name = ref $processor;
127
128 if ($proc_package_name !~ /buildproc$/ && $self->{'incremental'} == 1) {
129
130 # Only lookup timestamp info for import.pl, and only if incremental is set
131
132 my $output_dir = $processor->getoutputdir();
133## my $archives_inf = &util::filename_cat($output_dir,"archives.inf");
134 my $doc_db = "archiveinf-doc.gdb";
135 my $archives_inf = &util::filename_cat($output_dir,$doc_db);
136
137 if ( -e $archives_inf ) {
138 $self->{'inf_timestamp'} = -M $archives_inf;
139 }
140 }
141}
142
143sub remove_all {
144 my $self = shift (@_);
145 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
146
147}
148
149
150sub remove_one {
151 my $self = shift (@_);
152 my ($file, $oids, $archivedir) = @_;
153 return undef; # this will never be called for directories (will it??)
154
155}
156
157
158# called at the end of each plugin pass
159sub end {
160 my ($self) = shift (@_);
161
162}
163
164
165
166# return 1 if this class might recurse using $pluginfo
167sub is_recursive {
168 my $self = shift (@_);
169
170 return 1;
171}
172
173sub get_default_block_exp {
174 my $self = shift (@_);
175
176 return '(?i)(CVS|\.svn|Thumbs\.db|OIDcount|~)$';
177}
178
179sub check_directory_path {
180
181 my $self = shift(@_);
182 my ($dirname) = @_;
183
184 return undef unless (-d $dirname);
185
186 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
187
188 my $outhandle = $self->{'outhandle'};
189
190 # check to make sure we're not reading the archives or index directory
191 my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
192 if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
193 print $outhandle "DirectoryPlugin: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
194 return 0;
195 }
196
197 # check to see we haven't got a cyclic path...
198 if ($dirname =~ m%(/.*){,41}%) {
199 print $outhandle "DirectoryPlugin: $dirname is 40 directories deep, is this a recursive path? if not increase constant in DirectoryPlugin.pm.\n";
200 return 0;
201 }
202
203 # check to see we haven't got a cyclic path...
204 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
205 print $outhandle "DirectoryPlugin: $dirname appears to be in a recursive loop...\n";
206 return 0;
207 }
208
209 return 1;
210}
211
212# this may be called more than once
213sub sort_out_associated_files {
214
215 my $self = shift (@_);
216 my ($block_hash) = @_;
217 if (!scalar (keys %{$block_hash->{'shared_fileroot'}})) {
218 return;
219 }
220
221 $self->{'assocfile_info'} = {} unless defined $self->{'assocfile_info'};
222 my $metadata = $self->{'assocfile_info'};
223 foreach my $prefix (keys %{$block_hash->{'shared_fileroot'}}) {
224 my $record = $block_hash->{'shared_fileroot'}->{$prefix};
225
226 my $tie_to = $record->{'tie_to'};
227 my $exts = $record->{'exts'};
228
229 if ((defined $tie_to) && (scalar (keys %$exts) > 0)) {
230 # set up fileblocks and assocfile_tobe
231 my $base_file = "$prefix$tie_to";
232 $metadata->{$base_file} = {} unless defined $metadata->{$base_file};
233 my $base_file_metadata = $metadata->{$base_file};
234
235 $base_file_metadata->{'gsdlassocfile_tobe'} = [] unless defined $base_file_metadata->{'gsdlassocfile_tobe'};
236 my $assoc_tobe = $base_file_metadata->{'gsdlassocfile_tobe'};
237 foreach my $e (keys %$exts) {
238 # block the file
239 $block_hash->{'file_blocks'}->{"$prefix$e"} = 1;
240 # set up as an associatd file
241 print STDERR " $self->{'plugin_type'}: Associating $prefix$e with $tie_to version\n";
242 my $mime_type = ""; # let system auto detect this
243 push(@$assoc_tobe,"$prefix$e:$mime_type:");
244
245 }
246 }
247 } # foreach record
248
249 $block_hash->{'shared_fileroot'} = undef;
250 $block_hash->{'shared_fileroot'} = {};
251
252}
253
254
255# do block exp OR special blocking ???
256
257sub file_is_blocked {
258 my $self = shift (@_);
259 my ($block_hash, $filename_full_path) = @_;
260
261 if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) {
262 $self->{'num_blocked'} ++;
263 return 1;
264 }
265 # check Directory plugin's own block_exp
266 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
267 $self->{'num_blocked'} ++;
268 return 1; # blocked
269 }
270 return 0;
271}
272
273
274
275sub file_block_read {
276 my $self = shift (@_);
277 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
278
279 my $outhandle = $self->{'outhandle'};
280 my $verbosity = $self->{'verbosity'};
281
282 # Calculate the directory name and ensure it is a directory and
283 # that it is not explicitly blocked.
284 my $dirname = $file;
285 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
286
287 my $directory_ok = $self->check_directory_path($dirname);
288 return $directory_ok unless (defined $directory_ok && $directory_ok == 1);
289
290 print $outhandle "Global file scan checking directory: $dirname\n";
291
292 $block_hash->{'all_files'} = {} unless defined $block_hash->{'all_files'};
293 $block_hash->{'metadata_files'} = {} unless defined $block_hash->{'metadata_files'};
294
295 $block_hash->{'file_blocks'} = {} unless defined $block_hash->{'file_blocks'};
296 $block_hash->{'shared_fileroot'} = {} unless defined $block_hash->{'shared_fileroot'};
297
298 # Recur over directory contents.
299 my (@dir, $subfile);
300 #my $count = 0;
301
302 print $outhandle "DirectoryPlugin block: getting directory $dirname\n" if ($verbosity > 2);
303
304 # find all the files in the directory
305 if (!opendir (DIR, $dirname)) {
306 if ($gli) {
307 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
308 }
309 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
310 return -1; # error in processing
311 }
312 @dir = readdir (DIR);
313 closedir (DIR);
314
315 for (my $i = 0; $i < scalar(@dir); $i++) {
316 my $subfile = $dir[$i];
317 my $this_file_base_dir = $base_dir;
318 next if ($subfile =~ m/^\.\.?$/);
319
320 # Recursively read each $subfile
321 print $outhandle "DirectoryPlugin block recurring: $subfile\n" if ($verbosity > 2);
322
323 #$count += &plugin::file_block_read ($pluginfo, $this_file_base_dir,
324 &plugin::file_block_read ($pluginfo, $this_file_base_dir,
325 &util::filename_cat($file, $subfile),
326 $block_hash, $metadata, $gli);
327
328 }
329 $self->sort_out_associated_files($block_hash);
330 #return $count;
331
332}
333
334# We don't do metadata_read
335sub metadata_read {
336 my $self = shift (@_);
337 my ($pluginfo, $base_dir, $file, $block_hash,
338 $extrametakeys, $extrametadata, $extrametafile,
339 $processor, $maxdocs, $gli) = @_;
340
341 return undef;
342}
343
344
345# return number of files processed, undef if can't process
346# Note that $base_dir might be "" and that $file might
347# include directories
348
349# This function passes around metadata hash structures. Metadata hash
350# structures are hashes that map from a (scalar) key (the metadata element
351# name) to either a scalar metadata value or a reference to an array of
352# such values.
353
354sub read {
355 my $self = shift (@_);
356 my ($pluginfo, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_;
357
358 my $outhandle = $self->{'outhandle'};
359 my $verbosity = $self->{'verbosity'};
360
361 # Calculate the directory name and ensure it is a directory and
362 # that it is not explicitly blocked.
363 my $dirname;
364 if ($file eq "") {
365 $dirname = $base_dir;
366 } else {
367 $dirname = $file;
368 $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
369 }
370
371 my $directory_ok = $self->check_directory_path($dirname);
372 return $directory_ok unless (defined $directory_ok && $directory_ok == 1);
373
374 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
375 print $outhandle "DirectoryPlugin: metadata passed in: ",
376 join(", ", keys %$in_metadata), "\n";
377 }
378
379
380 # Recur over directory contents.
381 my (@dir, $subfile);
382 my $count = 0;
383
384 print $outhandle "DirectoryPlugin read: getting directory $dirname\n" if ($verbosity > 2);
385
386 # find all the files in the directory
387 if (!opendir (DIR, $dirname)) {
388 if ($gli) {
389 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
390 }
391 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
392 return -1; # error in processing
393 }
394 @dir = readdir (DIR);
395 closedir (DIR);
396
397 # Re-order the files in the list so any directories ending with .all are moved to the end
398 for (my $i = scalar(@dir) - 1; $i >= 0; $i--) {
399 if (-d &util::filename_cat($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) {
400 push(@dir, splice(@dir, $i, 1));
401 }
402 }
403
404 # setup the metadata structures. we do a metadata_read pass to see if there is any additional metadata, then pass it to read
405
406 my $additionalmetadata = 0; # is there extra metadata available?
407 my %extrametadata; # maps from filespec to extra metadata keys
408 my %extrametafile; # maps from filespec to the metadata.xml (or similar) file it came from
409 my @extrametakeys; # keys of %extrametadata in order read
410
411
412 my $os_dirsep = &util::get_os_dirsep();
413 my $dirsep = &util::get_dirsep();
414 my $base_dir_regexp = $base_dir;
415 $base_dir_regexp =~ s/\//$os_dirsep/g;
416 my $local_dirname = $dirname;
417 $local_dirname =~ s/^$base_dir_regexp($os_dirsep)//;
418 $local_dirname .= $dirsep;
419
420 if (defined $self->{'subdir_extrametakeys'}->{$local_dirname}) {
421 my $extrakeys = $self->{'subdir_extrametakeys'}->{$local_dirname};
422 foreach my $ek (@$extrakeys) {
423 my $extrakeys_re = $ek->{'re'};
424 my $extrakeys_md = $ek->{'md'};
425 my $extrakeys_mf = $ek->{'mf'};
426 push(@extrametakeys,$extrakeys_re);
427 $extrametadata{$extrakeys_re} = $extrakeys_md;
428 $extrametafile{$extrakeys_re} = $extrakeys_mf;
429 }
430 delete($self->{'subdir_extrametakeys'}->{$local_dirname});
431 }
432
433 # apply metadata pass for each of the files in the directory
434 my $num_files = scalar(@dir);
435 for (my $i = 0; $i < scalar(@dir); $i++) {
436 my $subfile = $dir[$i];
437 my $this_file_base_dir = $base_dir;
438 last if ($maxdocs != -1 && $count >= $maxdocs);
439 next if ($subfile =~ m/^\.\.?$/);
440 my $file_subfile = &util::filename_cat($file, $subfile);
441 my $full_filename = &util::filename_cat($this_file_base_dir, $file_subfile);
442 if ($self->file_is_blocked($block_hash,$full_filename)) {
443 print STDERR "DirectoryPlugin: file $full_filename was blocked for metadata_read\n" if ($verbosity > 2);
444 next;
445 }
446
447 # Recursively read each $subfile
448 print $outhandle "DirectoryPlugin metadata recurring: $subfile\n" if ($verbosity > 2);
449
450 $count += &plugin::metadata_read ($pluginfo, $this_file_base_dir,
451 $file_subfile,$block_hash,
452 \@extrametakeys, \%extrametadata,
453 \%extrametafile,
454 $processor, $maxdocs, $gli);
455 $additionalmetadata = 1;
456 }
457
458 # filter out any extrametakeys that mention subdirectories and store
459 # for later use (i.e. when that sub-directory is being processed)
460
461 foreach my $ek (@extrametakeys) {
462 my ($subdir_re,$extrakey_dir) = &File::Basename::fileparse($ek);
463
464 $extrakey_dir =~ s/\\\./\./g; # remove RE syntax for .
465 $extrakey_dir =~ s/\\\\/\\/g; # remove RE syntax for \
466
467 my $dirsep_re = &util::get_re_dirsep();
468
469 my $ek_non_re = $ek;
470 $ek_non_re =~ s/\\\./\./g; # remove RE syntax for .
471 $ek_non_re =~ s/\\\\/\\/g; # remove RE syntax for \
472 if ($ek_non_re =~ m/$dirsep_re/) { # specifies at least one directory
473 my $md = $extrametadata{$ek};
474 my $mf = $extrametafile{$ek};
475
476 my $subdir_extrametakeys = $self->{'subdir_extrametakeys'};
477
478 my $subdir_rec = { 're' => $subdir_re, 'md' => $md, 'mf' => $mf };
479
480 # when its looked up, it must be relative to the base dir
481 push(@{$subdir_extrametakeys->{"$local_dirname$extrakey_dir"}},$subdir_rec);
482 #push(@{$subdir_extrametakeys->{"$extrakey_dir"}},$subdir_rec);
483 }
484 }
485
486 # import each of the files in the directory
487 $count=0;
488 for (my $i = 0; $i <= scalar(@dir); $i++) {
489 # When every file in the directory has been done, pause for a moment (figuratively!)
490 # If the -recheck_directories argument hasn't been provided, stop now (default)
491 # Otherwise, re-read the contents of the directory to check for new files
492 # Any new files are added to the @dir list and are processed as normal
493 # This is necessary when documents to be indexed are specified in bibliographic DBs
494 # These files are copied/downloaded and stored in a new folder at import time
495 if ($i == $num_files) {
496 last unless $self->{'recheck_directories'};
497
498 # Re-read the files in the directory to see if there are any new files
499 last if (!opendir (DIR, $dirname));
500 my @dirnow = readdir (DIR);
501 closedir (DIR);
502
503 # We're only interested if there are more files than there were before
504 last if (scalar(@dirnow) <= scalar(@dir));
505
506 # Any new files are added to the end of @dir to get processed by the loop
507 my $j;
508 foreach my $subfilenow (@dirnow) {
509 for ($j = 0; $j < $num_files; $j++) {
510 last if ($subfilenow eq $dir[$j]);
511 }
512 if ($j == $num_files) {
513 # New file
514 push(@dir, $subfilenow);
515 }
516 }
517 # When the new files have been processed, check again
518 $num_files = scalar(@dir);
519 }
520
521 my $subfile = $dir[$i];
522 my $this_file_base_dir = $base_dir;
523 last if ($maxdocs != -1 && ($count + $total_count) >= $maxdocs);
524 next if ($subfile =~ /^\.\.?$/);
525
526 my $file_subfile = &util::filename_cat($file, $subfile);
527 my $full_filename
528 = &util::filename_cat($this_file_base_dir,$file_subfile);
529
530 if ($self->file_is_blocked($block_hash,$full_filename)) {
531 print STDERR "DirectoryPlugin: file $full_filename was blocked for read\n" if ($verbosity > 2);
532 next;
533 }
534 #print STDERR "processing $full_filename\n";
535 # Follow Windows shortcuts
536 if ($subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {
537 require Win32::Shortcut;
538 my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $subfile));
539 if ($shortcut) {
540 # The file to be processed is now the target of the shortcut
541 $this_file_base_dir = "";
542 $file = "";
543 $subfile = $shortcut->Path;
544 }
545 }
546
547 # check for a symlink pointing back to a leading directory
548 if (-d "$dirname/$subfile" && -l "$dirname/$subfile") {
549 # readlink gives a "fatal error" on systems that don't implement
550 # symlinks. This assumes the the -l test above would fail on those.
551 my $linkdest=readlink "$dirname/$subfile";
552 if (!defined ($linkdest)) {
553 # system error - file not found?
554 warn "DirectoryPlugin: symlink problem - $!";
555 } else {
556 # see if link points to current or a parent directory
557 if ($linkdest =~ m@^[\./\\]+$@ ||
558 index($dirname, $linkdest) != -1) {
559 warn "DirectoryPlugin: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n";
560 next;
561 ;
562 }
563 }
564 }
565
566 print $outhandle "DirectoryPlugin: preparing metadata for $subfile\n" if ($verbosity > 2);
567
568 # Make a copy of $in_metadata to pass to $subfile
569 my $out_metadata = {};
570 &metadatautil::combine_metadata_structures($out_metadata, $in_metadata);
571
572 # check the assocfile_info
573 if (defined $self->{'assocfile_info'}->{$full_filename}) {
574 &metadatautil::combine_metadata_structures($out_metadata, $self->{'assocfile_info'}->{$full_filename});
575 }
576 ## encode the filename as perl5 doesn't handle unicode filenames
577
578 my $tmpfile = Encode::encode_utf8($subfile);
579 # Next add metadata read in XML files (if it is supplied)
580 if ($additionalmetadata == 1) {
581 foreach my $filespec (@extrametakeys) {
582
583 ## use the utf8 encoded filename to do the filename comparison
584 if ($tmpfile =~ /^$filespec$/) {
585 print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
586 if ($verbosity > 2);
587 my $mdref = $extrametadata{$filespec};
588 my $mfref = $extrametafile{$filespec};
589
590 # Add the list files where the metadata came from
591 # into the metadata table so we can track this
592 # This mechanism is similar to how gsdlassocfile works
593
594 my @metafile_pair = ();
595 foreach my $l (keys %$mfref) {
596 my $f = $mfref->{$l};
597 push (@metafile_pair, "$f : $l");
598 }
599
600 $mdref->{'gsdlmetafile'} = \@metafile_pair;
601
602 &metadatautil::combine_metadata_structures($out_metadata, $mdref);
603 }
604 }
605 }
606
607 if (defined $self->{'inf_timestamp'}) {
608 # Look to see if it's a completely new file
609
610 if (!$block_hash->{'new_files'}->{$full_filename}) {
611 # Not a new file, must be an existing file
612 # Let' see if it's newer than the last import.pl
613
614
615 if (! -d $full_filename) {
616 if (!$block_hash->{'reindex_files'}->{$full_filename}) {
617 # filename has been around for longer than inf_timestamp
618 print $outhandle "**** Skipping $subfile\n" if ($verbosity >3);
619 next;
620 }
621 else {
622 # Remove old folder in archives (might hash to something different)
623 # *** should be doing this on a Del one as well
624 # but leave folder name?? and ensure hashs to
625 # same again??
626
627 # Then let through as new doc??
628
629 # mark to doc-oids that rely on it for re-indexing
630 }
631 }
632 }
633 }
634
635 # Recursively read each $subfile
636 print $outhandle "DirectoryPlugin recurring: $subfile\n" if ($verbosity > 2);
637
638 $count += &plugin::read ($pluginfo, $this_file_base_dir,
639 $file_subfile, $block_hash,
640 $out_metadata, $processor, $maxdocs, ($total_count + $count), $gli);
641 }
642
643 return $count;
644}
645
646sub compile_stats {
647 my $self = shift(@_);
648 my ($stats) = @_;
649}
650
6511;
Note: See TracBrowser for help on using the repository browser.