source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/DirectoryPlugin.pm@ 30289

Last change on this file since 30289 was 30289, checked in by jmt12, 9 years ago

Significant changes to read() function - essentially split in half with the first phase responsible for building up the list of files to process and the second for doing the actual processing. Allows us to shortcut the system by passing in a list of files to process (as in the case of manifest version 2).

  • Property svn:executable set to *
File size: 25.1 KB
Line 
1###########################################################################
2#
3# DirectoryPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# DirectoryPlugin is a plugin which recurses through directories processing
27# each file it finds - which basically means passing it down the plugin
28# pipeline
29
30package DirectoryPlugin;
31
32use extrametautil;
33use PrintInfo;
34use plugin;
35use util;
36use FileUtils;
37use metadatautil;
38
39use File::Basename;
40use strict;
41no strict 'refs';
42no strict 'subs';
43
44use Encode::Locale;
45use Encode;
46use Unicode::Normalize;
47
48BEGIN {
49 @DirectoryPlugin::ISA = ('PrintInfo');
50}
51
52my $arguments =
53 [ { 'name' => "block_exp",
54 'desc' => "{BasePlugin.block_exp}",
55 'type' => "regexp",
56 'deft' => &get_default_block_exp(),
57 'reqd' => "no" },
58 # this option has been deprecated. leave it here for now so we can warn people not to use it
59 { 'name' => "use_metadata_files",
60 'desc' => "{DirectoryPlugin.use_metadata_files}",
61 'type' => "flag",
62 'reqd' => "no",
63 'hiddengli' => "yes" },
64 { 'name' => "recheck_directories",
65 'desc' => "{DirectoryPlugin.recheck_directories}",
66 'type' => "flag",
67 'reqd' => "no" } ];
68
69my $options = { 'name' => "DirectoryPlugin",
70 'desc' => "{DirectoryPlugin.desc}",
71 'abstract' => "no",
72 'inherits' => "yes",
73 'args' => $arguments };
74
75sub new {
76 my ($class) = shift (@_);
77 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
78 push(@$pluginlist, $class);
79
80 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
81 push(@{$hashArgOptLists->{"OptList"}},$options);
82
83 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
84
85 print STDERR "INFO: This DirectoryPlugin supports version 2 manifest files\n";
86
87 if ($self->{'info_only'}) {
88 # don't worry about any options or initialisations etc
89 return bless $self, $class;
90 }
91
92 # we have left this option in so we can warn people who are still using it
93 if ($self->{'use_metadata_files'}) {
94 die "ERROR: DirectoryPlugin -use_metadata_files option has been deprecated. Please remove the option and add MetadataXMLPlug to your plugin list instead!\n";
95 }
96
97 $self->{'num_processed'} = 0;
98 $self->{'num_not_processed'} = 0;
99 $self->{'num_blocked'} = 0;
100 $self->{'num_archives'} = 0;
101
102 $self->{'subdir_extrametakeys'} = {};
103
104 return bless $self, $class;
105}
106
107# called once, at the start of processing
108sub init {
109 my $self = shift (@_);
110 my ($verbosity, $outhandle, $failhandle) = @_;
111
112 # verbosity is passed through from the processor
113 $self->{'verbosity'} = $verbosity;
114
115 # as are the outhandle and failhandle
116 $self->{'outhandle'} = $outhandle if defined $outhandle;
117 $self->{'failhandle'} = $failhandle;
118
119}
120
121# called once, after all passes have finished
122sub deinit {
123 my ($self) = @_;
124
125}
126
127# called at the beginning of each plugin pass (import has one, building has many)
128sub begin {
129 my $self = shift (@_);
130 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
131
132 # Only lookup timestamp info for import.pl, and only if incremental is set
133 my $proc_package_name = ref $processor;
134 if ($proc_package_name !~ /buildproc$/ && $self->{'incremental'} == 1) {
135 # Get the infodbtype value for this collection from the arcinfo object
136 my $infodbtype = $processor->getoutputinfo()->{'infodbtype'};
137 $infodbtype = "gdbm" if $infodbtype eq "gdbm-txtgz"; # in archives, cannot use txtgz version
138 my $output_dir = $processor->getoutputdir();
139 my $archives_inf = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
140
141 if ( -e $archives_inf ) {
142 $self->{'inf_timestamp'} = -M $archives_inf;
143 }
144 }
145}
146
147sub remove_all {
148 my $self = shift (@_);
149 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
150}
151
152
153sub remove_one {
154 my $self = shift (@_);
155 my ($file, $oids, $archivedir) = @_;
156 return undef; # this will never be called for directories (will it??)
157
158}
159
160
161# called at the end of each plugin pass
162sub end {
163 my ($self) = shift (@_);
164
165}
166
167
168
169# return 1 if this class might recurse using $pluginfo
170sub is_recursive {
171 my $self = shift (@_);
172
173 return 1;
174}
175
176sub get_default_block_exp {
177 my $self = shift (@_);
178
179 return '(?i)(CVS|\.svn|Thumbs\.db|OIDcount|\.DS_Store|~)$';
180}
181
182sub check_directory_path {
183
184 my $self = shift(@_);
185 my ($dirname) = @_;
186
187 return undef unless (-d $dirname);
188
189 return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
190
191 my $outhandle = $self->{'outhandle'};
192
193 # check to make sure we're not reading the archives or index directory
194 my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
195 if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
196 print $outhandle "DirectoryPlugin: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
197 return 0;
198 }
199
200 # check to see we haven't got a cyclic path...
201 if ($dirname =~ m%(/.*){,41}%) {
202 print $outhandle "DirectoryPlugin: $dirname is 40 directories deep, is this a recursive path? if not increase constant in DirectoryPlugin.pm.\n";
203 return 0;
204 }
205
206 # check to see we haven't got a cyclic path...
207 if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
208 print $outhandle "DirectoryPlugin: $dirname appears to be in a recursive loop...\n";
209 return 0;
210 }
211
212 return 1;
213}
214
215# this may be called more than once
216sub sort_out_associated_files {
217
218 my $self = shift (@_);
219 my ($block_hash) = @_;
220 if (!scalar (keys %{$block_hash->{'shared_fileroot'}})) {
221 return;
222 }
223
224 $self->{'assocfile_info'} = {} unless defined $self->{'assocfile_info'};
225 my $metadata = $self->{'assocfile_info'};
226 foreach my $prefix (keys %{$block_hash->{'shared_fileroot'}}) {
227 my $record = $block_hash->{'shared_fileroot'}->{$prefix};
228
229 my $tie_to = $record->{'tie_to'};
230 my $exts = $record->{'exts'};
231
232 if ((defined $tie_to) && (scalar (keys %$exts) > 0)) {
233 # set up fileblocks and assocfile_tobe
234 my $base_file = "$prefix$tie_to";
235 $metadata->{$base_file} = {} unless defined $metadata->{$base_file};
236 my $base_file_metadata = $metadata->{$base_file};
237
238 $base_file_metadata->{'gsdlassocfile_tobe'} = [] unless defined $base_file_metadata->{'gsdlassocfile_tobe'};
239 my $assoc_tobe = $base_file_metadata->{'gsdlassocfile_tobe'};
240 foreach my $e (keys %$exts) {
241 # block the file
242 &util::block_filename($block_hash,"$prefix$e");
243 # set up as an associatd file
244 print STDERR " $self->{'plugin_type'}: Associating $prefix$e with $tie_to version\n";
245 my $mime_type = ""; # let system auto detect this
246 push(@$assoc_tobe,"$prefix$e:$mime_type:");
247
248 }
249 }
250 } # foreach record
251
252 $block_hash->{'shared_fileroot'} = undef;
253 $block_hash->{'shared_fileroot'} = {};
254
255}
256
257
258# do block exp OR special blocking ???
259
260sub file_is_blocked {
261 my $self = shift (@_);
262 my ($block_hash, $filename_full_path) = @_;
263
264 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
265
266 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
267 # on windows, all block paths are lowercased.
268 my $lower_filename = lc ($filename_full_path);
269 if (defined $block_hash->{'file_blocks'}->{$lower_filename}) {
270 $self->{'num_blocked'} ++;
271 return 1;
272 }
273 }
274 else {
275 if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) {
276 $self->{'num_blocked'} ++;
277 return 1;
278 }
279 }
280 # check Directory plugin's own block_exp
281 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
282 $self->{'num_blocked'} ++;
283 return 1; # blocked
284 }
285 return 0;
286}
287
288
289
290sub file_block_read {
291 my $self = shift (@_);
292 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
293
294 my $outhandle = $self->{'outhandle'};
295 my $verbosity = $self->{'verbosity'};
296
297 # Calculate the directory name and ensure it is a directory and
298 # that it is not explicitly blocked.
299 my $dirname = $file;
300 $dirname = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\w/;
301
302 my $directory_ok = $self->check_directory_path($dirname);
303 return $directory_ok unless (defined $directory_ok && $directory_ok == 1);
304
305 print $outhandle "Global file scan checking directory: $dirname\n";
306
307 $block_hash->{'all_files'} = {} unless defined $block_hash->{'all_files'};
308 $block_hash->{'metadata_files'} = {} unless defined $block_hash->{'metadata_files'};
309
310 $block_hash->{'file_blocks'} = {} unless defined $block_hash->{'file_blocks'};
311 $block_hash->{'shared_fileroot'} = {} unless defined $block_hash->{'shared_fileroot'};
312
313 # Recur over directory contents.
314 my (@dir, $subfile);
315 #my $count = 0;
316
317 print $outhandle "DirectoryPlugin block: getting directory $dirname\n" if ($verbosity > 2);
318
319 # find all the files in the directory
320 if (!opendir (DIR, $dirname)) {
321 if ($gli) {
322 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
323 }
324 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
325 return -1; # error in processing
326 }
327 @dir = sort readdir (DIR);
328 closedir (DIR);
329
330 for (my $i = 0; $i < scalar(@dir); $i++) {
331 my $raw_subfile = $dir[$i];
332 next if ($raw_subfile =~ m/^\.\.?$/);
333
334 my $this_file_base_dir = $base_dir;
335 my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile);
336
337 # Recursively read each $raw_subfile
338 print $outhandle "DirectoryPlugin block recurring: ". Encode::decode("utf8", $raw_file_subfile) ."\n" if ($verbosity > 2);
339 print $outhandle "DirectoryPlugin block recurring: ". Encode::decode(locale =>$raw_file_subfile) ."\n" if ($verbosity > 2);
340
341 #$count += &plugin::file_block_read ($pluginfo, $this_file_base_dir,
342
343 &plugin::file_block_read ($pluginfo, $this_file_base_dir,
344 $raw_file_subfile,
345 $block_hash, $metadata, $gli);
346
347 }
348 $self->sort_out_associated_files($block_hash);
349 #return $count;
350 return 1;
351
352}
353
354# We don't do metadata_read
355sub metadata_read {
356 my $self = shift (@_);
357 my ($pluginfo, $base_dir, $file, $block_hash,
358 $extrametakeys, $extrametadata, $extrametafile,
359 $processor, $gli, $aux) = @_;
360
361 return undef;
362}
363
364
365# return number of files processed, undef if can't process
366# Note that $base_dir might be "" and that $file might
367# include directories
368
369# This function passes around metadata hash structures. Metadata hash
370# structures are hashes that map from a (scalar) key (the metadata element
371# name) to either a scalar metadata value or a reference to an array of
372# such values.
373
374sub read {
375 my $self = shift (@_);
376 my ($pluginfo, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_;
377 my $outhandle = $self->{'outhandle'};
378 my $verbosity = $self->{'verbosity'};
379
380 # Calculate the directory name and ensure it is a directory and
381 # that it is not explicitly blocked.
382 my $dirname;
383 if ($file eq "") {
384 $dirname = $base_dir;
385 } else {
386 $dirname = $file;
387 $dirname = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\w/;
388 }
389
390 my $directory_ok = $self->check_directory_path($dirname);
391 return $directory_ok unless (defined $directory_ok && $directory_ok == 1);
392
393 if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
394 print $outhandle "DirectoryPlugin: metadata passed in: ",
395 join(", ", keys %$in_metadata), "\n";
396 }
397
398
399 # Recur over directory contents.
400 my @dir;
401
402 print $outhandle "DirectoryPlugin read: getting directory $dirname\n" if ($verbosity > 2);
403
404 # find all the files in the directory
405 if (!opendir (DIR, $dirname)) {
406 if ($gli) {
407 print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
408 }
409 print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
410 return -1; # error in processing
411 }
412 @dir = sort readdir (DIR);
413 map { $_ = &unicode::raw_filename_to_url_encoded($_); } @dir;
414 closedir (DIR);
415 # Re-order the files in the list so any directories ending with .all are moved to the end
416 for (my $i = scalar(@dir) - 1; $i >= 0; $i--) {
417 if (-d &FileUtils::filenameConcatenate($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) {
418 push(@dir, splice(@dir, $i, 1));
419 }
420 }
421
422 # Chain through to the rest of the read function (now split off and named
423 # read_phase2)
424 my $count = $self->read_phase2($pluginfo, $dirname, \@dir, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli);
425
426 return $count;
427}
428
429sub read_phase2
430{
431 my $self = shift (@_);
432 my ($pluginfo, $dirname, $dir_ref, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_;
433 # These were defined in read (phase 1)
434 my @dir = @{$dir_ref};
435 my $subfile;
436
437 my $outhandle = $self->{'outhandle'};
438 my $verbosity = $self->{'verbosity'};
439
440 # setup the metadata structures. we do a metadata_read pass to see if there is any additional metadata, then pass it to read
441
442 my $additionalmetadata = 0; # is there extra metadata available?
443 my %extrametadata; # maps from filespec to extra metadata keys
444 my %extrametafile; # maps from filespec to the metadata.xml (or similar) file it came from
445 my @extrametakeys; # keys of %extrametadata in order read
446
447
448 my $os_dirsep = &util::get_os_dirsep();
449 my $dirsep = &util::get_dirsep();
450 my $base_dir_regexp = $base_dir;
451 $base_dir_regexp =~ s/\//$os_dirsep/g;
452
453 # Want to get relative path of local_dirname within the base_directory
454 # but with URL style slashes.
455 my $local_dirname = &util::filename_within_directory_url_format($dirname, $base_dir);
456
457 # if we are in import folder, then local_dirname will be empty
458 if ($local_dirname ne "") {
459 # look for extra metadata passed down from higher folders
460 $local_dirname .= "/"; # closing slash must be URL type slash also and not $dirsep;
461 if (defined $self->{'subdir_extrametakeys'}->{$local_dirname}) {
462 my $extrakeys = $self->{'subdir_extrametakeys'}->{$local_dirname};
463 foreach my $ek (@$extrakeys) {
464 my $extrakeys_re = $ek->{'re'};
465 my $extrakeys_md = $ek->{'md'};
466 my $extrakeys_mf = $ek->{'mf'};
467 &extrametautil::addmetakey(\@extrametakeys, $extrakeys_re);
468 &extrametautil::setmetadata(\%extrametadata, $extrakeys_re, $extrakeys_md);
469 &extrametautil::setmetafile(\%extrametafile, $extrakeys_re, $extrakeys_mf);
470 }
471 delete($self->{'subdir_extrametakeys'}->{$local_dirname});
472 }
473 }
474 # apply metadata pass for each of the files in the directory -- ignore
475 # maxdocs here
476 my $num_files = scalar(@dir);
477 for (my $i = 0; $i < scalar(@dir); $i++) {
478 my $subfile = $dir[$i];
479 next if ($subfile =~ m/^\.\.?$/);
480
481 my $this_file_base_dir = $base_dir;
482 my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile);
483
484 my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile);
485 my $raw_full_filename = &FileUtils::filenameConcatenate($this_file_base_dir, $raw_file_subfile);
486
487 if ($self->file_is_blocked($block_hash,$raw_full_filename)) {
488 print STDERR "DirectoryPlugin: file $raw_full_filename was blocked for metadata_read\n" if ($verbosity > 2);
489 next;
490 }
491
492 # Recursively read each $raw_subfile
493 print $outhandle "DirectoryPlugin metadata recurring: $raw_subfile\n" if ($verbosity > 2);
494
495 &plugin::metadata_read ($pluginfo, $this_file_base_dir,
496 $raw_file_subfile,$block_hash,
497 \@extrametakeys, \%extrametadata,
498 \%extrametafile,
499 $processor, $gli);
500 $additionalmetadata = 1;
501 }
502
503 # filter out any extrametakeys that mention subdirectories and store
504 # for later use (i.e. when that sub-directory is being processed)
505 foreach my $ek (@extrametakeys) { # where each Extrametakey (which is a filename) is stored as a url-style regex
506
507 my ($subdir_re,$extrakey_dir) = &util::url_fileparse($ek);
508
509 if ($extrakey_dir ne "") {
510 # a subdir was specified
511 my $md = &extrametautil::getmetadata(\%extrametadata, $ek);
512 my $mf = &extrametautil::getmetafile(\%extrametafile, $ek);
513
514 my $subdir_extrametakeys = $self->{'subdir_extrametakeys'};
515 my $subdir_rec = { 're' => $subdir_re, 'md' => $md, 'mf' => $mf };
516
517 # when it's looked up, it must be relative to the base dir
518 push(@{$subdir_extrametakeys->{"$local_dirname$extrakey_dir"}},$subdir_rec);
519 }
520 }
521
522 # import each of the files in the directory
523 my $count=0;
524 for (my $i = 0; $i <= scalar(@dir); $i++) {
525 # When every file in the directory has been done, pause for a moment (figuratively!)
526 # If the -recheck_directories argument hasn't been provided, stop now (default)
527 # Otherwise, re-read the contents of the directory to check for new files
528 # Any new files are added to the @dir list and are processed as normal
529 # This is necessary when documents to be indexed are specified in bibliographic DBs
530 # These files are copied/downloaded and stored in a new folder at import time
531 if ($i == $num_files) {
532 last unless $self->{'recheck_directories'};
533
534 # Re-read the files in the directory to see if there are any new files
535 last if (!opendir (DIR, $dirname));
536 my @dirnow = sort readdir (DIR);
537 map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dirnow;
538 closedir (DIR);
539
540 # We're only interested if there are more files than there were before
541 last if (scalar(@dirnow) <= scalar(@dir));
542
543 # Any new files are added to the end of @dir to get processed by the loop
544 my $j;
545 foreach my $subfilenow (@dirnow) {
546 for ($j = 0; $j < $num_files; $j++) {
547 last if ($subfilenow eq $dir[$j]);
548 }
549 if ($j == $num_files) {
550 # New file
551 push(@dir, $subfilenow);
552 }
553 }
554 # When the new files have been processed, check again
555 $num_files = scalar(@dir);
556 }
557
558 my $subfile = $dir[$i];
559 last if ($maxdocs != -1 && ($count + $total_count) >= $maxdocs);
560 next if ($subfile =~ /^\.\.?$/);
561
562 my $this_file_base_dir = $base_dir;
563 my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile);
564 # get the canonical unicode version of the filename. This may not match
565 # the filename on the file system. We will use it to compare to regex
566 # in the metadata table.
567 my $unicode_subfile = &util::raw_filename_to_unicode($dirname, $raw_subfile);
568 my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile);
569 my $raw_full_filename
570 = &FileUtils::filenameConcatenate($this_file_base_dir,$raw_file_subfile);
571
572 if ($self->file_is_blocked($block_hash,$raw_full_filename)) {
573 print STDERR "DirectoryPlugin: file $raw_full_filename was blocked for read\n" if ($verbosity > 2);
574 next;
575 }
576 print STDERR "** DirectoryPlugin processing $raw_full_filename\n";
577 # Follow Windows shortcuts
578 if ($raw_subfile =~ m/(?i)\.lnk$/ && (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin"))) {
579 require Win32::Shortcut;
580 my $shortcut = new Win32::Shortcut(&FileUtils::filenameConcatenate($dirname, $raw_subfile));
581 if ($shortcut) {
582 # The file to be processed is now the target of the shortcut
583 $this_file_base_dir = "";
584 $file = "";
585 $raw_subfile = $shortcut->Path;
586 }
587 }
588
589 # check for a symlink pointing back to a leading directory
590 if (-d "$dirname/$raw_subfile" && -l "$dirname/$raw_subfile") {
591 # readlink gives a "fatal error" on systems that don't implement
592 # symlinks. This assumes the the -l test above would fail on those.
593 my $linkdest=readlink "$dirname/$raw_subfile";
594 if (!defined ($linkdest)) {
595 # system error - file not found?
596 warn "DirectoryPlugin: symlink problem - $!";
597 } else {
598 # see if link points to current or a parent directory
599 if ($linkdest =~ m@^[\./\\]+$@ ||
600 index($dirname, $linkdest) != -1) {
601 warn "DirectoryPlugin: Ignoring recursive symlink ($dirname/$raw_subfile -> $linkdest)\n";
602 next;
603 ;
604 }
605 }
606 }
607
608 print $outhandle "DirectoryPlugin: preparing metadata for $raw_subfile\n" if ($verbosity > 2);
609
610 # Make a copy of $in_metadata to pass to $raw_subfile
611 my $out_metadata = {};
612 &metadatautil::combine_metadata_structures($out_metadata, $in_metadata);
613
614 # check the assocfile_info
615 if (defined $self->{'assocfile_info'}->{$raw_full_filename}) {
616 &metadatautil::combine_metadata_structures($out_metadata, $self->{'assocfile_info'}->{$raw_full_filename});
617 }
618
619 ### Now we need to look up the metadata table to see if there is any
620 # extra metadata for us. We need the canonical unicode version here.
621 if ($additionalmetadata == 1) {
622 foreach my $filespec (@extrametakeys) {
623 if ($unicode_subfile =~ /^$filespec$/) {
624 print $outhandle "File \"$unicode_subfile\" matches filespec \"$filespec\"\n"
625 if ($verbosity > 2);
626 my $mdref = &extrametautil::getmetadata(\%extrametadata, $filespec);
627 my $mfref = &extrametautil::getmetafile(\%extrametafile, $filespec);
628
629 # Add the list files where the metadata came from
630 # into the metadata table so we can track this
631 # This mechanism is similar to how gsdlassocfile works
632
633 my @metafile_pair = ();
634 foreach my $l (keys %$mfref) {
635 my $f = $mfref->{$l};
636 push (@metafile_pair, "$f : $l");
637 }
638
639 $mdref->{'gsdlmetafile'} = \@metafile_pair;
640
641 &metadatautil::combine_metadata_structures($out_metadata, $mdref);
642 }
643 }
644 }
645
646 if (defined $self->{'inf_timestamp'}) {
647 # Look to see if it's a completely new file
648
649 if (!$block_hash->{'new_files'}->{$raw_full_filename}) {
650 # Not a new file, must be an existing file
651 # Let' see if it's newer than the last import.pl
652
653
654 if (! -d $raw_full_filename) {
655 if (!$block_hash->{'reindex_files'}->{$raw_full_filename}) {
656 # filename has been around for longer than inf_timestamp
657 print $outhandle "**** Skipping $unicode_subfile\n" if ($verbosity >3);
658 next;
659 }
660 else {
661 # Remove old folder in archives (might hash to something different)
662 # *** should be doing this on a Del one as well
663 # but leave folder name?? and ensure hashs to
664 # same again??
665
666 # Then let through as new doc??
667
668 # mark to doc-oids that rely on it for re-indexing
669 }
670 }
671 }
672 }
673
674 # Recursively read each $subfile
675 print $outhandle "DirectoryPlugin recurring: $unicode_subfile\n" if ($verbosity > 2);
676
677 $count += &plugin::read ($pluginfo, $this_file_base_dir,
678 $raw_file_subfile, $block_hash,
679 $out_metadata, $processor, $maxdocs, ($total_count + $count), $gli);
680 }
681
682 return $count;
683}
684
685sub compile_stats {
686 my $self = shift(@_);
687 my ($stats) = @_;
688}
689
690# Manifest files, version 2, provide an explicit listing of the documents to be
691# processed by Greenstone. This allows a user to avoid expensive file tree
692# searches - a crucial requirement for very-large scale collections and
693# parallel processing. However, we still want to leverage the metadata parsing
694# functionality found here in DirectoryPlugin. Thus we have this special call
695# to read that expects a single file. The normal read function starts by
696# listing the files in a given directory and then performs a number of actions
697# over them (including recursing down into any further directories found). We
698# circumvent that behaviour by 'pretending' to already have a directory listing
699# containing at most two file - the file passed in, and an accompanying
700# metadata.xml file if one exists.
701sub read_for_manifest_v2
702{
703 my $self = shift (@_);
704 my ($pluginfo, $file, $block_hash, $processor, $gli) = @_;
705 my $base_dir = '';
706 my $in_metadata = {};
707 my $maxdocs = -1;
708 my $total_count = 0;
709 # Ensure we have the full path of the file to process
710 my $full_path = $file;
711 if ($base_dir =~ /\w/)
712 {
713 $full_path = &FileUtils::filenameConcatenate($base_dir, $file);
714 }
715 # Unlike the vanilla read(), directories are unacceptable
716 if (!-f $full_path)
717 {
718 return 0;
719 }
720 # Now split the full path into a directory and a filename
721 my ($dirname, $the_file) = $full_path =~ /^(.*)\/([^\/]+)$/;
722 # We will prepopulate a 'directory listing' with this file
723 my @dir = ($the_file);
724 # See if there is an accompanying
725 my $metadata_xml_path = $dirname . '/metadata.xml';
726 if (-f $metadata_xml_path)
727 {
728 unshift(@dir, 'metadata.xml');
729 }
730 # Chain through to the normal read process, but with out 'forged' directory
731 # listing so as to avoid all the costs of actually listing / recursing.
732 my $count = $self->read_phase2($pluginfo, $dirname, \@dir, $base_dir, $dirname, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli);
733 # We don't return count, but test that it is 1 exactly.
734 if ($count != 1)
735 {
736 print STDERR "ERROR! The count of documents processed from a single call to DirectoryPlugin::read_for_manifest_v2() is not 1.\n";
737 }
738}
739
7401;
Note: See TracBrowser for help on using the repository browser.