root/gs2-extensions/parallel-building/trunk/src/perllib/plugins/DirectoryPlugin.pm @ 30289

Revision 30289, 25.1 KB (checked in by jmt12, 4 years ago)

Significant changes to read() function - essentially split in half with the first phase responsible for building up the list of files to process and the second for doing the actual processing. Allows us to shortcut the system by passing in a list of files to process (as in the case of manifest version 2).

  • Property svn:executable set to *
Line 
1###########################################################################
2#
3# DirectoryPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# DirectoryPlugin is a plugin which recurses through directories processing
27# each file it finds - which basically means passing it down the plugin
28# pipeline
29
30package DirectoryPlugin;
31
32use extrametautil;
33use PrintInfo;
34use plugin;
35use util;
36use FileUtils;
37use metadatautil;
38
39use File::Basename;
40use strict;
41no strict 'refs';
42no strict 'subs';
43
44use Encode::Locale;
45use Encode;
46use Unicode::Normalize;
47
48BEGIN {
49    @DirectoryPlugin::ISA = ('PrintInfo');
50}
51
52my $arguments =
53    [ { 'name' => "block_exp",
54    'desc' => "{BasePlugin.block_exp}",
55    'type' => "regexp",
56    'deft' => &get_default_block_exp(),
57    'reqd' => "no" },
58      # this option has been deprecated. leave it here for now so we can warn people not to use it
59      { 'name' => "use_metadata_files",
60    'desc' => "{DirectoryPlugin.use_metadata_files}",
61    'type' => "flag",
62    'reqd' => "no",
63    'hiddengli' => "yes" },
64      { 'name' => "recheck_directories",
65    'desc' => "{DirectoryPlugin.recheck_directories}",
66    'type' => "flag",
67    'reqd' => "no" } ];
68   
69my $options = { 'name'     => "DirectoryPlugin",
70        'desc'     => "{DirectoryPlugin.desc}",
71        'abstract' => "no",
72        'inherits' => "yes",
73        'args'     => $arguments };
74
75sub new {
76    my ($class) = shift (@_);
77    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
78    push(@$pluginlist, $class);
79
80    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
81    push(@{$hashArgOptLists->{"OptList"}},$options);
82
83    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
84
85    print STDERR "INFO: This DirectoryPlugin supports version 2 manifest files\n";
86
87    if ($self->{'info_only'}) {
88    # don't worry about any options or initialisations etc
89    return bless $self, $class;
90    }
91
92    # we have left this option in so we can warn people who are still using it
93    if ($self->{'use_metadata_files'}) {
94    die "ERROR: DirectoryPlugin -use_metadata_files option has been deprecated. Please remove the option and add MetadataXMLPlug to your plugin list instead!\n";
95    }
96
97    $self->{'num_processed'} = 0;
98    $self->{'num_not_processed'} = 0;
99    $self->{'num_blocked'} = 0;
100    $self->{'num_archives'} = 0;
101
102    $self->{'subdir_extrametakeys'} = {};
103
104    return bless $self, $class;
105}
106
107# called once, at the start of processing
108sub init {
109    my $self = shift (@_);
110    my ($verbosity, $outhandle, $failhandle) = @_;
111
112    # verbosity is passed through from the processor
113    $self->{'verbosity'} = $verbosity;
114
115    # as are the outhandle and failhandle
116    $self->{'outhandle'} = $outhandle if defined $outhandle;
117    $self->{'failhandle'} = $failhandle;
118
119}
120
121# called once, after all passes have finished
122sub deinit {
123    my ($self) = @_;
124
125}
126
127# called at the beginning of each plugin pass (import has one, building has many)
128sub begin {
129    my $self = shift (@_);
130    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
131
132    # Only lookup timestamp info for import.pl, and only if incremental is set
133    my $proc_package_name = ref $processor;
134    if ($proc_package_name !~ /buildproc$/ && $self->{'incremental'} == 1) {
135        # Get the infodbtype value for this collection from the arcinfo object
136        my $infodbtype = $processor->getoutputinfo()->{'infodbtype'};
137    $infodbtype = "gdbm" if $infodbtype eq "gdbm-txtgz"; # in archives, cannot use txtgz version
138    my $output_dir = $processor->getoutputdir();
139        my $archives_inf = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
140
141    if ( -e $archives_inf ) {
142        $self->{'inf_timestamp'} = -M $archives_inf;
143    }
144    }
145}
146
147sub remove_all {
148    my $self = shift (@_);
149    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
150}
151
152
153sub remove_one {
154    my $self = shift (@_);
155    my ($file, $oids, $archivedir) = @_;
156    return undef; # this will never be called for directories (will it??)
157
158}
159
160
161# called at the end of each plugin pass
162sub end {
163    my ($self) = shift (@_);
164
165}
166
167
168
169# return 1 if this class might recurse using $pluginfo
170sub is_recursive {
171    my $self = shift (@_);
172   
173    return 1;
174}
175
176sub get_default_block_exp {
177    my $self = shift (@_);
178   
179    return '(?i)(CVS|\.svn|Thumbs\.db|OIDcount|\.DS_Store|~)$';
180}
181
182sub check_directory_path {
183
184    my $self = shift(@_);
185    my ($dirname) = @_;
186   
187    return undef unless (-d $dirname);
188
189    return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
190
191    my $outhandle = $self->{'outhandle'};
192   
193    # check to make sure we're not reading the archives or index directory
194    my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
195    if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
196    print $outhandle "DirectoryPlugin: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
197        return 0;
198    }
199   
200    # check to see we haven't got a cyclic path...
201    if ($dirname =~ m%(/.*){,41}%) {
202    print $outhandle "DirectoryPlugin: $dirname is 40 directories deep, is this a recursive path? if not increase constant in DirectoryPlugin.pm.\n";
203    return 0;
204    }
205   
206    # check to see we haven't got a cyclic path...
207    if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
208    print $outhandle "DirectoryPlugin: $dirname appears to be in a recursive loop...\n";
209    return 0;
210    }
211
212    return 1;
213}
214
215# this may be called more than once
216sub sort_out_associated_files {
217
218    my $self = shift (@_);
219    my ($block_hash) = @_;
220    if (!scalar (keys %{$block_hash->{'shared_fileroot'}})) {
221    return;
222    }
223
224    $self->{'assocfile_info'} = {} unless defined $self->{'assocfile_info'};
225    my $metadata = $self->{'assocfile_info'};
226    foreach my $prefix (keys %{$block_hash->{'shared_fileroot'}}) {
227    my $record = $block_hash->{'shared_fileroot'}->{$prefix};
228
229    my $tie_to = $record->{'tie_to'};
230    my $exts = $record->{'exts'};
231   
232    if ((defined $tie_to) && (scalar (keys %$exts) > 0)) {
233        # set up fileblocks and assocfile_tobe
234        my $base_file = "$prefix$tie_to";
235        $metadata->{$base_file} = {} unless defined $metadata->{$base_file};
236        my $base_file_metadata = $metadata->{$base_file};
237       
238        $base_file_metadata->{'gsdlassocfile_tobe'} = [] unless defined $base_file_metadata->{'gsdlassocfile_tobe'};
239        my $assoc_tobe = $base_file_metadata->{'gsdlassocfile_tobe'};
240        foreach my $e (keys %$exts) {
241        # block the file
242        &util::block_filename($block_hash,"$prefix$e");
243        # set up as an associatd file
244        print STDERR "  $self->{'plugin_type'}: Associating $prefix$e with $tie_to version\n";
245        my $mime_type = ""; # let system auto detect this
246        push(@$assoc_tobe,"$prefix$e:$mime_type:");
247
248        }
249    }
250    } # foreach record
251
252    $block_hash->{'shared_fileroot'} = undef;
253    $block_hash->{'shared_fileroot'} = {};
254
255}
256
257
258# do block exp OR special blocking ???
259
260sub file_is_blocked {
261    my $self = shift (@_);
262    my ($block_hash, $filename_full_path) = @_;
263
264    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
265
266    if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
267    # on windows, all block paths are lowercased.
268    my $lower_filename = lc ($filename_full_path);
269    if (defined $block_hash->{'file_blocks'}->{$lower_filename}) {
270        $self->{'num_blocked'} ++;
271        return 1;
272    }
273    }
274    else {
275    if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) {
276        $self->{'num_blocked'} ++;
277        return 1;
278    }
279    }
280    # check Directory plugin's own block_exp
281    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
282    $self->{'num_blocked'} ++;
283    return 1; # blocked
284    }
285    return 0;
286}
287
288
289
290sub file_block_read {
291    my $self = shift (@_);
292    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
293
294    my $outhandle = $self->{'outhandle'};
295    my $verbosity = $self->{'verbosity'};
296   
297    # Calculate the directory name and ensure it is a directory and
298    # that it is not explicitly blocked.
299    my $dirname = $file;
300    $dirname = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\w/;
301
302    my $directory_ok = $self->check_directory_path($dirname);
303    return $directory_ok unless (defined $directory_ok && $directory_ok == 1);
304
305    print $outhandle "Global file scan checking directory: $dirname\n";
306
307    $block_hash->{'all_files'} = {} unless defined $block_hash->{'all_files'};
308    $block_hash->{'metadata_files'} = {} unless defined $block_hash->{'metadata_files'};
309
310    $block_hash->{'file_blocks'} = {} unless defined $block_hash->{'file_blocks'};
311    $block_hash->{'shared_fileroot'} = {} unless defined $block_hash->{'shared_fileroot'};
312
313     # Recur over directory contents.
314    my (@dir, $subfile);
315    #my $count = 0;
316   
317    print $outhandle "DirectoryPlugin block: getting directory $dirname\n" if ($verbosity > 2);
318   
319    # find all the files in the directory
320    if (!opendir (DIR, $dirname)) {
321    if ($gli) {
322        print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
323    }
324    print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
325    return -1; # error in processing
326    }
327    @dir = sort readdir (DIR);
328    closedir (DIR);
329   
330    for (my $i = 0; $i < scalar(@dir); $i++) {
331    my $raw_subfile = $dir[$i];
332    next if ($raw_subfile =~ m/^\.\.?$/);
333
334    my $this_file_base_dir = $base_dir;
335    my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile);
336
337    # Recursively read each $raw_subfile
338    print $outhandle "DirectoryPlugin block recurring: ". Encode::decode("utf8", $raw_file_subfile) ."\n" if ($verbosity > 2);
339    print $outhandle "DirectoryPlugin block recurring: ". Encode::decode(locale =>$raw_file_subfile) ."\n" if ($verbosity > 2);
340   
341    #$count += &plugin::file_block_read ($pluginfo, $this_file_base_dir,
342
343    &plugin::file_block_read ($pluginfo, $this_file_base_dir,
344                  $raw_file_subfile,
345                  $block_hash, $metadata, $gli);
346   
347    }
348    $self->sort_out_associated_files($block_hash);
349    #return $count;
350    return 1;
351   
352}
353
354# We don't do metadata_read
355sub metadata_read {
356    my $self = shift (@_);
357    my ($pluginfo, $base_dir, $file, $block_hash,
358    $extrametakeys, $extrametadata, $extrametafile,
359    $processor, $gli, $aux) = @_;
360
361    return undef;
362}
363
364
365# return number of files processed, undef if can't process
366# Note that $base_dir might be "" and that $file might
367# include directories
368
369# This function passes around metadata hash structures.  Metadata hash
370# structures are hashes that map from a (scalar) key (the metadata element
371# name) to either a scalar metadata value or a reference to an array of
372# such values.
373
374sub read {
375    my $self = shift (@_);
376    my ($pluginfo, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_;
377    my $outhandle = $self->{'outhandle'};
378    my $verbosity = $self->{'verbosity'};
379
380    # Calculate the directory name and ensure it is a directory and
381    # that it is not explicitly blocked.
382    my $dirname;
383    if ($file eq "") {
384    $dirname = $base_dir;
385    } else {
386    $dirname = $file;
387    $dirname = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\w/;
388    }
389   
390    my $directory_ok = $self->check_directory_path($dirname);
391    return $directory_ok unless (defined $directory_ok && $directory_ok == 1);
392       
393    if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
394        print $outhandle "DirectoryPlugin: metadata passed in: ",
395    join(", ", keys %$in_metadata), "\n";
396    }
397   
398
399    # Recur over directory contents.
400    my @dir;
401   
402    print $outhandle "DirectoryPlugin read: getting directory $dirname\n" if ($verbosity > 2);
403   
404    # find all the files in the directory
405    if (!opendir (DIR, $dirname)) {
406    if ($gli) {
407        print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
408    }
409    print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
410    return -1; # error in processing
411    }
412    @dir = sort readdir (DIR);
413    map {  $_ = &unicode::raw_filename_to_url_encoded($_);  } @dir;
414    closedir (DIR);
415    # Re-order the files in the list so any directories ending with .all are moved to the end
416    for (my $i = scalar(@dir) - 1; $i >= 0; $i--) {
417    if (-d &FileUtils::filenameConcatenate($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) {
418        push(@dir, splice(@dir, $i, 1));
419    }
420    }
421
422    # Chain through to the rest of the read function (now split off and named
423    # read_phase2)
424    my $count = $self->read_phase2($pluginfo, $dirname, \@dir, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli);
425
426    return $count;
427}
428
429sub read_phase2
430{
431    my $self = shift (@_);
432    my ($pluginfo, $dirname, $dir_ref, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_;
433    # These were defined in read (phase 1)
434    my @dir = @{$dir_ref};
435    my $subfile;
436
437    my $outhandle = $self->{'outhandle'};
438    my $verbosity = $self->{'verbosity'};
439   
440    # setup the metadata structures. we do a metadata_read pass to see if there is any additional metadata, then pass it to read
441   
442    my $additionalmetadata = 0;      # is there extra metadata available?
443    my %extrametadata;               # maps from filespec to extra metadata keys
444    my %extrametafile;               # maps from filespec to the metadata.xml (or similar) file it came from
445    my @extrametakeys;               # keys of %extrametadata in order read
446
447
448    my $os_dirsep = &util::get_os_dirsep();
449    my $dirsep    = &util::get_dirsep();
450    my $base_dir_regexp = $base_dir;
451    $base_dir_regexp =~ s/\//$os_dirsep/g;
452       
453    # Want to get relative path of local_dirname within the base_directory
454    # but with URL style slashes.
455    my $local_dirname = &util::filename_within_directory_url_format($dirname, $base_dir);
456
457    # if we are in import folder, then local_dirname will be empty
458    if ($local_dirname ne "") {
459    # look for extra metadata passed down from higher folders   
460    $local_dirname .= "/"; # closing slash must be URL type slash also and not $dirsep;
461    if (defined $self->{'subdir_extrametakeys'}->{$local_dirname}) {
462        my $extrakeys = $self->{'subdir_extrametakeys'}->{$local_dirname};
463        foreach my $ek (@$extrakeys) {
464        my $extrakeys_re  = $ek->{'re'};
465        my $extrakeys_md  = $ek->{'md'};
466        my $extrakeys_mf  = $ek->{'mf'};
467        &extrametautil::addmetakey(\@extrametakeys, $extrakeys_re);
468        &extrametautil::setmetadata(\%extrametadata, $extrakeys_re, $extrakeys_md);
469        &extrametautil::setmetafile(\%extrametafile, $extrakeys_re, $extrakeys_mf);
470        }
471        delete($self->{'subdir_extrametakeys'}->{$local_dirname});
472    }
473    }
474    # apply metadata pass for each of the files in the directory -- ignore
475    # maxdocs here
476    my $num_files = scalar(@dir);
477    for (my $i = 0; $i < scalar(@dir); $i++) {
478    my $subfile = $dir[$i];
479    next if ($subfile =~ m/^\.\.?$/);
480
481    my $this_file_base_dir = $base_dir;
482    my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile);
483
484    my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile);
485    my $raw_full_filename = &FileUtils::filenameConcatenate($this_file_base_dir, $raw_file_subfile);
486
487    if ($self->file_is_blocked($block_hash,$raw_full_filename)) {
488        print STDERR "DirectoryPlugin: file $raw_full_filename was blocked for metadata_read\n" if ($verbosity > 2);
489        next;
490    }
491   
492    # Recursively read each $raw_subfile
493    print $outhandle "DirectoryPlugin metadata recurring: $raw_subfile\n" if ($verbosity > 2);
494   
495    &plugin::metadata_read ($pluginfo, $this_file_base_dir,
496                $raw_file_subfile,$block_hash,
497                \@extrametakeys, \%extrametadata,
498                \%extrametafile,
499                $processor, $gli);
500    $additionalmetadata = 1;
501    }
502
503    # filter out any extrametakeys that mention subdirectories and store
504    # for later use (i.e. when that sub-directory is being processed)
505    foreach my $ek (@extrametakeys) { # where each Extrametakey (which is a filename) is stored as a  url-style regex
506   
507    my ($subdir_re,$extrakey_dir) = &util::url_fileparse($ek);
508   
509    if ($extrakey_dir ne "") {
510        # a subdir was specified
511        my $md = &extrametautil::getmetadata(\%extrametadata, $ek);
512        my $mf = &extrametautil::getmetafile(\%extrametafile, $ek);
513
514        my $subdir_extrametakeys = $self->{'subdir_extrametakeys'};
515        my $subdir_rec = { 're' => $subdir_re, 'md' => $md, 'mf' => $mf };
516
517        # when it's looked up, it must be relative to the base dir
518        push(@{$subdir_extrametakeys->{"$local_dirname$extrakey_dir"}},$subdir_rec);
519    }
520    }
521   
522    # import each of the files in the directory
523    my $count=0;
524    for (my $i = 0; $i <= scalar(@dir); $i++) {
525    # When every file in the directory has been done, pause for a moment (figuratively!)
526    # If the -recheck_directories argument hasn't been provided, stop now (default)
527    # Otherwise, re-read the contents of the directory to check for new files
528    #   Any new files are added to the @dir list and are processed as normal
529    #   This is necessary when documents to be indexed are specified in bibliographic DBs
530    #   These files are copied/downloaded and stored in a new folder at import time
531    if ($i == $num_files) {
532        last unless $self->{'recheck_directories'};
533
534        # Re-read the files in the directory to see if there are any new files
535        last if (!opendir (DIR, $dirname));
536        my @dirnow = sort readdir (DIR);
537        map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dirnow;
538        closedir (DIR);
539
540        # We're only interested if there are more files than there were before
541        last if (scalar(@dirnow) <= scalar(@dir));
542
543        # Any new files are added to the end of @dir to get processed by the loop
544        my $j;
545        foreach my $subfilenow (@dirnow) {
546        for ($j = 0; $j < $num_files; $j++) {
547            last if ($subfilenow eq $dir[$j]);
548        }
549        if ($j == $num_files) {
550            # New file
551            push(@dir, $subfilenow);
552        }
553        }
554        # When the new files have been processed, check again
555        $num_files = scalar(@dir);
556    }
557
558    my $subfile = $dir[$i];
559    last if ($maxdocs != -1 && ($count + $total_count) >= $maxdocs);
560    next if ($subfile =~ /^\.\.?$/);
561
562    my $this_file_base_dir = $base_dir;
563    my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile);
564    # get the canonical unicode version of the filename. This may not match
565    # the filename on the file system. We will use it to compare to regex
566    # in the metadata table.
567    my $unicode_subfile = &util::raw_filename_to_unicode($dirname, $raw_subfile);
568    my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile);
569    my $raw_full_filename
570        = &FileUtils::filenameConcatenate($this_file_base_dir,$raw_file_subfile);
571
572    if ($self->file_is_blocked($block_hash,$raw_full_filename)) {
573        print STDERR "DirectoryPlugin: file $raw_full_filename was blocked for read\n" if ($verbosity > 2);
574        next;
575    }
576    print STDERR "** DirectoryPlugin processing $raw_full_filename\n";
577    # Follow Windows shortcuts
578    if ($raw_subfile =~ m/(?i)\.lnk$/ && (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin"))) {
579        require Win32::Shortcut;
580        my $shortcut = new Win32::Shortcut(&FileUtils::filenameConcatenate($dirname, $raw_subfile));
581        if ($shortcut) {
582        # The file to be processed is now the target of the shortcut
583        $this_file_base_dir = "";
584        $file = "";
585        $raw_subfile = $shortcut->Path;
586        }
587    }
588
589    # check for a symlink pointing back to a leading directory
590    if (-d "$dirname/$raw_subfile" && -l "$dirname/$raw_subfile") {
591        # readlink gives a "fatal error" on systems that don't implement
592        # symlinks. This assumes the the -l test above would fail on those.
593        my $linkdest=readlink "$dirname/$raw_subfile";
594        if (!defined ($linkdest)) {
595        # system error - file not found?
596        warn "DirectoryPlugin: symlink problem - $!";
597        } else {
598        # see if link points to current or a parent directory
599        if ($linkdest =~ m@^[\./\\]+$@ ||
600            index($dirname, $linkdest) != -1) {
601            warn "DirectoryPlugin: Ignoring recursive symlink ($dirname/$raw_subfile -> $linkdest)\n";
602            next;
603            ;
604        }
605        }
606    }
607
608    print $outhandle "DirectoryPlugin: preparing metadata for $raw_subfile\n" if ($verbosity > 2);
609
610    # Make a copy of $in_metadata to pass to $raw_subfile
611    my $out_metadata = {};
612    &metadatautil::combine_metadata_structures($out_metadata, $in_metadata);
613
614    # check the assocfile_info
615    if (defined $self->{'assocfile_info'}->{$raw_full_filename}) {
616        &metadatautil::combine_metadata_structures($out_metadata, $self->{'assocfile_info'}->{$raw_full_filename});
617    }
618
619    ### Now we need to look up the metadata table to see if there is any
620    # extra metadata for us. We need the canonical unicode version here.
621    if ($additionalmetadata == 1) {
622        foreach my $filespec (@extrametakeys) {
623        if ($unicode_subfile =~ /^$filespec$/) {
624            print $outhandle "File \"$unicode_subfile\" matches filespec \"$filespec\"\n"
625            if ($verbosity > 2);
626            my $mdref = &extrametautil::getmetadata(\%extrametadata, $filespec);
627            my $mfref = &extrametautil::getmetafile(\%extrametafile, $filespec);
628
629            # Add the list files where the metadata came from
630            # into the metadata table so we can track this
631            # This mechanism is similar to how gsdlassocfile works
632
633            my @metafile_pair = ();
634            foreach my $l (keys %$mfref) {
635            my $f = $mfref->{$l};
636            push (@metafile_pair, "$f : $l");
637            }
638
639            $mdref->{'gsdlmetafile'} = \@metafile_pair;
640
641            &metadatautil::combine_metadata_structures($out_metadata, $mdref);
642        }
643        }
644    }
645
646    if (defined $self->{'inf_timestamp'}) {
647        # Look to see if it's a completely new file
648
649        if (!$block_hash->{'new_files'}->{$raw_full_filename}) {
650        # Not a new file, must be an existing file
651        # Let' see if it's newer than the last import.pl
652
653
654        if (! -d $raw_full_filename) {
655            if (!$block_hash->{'reindex_files'}->{$raw_full_filename}) {
656            # filename has been around for longer than inf_timestamp
657            print $outhandle "**** Skipping $unicode_subfile\n" if ($verbosity >3);
658            next;
659            }
660            else {
661            # Remove old folder in archives (might hash to something different)
662            # *** should be doing this on a Del one as well
663            # but leave folder name?? and ensure hashs to
664            # same again??
665
666            # Then let through as new doc??
667
668            # mark to doc-oids that rely on it for re-indexing
669            }
670        }
671        }
672    }
673
674    # Recursively read each $subfile
675    print $outhandle "DirectoryPlugin recurring: $unicode_subfile\n" if ($verbosity > 2);
676   
677    $count += &plugin::read ($pluginfo, $this_file_base_dir,
678                 $raw_file_subfile, $block_hash,
679                 $out_metadata, $processor, $maxdocs, ($total_count + $count), $gli);
680    }
681
682    return $count;
683}
684
685sub compile_stats {
686    my $self = shift(@_);
687    my ($stats) = @_;
688}
689
690# Manifest files, version 2, provide an explicit listing of the documents to be
691# processed by Greenstone.  This allows a user to avoid expensive file tree
692# searches - a crucial requirement for very-large scale collections and
693# parallel processing. However, we still want to leverage the metadata parsing
694# functionality found here in DirectoryPlugin. Thus we have this special call
695# to read that expects a single file. The normal read function starts by
696# listing the files in a given directory and then performs a number of actions
697# over them (including recursing down into any further directories found). We
698# circumvent that behaviour by 'pretending' to already have a directory listing
699# containing at most two file - the file passed in, and an accompanying
700# metadata.xml file if one exists.
701sub read_for_manifest_v2
702{
703    my $self = shift (@_);
704    my ($pluginfo, $file, $block_hash, $processor, $gli) = @_;
705    my $base_dir = '';
706    my $in_metadata = {};
707    my $maxdocs = -1;
708    my $total_count = 0;
709    # Ensure we have the full path of the file to process
710    my $full_path = $file;
711    if ($base_dir =~ /\w/)
712    {
713    $full_path = &FileUtils::filenameConcatenate($base_dir, $file);
714    }
715    # Unlike the vanilla read(), directories are unacceptable
716    if (!-f $full_path)
717    {
718    return 0;
719    }
720    # Now split the full path into a directory and a filename
721    my ($dirname, $the_file) = $full_path =~ /^(.*)\/([^\/]+)$/;
722    # We will prepopulate a 'directory listing' with this file
723    my @dir = ($the_file);
724    # See if there is an accompanying
725    my $metadata_xml_path = $dirname . '/metadata.xml';
726    if (-f $metadata_xml_path)
727    {
728    unshift(@dir, 'metadata.xml');
729    }
730    # Chain through to the normal read process, but with out 'forged' directory
731    # listing so as to avoid all the costs of actually listing / recursing.
732    my $count = $self->read_phase2($pluginfo, $dirname, \@dir, $base_dir, $dirname, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli);
733    # We don't return count, but test that it is 1 exactly.
734    if ($count != 1)
735    {
736    print STDERR "ERROR! The count of documents processed from a single call to DirectoryPlugin::read_for_manifest_v2() is not 1.\n";
737    }
738}
739
7401;
Note: See TracBrowser for help on using the browser.