Ignore:
Timestamp:
2015-10-09T13:29:06+13:00 (9 years ago)
Author:
jmt12
Message:

Significant changes to read() function - essentially split in half with the first phase responsible for building up the list of files to process and the second for doing the actual processing. Allows us to shortcut the system by passing in a list of files to process (as in the case of manifest version 2).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/perllib/plugins/DirectoryPlugin.pm

    r29260 r30289  
    3434use plugin;
    3535use util;
     36use FileUtils;
    3637use metadatautil;
    3738
     
    4142no strict 'subs';
    4243
     44use Encode::Locale;
    4345use Encode;
     46use Unicode::Normalize;
    4447
    4548BEGIN {
     
    7982
    8083    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists);
    81    
     84
     85    print STDERR "INFO: This DirectoryPlugin supports version 2 manifest files\n";
     86
    8287    if ($self->{'info_only'}) {
    8388    # don't worry about any options or initialisations etc
     
    8994    die "ERROR: DirectoryPlugin -use_metadata_files option has been deprecated. Please remove the option and add MetadataXMLPlug to your plugin list instead!\n";
    9095    }
    91    
     96
    9297    $self->{'num_processed'} = 0;
    9398    $self->{'num_not_processed'} = 0;
     
    134139        my $archives_inf = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
    135140
    136     if ( &FileUtils::fileExists($archives_inf) ) {
    137         $self->{'inf_timestamp'} = &FileUtils::file_lastmodified($archives_inf);
     141    if ( -e $archives_inf ) {
     142        $self->{'inf_timestamp'} = -M $archives_inf;
    138143    }
    139144    }
     
    172177    my $self = shift (@_);
    173178   
    174     return '(?i)(CVS|\.svn|Thumbs\.db|OIDcount|~)$';
     179    return '(?i)(CVS|\.svn|Thumbs\.db|OIDcount|\.DS_Store|~)$';
    175180}
    176181
     
    179184    my $self = shift(@_);
    180185    my ($dirname) = @_;
    181 
    182     # replace -d with function in util library
    183     return undef unless (&FileUtils::directoryExists($dirname));
     186   
     187    return undef unless (-d $dirname);
    184188
    185189    return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);
     
    259263
    260264    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
    261 ###    print STDERR "*** DirectoryPlugin::file_is_blocked $filename_full_path\n";
    262 
    263     if ($ENV{'GSDLOS'} =~ m/^windows$/) {
     265
     266    if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
    264267    # on windows, all block paths are lowercased.
    265268    my $lower_filename = lc ($filename_full_path);
     
    295298    # that it is not explicitly blocked.
    296299    my $dirname = $file;
    297     $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
     300    $dirname = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\w/;
    298301
    299302    my $directory_ok = $self->check_directory_path($dirname);
    300303    return $directory_ok unless (defined $directory_ok && $directory_ok == 1);
    301304
    302     print $outhandle "File scan checking directory: $dirname\n";
     305    print $outhandle "Global file scan checking directory: $dirname\n";
    303306
    304307    $block_hash->{'all_files'} = {} unless defined $block_hash->{'all_files'};
     
    308311    $block_hash->{'shared_fileroot'} = {} unless defined $block_hash->{'shared_fileroot'};
    309312
    310     # Recur over directory contents.
     313     # Recur over directory contents.
     314    my (@dir, $subfile);
     315    #my $count = 0;
     316   
    311317    print $outhandle "DirectoryPlugin block: getting directory $dirname\n" if ($verbosity > 2);
    312 
     318   
    313319    # find all the files in the directory
    314     my @dir = @{&FileUtils::readDirectory($dirname)};
    315     if (scalar(@dir) == 0)
    316     {
    317       if ($gli)
    318       {
    319         print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
    320       }
    321       print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
    322       return -1; # error in processing
    323     }
    324 
     320    if (!opendir (DIR, $dirname)) {
     321    if ($gli) {
     322        print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
     323    }
     324    print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
     325    return -1; # error in processing
     326    }
     327    @dir = sort readdir (DIR);
     328    closedir (DIR);
     329   
    325330    for (my $i = 0; $i < scalar(@dir); $i++) {
    326331    my $raw_subfile = $dir[$i];
     
    328333
    329334    my $this_file_base_dir = $base_dir;
    330     my $raw_file_subfile = &util::filename_cat($file, $raw_subfile);
     335    my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile);
    331336
    332337    # Recursively read each $raw_subfile
    333     print $outhandle "DirectoryPlugin block recurring: $raw_file_subfile\n" if ($verbosity > 2);
     338    print $outhandle "DirectoryPlugin block recurring: ". Encode::decode("utf8", $raw_file_subfile) ."\n" if ($verbosity > 2);
     339    print $outhandle "DirectoryPlugin block recurring: ". Encode::decode(locale =>$raw_file_subfile) ."\n" if ($verbosity > 2);
    334340   
    335341    #$count += &plugin::file_block_read ($pluginfo, $this_file_base_dir,
     
    369375    my $self = shift (@_);
    370376    my ($pluginfo, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    371 
    372377    my $outhandle = $self->{'outhandle'};
    373378    my $verbosity = $self->{'verbosity'};
     
    380385    } else {
    381386    $dirname = $file;
    382     $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
     387    $dirname = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\w/;
    383388    }
    384389   
     
    393398
    394399    # Recur over directory contents.
     400    my @dir;
     401   
    395402    print $outhandle "DirectoryPlugin read: getting directory $dirname\n" if ($verbosity > 2);
    396 
     403   
    397404    # find all the files in the directory
    398     my @dir = @{&FileUtils::readDirectory($dirname)};
    399     if (scalar(@dir) == 0)
    400     {
    401       if ($gli)
    402       {
    403         print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
    404       }
    405       print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
    406       return -1; # error in processing
    407     }
    408     map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dir;
    409 
    410     # Re-order the files in the list so any directories ending with .all are
    411     # moved to the end
     405    if (!opendir (DIR, $dirname)) {
     406    if ($gli) {
     407        print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
     408    }
     409    print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n";
     410    return -1; # error in processing
     411    }
     412    @dir = sort readdir (DIR);
     413    map {  $_ = &unicode::raw_filename_to_url_encoded($_);  } @dir;
     414    closedir (DIR);
     415    # Re-order the files in the list so any directories ending with .all are moved to the end
    412416    for (my $i = scalar(@dir) - 1; $i >= 0; $i--) {
    413     if (-d &util::filename_cat($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) {
     417    if (-d &FileUtils::filenameConcatenate($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) {
    414418        push(@dir, splice(@dir, $i, 1));
    415419    }
    416420    }
    417421
     422    # Chain through to the rest of the read function (now split off and named
     423    # read_phase2)
     424    my $count = $self->read_phase2($pluginfo, $dirname, \@dir, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli);
     425
     426    return $count;
     427}
     428
     429sub read_phase2
     430{
     431    my $self = shift (@_);
     432    my ($pluginfo, $dirname, $dir_ref, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_;
     433    # These were defined in read (phase 1)
     434    my @dir = @{$dir_ref};
     435    my $subfile;
     436
     437    my $outhandle = $self->{'outhandle'};
     438    my $verbosity = $self->{'verbosity'};
     439   
    418440    # setup the metadata structures. we do a metadata_read pass to see if there is any additional metadata, then pass it to read
    419441   
     
    428450    my $base_dir_regexp = $base_dir;
    429451    $base_dir_regexp =~ s/\//$os_dirsep/g;
    430 
     452       
    431453    # Want to get relative path of local_dirname within the base_directory
    432     # but with URL style slashes.
     454    # but with URL style slashes. 
    433455    my $local_dirname = &util::filename_within_directory_url_format($dirname, $base_dir);
    434456
     
    460482    my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile);
    461483
    462     my $raw_file_subfile = &util::filename_cat($file, $raw_subfile);
    463     my $raw_full_filename = &util::filename_cat($this_file_base_dir, $raw_file_subfile);
     484    my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile);
     485    my $raw_full_filename = &FileUtils::filenameConcatenate($this_file_base_dir, $raw_file_subfile);
    464486
    465487    if ($self->file_is_blocked($block_hash,$raw_full_filename)) {
     
    512534        # Re-read the files in the directory to see if there are any new files
    513535        last if (!opendir (DIR, $dirname));
    514         my @dirnow = readdir (DIR);
     536        my @dirnow = sort readdir (DIR);
    515537        map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dirnow;
    516538        closedir (DIR);
     
    540562    my $this_file_base_dir = $base_dir;
    541563    my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile);
    542 
    543     my $raw_file_subfile = &util::filename_cat($file, $raw_subfile);
     564    # get the canonical unicode version of the filename. This may not match
     565    # the filename on the file system. We will use it to compare to regex
     566    # in the metadata table.
     567    my $unicode_subfile = &util::raw_filename_to_unicode($dirname, $raw_subfile);
     568    my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile);
    544569    my $raw_full_filename
    545         = &util::filename_cat($this_file_base_dir,$raw_file_subfile);
     570        = &FileUtils::filenameConcatenate($this_file_base_dir,$raw_file_subfile);
    546571
    547572    if ($self->file_is_blocked($block_hash,$raw_full_filename)) {
     
    549574        next;
    550575    }
    551     #print STDERR "processing $raw_full_filename\n";
     576    print STDERR "** DirectoryPlugin processing $raw_full_filename\n";
    552577    # Follow Windows shortcuts
    553     if ($raw_subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {
     578    if ($raw_subfile =~ m/(?i)\.lnk$/ && (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin"))) {
    554579        require Win32::Shortcut;
    555         my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $raw_subfile));
     580        my $shortcut = new Win32::Shortcut(&FileUtils::filenameConcatenate($dirname, $raw_subfile));
    556581        if ($shortcut) {
    557582        # The file to be processed is now the target of the shortcut
     
    592617    }
    593618
    594     # $subfile by this point is url-encoded => all ASCII chars => no need to encode as UTF8
    595 
    596     # Next add metadata read in XML files (if it is supplied)
     619    ### Now we need to look up the metadata table to see if there is any
     620    # extra metadata for us. We need the canonical unicode version here.
    597621    if ($additionalmetadata == 1) {
    598622        foreach my $filespec (@extrametakeys) {
    599         ## use the url-encoded filename to do the filename comparison
    600 
    601         if ($subfile =~ /^$filespec$/) {
    602             print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
     623        if ($unicode_subfile =~ /^$filespec$/) {
     624            print $outhandle "File \"$unicode_subfile\" matches filespec \"$filespec\"\n"
    603625            if ($verbosity > 2);
    604626            my $mdref = &extrametautil::getmetadata(\%extrametadata, $filespec);
     
    624646    if (defined $self->{'inf_timestamp'}) {
    625647        # Look to see if it's a completely new file
     648
    626649        if (!$block_hash->{'new_files'}->{$raw_full_filename}) {
    627650        # Not a new file, must be an existing file
    628651        # Let' see if it's newer than the last import.pl
    629         if (! &util::dir_exists($raw_full_filename)) {
     652
     653
     654        if (! -d $raw_full_filename) {
    630655            if (!$block_hash->{'reindex_files'}->{$raw_full_filename}) {
    631656            # filename has been around for longer than inf_timestamp
    632             print $outhandle "**** Skipping $subfile\n" if ($verbosity >3);
     657            print $outhandle "**** Skipping $unicode_subfile\n" if ($verbosity >3);
    633658            next;
    634659            }
     
    648673
    649674    # Recursively read each $subfile
    650     print $outhandle "DirectoryPlugin recurring: $subfile\n" if ($verbosity > 2);
     675    print $outhandle "DirectoryPlugin recurring: $unicode_subfile\n" if ($verbosity > 2);
    651676   
    652677    $count += &plugin::read ($pluginfo, $this_file_base_dir,
     
    663688}
    664689
     690# Manifest files, version 2, provide an explicit listing of the documents to be
     691# processed by Greenstone.  This allows a user to avoid expensive file tree
     692# searches - a crucial requirement for very-large scale collections and
     693# parallel processing. However, we still want to leverage the metadata parsing
     694# functionality found here in DirectoryPlugin. Thus we have this special call
     695# to read that expects a single file. The normal read function starts by
     696# listing the files in a given directory and then performs a number of actions
     697# over them (including recursing down into any further directories found). We
     698# circumvent that behaviour by 'pretending' to already have a directory listing
     699# containing at most two file - the file passed in, and an accompanying
     700# metadata.xml file if one exists.
     701sub read_for_manifest_v2
     702{
     703    my $self = shift (@_);
     704    my ($pluginfo, $file, $block_hash, $processor, $gli) = @_;
     705    my $base_dir = '';
     706    my $in_metadata = {};
     707    my $maxdocs = -1;
     708    my $total_count = 0;
     709    # Ensure we have the full path of the file to process
     710    my $full_path = $file;
     711    if ($base_dir =~ /\w/)
     712    {
     713    $full_path = &FileUtils::filenameConcatenate($base_dir, $file);
     714    }
     715    # Unlike the vanilla read(), directories are unacceptable
     716    if (!-f $full_path)
     717    {
     718    return 0;
     719    }
     720    # Now split the full path into a directory and a filename
     721    my ($dirname, $the_file) = $full_path =~ /^(.*)\/([^\/]+)$/;
     722    # We will prepopulate a 'directory listing' with this file
     723    my @dir = ($the_file);
     724    # See if there is an accompanying
     725    my $metadata_xml_path = $dirname . '/metadata.xml';
     726    if (-f $metadata_xml_path)
     727    {
     728    unshift(@dir, 'metadata.xml');
     729    }
     730    # Chain through to the normal read process, but with out 'forged' directory
     731    # listing so as to avoid all the costs of actually listing / recursing.
     732    my $count = $self->read_phase2($pluginfo, $dirname, \@dir, $base_dir, $dirname, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli);
     733    # We don't return count, but test that it is 1 exactly.
     734    if ($count != 1)
     735    {
     736    print STDERR "ERROR! The count of documents processed from a single call to DirectoryPlugin::read_for_manifest_v2() is not 1.\n";
     737    }
     738}
     739
    6657401;
Note: See TracChangeset for help on using the changeset viewer.