Changeset 30289 for gs2-extensions

Show
Ignore:
Timestamp:
09.10.2015 13:29:06 (4 years ago)
Author:
jmt12
Message:

Significant changes to read() function - essentially split in half with the first phase responsible for building up the list of files to process and the second for doing the actual processing. Allows us to shortcut the system by passing in a list of files to process (as in the case of manifest version 2).

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/perllib/plugins/DirectoryPlugin.pm

    r29260 r30289  
    3434use plugin; 
    3535use util; 
     36use FileUtils; 
    3637use metadatautil; 
    3738 
     
    4142no strict 'subs'; 
    4243 
     44use Encode::Locale; 
    4345use Encode; 
     46use Unicode::Normalize; 
    4447 
    4548BEGIN { 
     
    7982 
    8083    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists); 
    81      
     84 
     85    print STDERR "INFO: This DirectoryPlugin supports version 2 manifest files\n"; 
     86 
    8287    if ($self->{'info_only'}) { 
    8388    # don't worry about any options or initialisations etc 
     
    8994    die "ERROR: DirectoryPlugin -use_metadata_files option has been deprecated. Please remove the option and add MetadataXMLPlug to your plugin list instead!\n"; 
    9095    } 
    91      
     96 
    9297    $self->{'num_processed'} = 0; 
    9398    $self->{'num_not_processed'} = 0; 
     
    134139        my $archives_inf = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir); 
    135140 
    136     if ( &FileUtils::fileExists($archives_inf) ) { 
    137         $self->{'inf_timestamp'} = &FileUtils::file_lastmodified($archives_inf); 
     141    if ( -e $archives_inf ) { 
     142        $self->{'inf_timestamp'} = -M $archives_inf; 
    138143    } 
    139144    } 
     
    172177    my $self = shift (@_); 
    173178     
    174     return '(?i)(CVS|\.svn|Thumbs\.db|OIDcount|~)$'; 
     179    return '(?i)(CVS|\.svn|Thumbs\.db|OIDcount|\.DS_Store|~)$'; 
    175180} 
    176181 
     
    179184    my $self = shift(@_); 
    180185    my ($dirname) = @_; 
    181  
    182     # replace -d with function in util library 
    183     return undef unless (&FileUtils::directoryExists($dirname)); 
     186     
     187    return undef unless (-d $dirname); 
    184188 
    185189    return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/); 
     
    259263 
    260264    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path); 
    261 ###    print STDERR "*** DirectoryPlugin::file_is_blocked $filename_full_path\n"; 
    262  
    263     if ($ENV{'GSDLOS'} =~ m/^windows$/) { 
     265 
     266    if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) { 
    264267    # on windows, all block paths are lowercased. 
    265268    my $lower_filename = lc ($filename_full_path); 
     
    295298    # that it is not explicitly blocked. 
    296299    my $dirname = $file; 
    297     $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 
     300    $dirname = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\w/; 
    298301 
    299302    my $directory_ok = $self->check_directory_path($dirname); 
    300303    return $directory_ok unless (defined $directory_ok && $directory_ok == 1); 
    301304 
    302     print $outhandle "File scan checking directory: $dirname\n"; 
     305    print $outhandle "Global file scan checking directory: $dirname\n"; 
    303306 
    304307    $block_hash->{'all_files'} = {} unless defined $block_hash->{'all_files'}; 
     
    308311    $block_hash->{'shared_fileroot'} = {} unless defined $block_hash->{'shared_fileroot'}; 
    309312 
    310     # Recur over directory contents. 
     313     # Recur over directory contents. 
     314    my (@dir, $subfile); 
     315    #my $count = 0; 
     316     
    311317    print $outhandle "DirectoryPlugin block: getting directory $dirname\n" if ($verbosity > 2); 
    312  
     318     
    313319    # find all the files in the directory 
    314     my @dir = @{&FileUtils::readDirectory($dirname)}; 
    315     if (scalar(@dir) == 0) 
    316     { 
    317       if ($gli) 
    318       { 
    319         print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n"; 
    320       } 
    321       print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n"; 
    322       return -1; # error in processing 
    323     } 
    324  
     320    if (!opendir (DIR, $dirname)) { 
     321    if ($gli) { 
     322        print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n"; 
     323    } 
     324    print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n"; 
     325    return -1; # error in processing 
     326    } 
     327    @dir = sort readdir (DIR); 
     328    closedir (DIR); 
     329     
    325330    for (my $i = 0; $i < scalar(@dir); $i++) { 
    326331    my $raw_subfile = $dir[$i]; 
     
    328333 
    329334    my $this_file_base_dir = $base_dir; 
    330     my $raw_file_subfile = &util::filename_cat($file, $raw_subfile); 
     335    my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile); 
    331336 
    332337    # Recursively read each $raw_subfile 
    333     print $outhandle "DirectoryPlugin block recurring: $raw_file_subfile\n" if ($verbosity > 2); 
     338    print $outhandle "DirectoryPlugin block recurring: ". Encode::decode("utf8", $raw_file_subfile) ."\n" if ($verbosity > 2); 
     339    print $outhandle "DirectoryPlugin block recurring: ". Encode::decode(locale =>$raw_file_subfile) ."\n" if ($verbosity > 2); 
    334340     
    335341    #$count += &plugin::file_block_read ($pluginfo, $this_file_base_dir, 
     
    369375    my $self = shift (@_); 
    370376    my ($pluginfo, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
    371  
    372377    my $outhandle = $self->{'outhandle'}; 
    373378    my $verbosity = $self->{'verbosity'}; 
     
    380385    } else { 
    381386    $dirname = $file; 
    382     $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 
     387    $dirname = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\w/; 
    383388    } 
    384389     
     
    393398 
    394399    # Recur over directory contents. 
     400    my @dir; 
     401     
    395402    print $outhandle "DirectoryPlugin read: getting directory $dirname\n" if ($verbosity > 2); 
    396  
     403     
    397404    # find all the files in the directory 
    398     my @dir = @{&FileUtils::readDirectory($dirname)}; 
    399     if (scalar(@dir) == 0) 
    400     { 
    401       if ($gli) 
    402       { 
    403         print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n"; 
    404       } 
    405       print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n"; 
    406       return -1; # error in processing 
    407     } 
    408     map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dir; 
    409  
    410     # Re-order the files in the list so any directories ending with .all are 
    411     # moved to the end 
     405    if (!opendir (DIR, $dirname)) { 
     406    if ($gli) { 
     407        print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n"; 
     408    } 
     409    print $outhandle "DirectoryPlugin: WARNING - couldn't read directory $dirname\n"; 
     410    return -1; # error in processing 
     411    } 
     412    @dir = sort readdir (DIR); 
     413    map {  $_ = &unicode::raw_filename_to_url_encoded($_);  } @dir; 
     414    closedir (DIR); 
     415    # Re-order the files in the list so any directories ending with .all are moved to the end 
    412416    for (my $i = scalar(@dir) - 1; $i >= 0; $i--) { 
    413     if (-d &util::filename_cat($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) { 
     417    if (-d &FileUtils::filenameConcatenate($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) { 
    414418        push(@dir, splice(@dir, $i, 1)); 
    415419    } 
    416420    } 
    417421 
     422    # Chain through to the rest of the read function (now split off and named 
     423    # read_phase2) 
     424    my $count = $self->read_phase2($pluginfo, $dirname, \@dir, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli); 
     425 
     426    return $count; 
     427} 
     428 
     429sub read_phase2 
     430{ 
     431    my $self = shift (@_); 
     432    my ($pluginfo, $dirname, $dir_ref, $base_dir, $file, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
     433    # These were defined in read (phase 1) 
     434    my @dir = @{$dir_ref}; 
     435    my $subfile; 
     436 
     437    my $outhandle = $self->{'outhandle'}; 
     438    my $verbosity = $self->{'verbosity'}; 
     439     
    418440    # setup the metadata structures. we do a metadata_read pass to see if there is any additional metadata, then pass it to read 
    419441     
     
    428450    my $base_dir_regexp = $base_dir; 
    429451    $base_dir_regexp =~ s/\//$os_dirsep/g; 
    430  
     452        
    431453    # Want to get relative path of local_dirname within the base_directory 
    432     # but with URL style slashes. 
     454    # but with URL style slashes.  
    433455    my $local_dirname = &util::filename_within_directory_url_format($dirname, $base_dir); 
    434456 
     
    460482    my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile); 
    461483 
    462     my $raw_file_subfile = &util::filename_cat($file, $raw_subfile); 
    463     my $raw_full_filename = &util::filename_cat($this_file_base_dir, $raw_file_subfile); 
     484    my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile); 
     485    my $raw_full_filename = &FileUtils::filenameConcatenate($this_file_base_dir, $raw_file_subfile); 
    464486 
    465487    if ($self->file_is_blocked($block_hash,$raw_full_filename)) { 
     
    512534        # Re-read the files in the directory to see if there are any new files 
    513535        last if (!opendir (DIR, $dirname)); 
    514         my @dirnow = readdir (DIR); 
     536        my @dirnow = sort readdir (DIR); 
    515537        map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dirnow; 
    516538        closedir (DIR); 
     
    540562    my $this_file_base_dir = $base_dir; 
    541563    my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile); 
    542  
    543     my $raw_file_subfile = &util::filename_cat($file, $raw_subfile); 
     564    # get the canonical unicode version of the filename. This may not match 
     565    # the filename on the file system. We will use it to compare to regex 
     566    # in the metadata table. 
     567    my $unicode_subfile = &util::raw_filename_to_unicode($dirname, $raw_subfile); 
     568    my $raw_file_subfile = &FileUtils::filenameConcatenate($file, $raw_subfile); 
    544569    my $raw_full_filename  
    545         = &util::filename_cat($this_file_base_dir,$raw_file_subfile); 
     570        = &FileUtils::filenameConcatenate($this_file_base_dir,$raw_file_subfile); 
    546571 
    547572    if ($self->file_is_blocked($block_hash,$raw_full_filename)) { 
     
    549574        next; 
    550575    } 
    551     #print STDERR "processing $raw_full_filename\n"; 
     576    print STDERR "** DirectoryPlugin processing $raw_full_filename\n"; 
    552577    # Follow Windows shortcuts 
    553     if ($raw_subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) { 
     578    if ($raw_subfile =~ m/(?i)\.lnk$/ && (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin"))) { 
    554579        require Win32::Shortcut; 
    555         my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $raw_subfile)); 
     580        my $shortcut = new Win32::Shortcut(&FileUtils::filenameConcatenate($dirname, $raw_subfile)); 
    556581        if ($shortcut) { 
    557582        # The file to be processed is now the target of the shortcut 
     
    592617    } 
    593618 
    594     # $subfile by this point is url-encoded => all ASCII chars => no need to encode as UTF8 
    595  
    596     # Next add metadata read in XML files (if it is supplied) 
     619    ### Now we need to look up the metadata table to see if there is any  
     620    # extra metadata for us. We need the canonical unicode version here. 
    597621    if ($additionalmetadata == 1) { 
    598622        foreach my $filespec (@extrametakeys) { 
    599         ## use the url-encoded filename to do the filename comparison 
    600  
    601         if ($subfile =~ /^$filespec$/) { 
    602             print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"  
     623        if ($unicode_subfile =~ /^$filespec$/) { 
     624            print $outhandle "File \"$unicode_subfile\" matches filespec \"$filespec\"\n"  
    603625            if ($verbosity > 2); 
    604626            my $mdref = &extrametautil::getmetadata(\%extrametadata, $filespec); 
     
    624646    if (defined $self->{'inf_timestamp'}) { 
    625647        # Look to see if it's a completely new file 
     648 
    626649        if (!$block_hash->{'new_files'}->{$raw_full_filename}) { 
    627650        # Not a new file, must be an existing file 
    628651        # Let' see if it's newer than the last import.pl 
    629         if (! &util::dir_exists($raw_full_filename)) { 
     652 
     653 
     654        if (! -d $raw_full_filename) { 
    630655            if (!$block_hash->{'reindex_files'}->{$raw_full_filename}) { 
    631656            # filename has been around for longer than inf_timestamp 
    632             print $outhandle "**** Skipping $subfile\n" if ($verbosity >3); 
     657            print $outhandle "**** Skipping $unicode_subfile\n" if ($verbosity >3); 
    633658            next; 
    634659            } 
     
    648673 
    649674    # Recursively read each $subfile 
    650     print $outhandle "DirectoryPlugin recurring: $subfile\n" if ($verbosity > 2); 
     675    print $outhandle "DirectoryPlugin recurring: $unicode_subfile\n" if ($verbosity > 2); 
    651676     
    652677    $count += &plugin::read ($pluginfo, $this_file_base_dir, 
     
    663688} 
    664689 
     690# Manifest files, version 2, provide an explicit listing of the documents to be 
     691# processed by Greenstone.  This allows a user to avoid expensive file tree 
     692# searches - a crucial requirement for very-large scale collections and 
     693# parallel processing. However, we still want to leverage the metadata parsing 
     694# functionality found here in DirectoryPlugin. Thus we have this special call 
     695# to read that expects a single file. The normal read function starts by 
     696# listing the files in a given directory and then performs a number of actions 
     697# over them (including recursing down into any further directories found). We 
     698# circumvent that behaviour by 'pretending' to already have a directory listing 
     699# containing at most two file - the file passed in, and an accompanying 
     700# metadata.xml file if one exists. 
     701sub read_for_manifest_v2 
     702{ 
     703    my $self = shift (@_); 
     704    my ($pluginfo, $file, $block_hash, $processor, $gli) = @_; 
     705    my $base_dir = ''; 
     706    my $in_metadata = {}; 
     707    my $maxdocs = -1; 
     708    my $total_count = 0; 
     709    # Ensure we have the full path of the file to process 
     710    my $full_path = $file; 
     711    if ($base_dir =~ /\w/) 
     712    { 
     713    $full_path = &FileUtils::filenameConcatenate($base_dir, $file); 
     714    } 
     715    # Unlike the vanilla read(), directories are unacceptable 
     716    if (!-f $full_path) 
     717    { 
     718    return 0; 
     719    } 
     720    # Now split the full path into a directory and a filename 
     721    my ($dirname, $the_file) = $full_path =~ /^(.*)\/([^\/]+)$/; 
     722    # We will prepopulate a 'directory listing' with this file 
     723    my @dir = ($the_file); 
     724    # See if there is an accompanying 
     725    my $metadata_xml_path = $dirname . '/metadata.xml'; 
     726    if (-f $metadata_xml_path) 
     727    { 
     728    unshift(@dir, 'metadata.xml'); 
     729    } 
     730    # Chain through to the normal read process, but with out 'forged' directory 
     731    # listing so as to avoid all the costs of actually listing / recursing. 
     732    my $count = $self->read_phase2($pluginfo, $dirname, \@dir, $base_dir, $dirname, $block_hash, $in_metadata, $processor, $maxdocs, $total_count, $gli); 
     733    # We don't return count, but test that it is 1 exactly. 
     734    if ($count != 1) 
     735    { 
     736    print STDERR "ERROR! The count of documents processed from a single call to DirectoryPlugin::read_for_manifest_v2() is not 1.\n"; 
     737    } 
     738} 
     739 
    6657401;