Changeset 27302


Ignore:
Timestamp:
05/06/13 15:21:07 (8 years ago)
Author:
jmt12
Message:

Removed parallel processing stuff as that now lives in an extension. Restructured to better support overriding by extensions. Checks for manifest version, and processes files accordingly. Conditional addition to INC and PATH environment variables (explained elsewhere). Replace deprecated util.pm calls with FileUtils.pm ones

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/inexport.pm

    r26567 r27302  
    119119    $self->{'collection'} = shift @$argv;
    120120
    121     if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
    122     require ParallelInexport;
    123     }
     121    # Unless otherwise stated all manifests are considered version 1---where
     122    # they act more like an advanced process expression---as compared to newer
     123    # manifest files that act as an explicit (and exhaustive) list of files to
     124    # process [jmt12]
     125    $self->{'manifest_version'} = 1;
    124126
    125127    return bless $self, $class;
     
    143145    else { 
    144146        $self->{'site'} = "";
    145         $self->{'collectdir'} = &util::filename_cat($ENV{'GSDLHOME'},"collect");
     147        $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
    146148    }
    147149    $self->{'faillog'} = "";
     
    179181    $self->{'gs_version'} = "3";
    180182    }
    181     # add collection's perllib dir  into include path in
     183
     184    # add collection's perllib dir into include path in
    182185    # case we have collection specific modules
    183     unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
     186    &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
    184187
    185188    # check that we can open the faillog
    186189    my $faillog = $self->{'faillog'};
    187190    if ($faillog eq "") {
    188     $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
     191    $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
    189192    }
    190193    open (FAILLOG, ">$faillog") ||
     
    197200    $self->{'faillog'} = $faillog;
    198201    $self->{'faillogname'} = $faillogname;
     202    $self->{'close_faillog'} = 1;
    199203
    200204    # Read in the collection configuration file.
     
    237241    # fill in the default import and archives directories if none
    238242    # were supplied, turn all \ into / and remove trailing /
    239     $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
     243    $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
     244    # @todo &FileUtils::sanitizePath($importdir) [jmt12]
    240245    $importdir =~ s/[\\\/]+/\//g;
    241246    $importdir =~ s/\/$//;
     
    248253    if ($archivedir eq "") {
    249254    if ($inexport_mode eq "import") {
    250         $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
     255        $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives");
    251256    }
    252257    elsif ($inexport_mode eq "export") {
    253         $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
     258        $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "export");
    254259    }
    255260    else {
    256261        print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
    257262        print STDERR "         Defaulting to 'archives' for file output\n";
    258         $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
    259     }
    260     }
    261 
     263        $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives");
     264    }
     265    }
     266
     267    # @todo &FileUtils::sanitizePath($archivedir) [jmt12]
    262268    $archivedir =~ s/[\\\/]+/\//g;
    263269    $archivedir =~ s/\/$//;
     
    355361    $self->{'incremental'}      = $incremental;
    356362    $self->{'incremental_mode'} = $incremental_mode;
     363
     364    # Since this wasted my morning, let's at least warn a user that manifest
     365    # files now *only* work if keepold is set [jmt12]
     366    if ($self->{'manifest'} && !$self->{'keepold'})
     367    {
     368      print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental\n";
     369    }
    357370}
    358371
     
    394407
    395408    my $gli          = $self->{'gli'};
    396 
    397     my $jobs         = $self->{'jobs'};
    398     my $epoch        = $self->{'epoch'};
    399409
    400410    # related to export
     
    418428    my $manifest_filename = $self->{'manifest'};
    419429
    420     if (!&util::filename_is_absolute($manifest_filename)) {
    421         $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
     430    if (!&FileUtils::isFilenameAbsolute($manifest_filename)) {
     431        $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
    422432    }
    423433
     
    426436
    427437    $manifest_lookup->parse($manifest_filename);
     438
     439        # manifests may now include a version number [jmt12]
     440        $self->{'manifest_version'} = $manifest_lookup->get_version();
    428441    }
    429442
     
    454467
    455468    if ($removeold) {
    456     if (-e $archivedir) {
     469    if (&FileUtils::directoryExists($archivedir)) {
    457470        &gsprintf($out, "{import.removing_archives}\n");
    458         &util::rm_r ($archivedir);
    459     }
    460     my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
     471        &FileUtils::removeFilesRecursive($archivedir);
     472    }
     473    my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
    461474    $tmpdir =~ s/[\\\/]+/\//g;
    462475    $tmpdir =~ s/\/$//;
    463     if (-e $tmpdir) {
     476    if (&FileUtils::directoryExists($tmpdir)) {
    464477        &gsprintf($out, "{import.removing_tmpdir}\n");
    465         &util::rm_r ($tmpdir);
     478        &FileUtils::removeFileRecursive($tmpdir);
    466479    }
    467480    }
    468481
    469482    # create the archives dir if needed
    470     &util::mk_all_dir($archivedir);
     483    &FileUtils::makeAllDirectories($archivedir);
    471484
    472485    # read the archive information file
    473486
    474487    # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
    475     &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
    476     &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
    477 
    478     my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
    479     my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
    480                            
     488    &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc"));
     489    &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src"));
     490
     491    # When we make these initial calls to determine the archive information doc
     492    # and src databases we pass through a '1' to indicate this is the first
     493    # time we are referring to these databases. When using dynamic dbutils
     494    # (available in extensions) this indicates to some database types (for
     495    # example, persistent servers) that this is a good time to perform any
     496    # one time initialization. The argument has no effect on vanilla dbutils
     497    # [jmt12]
     498    my $perform_firsttime_init = 1;
     499    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init);
     500    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
     501
    481502    my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
    482503    $archive_info->load_info ($arcinfo_doc_filename);
     
    547568    }
    548569
    549     my $processor = &plugout::load_plugout($plugout);                       
     570    my $processor = &plugout::load_plugout($plugout);
    550571    $processor->setoutputdir ($archivedir);
    551572    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
     
    565586    $block_hash->{'new_files'} = {};
    566587    $block_hash->{'reindex_files'} = {};
     588    # all of these are set somewhere else, so it's more readable to define them
     589    # here [jmt12]
     590    $block_hash->{'all_files'} = {};
     591    $block_hash->{'deleted_files'} = {};
     592    $block_hash->{'file_blocks'} = {};
     593    $block_hash->{'metadata_files'} = {};
     594    $block_hash->{'shared_fileroot'} = '';
     595    # a new flag so we can tell we had a manifest way down in the plugins
     596    # [jmt12]
     597    $block_hash->{'manifest'} = 'false';
    567598    my $metadata = {};
    568599   
    569600    # global blocking pass may set up some metadata
    570     &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
    571    
     601    # - when we have a newer manifest file we don't do this -unless- the
     602    #   collection configuration indicates this collection contains complex
     603    #   (inherited) metadata [jmt12]
     604    if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
     605    {
     606      &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
     607    }
     608    else
     609    {
     610      print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n";
     611    }
     612
    572613    if ($manifest ne "") {
     614
     615      # mark that we are using a manifest - information that might be needed
     616      # down in plugins (for instance DirectoryPlugin)
     617      $block_hash->{'manifest'} = $self->{'manifest_version'};
     618
    573619    #
    574620    # 1. Process delete files first
     
    580626    foreach my $df (@deleted_files) {
    581627        my $full_df =
    582         (&util::filename_is_absolute($df))
     628        (&FileUtils::isFilenameAbsolute($df))
    583629        ? $df
    584         : &util::filename_cat($importdir,$df);
     630        : &FileUtils::filenameConcatenate($importdir,$df);
    585631
    586632        if (-d $full_df) {
     
    606652    foreach my $rf (@reindex_files) {       
    607653        my $full_rf =
    608         (&util::filename_is_absolute($rf))
     654        (&FileUtils::isFilenameAbsolute($rf))
    609655        ? $rf
    610         : &util::filename_cat($importdir,$rf);
     656        : &FileUtils::filenameConcatenate($importdir,$rf);
    611657
    612658        if (-d $full_rf) {
     
    637683        # ensure filename is absolute
    638684        my $full_nf =
    639         (&util::filename_is_absolute($nf))
     685        (&FileUtils::isFilenameAbsolute($nf))
    640686        ? $nf
    641         : &util::filename_cat($importdir,$nf);
     687        : &FileUtils::filenameConcatenate($importdir,$nf);
    642688
    643689        if (-d $full_nf) {
     
    649695
    650696    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
     697      # need to check this file exists before trying to read it - in the past
     698      # it wasn't possible to have a manifest unless keepold was also set so
     699      # you were pretty much guarenteed arcinfo existed
     700      # [jmt12]
     701      # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12]
     702      if (-e $arcinfo_src_filename)
     703      {
    651704    my $arcinfodb_map = {};
    652705    &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
     
    662715
    663716    undef $arcinfodb_map;
     717      }
     718      # no existing files - so we can just add all the files [jmt12]
     719      else
     720      {
     721        foreach my $f (@full_new_files)
     722        {
     723          $block_hash->{'new_files'}->{$f} = 1;
     724        }
     725      }
     726
     727      # If we are not using complex inherited metadata (and thus have skipped
     728      # the global file scan) we need to at least check for a matching
     729      # metadata.xml for the files being indexed/reindexed
     730      # - unless we are using the newer version of Manifests, which are treated
     731      #   verbatim, and should have a metadata element for metadata files (so
     732      #   we can explicitly process metadata files other than metadata.xml)
     733      # [jmt12]
     734      if ($self->{'manifest_version'} < 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
     735      {
     736        my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
     737        foreach my $file_to_import (@all_files_to_import)
     738        {
     739          my $metadata_xml_path = $file_to_import;
     740          $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
     741          if (&FileUtils::fileExists($metadata_xml_path))
     742          {
     743            &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
     744          }
     745        }
     746      }
     747
     748      # new version manifest files explicitly list metadata files to be
     749      # processed (ignoring complexmeta if set)
     750      # [jmt12]
     751      if ($self->{'manifest_version'} > 1)
     752      {
     753        # Process metadata files
     754        foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
     755        {
     756          $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
     757        }
     758      }
    664759    }
    665760    else {
     
    686781        # Filter out any in gsdl/tmp area
    687782        my @filtered_deleted_files = ();
    688         my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
    689         my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
     783        my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
     784        my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
    690785        $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
    691786        $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
     
    730825    # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
    731826    # oailastmodified and oailastmodifieddate
    732     my $earliestDatestampFile = &util::filename_cat($archivedir, "earliestDatestamp");
     827    my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
    733828    if (!-f $earliestDatestampFile && -d $archivedir) {
    734829    my $current_time_in_seconds = time; # in seconds
     
    745840    }
    746841
    747     # now, whichever mode we are in, we can process the entire import folder
    748     if ((defined $jobs) && ($jobs > 1))
    749     {
    750     # if jobs are set to >1, run in parallel using MPI helper
    751     # [hs, 1 july 2010]
    752     &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
    753                           $self->{'collection'}, $self->{'site'});
    754     }
    755     else
    756     {
    757     &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    758     }
    759    
    760    
     842   
     843    $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs);
     844
    761845    if ($saveas eq "FedoraMETS") {
    762846    # create collection "doc obj" for Fedora that contains
     
    782866    # Store the value of OIDCount (used in doc.pm) so it can be
    783867    # restored correctly to this value on an incremental build
    784     store_doc_oid_count($archivedir);
     868    # - this OIDcount file should only be generated for numerical oids [jmt12]
     869    if ($self->{'OIDtype'} eq 'incremental')
     870    {
     871      store_doc_oid_count($archivedir);
     872    }
    785873
    786874    # write out the archive information file
     
    799887}
    800888
    801 
     889# @function perform_process_files()
     890# while process_files() above prepares the system to import files this is the
     891# function that actually initiates the plugin pipeline to process the files.
     892# This function the therefore be overridden in subclasses of inexport.pm should
     893# they wish to do different or further processing
     894# @author jmt12
     895sub perform_process_files
     896{
     897  my $self = shift(@_);
     898  my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;
     899  my $gli = $self->{'gli'};
     900  # specific file to process - via manifest version 2+
     901  if ($file_to_import ne '')
     902  {
     903    &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
     904  }
     905  # global file scan - if we are using a new version manifest, files would have
     906  # been read above. Older manifests use extra settings in the $block_hash to
     907  # control what is imported, while non-manifest imports use a regular
     908  # $block_hash (so obeying process_exp and block_exp) [jmt12]
     909  elsif ($manifest eq '' || $self->{'manifest_version'} < 1)
     910  {
     911    &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
     912  }
     913  else
     914  {
     915    print STDERR "Skipping perform_process_files() due to manifest presence and version\n";
     916  }
     917}
     918# perform_process_files()
     919
     920# @function generate_statistics()
    802921sub generate_statistics
    803922{
    804     my $self = shift @_;
    805     my ($pluginfo) = @_;
    806 
    807     my $inexport_mode = $self->{'mode'};
    808 
    809     my $statsfile   = $self->{'statsfile'};
    810     my $out         = $self->{'out'};
    811     my $faillogname = $self->{'faillogname'};
    812     my $gli         = $self->{'gli'};
    813     my $jobs        = $self->{'jobs'};
    814 
    815     # write out import stats
    816 
    817     if ((!defined $jobs) || ($jobs == 1))
    818     {
    819     # only output statistics if there are multiple jobs
    820     # [hs, 1 july 2010]
    821 
    822     my $close_stats = 0;
    823     if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
    824         if (open (STATS, ">$statsfile")) {
    825         $statsfile = 'inexport::STATS';
    826         $close_stats = 1;
    827         } else {
    828         &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
    829         &gsprintf($out, "{import.stats_backup}\n");
    830         $statsfile = 'STDERR';
    831         }
    832     }
    833    
    834     &gsprintf($out, "\n");
    835     &gsprintf($out, "*********************************************\n");
    836     &gsprintf($out, "{$inexport_mode.complete}\n");
    837     &gsprintf($out, "*********************************************\n");
    838    
    839     &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
    840     if ($close_stats) {
    841         close STATS;
    842     }
    843     }
    844 
    845     close OUT if $self->{'close_out'};
    846     close FAILLOG;
    847 }
     923  my $self = shift @_;
     924  my ($pluginfo) = @_;
     925
     926  my $inexport_mode = $self->{'mode'};
     927  my $out           = $self->{'out'};
     928  my $faillogname   = $self->{'faillogname'};
     929  my $gli           = $self->{'gli'};
     930
     931  &gsprintf($out, "\n");
     932  &gsprintf($out, "*********************************************\n");
     933  &gsprintf($out, "{$inexport_mode.complete}\n");
     934  &gsprintf($out, "*********************************************\n");
     935
     936  &plugin::write_stats($pluginfo, 'STDERR', $faillogname, $gli);
     937}
     938# generate_statistics()
     939
     940
     941# @function deinit()
     942# Close down any file handles that we opened (and hence are responsible for
     943# closing
     944sub deinit
     945{
     946  my $self = shift(@_);
     947  close OUT if $self->{'close_out'};
     948  close FAILLOG if $self->{'close_faillog'};
     949}
     950# deinit()
    848951
    849952
     
    889992sub oid_count_file {
    890993    my ($archivedir) = @_;
    891     return &util::filename_cat ($archivedir, "OIDcount");
     994    return &FileUtils::filenameConcatenate($archivedir, "OIDcount");
    892995}
    893996
     
    9211024    my $oid_count_filename = &oid_count_file($archivedir);
    9221025
    923 
     1026    # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12]
    9241027    if (open(OIDOUT,">$oid_count_filename")) {
    9251028    print OIDOUT $doc::OIDcount, "\n";
     
    9551058    foreach my $prev_file (keys %$prev_all_files) {
    9561059
    957     if (!&util::filename_is_absolute($prev_file)) {
    958         my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
     1060    if (!&FileUtils::isFilenameAbsolute($prev_file)) {
     1061        my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file);
    9591062        $full_prev_all_files->{$full_prev_file} = $prev_file;
    9601063    }
     
    9751078    # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
    9761079
    977     if (!&util::filename_is_absolute($curr_file)) {
     1080    if (!&FileUtils::isFilenameAbsolute($curr_file)) {
    9781081        # add in import dir to make absolute
    979         $full_curr_file = &util::filename_cat($importdir,$curr_file);
     1082        $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file);
    9801083    }
    9811084
     
    10841187       
    10851188        my $existing_file = $existing_filename;
    1086         #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
     1189        #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'});
    10871190
    10881191        #my $collectdir_resafe = &util::filename_to_regex($collectdir);
     
    11161219      my $full_curr_file = $curr_file;
    11171220
    1118       if (!&util::filename_is_absolute($curr_file)) {
     1221      if (!&FileUtils::isFilenameAbsolute($curr_file)) {
    11191222          # add in import dir to make absolute
    11201223
    1121           $full_curr_file = &util::filename_cat($collectdir,$curr_file);
     1224          $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file);
    11221225      }
    11231226
     
    11721275        my $doc_source_file = $doc_rec->{'src-file'}->[0];
    11731276        if (!&util::filename_is_absolute($doc_source_file)) {
    1174         $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
     1277        $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
    11751278        }
    11761279
     
    12371340    next if ($subfile =~ m/^\.\.?$/);
    12381341    next if ($subfile =~ /^\.svn$/);
    1239     my $full_file = &util::filename_cat($dirname, $subfile);
     1342    my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile);
    12401343    if (-d $full_file) {
    12411344        &add_dir_contents_to_list($full_file, $list);
Note: See TracChangeset for help on using the changeset viewer.