Ignore:
Timestamp:
2011-09-28T13:22:52+13:00 (13 years ago)
Author:
jmt12
Message:

Several changes to ensure parallel importing plays nicely with manifest files and (simple) accompanying metadata.xml files. Also made it so initial calls to get_infodb_file_path run the GDBMServer (if necessary) to ensure it persists through parallel importing

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/perllib/inexport.pm

    r24626 r24686  
    351351    $self->{'incremental'}      = $incremental;
    352352    $self->{'incremental_mode'} = $incremental_mode;
     353
     354    # Since this wasted my morning, let's at least warn a user that manifest
     355    # files now *only* work if keepold is set. [jmt12]
     356    if ($self->{'manifest'} && !$self->{'keepold'})
     357    {
     358      print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental. Ignoring.\n";
     359    }
    353360}
    354361
     
    470477    &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
    471478
    472     my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
    473     my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
    474                            
     479    # Warning! Black magic follows. When the following functions are called on
     480    # the GDBMServer class they will actually prompt the running of the Server
     481    # and attach themselves as a listener (even though they don't do anything)
     482    # This is done so that, in parallel importing, the server will persist
     483    # until the top level import.pl (which will be the first this that calls
     484    # this function) completes.
     485    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, 1);
     486    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, 1);
     487
    475488    my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
    476489    $archive_info->load_info ($arcinfo_doc_filename);
     
    558571    $block_hash->{'new_files'} = {};
    559572    $block_hash->{'reindex_files'} = {};
     573    # All of these are set somewhere else, so it's kinda nice to define them
     574    # here. [jmt12]
     575    $block_hash->{'all_files'} = {};
     576    $block_hash->{'deleted_files'} = {};
     577    $block_hash->{'file_blocks'} = {};
     578    $block_hash->{'metadata_files'} = {};
     579    $block_hash->{'shared_fileroot'} = '';
     580    # My new flag so we can tell we had a manifest way down in the plugins
     581    # [jmt12]
     582    $block_hash->{'manifest'} = 'false';
    560583    my $metadata = {};
    561584   
    562585    # global blocking pass may set up some metadata
    563     &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
     586    # - when we have a manifest file we don't do this -unless- the collection
     587    #   configuration indicates this collection contains complex (inherited)
     588    #   metadata. [jmt12]
     589    if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
     590    {
     591      &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
     592    }
     593    else
     594    {
     595      print "Skipping global file scan due to manifest and complexmeta configuration\n";
     596    }
    564597   
    565598    if ($manifest ne "") {
     599
     600      $block_hash->{'manifest'} = 'true';
     601
    566602    #
    567603    # 1. Process delete files first
     
    642678
    643679    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
     680      # need to check this file exists before trying to read it. [jmt12]
     681      if (-e $arcinfo_src_filename)
     682      {
    644683    my $arcinfodb_map = {};
    645684    &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
     
    653692        }
    654693    }
    655 
    656694    undef $arcinfodb_map;
     695      }
     696      # no existing files - so we can just add all the ones that need adding.
     697      # [jmt12]
     698      else
     699      {
     700        foreach my $f (@full_new_files)
     701        {
     702          $block_hash->{'new_files'}->{$f} = 1;
     703        }
     704      }
     705
     706      # If we are not using complex inherited metadata (and thus have skipped
     707      # the global file scan) we need to at least scan the directory of the
     708      # files being indexed/reindexed. [jmt12]
     709      if ($collectcfg->{'complexmeta'} ne 'true')
     710      {
     711        my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
     712        foreach my $file_to_import (@all_files_to_import)
     713        {
     714          my $dir_to_import = $file_to_import;
     715          $dir_to_import =~ s/[^\\\/]*$//;
     716          # - one day we may need to manually scan this directory for child
     717          #   directories and somehow explicitly block them from being
     718          #   recursed.
     719          if (-d $dir_to_import)
     720          {
     721            &plugin::file_block_read($pluginfo, $dir_to_import, '', $block_hash, $metadata, $gli);
     722          }
     723        }
     724      }
    657725    }
    658726    else {
Note: See TracChangeset for help on using the changeset viewer.