Changeset 23053


Ignore:
Timestamp:
2010-10-06T15:39:33+13:00 (14 years ago)
Author:
kjdon
Message:

reworking of manifest stuff. Now, instead of calling plugin::read on each file in the reindex/new_files lists, we add all the filenames into the standard block_hash. This is the equivalent of what the code for incremental build does, except that instead of inspecting the import folder, we just use what is listed in manifest file. If a directory is listed, then we look through the directory and add each file in there to the appropriate list. Once block_hash is set up, then we tell the plugins we are incremental, and just process the import folder as per usual. Only those docs in new_files and reindex lists in block_hash will be processed. But this gets a full metadata_read done, which we need in case there is metadata specified up a level for the files that are changed/new.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/inexport.pm

    r23042 r23053  
    375375   
    376376    my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
    377     if ($self->{'manifest'} ne "") {   
     377    if ($self->{'manifest'} ne "") {
     378    print STDERR "parsing manifest\n";
    378379    my $manifest_filename = $self->{'manifest'};
    379380
     
    396397    }
    397398
     399    my $plugin_incr_mode = $incremental_mode;
     400    if ($manifest ne "") {
     401    # if we have a manifest file, then we pretend we are fully incremental for plugins
     402    $plugin_incr_mode = "all";
     403    }
    398404    #some global options for the plugins
    399405    my @global_opts = ();
    400406
    401     my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode);
     407    my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode);
    402408    if (scalar(@$pluginfo) == 0) {
    403409    &gsprintf($out, "{import.no_plugins_loaded}\n");
     
    424430
    425431    # read the archive information file
    426 ##  my $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf");
    427432
    428433    # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
     
    509514    &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
    510515    }
    511     if ($manifest eq "") {
    512     # process the import directory
    513     my $block_hash = {};
    514     my $metadata = {};
    515     # gobal blocking pass may set up some metadata
    516     &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
     516
     517    # process the import directory
     518    my $block_hash = {};
     519    $block_hash->{'new_files'} = {};
     520    $block_hash->{'reindex_files'} = {};
     521    my $metadata = {};
     522   
     523    # gobal blocking pass may set up some metadata
     524    &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
     525   
     526    if ($manifest ne "") {
     527    #
     528    # 1. Process delete files first
     529    #
     530
     531    my @deleted_files = keys %{$manifest_lookup->{'delete'}};
     532    my @full_deleted_files = ();
     533
     534    # ensure all filenames are absolute
     535    foreach my $df (@deleted_files) {
     536        #print STDERR "**delete file $df\n";
     537        my $full_df =
     538        (&util::filename_is_absolute($df))
     539        ? $df
     540        : &util::filename_cat($importdir,$df);
     541
     542        if (-d $full_df) {
     543        &add_dir_contents_to_list($full_df, \@full_deleted_files);
     544        } else {
     545        push(@full_deleted_files,$full_df);
     546        }
     547    }
     548   
     549    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
     550    mark_docs_for_deletion($archive_info,{},
     551                   \@full_deleted_files,
     552                   $archivedir, $verbosity, "delete");
     553
     554
     555    #
     556    # 2. Now files for reindexing
     557    #
     558
     559    my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
     560    my @full_reindex_files = ();
     561
     562    # ensure all filenames are absolute
     563    foreach my $rf (@reindex_files) {       
     564        my $full_rf =
     565        (&util::filename_is_absolute($rf))
     566        ? $rf
     567        : &util::filename_cat($importdir,$rf);
     568
     569        if (-d $full_rf) {
     570        &add_dir_contents_to_list($full_rf, \@full_reindex_files);
     571        } else {
     572        push(@full_reindex_files,$full_rf);
     573        }
     574    }
     575   
     576    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
     577    mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
     578
     579    # And now to ensure the new version of the file processed by
     580    # appropriate plugin, we need to add it to block_hash reindex list
     581    foreach my $full_rf (@full_reindex_files) {
     582        #print STDERR "***reindex file $full_rf\n";
     583        $block_hash->{'reindex_files'}->{$full_rf} = 1;
     584    }
     585
     586
     587    #
     588    # 3. Now finally any new files - add to block_hash new_files list
     589    #
     590
     591    my @new_files = keys %{$manifest_lookup->{'index'}};
     592    my @full_new_files = ();
     593
     594    foreach my $nf (@new_files) {
     595        #print STDERR "***index file $nf\n";
     596        # ensure filename is absolute
     597        my $full_nf =
     598        (&util::filename_is_absolute($nf))
     599        ? $nf
     600        : &util::filename_cat($importdir,$nf);
     601
     602        if (-d $full_nf) {
     603        &add_dir_contents_to_list($full_nf, \@full_new_files);
     604        } else {
     605        push(@full_new_files,$full_nf);
     606        }
     607    }
     608
     609    foreach my $f (@full_new_files) {
     610        $block_hash->{'new_files'}->{$f} = 1;
     611    }
     612    }
     613    else {
     614    # if incremental, we read through the import folder to see whats changed.
    517615
    518616    if ($incremental || $incremental_mode eq "onlyadd") {
    519 
    520617        prime_doc_oid_count($archivedir);
    521618
     
    571668        }
    572669               
    573         }
    574        
    575         # Play it safe, and run through the entire folder, only processing new or edited files
    576        
    577             if ((defined $jobs) && ($jobs > 1))
    578             {
    579         # if jobs are set to >1, run in parallel using MPI helper
    580         # [hs, 1 july 2010]
    581         &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
    582                               $self->{'collection'}, $self->{'site'});
    583             }
    584             else
    585             {
    586            &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    587             }
    588     }
    589     else {
    590             if ((defined $jobs) && ($jobs > 1))
    591             {
    592         # if jobs are set to >1, run in parallel using MPI helper
    593         # [hs, 1 july 2010]
    594         &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
    595                               $self->{'collection'}, $self->{'site'});
    596             }
    597             else
    598             {
    599         &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    600             }   
    601     }
    602 
     670        }       
     671    }
     672    }
     673   
     674    # now, whichever mode we are in, we can process the entire import folder
     675    if ((defined $jobs) && ($jobs > 1))
     676    {
     677    # if jobs are set to >1, run in parallel using MPI helper
     678    # [hs, 1 july 2010]
     679    &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
     680                          $self->{'collection'}, $self->{'site'});
    603681    }
    604682    else
    605683    {
    606     #
    607     # 1. Process delete files first
    608     #
    609 
    610     my @deleted_files = keys %{$manifest_lookup->{'delete'}};
    611     my @full_deleted_files = ();
    612 
    613     # ensure all filenames are absolute
    614     foreach my $df (@deleted_files) {       
    615         my $full_df =
    616         (&util::filename_is_absolute($df))
    617         ? $df
    618         : &util::filename_cat($importdir,$df);
    619 
    620         push(@full_deleted_files,$full_df);
    621     }
    622    
    623     &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
    624     mark_docs_for_deletion($archive_info,{},
    625                    \@full_deleted_files,
    626                    $archivedir, $verbosity, "delete");
    627 
    628 
    629     #
    630     # 2. Now files for reindexing
    631     #
    632 
    633     my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
    634     my @full_reindex_files = ();
    635 
    636     # ensure all filenames are absolute
    637     foreach my $rf (@reindex_files) {       
    638         my $full_rf =
    639         (&util::filename_is_absolute($rf))
    640         ? $rf
    641         : &util::filename_cat($importdir,$rf);
    642 
    643         push(@full_reindex_files,$full_rf);
    644     }
    645    
    646     &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
    647     mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
    648 
    649     # And now ensure the new version of the file processed by appropriate
    650     # plugin
    651     foreach my $full_rf (@full_reindex_files) {
    652         &plugin::read ($pluginfo, "", $full_rf, {}, {}, $processor, $maxdocs, 0, $gli);
    653     }
    654 
    655 
    656     #
    657     # 3. Now finally any new files
    658     #
    659 
    660     foreach my $file (keys %{$manifest_lookup->{'index'}}) {
    661         my $block_hash = {};
    662         my $metadata = {};
    663         &plugin::file_block_read($pluginfo, $importdir, $file, $block_hash, $metadata, $gli);
    664         &plugin::read ($pluginfo, $importdir, $file, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    665     }
    666 
    667 
    668     }
    669 
     684    &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
     685    }
     686   
     687   
    670688    if ($saveas eq "FedoraMETS") {
    671689    # create collection "doc obj" for Fedora that contains
     
    11231141}
    11241142
     1143sub add_dir_contents_to_list {
     1144
     1145    my ($dirname, $list) = @_;
     1146 
     1147    # Recur over directory contents.
     1148    my (@dir, $subfile);
     1149   
     1150    # find all the files in the directory
     1151    if (!opendir (DIR, $dirname)) {
     1152    print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
     1153    return -1; # error in processing
     1154    }
     1155    @dir = readdir (DIR);
     1156    closedir (DIR);
     1157   
     1158    for (my $i = 0; $i < scalar(@dir); $i++) {
     1159    my $subfile = $dir[$i];
     1160    next if ($subfile =~ m/^\.\.?$/);
     1161    my $full_file = &util::filename_cat($dirname, $subfile);
     1162    if (-d $full_file) {
     1163        &add_dir_contents_to_list($full_file, $list);
     1164    } else {
     1165        push (@$list, $full_file);
     1166    }
     1167    }
     1168   
     1169}
    11251170
    11261171
Note: See TracChangeset for help on using the changeset viewer.