Changeset 23053

Show
Ignore:
Timestamp:
06.10.2010 15:39:33 (9 years ago)
Author:
kjdon
Message:

reworking of manifest stuff. Now, instead of calling plugin::read on each file in the reindex/new_files lists, we add all the filenames into the standard block_hash. This is the equivalent of what the code for incremental build does, except that instead of inspecting the import folder, we just use what is listed in manifest file. If a directory is listed, then we look through the directory and add each file in there to the appropriate list. Once block_hash is set up, then we tell the plugins we are incremental, and just process the import folder as per usual. Only those docs in new_files and reindex lists in block_hash will be processed. But this gets a full metadata_read done, which we need in case there is metadata specified up a level for the files that are changed/new.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/inexport.pm

    r23042 r23053  
    375375     
    376376    my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir); 
    377     if ($self->{'manifest'} ne "") {     
     377    if ($self->{'manifest'} ne "") { 
     378    print STDERR "parsing manifest\n"; 
    378379    my $manifest_filename = $self->{'manifest'}; 
    379380 
     
    396397    } 
    397398 
     399    my $plugin_incr_mode = $incremental_mode; 
     400    if ($manifest ne "") { 
     401    # if we have a manifest file, then we pretend we are fully incremental for plugins 
     402    $plugin_incr_mode = "all"; 
     403    } 
    398404    #some global options for the plugins 
    399405    my @global_opts = (); 
    400406 
    401     my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode); 
     407    my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode); 
    402408    if (scalar(@$pluginfo) == 0) { 
    403409    &gsprintf($out, "{import.no_plugins_loaded}\n"); 
     
    424430 
    425431    # read the archive information file 
    426 ##  my $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf"); 
    427432 
    428433    # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes) 
     
    509514    &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli); 
    510515    } 
    511     if ($manifest eq "") { 
    512     # process the import directory 
    513     my $block_hash = {}; 
    514     my $metadata = {}; 
    515     # gobal blocking pass may set up some metadata 
    516     &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli); 
     516 
     517    # process the import directory 
     518    my $block_hash = {}; 
     519    $block_hash->{'new_files'} = {}; 
     520    $block_hash->{'reindex_files'} = {}; 
     521    my $metadata = {}; 
     522     
     523    # gobal blocking pass may set up some metadata 
     524    &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli); 
     525     
     526    if ($manifest ne "") { 
     527    #  
     528    # 1. Process delete files first 
     529    #  
     530 
     531    my @deleted_files = keys %{$manifest_lookup->{'delete'}}; 
     532    my @full_deleted_files = (); 
     533 
     534    # ensure all filenames are absolute 
     535    foreach my $df (@deleted_files) { 
     536        #print STDERR "**delete file $df\n"; 
     537        my $full_df = 
     538        (&util::filename_is_absolute($df))  
     539        ? $df 
     540        : &util::filename_cat($importdir,$df); 
     541 
     542        if (-d $full_df) { 
     543        &add_dir_contents_to_list($full_df, \@full_deleted_files); 
     544        } else { 
     545        push(@full_deleted_files,$full_df); 
     546        } 
     547    } 
     548     
     549    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files); 
     550    mark_docs_for_deletion($archive_info,{}, 
     551                   \@full_deleted_files, 
     552                   $archivedir, $verbosity, "delete"); 
     553 
     554 
     555    #  
     556    # 2. Now files for reindexing 
     557    #  
     558 
     559    my @reindex_files = keys %{$manifest_lookup->{'reindex'}}; 
     560    my @full_reindex_files = (); 
     561 
     562    # ensure all filenames are absolute 
     563    foreach my $rf (@reindex_files) {        
     564        my $full_rf = 
     565        (&util::filename_is_absolute($rf))  
     566        ? $rf 
     567        : &util::filename_cat($importdir,$rf); 
     568 
     569        if (-d $full_rf) { 
     570        &add_dir_contents_to_list($full_rf, \@full_reindex_files); 
     571        } else { 
     572        push(@full_reindex_files,$full_rf); 
     573        } 
     574    } 
     575     
     576    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files); 
     577    mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex"); 
     578 
     579    # And now to ensure the new version of the file processed by  
     580    # appropriate plugin, we need to add it to block_hash reindex list 
     581    foreach my $full_rf (@full_reindex_files) { 
     582        #print STDERR "***reindex file $full_rf\n"; 
     583        $block_hash->{'reindex_files'}->{$full_rf} = 1; 
     584    } 
     585 
     586 
     587    #  
     588    # 3. Now finally any new files - add to block_hash new_files list 
     589    #  
     590 
     591    my @new_files = keys %{$manifest_lookup->{'index'}}; 
     592    my @full_new_files = (); 
     593 
     594    foreach my $nf (@new_files) { 
     595        #print STDERR "***index file $nf\n"; 
     596        # ensure filename is absolute 
     597        my $full_nf = 
     598        (&util::filename_is_absolute($nf))  
     599        ? $nf 
     600        : &util::filename_cat($importdir,$nf); 
     601 
     602        if (-d $full_nf) { 
     603        &add_dir_contents_to_list($full_nf, \@full_new_files); 
     604        } else { 
     605        push(@full_new_files,$full_nf); 
     606        } 
     607    } 
     608 
     609    foreach my $f (@full_new_files) { 
     610        $block_hash->{'new_files'}->{$f} = 1; 
     611    } 
     612    } 
     613    else { 
     614    # if incremental, we read through the import folder to see whats changed. 
    517615 
    518616    if ($incremental || $incremental_mode eq "onlyadd") { 
    519  
    520617        prime_doc_oid_count($archivedir); 
    521618 
     
    571668        } 
    572669                 
    573         } 
    574          
    575         # Play it safe, and run through the entire folder, only processing new or edited files 
    576          
    577             if ((defined $jobs) && ($jobs > 1)) 
    578             {  
    579         # if jobs are set to >1, run in parallel using MPI helper 
    580         # [hs, 1 july 2010] 
    581         &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,  
    582                               $self->{'collection'}, $self->{'site'});  
    583             } 
    584             else 
    585             { 
    586            &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 
    587             } 
    588     } 
    589     else { 
    590             if ((defined $jobs) && ($jobs > 1)) 
    591             {  
    592         # if jobs are set to >1, run in parallel using MPI helper 
    593         # [hs, 1 july 2010] 
    594         &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,  
    595                               $self->{'collection'}, $self->{'site'});  
    596             } 
    597             else 
    598             { 
    599         &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 
    600             }    
    601     } 
    602  
     670        }        
     671    } 
     672    } 
     673     
     674    # now, whichever mode we are in, we can process the entire import folder 
     675    if ((defined $jobs) && ($jobs > 1)) 
     676    {  
     677    # if jobs are set to >1, run in parallel using MPI helper 
     678    # [hs, 1 july 2010] 
     679    &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,  
     680                          $self->{'collection'}, $self->{'site'});  
    603681    } 
    604682    else 
    605683    { 
    606     #  
    607     # 1. Process delete files first 
    608     #  
    609  
    610     my @deleted_files = keys %{$manifest_lookup->{'delete'}}; 
    611     my @full_deleted_files = (); 
    612  
    613     # ensure all filenames are absolute 
    614     foreach my $df (@deleted_files) {        
    615         my $full_df = 
    616         (&util::filename_is_absolute($df))  
    617         ? $df 
    618         : &util::filename_cat($importdir,$df); 
    619  
    620         push(@full_deleted_files,$full_df); 
    621     } 
    622      
    623     &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files); 
    624     mark_docs_for_deletion($archive_info,{}, 
    625                    \@full_deleted_files, 
    626                    $archivedir, $verbosity, "delete"); 
    627  
    628  
    629     #  
    630     # 2. Now files for reindexing 
    631     #  
    632  
    633     my @reindex_files = keys %{$manifest_lookup->{'reindex'}}; 
    634     my @full_reindex_files = (); 
    635  
    636     # ensure all filenames are absolute 
    637     foreach my $rf (@reindex_files) {        
    638         my $full_rf = 
    639         (&util::filename_is_absolute($rf))  
    640         ? $rf 
    641         : &util::filename_cat($importdir,$rf); 
    642  
    643         push(@full_reindex_files,$full_rf); 
    644     } 
    645      
    646     &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files); 
    647     mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex"); 
    648  
    649     # And now ensure the new version of the file processed by appropriate 
    650     # plugin 
    651     foreach my $full_rf (@full_reindex_files) { 
    652         &plugin::read ($pluginfo, "", $full_rf, {}, {}, $processor, $maxdocs, 0, $gli); 
    653     } 
    654  
    655  
    656     #  
    657     # 3. Now finally any new files 
    658     #  
    659  
    660     foreach my $file (keys %{$manifest_lookup->{'index'}}) { 
    661         my $block_hash = {}; 
    662         my $metadata = {}; 
    663         &plugin::file_block_read($pluginfo, $importdir, $file, $block_hash, $metadata, $gli); 
    664         &plugin::read ($pluginfo, $importdir, $file, $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 
    665     } 
    666  
    667  
    668     } 
    669  
     684    &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 
     685    } 
     686     
     687     
    670688    if ($saveas eq "FedoraMETS") { 
    671689    # create collection "doc obj" for Fedora that contains 
     
    11231141} 
    11241142 
     1143sub add_dir_contents_to_list { 
     1144 
     1145    my ($dirname, $list) = @_; 
     1146  
     1147    # Recur over directory contents. 
     1148    my (@dir, $subfile); 
     1149     
     1150    # find all the files in the directory 
     1151    if (!opendir (DIR, $dirname)) { 
     1152    print STDERR "inexport: WARNING - couldn't read directory $dirname\n"; 
     1153    return -1; # error in processing 
     1154    } 
     1155    @dir = readdir (DIR); 
     1156    closedir (DIR); 
     1157     
     1158    for (my $i = 0; $i < scalar(@dir); $i++) { 
     1159    my $subfile = $dir[$i]; 
     1160    next if ($subfile =~ m/^\.\.?$/); 
     1161    my $full_file = &util::filename_cat($dirname, $subfile); 
     1162    if (-d $full_file) { 
     1163        &add_dir_contents_to_list($full_file, $list); 
     1164    } else { 
     1165        push (@$list, $full_file); 
     1166    } 
     1167    } 
     1168     
     1169} 
    11251170 
    11261171