Changeset 21306

Show
Ignore:
Timestamp:
09.12.2009 13:18:41 (10 years ago)
Author:
kjdon
Message:

mark_docs_for_reindex code moved into new_vs_old_import_diff, so this now generates new_files, deleted_files and reindex_files lists. Both deleted_files and reindex_files need to be marked for deletion, which is now done by import.pl. don't need separate make_docs_for_deletion/reindex, just pass in a mode arg. removed a method no longer used

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/inexport.pm

    r20788 r21306  
    3333use GDBMUtils; 
    3434 
     35sub src_db_file { 
     36    my ($archivedir) = @_; 
     37    return &util::filename_cat ($archivedir, "archiveinf-src.gdb"); 
     38} 
     39 
     40sub doc_db_file { 
     41    my ($archivedir) = @_; 
     42    return &util::filename_cat ($archivedir, "archiveinf-doc.gdb"); 
     43} 
     44 
     45sub oid_count_file { 
     46    my ($archivedir) = @_; 
     47    return &util::filename_cat ($archivedir, "OIDcount"); 
     48} 
     49 
    3550 
    3651sub prime_doc_oid_count 
    3752{ 
    3853    my ($archivedir) = @_; 
    39     my $oid_count_filename = &util::filename_cat ($archivedir, "OIDcount"); 
     54    my $oid_count_filename = &oid_count_file($archivedir); 
    4055 
    4156    if (-e $oid_count_filename) { 
     
    6277 
    6378    my ($archivedir) = @_; 
    64     my $oid_count_filename = &util::filename_cat ($archivedir, "OIDcount"); 
     79    my $oid_count_filename = &oid_count_file($archivedir); 
    6580 
    6681 
     
    8297 
    8398    # in this method, we want to know if metadata files are modified or not. 
    84     my $doc_db = "archiveinf-doc.gdb"; 
    85     my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db); 
     99    my $arcinfo_doc_filename = &doc_db_file($archivedir); 
    86100 
    87101    my $archiveinf_timestamp = -M $arcinfo_doc_filename; 
     
    174188 
    175189 
     190 
     191 
    176192    # Deal with complication of new or modified metadata files by forcing 
    177193    # everything from this point down in the file hierarchy to 
     
    199215        push(@$reindex_files,$existing_f); 
    200216        $block_hash->{'reindex_files'}->{$existing_f} = 1; 
    201  
    202         } 
    203     } 
    204  
    205     # Reindexing is accomplished by putting them in the list for reindexing (line above) 
    206     # and then tagging the arcinfo version as to be deleted. 
    207  
    208     _mark_docs_for_deletion($archive_info,$block_hash,$reindex_files,$archivedir,$verbosity, "reindex"); 
     217        delete $block_hash->{'existing_files'}->{$existing_f}; 
     218 
     219        } 
     220    } 
    209221     
    210222    # metadata file needs to be in new_files list so parsed by MetadataXMLPlug 
    211223    # (or equivalent) 
    212224    $block_hash->{'new_files'}->{$new_mdf} = 1;  
     225 
     226    } 
     227 
     228    # go through remaining existing files and work out what has changed and needs to be reindexed. 
     229    my @existing_files = sort keys %{$block_hash->{'existing_files'}}; 
     230 
     231    my $reindex_files = []; 
     232 
     233    foreach my $existing_filename (@existing_files) { 
     234    if (-M $existing_filename < $archiveinf_timestamp) { 
     235        # file is newer than last build 
     236         
     237        my $existing_file = $existing_filename; 
     238        #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}); 
     239 
     240        #my $collectdir_resafe = &util::filename_to_regex($collectdir); 
     241        #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//; 
     242         
     243        print STDERR "**** Reindexing existing file: $existing_file\n"; 
     244 
     245        push(@$reindex_files,$existing_file); 
     246        $block_hash->{'reindex_files'}->{$existing_filename} = 1; 
     247    } 
    213248 
    214249    } 
     
    249284} 
    250285 
    251 # not used anymore 
    252 sub is_assoc_file 
     286 
     287# this is used to delete "deleted" docs, and to remove old versions of "changed" docs 
     288# $mode is 'delete' or 'reindex' 
     289sub mark_docs_for_deletion 
    253290{ 
    254     my ($file,$doc_rec) = @_; 
    255  
    256     my ($file_root,$dirname,$suffix) = fileparse($file, "\\.[^\\.]+\$"); 
    257  
    258     foreach my $af (@{$doc_rec->{'assoc-file'}}) { 
    259     my $full_af = &util::filename_cat($dirname,$af); 
    260  
    261     return 1 if ($full_af eq $file); 
    262     } 
    263  
    264     return 0; 
    265 } 
    266  
    267  
    268  
    269 # this is used to delete "deleted" docs, and to remove old versions of "changed" docs 
    270 sub _mark_docs_for_deletion 
    271 { 
    272     my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode_text) = @_; 
    273  
    274     my $doc_db = "archiveinf-doc.gdb"; 
    275     my $src_db = "archiveinf-src.gdb"; 
    276     my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db); 
    277     my $arcinfo_src_filename = &util::filename_cat ($archivedir, $src_db); 
     291    my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_; 
     292 
     293    my $mode_text = "deleted from index"; 
     294    if ($mode eq "reindex") { 
     295    $mode_text = "reindexed"; 
     296    } 
     297    my $arcinfo_doc_filename = &doc_db_file($archivedir); 
     298    my $arcinfo_src_filename = &src_db_file($archivedir); 
    278299 
    279300 
     
    321342} 
    322343 
    323 sub mark_docs_for_deletion 
    324 { 
    325     _mark_docs_for_deletion(@_,"deleted from index"); 
    326 } 
    327  
    328  
    329 sub mark_docs_for_reindex 
    330 { 
    331     my ($archive_info,$block_hash,$archivedir,$verbosity) = @_; 
    332  
    333     # Reindexing is accomplished by deleting the previously indexed 
    334     # version of the document, and then allowing the new version to 
    335     # be indexed (as would a new document be indexed).  
    336     #  
    337     # The first step (marking for deletion) is implemented by this routine. 
    338     #  
    339     # By default in Greenstone a new version of an index will hash to 
    340     # a new unique OID, and the above strategy of reindex=delete+add 
    341     # works fine.  A special case arises when a persistent OID is  
    342     # allocated to a document (for instance through a metadata field), 
    343     # and the second step to reindexing (see XXXX) detects this and 
    344     # deals with it appropriately. 
    345  
    346     my @existing_files = sort keys %{$block_hash->{'existing_files'}}; 
    347  
    348     my $doc_db = "archiveinf-doc.gdb"; 
    349     my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db); 
    350  
    351     my $archiveinf_timestamp = -M $arcinfo_doc_filename; 
    352  
    353     my $reindex_files = []; 
    354  
    355     foreach my $existing_filename (@existing_files) { 
    356     if (-M $existing_filename < $archiveinf_timestamp) { 
    357         # file is newer than last build 
    358          
    359         my $existing_file = $existing_filename; 
    360         #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}); 
    361  
    362         #my $collectdir_resafe = &util::filename_to_regex($collectdir); 
    363         #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//; 
    364          
    365         print STDERR "**** Reindexing existing file: $existing_file\n"; 
    366  
    367         push(@$reindex_files,$existing_file); 
    368         $block_hash->{'reindex_files'}->{$existing_filename} = 1; 
    369     } 
    370  
    371     } 
    372      
    373     _mark_docs_for_deletion($archive_info,$block_hash,$reindex_files,$archivedir,$verbosity, "reindex"); 
    374  
    375 } 
    376  
    377344 
    378345