Changeset 21306


Ignore:
Timestamp:
2009-12-09T13:18:41+13:00 (14 years ago)
Author:
kjdon
Message:

mark_docs_for_reindex code moved into new_vs_old_import_diff, so this now generates new_files, deleted_files and reindex_files lists. Both deleted_files and reindex_files need to be marked for deletion, which is now done by import.pl. don't need separate make_docs_for_deletion/reindex, just pass in a mode arg. removed a method no longer used

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/inexport.pm

    r20788 r21306  
    3333use GDBMUtils;
    3434
     35sub src_db_file {
     36    my ($archivedir) = @_;
     37    return &util::filename_cat ($archivedir, "archiveinf-src.gdb");
     38}
     39
     40sub doc_db_file {
     41    my ($archivedir) = @_;
     42    return &util::filename_cat ($archivedir, "archiveinf-doc.gdb");
     43}
     44
     45sub oid_count_file {
     46    my ($archivedir) = @_;
     47    return &util::filename_cat ($archivedir, "OIDcount");
     48}
     49
    3550
    3651sub prime_doc_oid_count
    3752{
    3853    my ($archivedir) = @_;
    39     my $oid_count_filename = &util::filename_cat ($archivedir, "OIDcount");
     54    my $oid_count_filename = &oid_count_file($archivedir);
    4055
    4156    if (-e $oid_count_filename) {
     
    6277
    6378    my ($archivedir) = @_;
    64     my $oid_count_filename = &util::filename_cat ($archivedir, "OIDcount");
     79    my $oid_count_filename = &oid_count_file($archivedir);
    6580
    6681
     
    8297
    8398    # in this method, we want to know if metadata files are modified or not.
    84     my $doc_db = "archiveinf-doc.gdb";
    85     my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db);
     99    my $arcinfo_doc_filename = &doc_db_file($archivedir);
    86100
    87101    my $archiveinf_timestamp = -M $arcinfo_doc_filename;
     
    174188
    175189
     190
     191
    176192    # Deal with complication of new or modified metadata files by forcing
    177193    # everything from this point down in the file hierarchy to
     
    199215        push(@$reindex_files,$existing_f);
    200216        $block_hash->{'reindex_files'}->{$existing_f} = 1;
    201 
    202         }
    203     }
    204 
    205     # Reindexing is accomplished by putting them in the list for reindexing (line above)
    206     # and then tagging the arcinfo version as to be deleted.
    207 
    208     _mark_docs_for_deletion($archive_info,$block_hash,$reindex_files,$archivedir,$verbosity, "reindex");
     217        delete $block_hash->{'existing_files'}->{$existing_f};
     218
     219        }
     220    }
    209221   
    210222    # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
    211223    # (or equivalent)
    212224    $block_hash->{'new_files'}->{$new_mdf} = 1;
     225
     226    }
     227
     228    # go through remaining existing files and work out what has changed and needs to be reindexed.
     229    my @existing_files = sort keys %{$block_hash->{'existing_files'}};
     230
     231    my $reindex_files = [];
     232
     233    foreach my $existing_filename (@existing_files) {
     234    if (-M $existing_filename < $archiveinf_timestamp) {
     235        # file is newer than last build
     236       
     237        my $existing_file = $existing_filename;
     238        #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
     239
     240        #my $collectdir_resafe = &util::filename_to_regex($collectdir);
     241        #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
     242       
     243        print STDERR "**** Reindexing existing file: $existing_file\n";
     244
     245        push(@$reindex_files,$existing_file);
     246        $block_hash->{'reindex_files'}->{$existing_filename} = 1;
     247    }
    213248
    214249    }
     
    249284}
    250285
    251 # not used anymore
    252 sub is_assoc_file
     286
     287# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
     288# $mode is 'delete' or 'reindex'
     289sub mark_docs_for_deletion
    253290{
    254     my ($file,$doc_rec) = @_;
    255 
    256     my ($file_root,$dirname,$suffix) = fileparse($file, "\\.[^\\.]+\$");
    257 
    258     foreach my $af (@{$doc_rec->{'assoc-file'}}) {
    259     my $full_af = &util::filename_cat($dirname,$af);
    260 
    261     return 1 if ($full_af eq $file);
    262     }
    263 
    264     return 0;
    265 }
    266 
    267 
    268 
    269 # this is used to delete "deleted" docs, and to remove old versions of "changed" docs
    270 sub _mark_docs_for_deletion
    271 {
    272     my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode_text) = @_;
    273 
    274     my $doc_db = "archiveinf-doc.gdb";
    275     my $src_db = "archiveinf-src.gdb";
    276     my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db);
    277     my $arcinfo_src_filename = &util::filename_cat ($archivedir, $src_db);
     291    my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
     292
     293    my $mode_text = "deleted from index";
     294    if ($mode eq "reindex") {
     295    $mode_text = "reindexed";
     296    }
     297    my $arcinfo_doc_filename = &doc_db_file($archivedir);
     298    my $arcinfo_src_filename = &src_db_file($archivedir);
    278299
    279300
     
    321342}
    322343
    323 sub mark_docs_for_deletion
    324 {
    325     _mark_docs_for_deletion(@_,"deleted from index");
    326 }
    327 
    328 
    329 sub mark_docs_for_reindex
    330 {
    331     my ($archive_info,$block_hash,$archivedir,$verbosity) = @_;
    332 
    333     # Reindexing is accomplished by deleting the previously indexed
    334     # version of the document, and then allowing the new version to
    335     # be indexed (as would a new document be indexed).
    336     #
    337     # The first step (marking for deletion) is implemented by this routine.
    338     #
    339     # By default in Greenstone a new version of an index will hash to
    340     # a new unique OID, and the above strategy of reindex=delete+add
    341     # works fine.  A special case arises when a persistent OID is
    342     # allocated to a document (for instance through a metadata field),
    343     # and the second step to reindexing (see XXXX) detects this and
    344     # deals with it appropriately.
    345 
    346     my @existing_files = sort keys %{$block_hash->{'existing_files'}};
    347 
    348     my $doc_db = "archiveinf-doc.gdb";
    349     my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db);
    350 
    351     my $archiveinf_timestamp = -M $arcinfo_doc_filename;
    352 
    353     my $reindex_files = [];
    354 
    355     foreach my $existing_filename (@existing_files) {
    356     if (-M $existing_filename < $archiveinf_timestamp) {
    357         # file is newer than last build
    358        
    359         my $existing_file = $existing_filename;
    360         #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
    361 
    362         #my $collectdir_resafe = &util::filename_to_regex($collectdir);
    363         #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
    364        
    365         print STDERR "**** Reindexing existing file: $existing_file\n";
    366 
    367         push(@$reindex_files,$existing_file);
    368         $block_hash->{'reindex_files'}->{$existing_filename} = 1;
    369     }
    370 
    371     }
    372    
    373     _mark_docs_for_deletion($archive_info,$block_hash,$reindex_files,$archivedir,$verbosity, "reindex");
    374 
    375 }
    376 
    377344
    378345
Note: See TracChangeset for help on using the changeset viewer.