Changeset 31900

Show
Ignore:
Timestamp:
17.08.2017 20:29:16 (3 months ago)
Author:
ak19
Message:

When the oai-inf.db for a collection is first created, oaiinfo.pm now writes out a new special record 'earliesttimestamp' containing the current timestamp as at the time that oai-inf.db is created. If the oai-inf.db already exists, as with our demo collection on svn, the earliesttimestamp record will have a timestamp that is the earliest lastmodified date in the oai-inf.db.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/oaiinfo.pm

    r31723 r31900  
    66use constant INFO_DATESTAMP_INDEX => 2; 
    77 
     8my $OID_EARLIEST_TIMESTAMP = "earliesttimestamp"; 
     9  # Declaring as my $OID_EARLIEST_TIMESTAMP rather than constant, because it's not straightforward 
     10  # to use string constant as hash key (need to concat with empty str). 
     11  # http://perldoc.perl.org/constant.html 
     12  # But beware of using perl 'constant' as hash key: 
     13  # https://stackoverflow.com/questions/96848/is-there-any-way-to-use-a-constant-as-hash-key-in-perl 
     14  # http://forums.devshed.com/perl-programming-6/massive-using-constants-hash-keys-603600.html 
     15  # https://perlmaven.com/constants-and-read-only-variables-in-perl 
     16  # http://neilb.org/reviews/constants.html - compares different ways to declare constants in perl 
     17 
    818use strict; 
    919 
     
    1626 
    1727# File format read in: OID <tab> (Deletion-)Status <tab> Timestamp <tab> Datestamp 
     28 
     29# A special record of the db contains the timestamp of the creation of the oai-inf.db for 
     30# the collection, representing the collection's earliest datetimestamp. 
     31# This record has $OID_EARLIEST_TIMESTAMP for OID. 
     32# Its deletion status is maintained at NA, not applicable. 
     33# In cases of older oai-inf.db files where there's no earliesttimestamp field, this record 
     34# is also created but with timestamp set to the oldest lastmodified date in oai-inf.db. 
    1835 
    1936# Deletion status can be: 
     
    2138#  D = Doc with OID has been deleted. Timestamp indicates time of deletion 
    2239#  PD = Provisionally Deleted. The associated timestamps are momentarily unaltered. 
     40#  NA = Not Applicable. Only for the special record with $OID_EARLIEST_TIMESTAMP as OID. 
    2341 
    2442# oaidb is "always incremental": always reflects the I/B/R/D status of archive info db, 
     
    187205 
    188206#    print STDERR "@@@@@ oaidb: $self->{'oaidb_file_path'}\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'}; 
    189      
    190     return $do_pd_step; 
     207 
     208    return ($do_pd_step, $initdb); 
    191209} 
    192210 
     
    200218    my ($removeold, $have_manifest) = @_; 
    201219     
    202     my $do_pd_step = $self->init_tmpdb($removeold, $have_manifest); 
    203       # returns 1 if the step to mark oaidb entries as PD is required 
     220    my ($do_pd_step, $is_new_db) = $self->init_tmpdb($removeold, $have_manifest); 
     221      # returns 1 for $do_pd_step if the step to mark oaidb entries as PD is required 
    204222      # if we're doing full rebuilding and it's NOT the first time creating the oai_inf db,  
    205223      # then the tasks to do with PD (provisionally deleted) OAI OIDs should be carried out 
     224      # Returns 1 for is_new_db to allow further one time initialisation of the new oai-inf.db 
    206225 
    207226    $self->load_info(); 
    208227    $self->print_info(); # DEBUGGING 
    209228 
     229    # A special record of the oai-inf.db will contain the timestamp when the oai-inf.db was created. 
     230    # This represents the collection's "earliest datetimestamp". It should remain unaltered 
     231    # for as long as oai-inf db exists. This record has the special OID of $OID_EARLIEST_TIMESTAMP. 
     232    # This record should not be marked as PD, but remain as E, as it can't ever be deleted. 
     233    # Although the status field for the $OID_EARLIEST_TIMESTAMP record is actually meaningless.     
     234    my $save_to_db = $self->insert_coll_earliest_timestamp($is_new_db);     
     235     
    210236    if ($do_pd_step) { 
    211237    $self->mark_all_existing_as_provisionallydeleted(); 
    212238    $self->print_info(); # DEBUGGING 
    213  
    214     # save to db file now that we're done 
    215     $self->save_info(); 
     239     
     240    $save_to_db = 1;     
     241    } 
     242 
     243    if($save_to_db) { 
     244    # save changes to $self->{'info'} out to db file, now that we're done 
     245    $self->save_info();  
    216246    } 
    217247 
     
    238268    # all the while ensuring all PDs are changed back to E for OIDs that exist in both arcinfo and oaiinfo db.   
    239269 
    240     my $arcinfo_map = $archive_info->{'info'}; 
     270    my $arcinfo_map = $archive_info->{'info'}; 
    241271 
    242272    foreach my $OID (keys %$arcinfo_map) { 
     
    259289        # an oaicollection. But what if we always maintain an oaidb? Still call $self->index() here. 
    260290    } else { 
    261         if ($self->{'verbosity'} >= $self->{'verbosity_threshold'}) { 
    262             print STDERR "### oaiinfo::building_stage_before_indexing(): Unrecognised indexing status $indexing_status\n"; 
    263         } 
     291        if ($self->{'verbosity'} >= $self->{'verbosity_threshold'}) { 
     292        print STDERR "### oaiinfo::building_stage_before_indexing(): Unrecognised indexing status $indexing_status\n"; 
     293        } 
    264294    } 
    265295    } 
     
    297327     
    298328    } else { 
    299         if ($self->{'verbosity'} >= $self->{'verbosity_threshold'}) { 
    300             print STDERR "@@@@@ In oaiinfo::activate_collection():\n"; 
    301             print STDERR "@@@@@   No tmpdb at $self->{'oaidb_tmp_filepath'}\n"; 
    302             print STDERR "@@@@@   to make 'live' by moving to $self->{'oaidb_live_filepath'}.\n"; 
    303         } 
     329    if ($self->{'verbosity'} >= $self->{'verbosity_threshold'}) { 
     330        print STDERR "@@@@@ In oaiinfo::activate_collection():\n"; 
     331        print STDERR "@@@@@   No tmpdb at $self->{'oaidb_tmp_filepath'}\n"; 
     332        print STDERR "@@@@@   to make 'live' by moving to $self->{'oaidb_live_filepath'}.\n"; 
     333    } 
    304334    } 
    305335} 
     
    316346    print STDERR "@@@@@ oaiinfo::mark_all_E_as_PD(): Marking the E entries as PD\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'}; 
    317347 
    318     my $infomap = $self->{'info'}; 
     348    my $infomap = $self->{'info'}; 
    319349 
    320350    foreach my $OID (keys %$infomap) { # Mac Mountain Lion wants %$map, won't accept %$self->{'info'} 
     
    334364    print STDERR "@@@@@ oaiinfo::mark_all_PD_as_D(): Marking the PD entries as D\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'}; 
    335365 
    336     my $infomap = $self->{'info'}; 
     366    my $infomap = $self->{'info'}; 
    337367 
    338368    foreach my $OID (keys %$infomap) { 
     
    351381    my $self = shift (@_); 
    352382 
    353     if ($self->{'verbosity'} < $self->{'verbosity_threshold'}) { 
    354         return; 
    355     }    
     383    if ($self->{'verbosity'} < $self->{'verbosity_threshold'}) { 
     384    return; 
     385    } 
    356386     
    357387    print STDERR "###########################################################\n"; 
    358388    print STDERR "@@@@@ oaiinfo::print_info(): oaidb in memory contains: \n"; 
    359389     
    360     my $infomap = $self->{'info'}; 
     390    my $infomap = $self->{'info'}; 
    361391 
    362392    foreach my $OID (keys %$infomap) { 
     
    369399 
    370400    print STDERR "###########################################################\n"; 
     401} 
     402 
     403 
     404# When a fresh oai-inf.db is created, this method is called to add the db's special 
     405# record representing the collection's earliest timestamp. 
     406# OID=$OID_EARLIEST_TIMESTAMP, deletion_status=NA for not applicable, and current timestamp/date. 
     407# For older oai-inf.db's that don't yet have this record, a record will be added too, 
     408# but with the timestamp set to the oldest last modified date for the collection's docs. 
     409sub insert_coll_earliest_timestamp { 
     410    my $self = shift (@_); 
     411    my ($is_new_db) = @_; 
     412 
     413    my $current_time = $self->get_current_time(); 
     414    my $save_to_db = 0; 
     415 
     416     
     417    print STDERR "@@@@@ oaiinfo::insert_coll_earliest_timestamp(): " if $self->{'verbosity'} >= $self->{'verbosity_threshold'}; 
     418     
     419    if($is_new_db) { 
     420     
     421    print STDERR "New db. Setting timestamp of oai-inf.db creation.\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'}; 
     422     
     423    $self->set_info($OID_EARLIEST_TIMESTAMP, "NA", $current_time); 
     424    $save_to_db = 1; 
     425    } 
     426     
     427    else { # oai-inf.db already exists, ensure it has an [$OID_EARLIEST_TIMESTAMP] set 
     428 
     429    my $earliesttimestamp_record = $self->{'info'}->{$OID_EARLIEST_TIMESTAMP}; 
     430     
     431    if (!defined $earliesttimestamp_record) { 
     432        # oai-inf.db exists, but doesn't contain an [$OID_EARLIEST_TIMESTAMP] record yet. 
     433        # Let's create one for it: 
     434        # Work out the earliest lastmodified datetime in the collection, by inspecting 
     435        # the last modified timestamp for each doc in the collection 
     436         
     437        my $earliest_timestamp = $current_time; 
     438         
     439        my $infomap = $self->{'info'}; # Mac Mountain Lion wants %$map, won't accept %$self->{'info'}        
     440        foreach my $OID (keys %$infomap) { 
     441        my $OID_info = $self->{'info'}->{$OID}; 
     442        my $lastmodified = $OID_info->[INFO_TIMESTAMP_INDEX]; 
     443        if($lastmodified < $earliest_timestamp) { 
     444            $earliest_timestamp = $lastmodified; 
     445        } 
     446        } 
     447         
     448        print STDERR "Collection timestamp not yet set for $OID_EARLIEST_TIMESTAMP. Setting to earliest found: $earliest_timestamp\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'}; 
     449         
     450        $self->set_info($OID_EARLIEST_TIMESTAMP, "NA", $earliest_timestamp); 
     451        $save_to_db = 1; 
     452    } else { 
     453        print STDERR "Collection timestamp was already set\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'}; 
     454    } 
     455    } 
     456 
     457    return $save_to_db; 
    371458} 
    372459 
     
    422509 
    423510    # the following method will set to current time if no timestamp provided, 
    424     # But by explicit here, the code is easier to follow 
     511    # But by being explicit here, the code is easier to follow 
    425512    $self->set_info($OID, "D", $self->get_current_time()); 
    426513 
     
    540627 
    541628    # write out again. Open file for overwriting, not appending. 
    542     # Then write out data structure $self->{'info'} that's been maintaining the data in-memory.  
     629    # Then write out data structure $self->{'info'} that has been maintaining the data in-memory.  
    543630    my $infodb_handle = &dbutil::open_infodb_write_handle($infodbtype, $filename); 
    544631 
    545     my $infomap = $self->{'info'}; 
     632    my $infomap = $self->{'info'}; 
    546633    foreach my $oid ( keys %$infomap ) { 
    547634    my $OID_info = $self->{'info'}->{$oid}; 
     
    599686    my @list = (); 
    600687     
    601     my $infomap = $self->{'info'}; 
     688    my $infomap = $self->{'info'}; 
    602689    foreach my $OID (keys %$infomap) {   
    603690    my $OID_info = $self->{'info'}->{$OID}; 
     
    617704sub size { 
    618705    my $self = shift (@_); 
    619     my $infomap = $self->{'info'}; 
     706    my $infomap = $self->{'info'}; 
    620707    return (scalar keys %$infomap); 
    621708}