# This class based on arcinfo.pm package oaiinfo; use constant INFO_STATUS_INDEX => 0; use constant INFO_TIMESTAMP_INDEX => 1; use strict; use arcinfo; use dbutil; # QUESTIONS: # Should we use time or localtime(time) for timestamp? Just timestamp. # What format should the timestamp be in, or is the basic format used by perl sufficient? Basic. # File format read in: OID Date-timestamp Deletion-Status # Deletion status can be: # E = Doc with OID exists (has not been deleted from collection). Timestamp indicates last time of build # D = Doc with OID has been deleted. Timestamp indicates time of deletion # PD = Provisionally Deleted. Timestamp momentarily unaltered. # oaidb is "always incremental": always reflects the I/B/R/D status of archive info db, # before the indexing step of the build phase that alters the I/B/R/D contents of archive info db. # (I=index, B=been indexed, R=reindex; D=delete) sub new { my $class = shift(@_); my ($config_filename, $infodbtype) = @_; my $self = { 'info'=>{} # map of {OID, array[deletion-status,timestamp]} pairs }; if(!defined $infodbtype) { $infodbtype = &dbutil::get_default_infodb_type(); } $infodbtype = "gdbm" if ($infodbtype eq "gdbm-txtgz"); $self->{'infodbtype'} = $infodbtype; # Create and store the db filenames we'll be working with (tmp and livedb) my $etc_dir = &util::get_parent_folder($config_filename); my $perform_firsttime_init = 0; $self->{'oaidb_live_filepath'} = &dbutil::get_infodb_file_path($infodbtype, "oai-inf", $etc_dir, $perform_firsttime_init); $self->{'oaidb_tmp_filepath'} = &dbutil::get_infodb_file_path($infodbtype, "oai-inf-tmp", $etc_dir, $perform_firsttime_init); $self->{'etc_dir'} = $etc_dir; # print STDERR "############ LIVE DB: $self->{'oaidb_live_filepath'}\n"; # print STDERR "############ TMP DB: $self->{'oaidb_tmp_filepath'}\n"; $self->{'oaidb_file_path'} = $self->{'oaidb_tmp_filepath'}; # db file we're working with return bless $self, $class; } # this subroutine will work out the starting contents of the tmp-db (temporary oai db): # whether it should start off empty, or with the contents of any existing live-db, # or with the contents of any existing tmp-db. sub init_tmpdb { my $self = shift(@_); my ($removeold, $have_manifest) = @_; # if we have a manifest file, then we pretend we are fully incremental for oaiinfo db. # removeold implies proper full-rebuild, whereas keepold or incremental means incremental if($have_manifest) { # if we have a manifest file, we're not doing removeold/full-rebuild either $removeold = 0; } my $do_pd_step = ($removeold) ? 1 : 0; # if $removeold, then proper full rebuild, will carry out step where all E will be marked as PD # else some kind of incremental build, won't do the extra PD pass # which is the step marking existing OIDs (E) as PD (provisionally deleted) my $oaidb_live_filepath = $self->{'oaidb_live_filepath'}; my $oaidb_tmp_filepath = $self->{'oaidb_tmp_filepath'}; my $infodbtype = $self->{'infodbtype'}; # Note: the live db can only exist if the collection has been activated at least once before my $livedb_exists = &FileUtils::fileExists($oaidb_live_filepath); my $tmpdb_exists = &FileUtils::fileExists($oaidb_tmp_filepath); my $initdb = 0; # work out what operation we need to do # work with empty tmpdb # copy_livedb_to_tmpdb # work with existing tmpdb (so existing tmpdb will be topped up) # make_contents_of_tmpdb_empty # make_contents_of_tmpdb_that_of_livedb # continue_working_with_tmpdb ("contents_of_tmpdb_is_tmpdb") # We're going to prepare the starting state of tmpdb next. # It can start off empty, start off with the contents of livedb, or it can start off with the contents # of the existing tmp db. Which of these three it is depends on the 3 factors: whether livedb exists, # whether tmpdb exists and whether or not removeold is true. # i.o.w. which of the 3 outcomes it is depends on the truth table built on the following 3 variables: # LDB = LiveDB exists # TDB = TmpDB exists # RO = Removeold # OUTCOMES: # clean slate (create an empty tmpdb/make tmpdb empty) # top up tmpDB (work with existing tmpdb) # copy LiveDB to TmpDB (liveDB's contents become the contents of TmpDB, and we'll work with that) # # TRUTH TABLE: # --------------------------------------- # LDB TDB RO | Outcome # --------------------------------------- # 0 0 0 | clean-slate # 0 0 1 | clean-slate # 0 1 0 | top-up-tmpdb # 0 1 1 | erase tmpdb, clean-slate # 1 0 0 | copy livedb to tmpdb # 1 0 1 | copy livedb to tmpdb # 1 1 0 | top-up-tmpdb # 1 1 1 | copy livedb to tmpd # --------------------------------------- # # Dr Bainbridge worked out using Karnaugh maps that, from the above truth table: # => clean-slate/empty-tmpdb = !LDB && (RO || !TDB) # => top-up-tmpdb/work-with-existing-tmpdb = !RO && TDB # => copy-livedb-to-tmpdb = LDB && (!TDB || RO) # I had most of these tests, except that I hadn't (yet) merged the two clean slate instances # of first-build-ever and make-contents-of-tmpdb-empty #my $first_build_ever = (!$livedb_exists && !$tmpdb_exists); #my $make_contents_of_tmpdb_empty = (!$livedb_exists && $tmpdb_exists && $removeold); # Karnaugh map allows merging $first_build_ever and $make_contents_of_tmpdb_empty above # into: my $work_with_empty_tmpdb = (!$livedb_exists && (!$tmpdb_exists || $removeold)); my $work_with_empty_tmpdb = (!$livedb_exists && (!$tmpdb_exists || $removeold)); my $make_contents_of_tmpdb_that_of_livedb = ($livedb_exists && (!$tmpdb_exists || $removeold)); my $work_with_existing_tmpdb = ($tmpdb_exists && !$removeold); if($work_with_empty_tmpdb) { # we'll use an empty tmpdb # If importing the collection for the very first time, neither db exists, # so create an empty tmpdb. # # We also create an empty tmpdb when livedb doesn't exist and $removeold is true. # This can happen if we've never run activate (so no livedb), # yet had done some import (and perhaps building) followed by a full re-import now. # Since there was no activate and we're doing a removeold/full-rebuild now, can just # work with a new tmpdb, even though one already existed, its contents can be wiped out. # In such a scenario, we'll be deleting tmpdb. Then there will be no livedb nor any tmpdb # any more, so same situation as if importing the very first time when no oaidb exists either. &dbutil::remove_db_file($self->{'infodbtype'}, $oaidb_tmp_filepath) if $tmpdb_exists; # remove the db file and any assoc files $initdb = 1; # new tmpdb # if the oai db is created the first time, it's like incremental and # "keepold" (keepold means "only add, don't reprocess existing"). So # no need to do the special passes dealing with "provisional deletes". $do_pd_step = 0; } elsif ($make_contents_of_tmpdb_that_of_livedb) { # If the livedb exists and we're doing a full rebuild ($removeold is true), # copy livedb to tmp regardless of if tmpdb already exists. # Or if the livedb exists and tmpdb doesn't exist, it doesn't matter # if we're incremental or not: also copy live to tmp and work with tmp. # copy livedb to tmpdb &dbutil::remove_db_file($self->{'infodbtype'}, $oaidb_tmp_filepath) if $tmpdb_exists; # remove the db file and any assoc files &FileUtils::copyFiles($oaidb_live_filepath, $oaidb_tmp_filepath); $initdb = 0; # tmpdb exists, since we just copied livedb to tmpdb, so need to init existing tmpdb } else { # $work_with_existing_tmpdb, so we'll build on top of what's presently already in tmpdb # (we'll be topping up the current tmpdb) # !$removeold, meaning incremental # If incremental and have a tmpdb already, regardless of whether livedb exists, # then work with the existing tmpdb file, as this means we've been # importing (perhaps followed by building) repeatedly without activating the # last time but want to maintain the (incremental) changes in tmpdb. $initdb = 0; } # Dr Bainbridge drew up Karnaugh maps on the truth table, which proved that all cases # are indeed covered above, so don't need any other catch-all else here $self->{'oaidb_file_path'} = &dbutil::get_infodb_file_path($infodbtype, "oai-inf-tmp", $self->{'etc_dir'}, $initdb); # final param follows jmt's $perform_firsttime_init in inexport.pm # print STDERR "@@@@@ oaidb: $self->{'oaidb_file_path'}\n"; return $do_pd_step; } sub get_filepath { my $self = shift (@_); return $self->{'oaidb_file_path'}; } sub import_stage { my $self = shift (@_); my ($removeold, $have_manifest) = @_; my $do_pd_step = $self->init_tmpdb($removeold, $have_manifest); # returns 1 if the step to mark oaidb entries as PD is required # if we're doing full rebuilding and it's NOT the first time creating the oai_inf db, # then the tasks to do with PD (provisionally deleted) OAI OIDs should be carried out $self->load_info(); $self->print_info(); # DEBUGGING if ($do_pd_step) { $self->mark_all_existing_as_provisionallydeleted(); $self->print_info(); # DEBUGGING # save to db file now that we're done $self->save_info(); } } sub building_stage_before_indexing() { my $self = shift (@_); my ($archivedir) = @_; # load archive info db into memory my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($self->{'infodbtype'}, "archiveinf-doc", $archivedir); my $arcinfo_src_filename = &dbutil::get_infodb_file_path($self->{'infodbtype'}, "archiveinf-src", $archivedir); my $archive_info = new arcinfo ($self->{'infodbtype'}); $archive_info->load_info ($arcinfo_doc_filename); #my $started_from_scratch = &FileUtils::fileTest($self->{'oaidb_tmp_filepath'}, '-z'); # 1 if tmpdb is empty # -z test for file is empty http://www.perlmonks.org/?node_id=927447 # load the oaidb file's contents into memory. $self->load_info(); $self->print_info(); # DEBUGGING # process all the index, reindex and delete operations as indicated in arcinfo, # all the while ensuring all PDs are changed back to E for OIDs that exist in both arcinfo and oaiinfo db. foreach my $OID (keys $archive_info->{'info'}) { my $arcinf_tuple = $archive_info->{'info'}->{$OID}; my $indexing_status = $arcinf_tuple->[arcinfo::INFO_STATUS_INDEX]; # use packageName::constant to refer to constants declared in another package, # see http://perldoc.perl.org/constant.html print STDERR "######## OID: $OID - status: $indexing_status\n"; if($indexing_status eq "I") { $self->index($OID); # add new as E with current timestamp/or set existing as E with orig timestamp } elsif($indexing_status eq "R") { $self->reindex($OID); # update timestamp and ensure marked as E (if oid doesn't exist, add new) } elsif($indexing_status eq "D") { $self->delete($OID); # set as D with current timestamp } elsif($indexing_status eq "B") { # B for "been indexed" $self->been_indexed($OID); # will flip any PD to E if oid exists, else will add new entry for oid # A new entry may be required if the collection had been built prior to turning this into # an oaicollection. But what if we always maintain an oaidb? Still call $self->index() here. } else { print STDERR "### oaiinfo::building_stage_before_indexing(): Unrecognised indexing status $indexing_status\n"; } } # once all docs processed, go through oaiiinfo db changing any PDs to D along with current timestamp # to indicate that they're deleted $self->mark_all_provisionallydeleted_as_deleted(); $self->print_info(); # let's save to db file now that we're done $self->save_info(); } sub activate_collection { # move tmp db to live db my $self = shift (@_); my $oaidb_live_filepath = $self->{'oaidb_live_filepath'}; my $oaidb_tmp_filepath = $self->{'oaidb_tmp_filepath'}; my $livedb_exists = &FileUtils::fileExists($oaidb_live_filepath); my $tmpdb_exists = &FileUtils::fileExists($oaidb_tmp_filepath); if($tmpdb_exists) { if($livedb_exists) { #&dbutil::remove_db_file($self->{'infodbtype'}, s$oaidb_live_filepath); # remove the db file and any assoc files &dbutil::rename_db_file_to($self->{'infodbtype'}, $oaidb_live_filepath, $oaidb_live_filepath.".bak"); # rename the db file and any assoc files } #&FileUtils::moveFiles($oaidb_tmp_filepath, $oaidb_live_filepath); &dbutil::rename_db_file_to($self->{'infodbtype'}, $oaidb_tmp_filepath, $oaidb_live_filepath); # rename the db file and any assoc files print STDERR "#### Should now have MOVED $self->{'oaidb_tmp_filepath'} to $self->{'oaidb_live_filepath'}\n"; } else { print STDERR "@@@@@ In oaiinfo::activate_collection():\n"; print STDERR "@@@@@ No tmpdb at $self->{'oaidb_tmp_filepath'}\n"; print STDERR "@@@@@ to make 'live' by moving to $self->{'oaidb_live_filepath'}.\n"; } } ##################### SPECIFIC TO PD-STEP #################### # mark all existing, E (non-deleted) OIDs as Provisionally Deleted (PD) # this subroutine doesn't save to oai-inf.DB # the caller should call save_info when they want to save to the db sub mark_all_existing_as_provisionallydeleted { my $self = shift (@_); print STDERR "@@@@@ oaiinfo::mark_all_E_as_PD(): Marking the E entries as PD\n"; foreach my $OID (keys $self->{'info'}) { my $OID_info = $self->{'info'}->{$OID}; my $curr_status = $OID_info->[INFO_STATUS_INDEX]; if($curr_status eq "E") { $OID_info->[INFO_STATUS_INDEX] = "PD"; } } } # mark all OIDs that are Provisionally Deleted (PD) as deleted, and set to current timestamp # To be called at end of build. Again, the caller should save to DB by calling save_info. sub mark_all_provisionallydeleted_as_deleted { my $self = shift (@_); print STDERR "@@@@@ oaiinfo::mark_all_PD_as_D(): Marking the PD entries as D\n"; foreach my $OID (keys $self->{'info'}) { my $OID_info = $self->{'info'}->{$OID}; my $curr_status = $OID_info->[INFO_STATUS_INDEX]; if($curr_status eq "PD") { $self->set_info($OID, "D", $self->get_current_time()); } } } ##################### GENERAL, NOT SPECIFIC TO PD-STEP #################### sub print_info { my $self = shift (@_); print STDERR "###########################################################\n"; print STDERR "@@@@@ oaiinfo::print_info(): oaidb in memory contains: \n"; foreach my $OID (keys $self->{'info'}) { print STDERR "OID: $OID"; print STDERR " status: " . $self->{'info'}->{$OID}->[INFO_STATUS_INDEX]; print STDERR " time: " . $self->{'info'}->{$OID}->[INFO_TIMESTAMP_INDEX]; print STDERR "\n"; } print STDERR "###########################################################\n"; } # Find the OID, if it exists, make its status=E for existing. Leave its timestamp alone. # If the OID doesn't yet exist, add it as a new entry with status=E and with current timestamp. sub index { # Add a new oid with current time and E. If the oid was already present, mark as E my $self = shift (@_); my ($OID) = @_; my $OID_info = $self->{'info'}->{$OID}; if (defined $OID_info) { # if OID is present, this will change status back to E, timestamp unchanged $OID_info->[INFO_STATUS_INDEX] = "E"; } else { # if OID is not present, then it's now added as existing from current time on $self->set_info($OID, "E", $self->get_current_time()); } } # Upon reindexing a document with identifier OID, change its timestamp to current time # if a new OID, then add as new entry with status=E and current timestamp sub reindex { # update timestamp if oid is already present, if not (unlikely), add as new my $self = shift (@_); my ($OID) = @_; my $OID_info = $self->{'info'}->{$OID}; $self->set_info($OID, "E", $self->get_current_time()); # Takes care of 3 things: # if OID exists, updates modified time to indicate the doc has been reindexed # if OID exists, ensures any status=PD is flipped back to E for this OID doc (as we know it exists); # if the OID doesn't yet exist, adds a new OID entry with status=E and current timestamp. } # Does the same as index(): # OIDs that have been indexed upon rebuild may still be new to the oaidb: GS2 collections # are not OAI collections by default, unlike GS3 collections. Imagine rebuilding a (GS2) collection # 5 times and then setting them to be an OAI collection. In that case, the doc OIDs in the collection # may not be in the oaidb yet. Unless, we decide (as is the present case) to always maintain an oaidb # (always creating an oaidb regardless of whether the collection has OAI support turned on or not). sub been_indexed { my $self = shift (@_); my ($OID) = @_; $self->index($OID); } # Upon deleting a document with identifier OID, # set status to deleted and change its timestamp to current time sub delete { my $self = shift (@_); my ($OID) = @_; # the following method will set to current time if no timestamp provided, # But by explicit here, the code is easier to follow $self->set_info($OID, "D", $self->get_current_time()); } ############################################################# sub get_current_time { my $self = shift (@_); return time; # current time # localtime(time) returns an array of values (day, month, year, hour, min, seconds) or singular string # return localtime; # same as localtime(time); # http://perldoc.perl.org/functions/localtime.html } sub _load_info_txt { my $self = shift (@_); my ($filename) = @_; if (defined $filename && &FileUtils::fileExists($filename)) { open (INFILE, $filename) || die "oaiinfo::load_info couldn't read $filename\n"; my ($line, @lineparts); while (defined ($line = )) { $line =~ s/\cM|\cJ//g; # remove end-of-line characters @lineparts = split ("\t", $line); if (scalar(@lineparts) >= 2) { $self->set_info (@lineparts); } } close (INFILE); } } sub _load_info_db { my $self = shift (@_); my ($filename) = @_; my $infodb_map = {}; &dbutil::read_infodb_file($self->{'infodbtype'}, $filename, $infodb_map); foreach my $oid ( keys $infodb_map ) { my $vals = $infodb_map->{$oid}; # interested in oid, timestamp, deletion status my ($deletion_status) = ($vals=~/^(.*)$/m); my ($timestamp) = ($vals=~/^(.*)$/m); $self->set_info ($oid, $deletion_status, $timestamp); } } # if no filename is passed in (and you don't generally want to), then # it tries to load in /etc/oai-inf. if it exists sub load_info { my $self = shift (@_); my ($filename) = @_; $self->{'info'} = {}; $filename = $self->{'oaidb_file_path'} unless defined $filename; if (&FileUtils::fileExists($filename)) { if ($filename =~ m/\.inf$/) { $self->_load_info_txt($filename); } else { $self->_load_info_db($filename); } } } sub _save_info_txt { my $self = shift (@_); my ($filename) = @_; my ($OID, $info); open (OUTFILE, ">$filename") || die "oaiinfo::save_info couldn't write $filename\n"; foreach $info (@{$self->get_OID_list()}) { if (defined $info) { print OUTFILE join("\t", @$info), "\n"; } } close (OUTFILE); } # if no filename is passed in (and you don't generally want to), then # this subroutine tries to write to /etc/oai-inf.. sub _save_info_db { my $self = shift (@_); my ($filename) = @_; $filename = $self->{'oaidb_file_path'} unless defined $filename; my $infodbtype = $self->{'infodbtype'}; # write out again. Open file for overwriting, not appending. # Then write out data structure $self->{'info'} that's been maintaining the data in-memory. my $infodb_handle = &dbutil::open_infodb_write_handle($infodbtype, $filename); foreach my $oid ( keys $self->{'info'} ) { my $OID_info = $self->{'info'}->{$oid}; #my $val_hash = { # "status" => $OID_info->[INFO_STATUS_INDEX], # "timestamp" => $OID_info->[INFO_TIMESTAMP_INDEX] #}; #&dbutil::write_infodb_rawentry($infodbtype,$infodb_handle,$oid,$val_hash); my $val = "".$OID_info->[INFO_STATUS_INDEX]."\n".$OID_info->[INFO_TIMESTAMP_INDEX]."\n"; &dbutil::write_infodb_rawentry($infodbtype,$infodb_handle,$oid,$val); } &dbutil::close_infodb_write_handle($infodbtype, $infodb_handle); } sub save_info { my $self = shift (@_); my ($filename) = @_; if(defined $filename) { if ($filename =~ m/(contents)|(\.inf)$/) { $self->_save_info_txt($filename); } else { $self->_save_info_db($filename); } } else { $self->_save_info_db(); } } sub set_info { # sets existing or appends my $self = shift (@_); my ($OID, $del_status, $timestamp) = @_; if(!defined $timestamp) { # get current date timestamp $timestamp = $self->get_current_time(); } $self->{'info'}->{$OID} = [$del_status, $timestamp]; } # returns a list of the form [[OID, timestamp, deletion_status], ...] sub get_OID_list { my $self = shift (@_); my @list = (); foreach my $OID (keys $self->{'info'}) { my $OID_info = $self->{'info'}->{$OID}; push (@list, [$OID, $OID_info->[INFO_STATUS_INDEX], $OID_info->[INFO_TIMESTAMP_INDEX]]); } return \@list; } # returns the number of entries so far, including deleted ones # http://stackoverflow.com/questions/1109095/how-can-i-find-the-number-of-keys-in-a-hash-in-perl sub size { my $self = shift (@_); return (scalar keys $self->{'info'}); } 1;