root/main/trunk/greenstone2/perllib/oaiinfo.pm @ 31900

Revision 31900, 27.6 KB (checked in by ak19, 22 months ago)

When the oai-inf.db for a collection is first created, oaiinfo.pm now writes out a new special record 'earliesttimestamp' containing the current timestamp as at the time that oai-inf.db is created. If the oai-inf.db already exists, as with our demo collection on svn, the earliesttimestamp record will have a timestamp that is the earliest lastmodified date in the oai-inf.db.

Line 
1# This class based on arcinfo.pm
2package oaiinfo;
3
4use constant INFO_STATUS_INDEX  => 0;
5use constant INFO_TIMESTAMP_INDEX => 1;
6use constant INFO_DATESTAMP_INDEX => 2;
7
8my $OID_EARLIEST_TIMESTAMP = "earliesttimestamp";
9  # Declaring as my $OID_EARLIEST_TIMESTAMP rather than constant, because it's not straightforward
10  # to use string constant as hash key (need to concat with empty str).
11  # http://perldoc.perl.org/constant.html
12  # But beware of using perl 'constant' as hash key:
13  # https://stackoverflow.com/questions/96848/is-there-any-way-to-use-a-constant-as-hash-key-in-perl
14  # http://forums.devshed.com/perl-programming-6/massive-using-constants-hash-keys-603600.html
15  # https://perlmaven.com/constants-and-read-only-variables-in-perl
16  # http://neilb.org/reviews/constants.html - compares different ways to declare constants in perl
17
18use strict;
19
20use arcinfo;
21use dbutil;
22
23# Store timestamp in 2 formats: internal and external (same as oailastmodified and oailastmodifieddate)
24# These times indicate the last modified date for that document. In the case of the doc being deleted,
25# it's the time the doc was deleted.
26
27# File format read in: OID <tab> (Deletion-)Status <tab> Timestamp <tab> Datestamp
28
29# A special record of the db contains the timestamp of the creation of the oai-inf.db for
30# the collection, representing the collection's earliest datetimestamp.
31# This record has $OID_EARLIEST_TIMESTAMP for OID.
32# Its deletion status is maintained at NA, not applicable.
33# In cases of older oai-inf.db files where there's no earliesttimestamp field, this record
34# is also created but with timestamp set to the oldest lastmodified date in oai-inf.db.
35
36# Deletion status can be:
37#  E = Doc with OID exists (has not been deleted from collection). Timestamp indicates last time of build
38#  D = Doc with OID has been deleted. Timestamp indicates time of deletion
39#  PD = Provisionally Deleted. The associated timestamps are momentarily unaltered.
40#  NA = Not Applicable. Only for the special record with $OID_EARLIEST_TIMESTAMP as OID.
41
42# oaidb is "always incremental": always reflects the I/B/R/D status of archive info db,
43# before the indexing step of the build phase that alters the I/B/R/D contents of archive info db.
44# (I=index, B=been indexed, R=reindex; D=delete)
45
46sub new {
47    my $class = shift(@_);
48    my ($config_filename, $infodbtype, $verbosity) = @_;
49 
50    my $self = {
51    'verbosity' => $verbosity || 0,
52    'verbosity_threshold' => 5, # start printing debugging info from verbosity >= threshold
53    'info'=>{} # map of {OID, array[deletion-status,timestamp,datestamp]} pairs
54    };
55   
56    if(!defined $infodbtype) {
57    $infodbtype = &dbutil::get_default_infodb_type();
58    }
59    $infodbtype = "gdbm" if ($infodbtype eq "gdbm-txtgz");
60    $self->{'infodbtype'} = $infodbtype;
61
62    # Create and store the db filenames we'll be working with (tmp and livedb)
63    my $etc_dir = &util::get_parent_folder($config_filename);
64
65    my $perform_firsttime_init = 0;
66    $self->{'oaidb_live_filepath'} = &dbutil::get_infodb_file_path($infodbtype, "oai-inf", $etc_dir, $perform_firsttime_init);
67    $self->{'oaidb_tmp_filepath'} = &dbutil::get_infodb_file_path($infodbtype, "oai-inf-tmp", $etc_dir, $perform_firsttime_init);
68    $self->{'etc_dir'} = $etc_dir;
69#    print STDERR "############ LIVE DB: $self->{'oaidb_live_filepath'}\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'};
70#    print STDERR "############ TMP DB: $self->{'oaidb_tmp_filepath'}\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'};
71
72    $self->{'oaidb_file_path'} = $self->{'oaidb_tmp_filepath'}; # db file we're working with
73
74    return bless $self, $class;
75}
76
77# this subroutine will work out the starting contents of the tmp-db (temporary oai db):
78# whether it should start off empty, or with the contents of any existing live-db,
79# or with the contents of any existing tmp-db.
80sub init_tmpdb {
81    my $self = shift(@_);
82    my ($removeold, $have_manifest) = @_;
83
84    # if we have a manifest file, then we pretend we are fully incremental for oaiinfo db.
85    # removeold implies proper full-rebuild, whereas keepold or incremental means incremental
86    if($have_manifest) { # if we have a manifest file, we're not doing removeold/full-rebuild either
87    $removeold = 0;
88    }
89
90    my $do_pd_step = ($removeold) ? 1 : 0;
91       # if $removeold, then proper full rebuild, will carry out step where all E will be marked as PD
92       # else some kind of incremental build, won't do the extra PD pass
93       # which is the step marking existing OIDs (E) as PD (provisionally deleted) 
94   
95    my $oaidb_live_filepath = $self->{'oaidb_live_filepath'};
96    my $oaidb_tmp_filepath = $self->{'oaidb_tmp_filepath'};
97    my $infodbtype = $self->{'infodbtype'};
98    # Note: the live db can only exist if the collection has been activated at least once before
99    my $livedb_exists = &FileUtils::fileExists($oaidb_live_filepath);
100    my $tmpdb_exists = &FileUtils::fileExists($oaidb_tmp_filepath);   
101
102    my $initdb = 0;
103   
104    # work out what operation we need to do
105    #    work with empty tmpdb
106    #    copy_livedb_to_tmpdb
107    #    work with existing tmpdb (so existing tmpdb will be topped up)
108
109    # make_contents_of_tmpdb_empty
110    # make_contents_of_tmpdb_that_of_livedb
111    # continue_working_with_tmpdb ("contents_of_tmpdb_is_tmpdb")
112
113    # We're going to prepare the starting state of tmpdb next.
114    # It can start off empty, start off with the contents of livedb, or it can start off with the contents
115    # of the existing tmp db. Which of these three it is depends on the 3 factors: whether livedb exists,
116    # whether tmpdb exists and whether or not removeold is true.
117    # i.o.w. which of the 3 outcomes it is depends on the truth table built on the following 3 variables:
118    #   LDB = LiveDB exists
119    #   TDB = TmpDB exists
120    #   RO = Removeold
121    # OUTCOMES:
122    #   clean slate (create an empty tmpdb/make tmpdb empty)
123    #   top up tmpDB (work with existing tmpdb)
124    #   copy LiveDB to TmpDB (liveDB's contents become the contents of TmpDB, and we'll work with that)
125    #
126    # TRUTH TABLE:
127    # ---------------------------------------
128    # LDB TDB  RO | Outcome
129    # ---------------------------------------
130    #  0   0   0  | clean-slate
131    #  0   0   1  | clean-slate
132    #  0   1   0  | top-up-tmpdb
133    #  0   1   1  | erase tmpdb, clean-slate
134    #  1   0   0  | copy livedb to tmpdb
135    #  1   0   1  | copy livedb to tmpdb
136    #  1   1   0  | top-up-tmpdb
137    #  1   1   1  | copy livedb to tmpd
138    # ---------------------------------------
139    #
140    # Dr Bainbridge worked out using Karnaugh maps that, from the above truth table:
141    # => clean-slate/empty-tmpdb = !LDB && (RO || !TDB)
142    # => top-up-tmpdb/work-with-existing-tmpdb = !RO && TDB
143    # => copy-livedb-to-tmpdb = LDB && (!TDB || RO)
144    # I had most of these tests, except that I hadn't (yet) merged the two clean slate instances
145    # of first-build-ever and make-contents-of-tmpdb-empty
146
147    #my $first_build_ever = (!$livedb_exists && !$tmpdb_exists);
148    #my $make_contents_of_tmpdb_empty = (!$livedb_exists && $tmpdb_exists && $removeold);
149    # Karnaugh map allows merging $first_build_ever and $make_contents_of_tmpdb_empty above
150    # into: my $work_with_empty_tmpdb = (!$livedb_exists && (!$tmpdb_exists || $removeold));
151    my $work_with_empty_tmpdb = (!$livedb_exists && (!$tmpdb_exists || $removeold));
152    my $make_contents_of_tmpdb_that_of_livedb = ($livedb_exists && (!$tmpdb_exists || $removeold));
153    my $work_with_existing_tmpdb = ($tmpdb_exists && !$removeold);
154
155    if($work_with_empty_tmpdb) { # we'll use an empty tmpdb
156
157    # If importing the collection for the very first time, neither db exists,
158    # so create an empty tmpdb.
159    #
160    # We also create an empty tmpdb when livedb doesn't exist and $removeold is true.
161    # This can happen if we've never run activate (so no livedb),
162    # yet had done some import (and perhaps building) followed by a full re-import now.
163    # Since there was no activate and we're doing a removeold/full-rebuild now, can just
164    # work with a new tmpdb, even though one already existed, its contents can be wiped out.
165        # In such a scenario, we'll be deleting tmpdb. Then there  will be no livedb nor any tmpdb
166    # any more, so same situation as if importing the very first time when no oaidb exists either.
167
168    &dbutil::remove_db_file($self->{'infodbtype'}, $oaidb_tmp_filepath) if $tmpdb_exists; # remove the db file and any assoc files
169    $initdb = 1; # new tmpdb
170   
171    # if the oai db is created the first time, it's like incremental and
172    # "keepold" (keepold means "only add, don't reprocess existing"). So
173    # no need to do the special passes dealing with "provisional deletes".
174    $do_pd_step = 0;
175   
176    } elsif ($make_contents_of_tmpdb_that_of_livedb) {
177
178    # If the livedb exists and we're doing a full rebuild ($removeold is true),
179    # copy livedb to tmp regardless of if tmpdb already exists.
180    # Or if the livedb exists and tmpdb doesn't exist, it doesn't matter
181    # if we're incremental or not: also copy live to tmp and work with tmp.
182   
183    # copy livedb to tmpdb
184    &dbutil::remove_db_file($self->{'infodbtype'}, $oaidb_tmp_filepath) if $tmpdb_exists; # remove the db file and any assoc files
185    &FileUtils::copyFiles($oaidb_live_filepath, $oaidb_tmp_filepath);
186   
187    $initdb = 0; # tmpdb exists, since we just copied livedb to tmpdb, so no need to init existing tmpdb
188
189    } else { # $work_with_existing_tmpdb, so we'll build on top of what's presently already in tmpdb
190         # (we'll be topping up the current tmpdb)
191
192    # !$removeold, meaning incremental
193    # If incremental and have a tmpdb already, regardless of whether livedb exists,
194    # then work with the existing tmpdb file, as this means we've been
195    # importing (perhaps followed by building) repeatedly without activating the
196    # last time but want to maintain the (incremental) changes in tmpdb.       
197     
198    $initdb = 0;
199
200    } # Dr Bainbridge drew up Karnaugh maps on the truth table, which proved that all cases
201                    # are indeed covered above, so don't need any other catch-all else here
202
203    $self->{'oaidb_file_path'} = &dbutil::get_infodb_file_path($infodbtype, "oai-inf-tmp", $self->{'etc_dir'}, $initdb);
204                                 # final param follows jmt's $perform_firsttime_init in inexport.pm
205
206#    print STDERR "@@@@@ oaidb: $self->{'oaidb_file_path'}\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'};
207
208    return ($do_pd_step, $initdb);
209}
210
211sub get_filepath {
212    my $self = shift (@_);
213    return $self->{'oaidb_file_path'};
214}
215
216sub import_stage {
217    my $self = shift (@_);
218    my ($removeold, $have_manifest) = @_;
219   
220    my ($do_pd_step, $is_new_db) = $self->init_tmpdb($removeold, $have_manifest);
221      # returns 1 for $do_pd_step if the step to mark oaidb entries as PD is required
222      # if we're doing full rebuilding and it's NOT the first time creating the oai_inf db,
223      # then the tasks to do with PD (provisionally deleted) OAI OIDs should be carried out
224      # Returns 1 for is_new_db to allow further one time initialisation of the new oai-inf.db
225
226    $self->load_info();
227    $self->print_info(); # DEBUGGING
228
229    # A special record of the oai-inf.db will contain the timestamp when the oai-inf.db was created.
230    # This represents the collection's "earliest datetimestamp". It should remain unaltered
231    # for as long as oai-inf db exists. This record has the special OID of $OID_EARLIEST_TIMESTAMP.
232    # This record should not be marked as PD, but remain as E, as it can't ever be deleted.
233    # Although the status field for the $OID_EARLIEST_TIMESTAMP record is actually meaningless.   
234    my $save_to_db = $self->insert_coll_earliest_timestamp($is_new_db);   
235   
236    if ($do_pd_step) {
237    $self->mark_all_existing_as_provisionallydeleted();
238    $self->print_info(); # DEBUGGING
239   
240    $save_to_db = 1;   
241    }
242
243    if($save_to_db) {
244    # save changes to $self->{'info'} out to db file, now that we're done
245    $self->save_info();
246    }
247
248}
249
250sub building_stage_before_indexing() {
251    my $self = shift (@_);   
252    my ($archivedir) = @_;
253
254    # load archive info db into memory
255    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($self->{'infodbtype'}, "archiveinf-doc", $archivedir);
256    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($self->{'infodbtype'}, "archiveinf-src", $archivedir);
257    my $archive_info = new arcinfo ($self->{'infodbtype'});
258    $archive_info->load_info ($arcinfo_doc_filename);
259
260    #my $started_from_scratch = &FileUtils::fileTest($self->{'oaidb_tmp_filepath'}, '-z'); # 1 if tmpdb is empty
261        # -z test for file is empty http://www.perlmonks.org/?node_id=927447
262   
263    # load the oaidb file's contents into memory.
264    $self->load_info();
265    $self->print_info(); # DEBUGGING
266
267    # process all the index, reindex and delete operations as indicated in arcinfo,
268    # all the while ensuring all PDs are changed back to E for OIDs that exist in both arcinfo and oaiinfo db. 
269
270    my $arcinfo_map = $archive_info->{'info'};
271
272    foreach my $OID (keys %$arcinfo_map) {
273    my $arcinf_tuple = $archive_info->{'info'}->{$OID};
274    my $indexing_status = $arcinf_tuple->[arcinfo::INFO_STATUS_INDEX];
275                 # use packageName::constant to refer to constants declared in another package,
276                 # see http://perldoc.perl.org/constant.html
277
278    print STDERR "######## OID: $OID - status: $indexing_status\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'};
279
280    if($indexing_status eq "I") {
281        $self->index($OID); # add new as E with current timestamp/or set existing as E with orig timestamp
282    } elsif($indexing_status eq "R") {
283        $self->reindex($OID); # update timestamp and ensure marked as E (if oid doesn't exist, add new)
284    } elsif($indexing_status eq "D") {
285        $self->delete($OID); # set as D with current timestamp
286    } elsif($indexing_status eq "B") { # B for "been indexed"
287        $self->been_indexed($OID); # will flip any PD to E if oid exists, else will add new entry for oid
288        # A new entry may be required if the collection had been built prior to turning this into
289        # an oaicollection. But what if we always maintain an oaidb? Still call $self->index() here.
290    } else {
291        if ($self->{'verbosity'} >= $self->{'verbosity_threshold'}) {
292        print STDERR "### oaiinfo::building_stage_before_indexing(): Unrecognised indexing status $indexing_status\n";
293        }
294    }
295    }
296
297    # once all docs processed, go through oaiiinfo db changing any PDs to D along with current timestamp
298    # to indicate that they're deleted
299    $self->mark_all_provisionallydeleted_as_deleted();
300    $self->print_info();
301   
302    # let's save to db file now that we're done
303    $self->save_info();
304   
305}
306
307sub activate_collection { # move tmp db to live db
308    my $self = shift (@_);
309
310    my $oaidb_live_filepath =  $self->{'oaidb_live_filepath'};
311    my $oaidb_tmp_filepath = $self->{'oaidb_tmp_filepath'};
312
313    my $livedb_exists = &FileUtils::fileExists($oaidb_live_filepath);
314    my $tmpdb_exists = &FileUtils::fileExists($oaidb_tmp_filepath);
315
316    if($tmpdb_exists) {
317    if($livedb_exists) {
318        #&dbutil::remove_db_file($self->{'infodbtype'}, $oaidb_live_filepath); # remove the db file and any assoc files
319        &dbutil::rename_db_file_to($self->{'infodbtype'}, $oaidb_live_filepath, $oaidb_live_filepath.".bak"); # rename the db file and any assoc files
320    }
321    #&FileUtils::moveFiles($oaidb_tmp_filepath, $oaidb_live_filepath);
322    &dbutil::rename_db_file_to($self->{'infodbtype'}, $oaidb_tmp_filepath, $oaidb_live_filepath); # rename the db file and any assoc files
323
324    if ($self->{'verbosity'} >= $self->{'verbosity_threshold'}) {
325        print STDERR "#### Should now have MOVED $self->{'oaidb_tmp_filepath'} to $self->{'oaidb_live_filepath'}\n";
326    }
327   
328    } else {
329    if ($self->{'verbosity'} >= $self->{'verbosity_threshold'}) {
330        print STDERR "@@@@@ In oaiinfo::activate_collection():\n";
331        print STDERR "@@@@@   No tmpdb at $self->{'oaidb_tmp_filepath'}\n";
332        print STDERR "@@@@@   to make 'live' by moving to $self->{'oaidb_live_filepath'}.\n";
333    }
334    }
335}
336
337##################### SPECIFIC TO PD-STEP ####################
338
339
340# mark all existing, E (non-deleted) OIDs as Provisionally Deleted (PD)
341# this subroutine doesn't save to oai-inf.DB
342# the caller should call save_info when they want to save to the db
343sub mark_all_existing_as_provisionallydeleted {
344    my $self = shift (@_);
345   
346    print STDERR "@@@@@ oaiinfo::mark_all_E_as_PD(): Marking the E entries as PD\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'};
347
348    my $infomap = $self->{'info'};
349
350    foreach my $OID (keys %$infomap) { # Mac Mountain Lion wants %$map, won't accept %$self->{'info'}
351    my $OID_info = $self->{'info'}->{$OID};
352    my $curr_status = $OID_info->[INFO_STATUS_INDEX];
353    if($curr_status eq "E") {       
354        $OID_info->[INFO_STATUS_INDEX] = "PD";
355    }
356    }
357}
358
359# mark all OIDs that are Provisionally Deleted (PD) as deleted, and set to current timestamp
360# To be called at end of build. Again, the caller should save to DB by calling save_info.
361sub mark_all_provisionallydeleted_as_deleted {
362    my $self = shift (@_);
363   
364    print STDERR "@@@@@ oaiinfo::mark_all_PD_as_D(): Marking the PD entries as D\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'};
365
366    my $infomap = $self->{'info'};
367
368    foreach my $OID (keys %$infomap) {
369    my $OID_info = $self->{'info'}->{$OID};
370    my $curr_status = $OID_info->[INFO_STATUS_INDEX];
371    if($curr_status eq "PD") {
372        $self->set_info($OID, "D", $self->get_current_time());
373    }
374    }
375}
376
377
378##################### GENERAL, NOT SPECIFIC TO PD-STEP ####################
379
380sub print_info {
381    my $self = shift (@_);
382
383    if ($self->{'verbosity'} < $self->{'verbosity_threshold'}) {
384    return;
385    }
386   
387    print STDERR "###########################################################\n";
388    print STDERR "@@@@@ oaiinfo::print_info(): oaidb in memory contains: \n";
389   
390    my $infomap = $self->{'info'};
391
392    foreach my $OID (keys %$infomap) {
393    print STDERR "OID: $OID";
394    print STDERR " status: " . $self->{'info'}->{$OID}->[INFO_STATUS_INDEX];
395    print STDERR " time: " . $self->{'info'}->{$OID}->[INFO_TIMESTAMP_INDEX];
396    print STDERR " date: " . $self->{'info'}->{$OID}->[INFO_DATESTAMP_INDEX];
397    print STDERR "\n";
398    }
399
400    print STDERR "###########################################################\n";
401}
402
403
404# When a fresh oai-inf.db is created, this method is called to add the db's special
405# record representing the collection's earliest timestamp.
406# OID=$OID_EARLIEST_TIMESTAMP, deletion_status=NA for not applicable, and current timestamp/date.
407# For older oai-inf.db's that don't yet have this record, a record will be added too,
408# but with the timestamp set to the oldest last modified date for the collection's docs.
409sub insert_coll_earliest_timestamp {
410    my $self = shift (@_);
411    my ($is_new_db) = @_;
412
413    my $current_time = $self->get_current_time();
414    my $save_to_db = 0;
415
416   
417    print STDERR "@@@@@ oaiinfo::insert_coll_earliest_timestamp(): " if $self->{'verbosity'} >= $self->{'verbosity_threshold'};
418   
419    if($is_new_db) {
420   
421    print STDERR "New db. Setting timestamp of oai-inf.db creation.\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'};
422   
423    $self->set_info($OID_EARLIEST_TIMESTAMP, "NA", $current_time);
424    $save_to_db = 1;
425    }
426   
427    else { # oai-inf.db already exists, ensure it has an [$OID_EARLIEST_TIMESTAMP] set
428
429    my $earliesttimestamp_record = $self->{'info'}->{$OID_EARLIEST_TIMESTAMP};
430   
431    if (!defined $earliesttimestamp_record) {
432        # oai-inf.db exists, but doesn't contain an [$OID_EARLIEST_TIMESTAMP] record yet.
433        # Let's create one for it:
434        # Work out the earliest lastmodified datetime in the collection, by inspecting
435        # the last modified timestamp for each doc in the collection
436       
437        my $earliest_timestamp = $current_time;
438       
439        my $infomap = $self->{'info'}; # Mac Mountain Lion wants %$map, won't accept %$self->{'info'}       
440        foreach my $OID (keys %$infomap) {
441        my $OID_info = $self->{'info'}->{$OID};
442        my $lastmodified = $OID_info->[INFO_TIMESTAMP_INDEX];
443        if($lastmodified < $earliest_timestamp) {
444            $earliest_timestamp = $lastmodified;
445        }
446        }
447       
448        print STDERR "Collection timestamp not yet set for $OID_EARLIEST_TIMESTAMP. Setting to earliest found: $earliest_timestamp\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'};
449       
450        $self->set_info($OID_EARLIEST_TIMESTAMP, "NA", $earliest_timestamp);
451        $save_to_db = 1;
452    } else {
453        print STDERR "Collection timestamp was already set\n" if $self->{'verbosity'} >= $self->{'verbosity_threshold'};
454    }
455    }
456
457    return $save_to_db;
458}
459
460
461# Find the OID, if it exists, make its status=E for existing. Leave its timestamp alone.
462# If the OID doesn't yet exist, add it as a new entry with status=E and with current timestamp.
463sub index { # Add a new oid with current time and E. If the oid was already present, mark as E
464    my $self = shift (@_);
465    my ($OID) = @_;
466   
467    my $OID_info = $self->{'info'}->{$OID};
468   
469    if (defined $OID_info) { # if OID is present, this will change status back to E, timestamp unchanged
470    $OID_info->[INFO_STATUS_INDEX] = "E";
471   
472    } else { # if OID is not present, then it's now added as existing from current time on
473    $self->set_info($OID, "E", $self->get_current_time());
474    }
475}
476
477# Upon reindexing a document with identifier OID, change its timestamp to current time
478# if a new OID, then add as new entry with status=E and current timestamp
479sub reindex { # update timestamp if oid is already present, if not (unlikely), add as new
480    my $self = shift (@_);
481    my ($OID) = @_;
482
483    my $OID_info = $self->{'info'}->{$OID};   
484    $self->set_info($OID, "E", $self->get_current_time()); # Takes care of 3 things:
485       # if OID exists, updates modified time to indicate the doc has been reindexed
486       # if OID exists, ensures any status=PD is flipped back to E for this OID doc (as we know it exists);
487       # if the OID doesn't yet exist, adds a new OID entry with status=E and current timestamp.
488
489}
490
491# Does the same as index():
492# OIDs that have been indexed upon rebuild may still be new to the oaidb: GS2 collections
493# are not OAI collections by default, unlike GS3 collections. Imagine rebuilding a (GS2) collection
494# 5 times and then setting them to be an OAI collection. In that case, the doc OIDs in the collection
495# may not be in the oaidb yet. Unless, we decide (as is the present case) to always maintain an oaidb
496# (always creating an oaidb regardless of whether the collection has OAI support turned on or not).
497sub been_indexed {
498    my $self = shift (@_);
499    my ($OID) = @_;
500
501    $self->index($OID);
502}
503
504# Upon deleting a document with identifier OID,
505# set status to deleted and change its timestamp to current time
506sub delete {
507    my $self = shift (@_);
508    my ($OID) = @_;
509
510    # the following method will set to current time if no timestamp provided,
511    # But by being explicit here, the code is easier to follow
512    $self->set_info($OID, "D", $self->get_current_time());
513
514}
515
516#############################################################
517sub get_current_time {
518    my $self = shift (@_);
519    return time; # current time
520
521    # localtime(time) returns an array of values (day, month, year, hour, min, seconds) or singular string
522    # return localtime; # same as localtime(time); # http://perldoc.perl.org/functions/localtime.html
523   
524}
525
526sub get_datestamp {
527    my $self = shift (@_);
528    my ($timestamp) = @_;
529
530    my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
531        $wday, $yday, $isdst) = localtime($timestamp);
532
533    my $datestamp = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
534
535    return $datestamp;
536}
537
538sub _load_info_txt
539{
540    my $self = shift (@_);
541    my ($filename) = @_;
542
543    if (defined $filename && &FileUtils::fileExists($filename)) {
544    open (INFILE, $filename) ||
545        die "oaiinfo::load_info couldn't read $filename\n";
546
547    my ($line, @lineparts);
548    while (defined ($line = <INFILE>)) {
549        $line =~ s/\cM|\cJ//g; # remove end-of-line characters
550        @lineparts = split ("\t", $line);
551        if (scalar(@lineparts) >= 2) {
552        $self->set_info (@lineparts);
553        }
554    }
555    close (INFILE);
556    }
557
558}
559
560sub _load_info_db
561{
562    my $self = shift (@_);
563    my ($filename) = @_;
564
565    my $infodb_map = {};
566
567    &dbutil::read_infodb_file($self->{'infodbtype'}, $filename, $infodb_map);
568
569    foreach my $oid ( keys %$infodb_map ) {
570    my $vals = $infodb_map->{$oid};
571    # interested in oid, timestamp, deletion status
572
573    my ($deletion_status) = ($vals=~/^<status>(.*)$/m);
574    my ($timestamp) = ($vals=~/^<timestamp>(.*)$/m);
575    my ($datestamp) = ($vals=~/^<datestamp>(.*)$/m);
576   
577    $self->add_info ($oid, $deletion_status, $timestamp, $datestamp);
578    }
579}
580
581# if no filename is passed in (and you don't generally want to), then
582# it tries to load in <collection>/etc/oai-inf.<db> if it exists
583sub load_info {
584    my $self = shift (@_);
585    my ($filename) = @_;
586
587    $self->{'info'} = {};
588
589    $filename = $self->{'oaidb_file_path'} unless defined $filename;
590
591    if (&FileUtils::fileExists($filename)) {
592    if ($filename =~ m/\.inf$/) {
593        $self->_load_info_txt($filename);
594    }
595    else {
596        $self->_load_info_db($filename);
597    }
598    }
599
600}
601
602sub _save_info_txt {
603    my $self = shift (@_);
604    my ($filename) = @_;
605
606    my ($OID, $info);
607
608    open (OUTFILE, ">$filename") ||
609    die "oaiinfo::save_info couldn't write $filename\n";
610 
611    foreach $info (@{$self->get_OID_list()}) {
612    if (defined $info) {
613        print OUTFILE join("\t", @$info), "\n";
614    }
615    }
616    close (OUTFILE);
617}
618
619# if no filename is passed in (and you don't generally want to), then
620# this subroutine tries to write to <collection>/etc/oai-inf.<db>.
621sub _save_info_db {
622    my $self = shift (@_);
623    my ($filename) = @_;
624
625    $filename = $self->{'oaidb_file_path'} unless defined $filename;
626    my $infodbtype = $self->{'infodbtype'};
627
628    # write out again. Open file for overwriting, not appending.
629    # Then write out data structure $self->{'info'} that has been maintaining the data in-memory.
630    my $infodb_handle = &dbutil::open_infodb_write_handle($infodbtype, $filename);
631
632    my $infomap = $self->{'info'};
633    foreach my $oid ( keys %$infomap ) {
634    my $OID_info = $self->{'info'}->{$oid};
635    my $val = "<status>".$OID_info->[INFO_STATUS_INDEX];
636    $val .= "\n<timestamp>".$OID_info->[INFO_TIMESTAMP_INDEX];
637    $val .= "\n<datestamp>".$OID_info->[INFO_DATESTAMP_INDEX]."\n";
638    &dbutil::write_infodb_rawentry($infodbtype,$infodb_handle,$oid,$val);
639    }
640    &dbutil::close_infodb_write_handle($infodbtype, $infodb_handle);
641}
642
643sub save_info {
644    my $self = shift (@_);
645    my ($filename) = @_;
646
647    if(defined $filename) {
648    if ($filename =~ m/(contents)|(\.inf)$/) {
649        $self->_save_info_txt($filename);
650    }
651    else {
652        $self->_save_info_db($filename);
653    }
654    } else {
655    $self->_save_info_db();
656    }
657}
658
659
660sub set_info { # sets existing or appends
661    my $self = shift (@_);
662    my ($OID, $del_status, $timestamp) = @_;
663
664    if(!defined $timestamp) { # get current date timestamp
665    $timestamp = $self->get_current_time();
666    }
667    my $datestamp = $self->get_datestamp($timestamp);
668
669    $self->{'info'}->{$OID} = [$del_status, $timestamp, $datestamp];
670
671}
672
673sub add_info { # called to load a single record from file into memory, so it should be provided all 4 fields
674    my $self = shift (@_);
675    my ($OID, $del_status, $timestamp, $datestamp) = @_;
676
677    $self->{'info'}->{$OID} = [$del_status, $timestamp, $datestamp];
678}
679
680
681# returns a list of the form [[OID, deletion_status, timestamp, datestamp], ...]
682sub get_OID_list
683{
684    my $self = shift (@_);
685
686    my @list = ();
687   
688    my $infomap = $self->{'info'};
689    foreach my $OID (keys %$infomap) { 
690    my $OID_info = $self->{'info'}->{$OID};
691
692    push (@list, [$OID, $OID_info->[INFO_STATUS_INDEX],
693              $OID_info->[INFO_TIMESTAMP_INDEX],
694              $OID_info->[INFO_DATESTAMP_INDEX]
695          ]);
696    }
697
698    return \@list;
699}
700
701
702# returns the number of entries so far, including deleted ones
703# http://stackoverflow.com/questions/1109095/how-can-i-find-the-number-of-keys-in-a-hash-in-perl
704sub size {
705    my $self = shift (@_);
706    my $infomap = $self->{'info'};
707    return (scalar keys %$infomap);
708}
709
7101;
Note: See TracBrowser for help on using the browser.