Changeset 37152 for main


Ignore:
Timestamp:
2023-01-18T15:22:41+13:00 (15 months ago)
Author:
davidb
Message:

The commit adds the newly developed 'File-Level Document-Version History' feature (fldv-history for short in the code). The work targets the 'archives' directory area. The core idea is to have older versions of a archives' document folder contained inside the latest generated version. To achieve this, inside an archives' document folder there can now be a '_fldv_history' folder, and inside that have sub-folders conforming the patter 'nminus-1', 'nminus-2', ... These are the older versions of the archives' document. The filenames are literally as just typed: nminus-1 contains the most recent stored version of the document; nminus-2 (if exists) is the second most recent version, and so on. When import.pl is run with -incremental and -keepold then any existing documents that need to be re-processed will trigger the formation of _fldv_history/nminus-1, storing the previous version in it. If import.pl -incremental -keepold is run again and the doc has changed again, then nminus-1 is moved to nminus-2, and a new nminus-1 is generated. With the addition of this new feature, there is a use-case of running -keepold without -incremental (and for it to be 'addonly'). To be clear, the 'onlyadd' functionality still works, however the code now does extra work to ensure that any existing documents in archives get the file-level document-version history treatment as well. This allows for a collection building to manually add content into import (even choose to leave existing content previously processed there if they want), and when import.pl -keepold is next run then a 'collection-wide' file-level document-version history of existing documents is triggered in 'archives'. The idea of a 'colletion-wide' (global) document history mechanism could be a useful way for a user to manage their collection. Hardlinking is used throughout the new code, so that occurs on the file system is not particularly expensive, although the overall collection build takes longer than 'onlyadd' as it does reprocess off the existing documents again. In the case of a user running import.pl -keepold and realizing this was in fact a mistake, there a new minus option '-replaceold'. This works in much the same way as 'keepold' only when it comes to the file-level document-version history feature, it does not add in yet another stored document version, rather it replaces the one previously stored at 'nminus-1' with this one, effectively undoing the 'mistake' of the previous build.

Location:
main/trunk/greenstone2/perllib
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/inexport.pm

    r36471 r37152  
    4747use parse2;
    4848
     49use DocHistoryFileUtils;
     50use FileUtils;
     51
    4952use File::Basename;
    5053
     
    121124       'reqd' => "no",
    122125       'hiddengli' => "yes" },
     126     { 'name' => "replaceold",
     127       'desc' => "{import.replaceold}",
     128       'type' => "flag",
     129       'reqd' => "no",
     130       'hiddengli' => "yes" },
    123131     { 'name' => "removeold",
    124132       'desc' => "{import.removeold}",
     
    428436    $archivedir = &FileUtils::sanitizePath($archivedir);
    429437    }
     438
     439    my $archivedir_keepold = "${archivedir}_keepold"; # used when file-level document-version history is in play
    430440    $self->{'archivedir'} = $archivedir;
    431 
     441    $self->{'archivedir_keepold'} = $archivedir_keepold;
     442   
    432443    if (defined $self->{'default_verbosity'}) {
    433444    if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
     
    478489    my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
    479490
    480     my ($removeold, $keepold, $incremental, $incremental_mode)
    481     = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
    482                            $self->{'incremental'}, $checkdir,
    483                            $collectcfg);
     491    my ($removeold, $keepold, $replaceold, $incremental, $incremental_mode)
     492    = &scriptutil::check_removeold_keepold_replaceold($self->{'removeold'}, $self->{'keepold'}, $self->{'replaceold'},
     493                              $self->{'incremental'}, $checkdir,
     494                              $collectcfg);
    484495
    485496    $self->{'removeold'}        = $removeold;
    486497    $self->{'keepold'}          = $keepold;
     498    $self->{'replaceold'}       = $replaceold;
    487499    $self->{'incremental'}      = $incremental;
    488500    $self->{'incremental_mode'} = $incremental_mode;
     
    507519
    508520    my $importdir   = $self->{'importdir'};
    509     my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
     521    my $archivedir  = $self->{'archivedir'} || $self->{'exportdir'};
     522    # 'archivedir' is a tad abused, and is sometimes set to the 'exportdir' value,
     523    # meaining 'archivedir_keepold' is actually the export dir name with '_keepold' appended
     524    my $archivedir_keepold  = $self->{'archivedir_keepold'};
    510525
    511526    my $incremental = $self->{'incremental'};
     
    515530
    516531    my $removeold   = $self->{'removeold'};
     532    my $replaceold  = $self->{'replaceold'};
    517533    my $keepold     = $self->{'keepold'};
    518534
     
    551567    }
    552568
    553     my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
     569    my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir); 
    554570    if ($self->{'manifest'} ne "") {
    555571    my $manifest_filename = $self->{'manifest'};
     
    590606    }
    591607
    592     # remove the old contents of the archives directory (and tmp
    593     # directory) if needed
    594 
    595     if ($removeold) {
     608    # Whether -removeold, -keepold or -replaceold there should never be an existing archivedir_keepold
     609    # => Taken to be a sign of a previous import/export that has gone wrong
     610    # => Print out error message and stop!
     611   
     612    if (&FileUtils::directoryExists($archivedir_keepold)) {
     613    my $rkr_old_minus_option = undef; # rkr = remove, keep, replace (whichever one is being used)
     614    if ($removeold) {
     615        $rkr_old_minus_option = "-removeold";
     616    }
     617    elsif ($keepold) {
     618        $rkr_old_minus_option = "-keepold";
     619    }
     620    elsif ($replaceold) {
     621        $rkr_old_minus_option = "-replaceold";
     622    }
     623   
     624    &gsprintf(STDERR, "\n");
     625    &gsprintf(STDERR, "Detected existing directory:\n\n");
     626    &gsprintf(STDERR, "    $archivedir_keepold\n\n");
     627    &gsprintf(STDERR, "Stopping $inexport_mode.\n\n");
     628   
     629    &gsprintf(STDERR, "**** When building with $rkr_old_minus_option, there cannot be a pre-existing 'archives_keepold' directory\n");
     630    &gsprintf(STDERR, "****\n");
     631    &gsprintf(STDERR, "**** Review your collection directory folder, and determine whether to:\n");
     632    &gsprintf(STDERR, "****   (a) move your 'archives_keepold' back to being 'archives'; or\n");
     633    &gsprintf(STDERR, "****   (b) remove your 'archives_keepold'\n");
     634    &gsprintf(STDERR, "**** before running your $inexport_mode command again\n\n");
     635   
     636    exit 1; # c errno for 'operation not permitted'
     637    }
     638
     639
     640    # remove the old contents of the archives directory (and tmp directory) if needed
     641
     642    if ($removeold) {   
    596643    if (&FileUtils::directoryExists($archivedir)) {
    597644        &gsprintf($out, "{import.removing_archives}\n");
     
    605652        &FileUtils::removeFilesRecursive($tmpdir);
    606653    }
     654    }
     655    else { 
     656    # If not $removeold, then must be $keepold or $replaceold
     657    # => for either case want to "hard-link"/copy 'archives' to 'archives_keepold'
     658
     659    # Want to be super careful about doing this, so as not to accidentally
     660    # wipe out any previous file-level document-version history
     661
     662    # If got to here, then there is no pre-existing $archivedir_keepold
     663    # => Hard-link copy the contents of 'archives' to 'archives_keepold'
     664    # => Stop if there is any issue with creating the hard-link copy
     665   
     666    if (!&FileUtils::hardlinkFilesRefRecursive([$archivedir],$archivedir_keepold, { 'strict' => 1 } )) {
     667       
     668        &gsprintf(STDERR, "\nError message: $!\n\n");
     669       
     670        &gsprintf(STDERR, "**** Failed to make a hard-link copy of:\n");
     671        &gsprintf(STDERR, "****     $archivedir\n");
     672        &gsprintf(STDERR, "**** to:\n");
     673        &gsprintf(STDERR, "****     $archivedir_keepold\n");
     674        &gsprintf(STDERR, "****\n");
     675        &gsprintf(STDERR, "**** Unable to proceed with file-level document-version history $inexport_mode => Stopping\n");
     676
     677        exit $!;
     678    }   
    607679    }
    608680
     
    627699    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
    628700
     701
    629702    my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
    630     $archive_info->load_info ($arcinfo_doc_filename);
    631     # load in rev info so we don't overwrite existing info when we do incremental import
    632     # from here on, make all changes to this object, then write out the file at the end.
     703    $archive_info->load_info($arcinfo_doc_filename);
     704    # Load in reverse-lookup info (used to determine the docs that a file in import are used in),
     705    #   so we don't overwrite existing info when we do incremental import
     706    # From here on, make all changes to this object, then write out the file at the end.
    633707    $archive_info->load_rev_info($arcinfo_src_filename);
    634708
     
    710784    $block_hash->{'new_files'} = {};
    711785    $block_hash->{'reindex_files'} = {};
    712     # all of these are set somewhere else, so it's more readable to define them
    713     # here [jmt12]
     786
     787    # all of these are set somewhere else, so it's more readable to define them here [jmt12]
    714788    $block_hash->{'all_files'} = {};
    715789    $block_hash->{'deleted_files'} = {};
     
    829903
    830904    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
     905   
    831906    # need to check this file exists before trying to read it - in the past
    832907    # it wasn't possible to have a manifest unless keepold was also set so
     
    902977        # Can now work out which files were new, already existed, and have
    903978        # been deleted
    904        
     979
    905980        new_vs_old_import_diff($archive_info,$block_hash,$importdir,
    906981                   $archivedir,$verbosity,$incremental_mode);
     
    10071082    if ($self->{'OIDtype'} eq 'incremental')
    10081083    {
    1009     store_doc_oid_count($archivedir);
     1084    store_doc_oid_count($archivedir); 
    10101085    }
    10111086
     
    10141089    $processor->end();
    10151090   
    1016     #    if ($inexport_mode eq "import") {
    10171091    if ($self->{'generate_auxiliary_files'}) {
     1092       
    10181093    # write out the archive information file
    10191094    # for backwards compatability with archvies.inf file
    10201095    if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
     1096        # In the days of this being a text file, this all we had to do
     1097        # Note, if still using this form of archive-inf, then neither
     1098        # incremental building nor files-level document-version history
     1099        # is suported
    10211100        $archive_info->save_info($arcinfo_doc_filename);
    10221101    }
     
    10251104    }
    10261105    }
     1106
     1107
     1108    #
     1109    # Now deal with any file-level document-version history (fldv-history)
     1110    #
     1111
     1112    if ($keepold || $removeold) {
     1113
     1114    &DocHistoryFileUtils::archivedir_keepold_to_archivedir($collectcfg, $keepold, $replaceold, $incremental_mode, $archive_info, $archivedir,$archivedir_keepold);
     1115   
     1116    }
     1117   
     1118   
    10271119    return $pluginfo;
    10281120}
     
    10501142    elsif ($manifest eq '' || $self->{'manifest_version'} == 1)
    10511143    {
     1144    #print STDERR "**** perform_process_files(): importdir=$importdir\n";
     1145    #print STDERR "**** block_hash:\n  ", join("\n  ", keys %{$block_hash}), "\n\n";
     1146    #print STDERR "**** block_hash->all_files:\n  ", join("\n  ", keys %{$block_hash->{'all_files'}}), "\n\n";
     1147    #print STDERR "**** block_hash->reindex_files:\n  ", join("\n  ", keys %{$block_hash->{'reindex_files'}}), "\n\n";
     1148
     1149    #print STDERR "**** block_hash->existing_files:\n  ", join("\n  ", keys %{$block_hash->{'existing_files'}}), "\n\n";
     1150    #print STDERR "**** block_hash->file_blocks:\n  ", join("\n  ", keys %{$block_hash->{'file_blocks'}}), "\n\n";
     1151
    10521152    &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    10531153    }
     
    11971297   
    11981298    my $prev_all_files = $archive_info->{'prev_import_filelist'};
     1299   
    11991300    my $full_prev_all_files = {};
    12001301
     
    12281329    }
    12291330
     1331    ###print STDERR "*** new vs old: look to see if full_curr_file=$full_curr_file in full_prev_all_files hashmap\n";
     1332   
    12301333    # figure out if new file or not
    12311334    if (defined $full_prev_all_files->{$full_curr_file}) {
     
    13351438        #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
    13361439       
    1337         print STDERR "**** Reindexing existing file: $existing_file\n";
     1440        # print STDERR "**** Reindexing existing file: $existing_file\n";
    13381441
    13391442        push(@$reindex_files,$existing_file);
  • main/trunk/greenstone2/perllib/scriptutil.pm

    r20646 r37152  
    3232use gsprintf 'gsprintf';
    3333
    34 # returns $removeold, $keepold
     34# returns ($removeold, $keepold, $incremental, $incremental_mode)
    3535sub check_removeold_and_keepold {
    3636
    3737    my ($removeold, $keepold, $incremental, $dir, $collectcfg) = @_;   
    3838
    39     if (($keepold && $removeold) || ($incremental && $removeold) ) {
    40     gsprintf(STDERR, "{scripts.both_old_options}\n", $dir);
     39    if ($keepold && $removeold) {
     40    gsprintf(STDERR, "{scripts.only_one_old_option}\n");
     41    exit(2)
     42    }
     43
     44    if ($incremental && $removeold) {
     45    gsprintf(STDERR, "{scripts.inc_remove_conflict}\n", $dir);
    4146    sleep(3); #just in case
    4247    return (1,0,0,"none");
     
    4449    }
    4550
     51   
    4652    # Incremental mode may be set to "none", "onlyadd" or "all"
    4753    # depending on status of -keepold and -incremental flags
     
    8086}
    8187
     88
     89# returns ($removeold, $keepold, $replaceold, $incremental, $incremental_mode)
     90sub check_removeold_keepold_replaceold {
     91
     92    my ($removeold, $keepold, $replaceold, $incremental, $dir, $collectcfg) = @_;   
     93
     94    my $old_count = 0;
     95    $old_count++ if $removeold;
     96    $old_count++ if $keepold;
     97    $old_count++ if $replaceold;
     98
     99    if ($old_count>1) {
     100    gsprintf(STDERR, "{scripts.only_one_old_option}\n");
     101    exit(2);
     102    }
     103
     104    if (($incremental && $removeold) ) {
     105    gsprintf(STDERR, "{scripts.inc_remove_conflict}\n", $dir);
     106    sleep(5); #just in case
     107    return (1,0,0,0,"none");   
     108    }
     109
     110    # Determine what the internal 'incremental_mode' is:
     111    # => May be set to "none", "onlyadd" or "all"
     112    # Based on status of (-keepold|-removeold) and -incremental flags
     113    #
     114    # With the introduction of file-level document-version (fldv) history, the chosen name 'onlyadd'
     115    # for when '-keepold' is on is a bit misleading. However, it does still get us "over the line"
     116    # in terms of how it functionally operates.  In the case where pre-existing content is
     117    # still in the 'import' folder, then when everything in 'archives_keepold' gets copied
     118    # back, any pre-existing documents from import (which will have resulted in a doc folder
     119    # in 'archives') will trigger a file-level document-version history folder inside it.  For any
     120    # content that was new in 'import', it won't have a pre-existing folder inside 'archives'
     121    # and so will appear as a new folder with *no* file-level document-version history folder
     122    # inside it (effectively why the keepold incremental mode was originally called 'onlyadd').
     123
     124    my $incremental_mode = "none";
     125    if ($incremental) {
     126    $incremental_mode = "all";
     127    } elsif ($keepold || $replaceold) {
     128    $incremental_mode = "onlyadd";
     129    }
     130
     131    if (!$keepold && !$removeold && !$replaceold && !$incremental && defined $collectcfg) {
     132    # we only look at config file options if we don't have these on the command line
     133    if (defined $collectcfg->{'removeold'} && $collectcfg->{'removeold'} =~ /^true$/i ) {
     134        $removeold = 1;
     135    } elsif (defined $collectcfg->{'keepold'} && $collectcfg->{'keepold'} =~ /^true$/i) {
     136        $keepold = 1;
     137        $incremental_mode = "onlyadd";
     138    } elsif (defined $collectcfg->{'replaceold'} && $collectcfg->{'replaceold'} =~ /^true$/i) {
     139        $replaceold = 1;
     140        $incremental_mode = "onlyadd";
     141    } elsif (defined $collectcfg->{'incremental'} && $collectcfg->{'incremental'} =~ /^true$/i) {
     142        $incremental = 1;
     143        $incremental_mode = "all";
     144    }
     145
     146    # Go through the same checks as before
     147    my $cfg_old_count = 0;
     148    $cfg_old_count++ if $removeold;
     149    $cfg_old_count++ if $keepold;
     150    $cfg_old_count++ if $replaceold;
     151
     152    if ($cfg_old_count>1) {
     153        gsprintf(STDERR, "{scripts.only_one_old_option}\n");
     154        exit(2);
     155    }
     156
     157    if (($incremental && $removeold) ) {
     158        gsprintf(STDERR, "{scripts.inc_remove_conflict}\n", $dir);
     159        sleep(5); #just in case
     160        return (1,0,0,0,"none");   
     161    }
     162    }
     163
     164    # default to -removeold if nothing specified
     165    if (!$keepold && !$removeold && !$replaceold && !$incremental) {
     166    gsprintf(STDERR, "{scripts.no_old_options} \n", $dir);
     167    sleep(5); #just in case
     168    return (1,0,0,0,"none");
     169    }
     170   
     171    # incremental implies keepold
     172    if ($incremental) {
     173    $keepold = 1;
     174    }
     175    return ($removeold, $keepold, $replaceold, $incremental, $incremental_mode);
     176
     177}
     178
     179
     180
    821811;
  • main/trunk/greenstone2/perllib/strings.properties

    r37047 r37152  
    6262
    6363scripts.both_old_options:WARNING: -removeold was specified with -keepold or -incremental, defaulting to -removeold. Current contents of %s directory will be deleted.
     64
     65scripts.inc_remove_conflict:WARNING: -incremental and -removeold were specified. Defaulting to -removeold. Current contents of %s directory will be deleted.
     66
     67scripts.only_one_old_option:Error: conflicting 'old' options: can only specify one of -removeold, -keepold, -replaceold. Exiting.
    6468
    6569scripts.no_old_options:WARNING: None of -removeold, -keepold or -incremental were specified, defaulting to -removeold. Current contents of %s directory will be deleted.
Note: See TracChangeset for help on using the changeset viewer.