Ignore:
Timestamp:
2009-02-06T18:19:44+13:00 (15 years ago)
Author:
davidb
Message:

Support for reindexing a document added

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/inexport.pm

    r18457 r18469  
    2727
    2828use strict;
     29
     30use File::Basename;
    2931
    3032use util;
     
    5355    }
    5456
     57
    5558    # Figure out which are the new files, existing files and so
    5659    # by implication the files from the previous import that are not
     
    6871    }
    6972
     73    if (defined $block_hash->{'file_blocks'}->{$full_curr_file}) {
     74        # If in block list, we want to ignore it
     75        delete $block_hash->{'all_files'}->{$curr_file};
     76
     77        if (defined $full_prev_all_files->{$full_curr_file}) {
     78        # also make sure it is gone from 'previous' list so
     79        # not mistaken for a file that needs to be deleted
     80        delete $full_prev_all_files->{$full_curr_file};
     81        }
     82        next;
     83    }
     84
    7085    # figure of if new file or not
    7186    if (defined $full_prev_all_files->{$full_curr_file}) {
     87       
    7288        # had it before
    73         $block_hash->{'existing_files'}->{$curr_file} = 1;
     89        $block_hash->{'existing_files'}->{$full_curr_file} = 1;
     90
    7491        # Now remove it, so by end of loop only the files
    7592        # that need deleting are left
     
    7895        }
    7996    else {
    80         $block_hash->{'new_files'}->{$curr_file} = 1;
     97        $block_hash->{'new_files'}->{$full_curr_file} = 1;
    8198    }
    8299   
     
    84101    }
    85102
    86     # By this point full_prev_all_files contains only the files
    87     # that are not in the current import folder => i.e. files
    88     # to be deleted
     103    # By this point full_prev_all_files contains the files
     104    # mentioned in archiveinf-src.db but are not in the 'import'
     105    # folder (or whatever was specified through -importdir ...)
     106
     107    # This list can contain files that were created in the 'tmp' or
     108    # 'cache' areas (such as screen-size and thumbnail images).
    89109    #
    90     # The value in each key is its "local" import file name, which is what
    91     # we want to use
     110    # In building the final list of files to delete, we test to see if
     111    # it exists on the filesystem and if it does (unusual for a file
     112    # that's allegedly deleted!) , supress it from going into the final
     113    # list
     114
     115    my $collectdir = $ENV{'GSDLCOLLECTDIR'};
     116
    92117    my @deleted_files = values %$full_prev_all_files;
    93     map { $block_hash->{'deleted_files'}->{$_} = 1 } @deleted_files;
     118    map { my $curr_file = $_;
     119      my $full_curr_file = $curr_file;
     120
     121      if (!&util::filename_is_absolute($curr_file)) {
     122          # add in import dir to make absolute
     123
     124          $full_curr_file = &util::filename_cat($collectdir,$curr_file);
     125      }
     126
     127
     128      if (!-e $full_curr_file) {
     129          $block_hash->{'deleted_files'}->{$curr_file} = 1;
     130      }
     131      } @deleted_files;
    94132}
    95133
     
    110148    # this file is used in (note in most cases, it's just one OID)
    111149   
    112     # An improvement would be to have the record read
    113     # into a hash array
    114150    my $src_rec = GDBMUtils::gdbmRecordToHash($arcinfo_src_filename,$file);
    115151    my $oids = $src_rec->{'oid'};
    116152    foreach my $oid (@$oids) {
    117153
    118         # find out if it's an assoc file or main doc
     154        # Find out if it's an assoc file or main doc
    119155
    120156        my $doc_rec = GDBMUtils::gdbmRecordToHash($arcinfo_doc_filename,$oid);
    121 ##      print STDERR "file = $file\n";
    122 
    123157        if ($doc_rec->{'src-file'}->[0] eq $file) {
    124         # mark it for deletion
     158        # It's the main doc
     159        # => mark it for deletion
     160   
    125161        if ($verbosity>1) {
    126             print STDERR "$oid marked to be deleted\n";
     162            print STDERR "$oid marked to be deleted from index on next buildcol.pl\n";
    127163        }
    128164        $archive_info->set_status_info($oid,"D");
    129165
    130166        my $val = &GDBMUtils::gdbmDatabaseGet($arcinfo_doc_filename,$oid);
    131         $val =~ s/^<index-status>(.*)$/<index-status>D/m;
    132         &GDBMUtils::gdbmDatabaseSet($arcinfo_doc_filename,$oid,$val);
     167        my ($index_status) = ($val =~ m/^<index-status>(.*)$/m);
     168        if ($index_status ne "D") {
     169            $val =~ s/^<index-status>(.*)$/<index-status>D/m;
     170            &GDBMUtils::gdbmDatabaseSet($arcinfo_doc_filename,$oid,$val);
     171            my $doc_file = $doc_rec->{'doc-file'}->[0];
     172
     173            my $doc_filename = &util::filename_cat($archivedir,$doc_file);
     174
     175
     176            my ($doc_tailname, $doc_dirname, $suffix)
     177            = File::Basename::fileparse($doc_filename, "\\.[^\\.]+\$");
     178
     179            print STDERR "Removing $doc_dirname\n" if ($verbosity>2);
     180
     181            &util::rm_r($doc_dirname);
     182           
     183        }
    133184        }
    134185        else {
     
    149200        }
    150201        }
     202
     203        GDBMUtils::gdbmDatabaseRemove($arcinfo_src_filename,$file);
    151204    }
    152205    }
     
    155208
    156209
     210sub mark_docs_for_reindex
     211{
     212    my ($archive_info,$existing_files_ref,$archivedir,$verbosity) = @_;
     213
     214    # Reindexing is accomplished by deleting the previously indexed
     215    # version of the document, and then allowing the new version to
     216    # be indexed (as would a new document be indexed).
     217    #
     218    # The first step (marking for deletion) is implemented by this routine.
     219    #
     220    # By default in Greenstone a new version of an index will hash to
     221    # a new unique OID, and the above strategy of reindex=delete+add
     222    # works fine.  A special case arises when a persistent OID is
     223    # allocated to a document (for instance through a metadata field),
     224    # and the second step to reindexing (see XXXX) detects this and
     225    # deals with it appropriately.
     226
     227    my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb";
     228    my $doc_db = "archiveinf-doc$db_ext";
     229    my $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db);
     230
     231
     232    my $archiveinf_timestamp = -M $arcinfo_doc_filename;
     233
     234    my $reindex_files_ref = [];
     235
     236    foreach my $existing_filename (@$existing_files_ref) {
     237   
     238    if (-M $existing_filename < $archiveinf_timestamp) {
     239        # file is newer than last build
     240       
     241        my $existing_file = $existing_filename;
     242        my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
     243
     244        $existing_file =~ s/^$collectdir(\\|\/)?//;
     245       
     246        print STDERR "**** Deleting existing file: $existing_file\n";
     247
     248        push(@$reindex_files_ref,$existing_file);
     249    }
     250
     251    }
     252   
     253    mark_docs_for_deletion($archive_info,$reindex_files_ref,$archivedir,$verbosity);
     254
     255    return @$reindex_files_ref;
     256}
     257
     258
     259
    1572601;
Note: See TracChangeset for help on using the changeset viewer.