Ignore:
Timestamp:
2009-02-03T09:48:19+13:00 (15 years ago)
Author:
davidb
Message:

Additions to support the deleting of documents from the index. Only works for indexers that support incremental building, e.g. lucene

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/bin/script/import.pl

    r18440 r18456  
    6363use plugout;
    6464use manifest;
     65use inexport;
    6566use util;
    6667use scriptutil;
     
    534535    if ($manifest eq "") {
    535536    # Load in list of files in import folder from last import (if present)
    536     $archive_info->load_import_filelist ($arcinfo_src_filename);
     537    $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
    537538    }
    538539
     
    571572    # gobal blocking pass may set up some metadata
    572573    &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
    573     # Can now work out which files were deleted
    574 
    575     # First convert all files to absolute form
    576     # This is to support the situation where the import folder is not
    577     # the default
    578 
    579     my $prev_all_files = $archive_info->{'import_filelist'};
    580     foreach my $prev_file (keys %$prev_all_files) {
    581 
    582         if (!&util::filename_is_absolute($prev_file)) {
    583         my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
    584         delete $prev_all_files->{$prev_file};
    585         $prev_all_files->{$full_prev_file} = 1;
    586         }
    587     }
    588 
    589     # Figure out which are the new files, existing files and so
    590     # by implication the files from the previous import that are not
    591     # there any more => mark them for deletion
    592     foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
    593 
    594         my $full_curr_file = $curr_file;
    595 
    596         if (!&util::filename_is_absolute($curr_file)) {
    597         # add in import dir to make absolute
    598         $full_curr_file = &util::filename_cat($importdir,$curr_file);
    599         }
    600 
    601 ##      print STDERR "**** Checking $curr_file\n";
    602 
    603         # figure of if new file or not
    604         if (defined $prev_all_files->{$full_curr_file}) {
    605         # had it before
    606         $block_hash->{'existing_files'}->{$curr_file} = 1;
    607         # Now remove it, so by end of loop only the files
    608         # that need deleting are left
    609 
    610         delete $prev_all_files->{$full_curr_file}
    611         }
    612         else {
    613         $block_hash->{'new_files'}->{$curr_file} = 1;
    614         }
    615 
    616         delete $block_hash->{'all_files'}->{$curr_file};
    617     }
    618 
    619     print STDERR "Delete files:\n  ";
    620 
    621     my @delete_files = keys %$prev_all_files;
    622     print STDERR join("\n  ",@delete_files), "\n";
     574    # Can now work out which files were new, already existed, and have
     575    # been deleted
     576
     577    &inexport::new_vs_old_import_diff($archive_info,$block_hash,$importdir);
     578   
     579    my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
     580    if (scalar(@deleted_files>0)) {
     581        print STDERR "Delete files:\n  ";
     582        print STDERR join("\n  ",@deleted_files), "\n";
     583    }
     584
     585    my @new_files = sort keys %{$block_hash->{'new_files'}};
     586    if (scalar(@new_files>0)) {
     587        print STDERR "New files:\n  ";
     588        print STDERR join("\n  ",@new_files), "\n";
     589    }
     590
     591    &inexport::mark_docs_for_deletion($archive_info,\@deleted_files,$archivedir,
     592        $verbosity);
    623593
    624594    &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    625595    }
    626     else
     596                   else
    627597    {
    628598    # process any files marked for importing
     
    631601    }
    632602
    633     # record files marked for deletion in arcinfo
    634     foreach my $file (keys %{$manifest_lookup->{'delete'}}) {
    635         # use 'archiveinf-src' GDBM file to look up all the OIDs
    636         # this file is used in (note in most cases, it's just one OID)
    637 
    638         # An improvement would be to have the record read
    639         # into a hash array
    640         # gdbmRecordToHash
    641 
    642         my $gdbm_val
    643         = &GDBMUtil::gdbmDatabaseGet($arcinfo_src_filename,$file);
    644 
    645         my @oids = ($gdbm_val =~ m/^<oid>(.*)$/gm);
    646         foreach my $oid (@oids) {
    647 
    648         # find out if it's an assoc file or main doc
    649 
    650         # archiveinf-doc, lookup $oid
    651         # if "doc-file"
    652         #   mark it for deletion
    653 
    654         # else (assoc file)
    655         #  mark all for re-indexing
    656 
    657         # Now delete file
    658         }
    659 
    660         # $archive_info->add_info($OID,$doc_xml_file,"D");
    661     }
     603    my @deleted_files = keys %{$manifest_lookup->{'delete'}};
     604
     605    &inexport::mark_docs_for_deletion($archive_info,\@deleted_files,$archivedir);
    662606    }
    663607
Note: See TracChangeset for help on using the changeset viewer.