Changeset 18440 for gsdl


Ignore:
Timestamp:
02/01/09 14:35:22 (12 years ago)
Author:
davidb
Message:

Modifications for incremental building to support files that need to be deleted

Location:
gsdl/trunk/bin/script
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/bin/script/export.pl

    r17142 r18440  
    288288    # other vars
    289289    my ($configfilename, $collection, $collectcfg,
    290     $export_info_filename, $export_info,
     290    $expinfo_doc_filename, $export_info,
    291291    $gs_mode,
    292292    $processor, $pluginfo);
     
    498498    # the plugouts should be doing this!!
    499499    if ($saveas eq "DSpace"){
    500     $export_info_filename = &util::filename_cat ($exportdir, "contents");
     500    $expinfo_doc_filename = &util::filename_cat ($exportdir, "contents");
    501501    } elsif ($saveas =~ m/^.*METS$/ || $saveas eq "MARC" ) {
    502     $export_info_filename = &util::filename_cat ($exportdir, "export.inf");
     502##  $expinfo_doc_filename = &util::filename_cat ($exportdir, "export.inf");
     503    my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb";
     504    my $doc_db = "archiveinf-doc$db_ext";
     505    $expinfo_doc_filename = &util::filename_cat ($exportdir, $doc_db);
    503506    }
    504507       
    505508    $export_info = new arcinfo();
    506     $export_info -> load_info ($export_info_filename); 
     509    $export_info -> load_info ($expinfo_doc_filename); 
    507510       
    508511    my ($plugout);
     
    550553    else {
    551554    # process any files marked for exporting
    552     foreach my $file (keys %{$manifest_lookup->{'export'}}) {
     555    foreach my $file (keys %{$manifest_lookup->{'index'}}) {
    553556        &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
    554557    }
     
    581584    #$processor->close_file_output() if $groupsize > 1;
    582585    $processor->close_group_output() if $processor->is_group();
    583     # why do we need this??
    584     if ($saveas =~ m/^.*METS$/) {
    585     $export_info->save_info($export_info_filename);
     586
     587    if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) {
     588    # Not all export types need this (e.g. DSpace)
     589    $export_info->save_info($expinfo_doc_filename);
    586590    }
    587591       
  • gsdl/trunk/bin/script/import.pl

    r17751 r18440  
    287287    # other vars
    288288    my ($configfilename, $collection, $collectcfg,
    289     $archive_info_filename, $archive_info,
     289    $arcinfo_doc_filename, $arcinfo_src_filename, $archive_info,
    290290    $gs_mode,
    291291    $processor, $pluginfo);
     
    518518
    519519    # read the archive information file
    520     $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
     520##  $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf");
     521
     522    my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb";
     523    my $doc_db = "archiveinf-doc$db_ext";
     524    my $src_db = "archiveinf-src$db_ext";
     525    $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db);
     526    $arcinfo_src_filename = &util::filename_cat ($archivedir, $src_db);
    521527   
    522528    $archive_info = new arcinfo ();
    523     $archive_info->load_info ($archive_info_filename);
     529    $archive_info->load_info ($arcinfo_doc_filename);
    524530    if ($reversesort) {
    525531    $archive_info->reverse_sort();
     532    }
     533
     534    if ($manifest eq "") {
     535    # Load in list of files in import folder from last import (if present)
     536    $archive_info->load_import_filelist ($arcinfo_src_filename);
    526537    }
    527538
     
    560571    # gobal blocking pass may set up some metadata
    561572    &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
     573    # Can now work out which files were deleted
     574
     575    # First convert all files to absolute form
     576    # This is to support the situation where the import folder is not
     577    # the default
     578
     579    my $prev_all_files = $archive_info->{'import_filelist'};
     580    foreach my $prev_file (keys %$prev_all_files) {
     581
     582        if (!&util::filename_is_absolute($prev_file)) {
     583        my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
     584        delete $prev_all_files->{$prev_file};
     585        $prev_all_files->{$full_prev_file} = 1;
     586        }
     587    }
     588
     589    # Figure out which are the new files, existing files and so
     590    # by implication the files from the previous import that are not
     591    # there any more => mark them for deletion
     592    foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
     593
     594        my $full_curr_file = $curr_file;
     595
     596        if (!&util::filename_is_absolute($curr_file)) {
     597        # add in import dir to make absolute
     598        $full_curr_file = &util::filename_cat($importdir,$curr_file);
     599        }
     600
     601##      print STDERR "**** Checking $curr_file\n";
     602
     603        # figure of if new file or not
     604        if (defined $prev_all_files->{$full_curr_file}) {
     605        # had it before
     606        $block_hash->{'existing_files'}->{$curr_file} = 1;
     607        # Now remove it, so by end of loop only the files
     608        # that need deleting are left
     609
     610        delete $prev_all_files->{$full_curr_file}
     611        }
     612        else {
     613        $block_hash->{'new_files'}->{$curr_file} = 1;
     614        }
     615
     616        delete $block_hash->{'all_files'}->{$curr_file};
     617    }
     618
     619    print STDERR "Delete files:\n  ";
     620
     621    my @delete_files = keys %$prev_all_files;
     622    print STDERR join("\n  ",@delete_files), "\n";
     623
    562624    &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    563625    }
     
    565627    {
    566628    # process any files marked for importing
    567     foreach my $file (keys %{$manifest_lookup->{'import'}}) {
     629    foreach my $file (keys %{$manifest_lookup->{'index'}}) {
    568630        &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
    569631    }
     
    571633    # record files marked for deletion in arcinfo
    572634    foreach my $file (keys %{$manifest_lookup->{'delete'}}) {
    573         # consider finding it?
     635        # use 'archiveinf-src' GDBM file to look up all the OIDs
     636        # this file is used in (note in most cases, it's just one OID)
     637
     638        # An improvement would be to have the record read
     639        # into a hash array
     640        # gdbmRecordToHash
     641
     642        my $gdbm_val
     643        = &GDBMUtil::gdbmDatabaseGet($arcinfo_src_filename,$file);
     644
     645        my @oids = ($gdbm_val =~ m/^<oid>(.*)$/gm);
     646        foreach my $oid (@oids) {
     647
     648        # find out if it's an assoc file or main doc
     649
     650        # archiveinf-doc, lookup $oid
     651        # if "doc-file"
     652        #   mark it for deletion
     653
     654        # else (assoc file)
     655        #  mark all for re-indexing
     656
     657        # Now delete file
     658        }
     659
    574660        # $archive_info->add_info($OID,$doc_xml_file,"D");
    575661    }
     
    585671
    586672# The following 'if' statement is in the export.pl version of the script,
    587 # but not (so far) the import.pl version.  Why is this?
    588 ##    if ($saveas =~ m/^.*METS$/) {
     673# The reason for the 'if' statement is now given in export.pl
     674# Unclear at this point if the same should be done here
     675##    if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) {
     676    # Not all export types need this (e.g. DSpace)
     677
    589678    # should we still do this in debug mode??
    590679
    591     $archive_info->save_info($archive_info_filename);
     680    # for backwards compatability with archvies.inf file
     681    if ($arcinfo_doc_filename =~ m/\.inf$/) {
     682    $archive_info->save_info($arcinfo_doc_filename);
     683    }
     684
    592685##    }
    593686   
  • gsdl/trunk/bin/script/lucene_passes.pl

    r16264 r18440  
    131131        $doc_xml = "";
    132132        $output_filename = "";
     133    }
     134    elsif ($line =~ m/<\/Delete>\s*$/) {
     135        if ($mode eq "index") {
     136        $doc_xml =~ s/\n+/\n/g;
     137
     138        # notify lucene indexer
     139        print PIPEOUT "$doc_xml";
     140        }
     141        $doc_xml = "";
    133142    }
    134143    }
Note: See TracChangeset for help on using the changeset viewer.