Changeset 18440 for gsdl

Show
Ignore:
Timestamp:
01.02.2009 14:35:22 (11 years ago)
Author:
davidb
Message:

Modifications for incremental building to support files that need to be deleted

Location:
gsdl/trunk/bin/script
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/bin/script/export.pl

    r17142 r18440  
    288288    # other vars 
    289289    my ($configfilename, $collection, $collectcfg, 
    290     $export_info_filename, $export_info,  
     290    $expinfo_doc_filename, $export_info,  
    291291    $gs_mode,  
    292292    $processor, $pluginfo); 
     
    498498    # the plugouts should be doing this!! 
    499499    if ($saveas eq "DSpace"){ 
    500     $export_info_filename = &util::filename_cat ($exportdir, "contents"); 
     500    $expinfo_doc_filename = &util::filename_cat ($exportdir, "contents"); 
    501501    } elsif ($saveas =~ m/^.*METS$/ || $saveas eq "MARC" ) { 
    502     $export_info_filename = &util::filename_cat ($exportdir, "export.inf"); 
     502##  $expinfo_doc_filename = &util::filename_cat ($exportdir, "export.inf"); 
     503    my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb"; 
     504    my $doc_db = "archiveinf-doc$db_ext"; 
     505    $expinfo_doc_filename = &util::filename_cat ($exportdir, $doc_db); 
    503506    } 
    504507         
    505508    $export_info = new arcinfo(); 
    506     $export_info -> load_info ($export_info_filename);   
     509    $export_info -> load_info ($expinfo_doc_filename);   
    507510         
    508511    my ($plugout);  
     
    550553    else { 
    551554    # process any files marked for exporting 
    552     foreach my $file (keys %{$manifest_lookup->{'export'}}) { 
     555    foreach my $file (keys %{$manifest_lookup->{'index'}}) { 
    553556        &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli); 
    554557    } 
     
    581584    #$processor->close_file_output() if $groupsize > 1; 
    582585    $processor->close_group_output() if $processor->is_group(); 
    583     # why do we need this?? 
    584     if ($saveas =~ m/^.*METS$/) { 
    585     $export_info->save_info($export_info_filename); 
     586 
     587    if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) { 
     588    # Not all export types need this (e.g. DSpace) 
     589    $export_info->save_info($expinfo_doc_filename); 
    586590    } 
    587591         
  • gsdl/trunk/bin/script/import.pl

    r17751 r18440  
    287287    # other vars 
    288288    my ($configfilename, $collection, $collectcfg,  
    289     $archive_info_filename, $archive_info,  
     289    $arcinfo_doc_filename, $arcinfo_src_filename, $archive_info,  
    290290    $gs_mode, 
    291291    $processor, $pluginfo); 
     
    518518 
    519519    # read the archive information file 
    520     $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf"); 
     520##  $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf"); 
     521 
     522    my $db_ext = &util::is_little_endian() ? ".ldb" : ".bdb"; 
     523    my $doc_db = "archiveinf-doc$db_ext"; 
     524    my $src_db = "archiveinf-src$db_ext"; 
     525    $arcinfo_doc_filename = &util::filename_cat ($archivedir, $doc_db); 
     526    $arcinfo_src_filename = &util::filename_cat ($archivedir, $src_db); 
    521527     
    522528    $archive_info = new arcinfo (); 
    523     $archive_info->load_info ($archive_info_filename); 
     529    $archive_info->load_info ($arcinfo_doc_filename); 
    524530    if ($reversesort) { 
    525531    $archive_info->reverse_sort(); 
     532    } 
     533 
     534    if ($manifest eq "") { 
     535    # Load in list of files in import folder from last import (if present) 
     536    $archive_info->load_import_filelist ($arcinfo_src_filename); 
    526537    } 
    527538 
     
    560571    # gobal blocking pass may set up some metadata 
    561572    &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli); 
     573    # Can now work out which files were deleted 
     574 
     575    # First convert all files to absolute form 
     576    # This is to support the situation where the import folder is not 
     577    # the default 
     578 
     579    my $prev_all_files = $archive_info->{'import_filelist'}; 
     580    foreach my $prev_file (keys %$prev_all_files) { 
     581 
     582        if (!&util::filename_is_absolute($prev_file)) { 
     583        my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file); 
     584        delete $prev_all_files->{$prev_file}; 
     585        $prev_all_files->{$full_prev_file} = 1; 
     586        } 
     587    } 
     588 
     589    # Figure out which are the new files, existing files and so 
     590    # by implication the files from the previous import that are not 
     591    # there any more => mark them for deletion 
     592    foreach my $curr_file (keys %{$block_hash->{'all_files'}}) { 
     593 
     594        my $full_curr_file = $curr_file; 
     595 
     596        if (!&util::filename_is_absolute($curr_file)) { 
     597        # add in import dir to make absolute 
     598        $full_curr_file = &util::filename_cat($importdir,$curr_file); 
     599        } 
     600 
     601##      print STDERR "**** Checking $curr_file\n"; 
     602 
     603        # figure of if new file or not 
     604        if (defined $prev_all_files->{$full_curr_file}) { 
     605        # had it before 
     606        $block_hash->{'existing_files'}->{$curr_file} = 1; 
     607        # Now remove it, so by end of loop only the files 
     608        # that need deleting are left 
     609 
     610        delete $prev_all_files->{$full_curr_file} 
     611        } 
     612        else { 
     613        $block_hash->{'new_files'}->{$curr_file} = 1; 
     614        } 
     615 
     616        delete $block_hash->{'all_files'}->{$curr_file}; 
     617    } 
     618 
     619    print STDERR "Delete files:\n  "; 
     620 
     621    my @delete_files = keys %$prev_all_files; 
     622    print STDERR join("\n  ",@delete_files), "\n"; 
     623 
    562624    &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 
    563625    } 
     
    565627    { 
    566628    # process any files marked for importing 
    567     foreach my $file (keys %{$manifest_lookup->{'import'}}) { 
     629    foreach my $file (keys %{$manifest_lookup->{'index'}}) { 
    568630        &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli); 
    569631    } 
     
    571633    # record files marked for deletion in arcinfo 
    572634    foreach my $file (keys %{$manifest_lookup->{'delete'}}) { 
    573         # consider finding it? 
     635        # use 'archiveinf-src' GDBM file to look up all the OIDs 
     636        # this file is used in (note in most cases, it's just one OID) 
     637 
     638        # An improvement would be to have the record read 
     639        # into a hash array 
     640        # gdbmRecordToHash 
     641 
     642        my $gdbm_val  
     643        = &GDBMUtil::gdbmDatabaseGet($arcinfo_src_filename,$file); 
     644 
     645        my @oids = ($gdbm_val =~ m/^<oid>(.*)$/gm); 
     646        foreach my $oid (@oids) { 
     647 
     648        # find out if it's an assoc file or main doc 
     649 
     650        # archiveinf-doc, lookup $oid 
     651        # if "doc-file"  
     652        #   mark it for deletion 
     653 
     654        # else (assoc file) 
     655        #  mark all for re-indexing 
     656 
     657        # Now delete file  
     658        } 
     659 
    574660        # $archive_info->add_info($OID,$doc_xml_file,"D"); 
    575661    } 
     
    585671 
    586672# The following 'if' statement is in the export.pl version of the script, 
    587 # but not (so far) the import.pl version.  Why is this? 
    588 ##    if ($saveas =~ m/^.*METS$/) { 
     673# The reason for the 'if' statement is now given in export.pl 
     674# Unclear at this point if the same should be done here 
     675##    if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) { 
     676    # Not all export types need this (e.g. DSpace) 
     677 
    589678    # should we still do this in debug mode?? 
    590679 
    591     $archive_info->save_info($archive_info_filename); 
     680    # for backwards compatability with archvies.inf file 
     681    if ($arcinfo_doc_filename =~ m/\.inf$/) { 
     682    $archive_info->save_info($arcinfo_doc_filename); 
     683    } 
     684 
    592685##    } 
    593686     
  • gsdl/trunk/bin/script/lucene_passes.pl

    r16264 r18440  
    131131        $doc_xml = ""; 
    132132        $output_filename = ""; 
     133    } 
     134    elsif ($line =~ m/<\/Delete>\s*$/) { 
     135        if ($mode eq "index") { 
     136        $doc_xml =~ s/\n+/\n/g; 
     137 
     138        # notify lucene indexer 
     139        print PIPEOUT "$doc_xml"; 
     140        } 
     141        $doc_xml = ""; 
    133142    } 
    134143    }