Changeset 20616 for gsdl/trunk


Ignore:
Timestamp:
2009-09-16T15:55:57+12:00 (15 years ago)
Author:
kjdon
Message:

first step at bring export into line with import regarding incremental export

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/bin/script/export.pl

    r20571 r20616  
    2727
    2828
    29 # This program will export a particular collection into a specific Format (e.g. METS or DSpace)
     29# This program will export a particular collection into a specific Format (e.g. METS or DSpace) by importing then saving as a different format.
    3030
    3131package export;
     
    3636    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
    3737    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
     38    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan/perl-5.8");
    3839    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
    3940    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
     
    6162use plugout;
    6263use manifest;
     64use inexport;
    6365use util;
    6466use scriptutil;
     
    143145    'reqd' => "no",
    144146        'hiddengli' => "yes" },
    145       { 'name' => "listall",
    146     'desc' => "{export.listall}",
    147     'type' => "flag",
    148     'reqd' => "no" },
    149147      { 'name' => "debug",
    150148    'desc' => "{export.debug}",
     
    159157        'modegli' => "3" },
    160158      # does this make sense?
    161 #      { 'name' => "incremental",
    162 #   'desc' => "{import.incremental}",
    163 #   'type' => "flag",
    164 #   'hiddengli' => "yes" },
     159      { 'name' => "incremental",
     160    'desc' => "{import.incremental}",
     161    'type' => "flag",
     162    'hiddengli' => "yes" },
    165163      { 'name' => "keepold",
    166164    'desc' => "{export.keepold}",
     
    172170    'type' => "flag",
    173171    'reqd' => "no",
    174     'modegli' => "3" },
     172    'hiddengli' => "yes" },
    175173      { 'name' => "language",
    176174    'desc' => "{scripts.language}",
    177175    'type' => "string",
    178176    'reqd' => "no",
    179     'modegli' => "3" },
     177    'hiddengli' => "yes" },
    180178      { 'name' => "maxdocs",
    181179    'desc' => "{export.maxdocs}",
     
    254252    'reqd' => "no",
    255253    'hiddengli' => "yes" },
     254      { 'name' => "listall",
     255    'desc' => "{export.listall}",
     256    'type' => "flag",
     257    'reqd' => "no" },
    256258      { 'name' => "xml",
    257259    'desc' => "{scripts.xml}",
     
    280282    $OIDtype, $OIDmetadata,
    281283    $maxdocs, $statsfile,
     284    $gzip,
    282285    $out, $faillog, $gli, $listall,
    283286    # plugout specific ones
     
    289292    # other vars
    290293    my ($configfilename, $collection, $collectcfg,
    291     $expinfo_doc_filename, $export_info,
     294    $expinfo_doc_filename, $expinfo_src_filename, $export_info,
    292295    $gs_mode,
    293296    $processor, $pluginfo);
     
    311314    }
    312315
    313    
    314     # these are options used by other things - we just set default values
    315     # undef means will be set from config file if there
    316     my $gzip = undef;
    317 
     316   
    318317    # If $language has been specified, load the appropriate resource bundle
    319318    # (Otherwise, the default resource bundle will be loaded automatically)
     
    420419    }
    421420   
     421    if (defined $collectcfg->{'manifest'} && $manifest eq "") {
     422    $manifest = $collectcfg->{'manifest'};
     423    }
    422424    if (defined $collectcfg->{'gzip'} && !$gzip) {
    423425    if ($collectcfg->{'gzip'} =~ /^true$/i) {
     
    433435    }
    434436   
     437    # groupsize is in import - does it make sense here??
     438
    435439    if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/)) {
    436440    if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
     
    441445    }
    442446
     447    if ((!defined $OIDmetadata) || ($OIDmetadata eq "")) {
     448    if (defined $collectcfg->{'OIDmetadata'}) {
     449        $OIDmetadata = $collectcfg->{'OIDmetadata'};
     450    } else {
     451        $OIDmetadata = "dc.Identifier"; # the default
     452    }
     453    }
     454
    443455    if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
    444456    $debug = 1;
     
    496508    } elsif ($saveas =~ m/^.*METS$/ || $saveas eq "MARCXML" ) {
    497509##  $expinfo_doc_filename = &util::filename_cat ($exportdir, "export.inf");
    498     my $doc_db = "archiveinf-doc";
    499     $expinfo_doc_filename = &util::filename_cat ($exportdir, $doc_db);
     510    $expinfo_doc_filename = &util::filename_cat ($exportdir,"archiveinf-doc" );
    500511    &util::rename_gdbm_file($expinfo_doc_filename); # ensures gdb in case we have an existing legacy ldb one - can this happen?
    501512    $expinfo_doc_filename .= ".gdb";
     513   
     514    $expinfo_src_filename = &util::filename_cat ($exportdir,"archiveinf-src" );
     515    &util::rename_gdbm_file($expinfo_src_filename); # ensures gdb in case we have an existing legacy ldb one - can this happen?
     516    $expinfo_src_filename .= ".gdb";
    502517
    503518    }
     
    506521    $export_info -> load_info ($expinfo_doc_filename); 
    507522       
     523    if ($manifest eq "") {
     524    # Load in list of files in export folder from last export (if present)
     525    $export_info->load_prev_import_filelist ($expinfo_src_filename);
     526    }
     527   
    508528    my ($plugout);
    509529    if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
     
    511531    }
    512532    else{
    513     if ($saveas !~ /^(.*METS|DSpace|MARCXML)$/) {
     533    if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
    514534        push @$plugout,"GreenstoneMETSPlugout";
    515535    }
     
    524544    push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
    525545    push @$plugout,("-debug") if ($debug);
    526     push @$plugout,("-gzip_output",$gzip) if (defined $gzip);
     546    push @$plugout,("-gzip_output") if ($gzip);
    527547    push @$plugout,("-output_handle",$out) if (defined $out);
    528548    push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
     
    546566    # gobal blocking pass may set up some metadata
    547567    &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
    548     &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
     568    #&plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
     569    ### section below copied from import.pl
     570    if ($incremental) {
     571        # equivalent to saying ($keepold && ($incremental_mode eq "all"))
     572
     573        &inexport::prime_doc_oid_count($exportdir);
     574
     575
     576        # Can now work out which files were new, already existed, and have
     577        # been deleted
     578       
     579        &inexport::new_vs_old_import_diff($export_info,$block_hash,$importdir,
     580                          $exportdir,$verbosity,$incremental_mode);
     581       
     582        my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
     583        # Filter out any in gsdl/tmp area
     584        my @filtered_deleted_files = ();
     585        my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
     586        my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
     587        $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
     588        $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
     589
     590
     591        foreach my $df (@deleted_files) {
     592        next if ($df =~ m/^$gsdl_tmp_area/);
     593        next if ($df =~ m/^$collect_tmp_area/);
     594       
     595        push(@filtered_deleted_files,$df);
     596        }
     597
     598       
     599        @deleted_files = @filtered_deleted_files;
     600
     601        if (scalar(@deleted_files>0)) {
     602        print STDERR "Files deleted since last import:\n  ";
     603        print STDERR join("\n  ",@deleted_files), "\n";
     604        }
     605       
     606        my @new_files = sort keys %{$block_hash->{'new_files'}};
     607        if (scalar(@new_files>0)) {
     608        print STDERR "New files since last import:\n  ";
     609        print STDERR join("\n  ",@new_files), "\n";
     610        }
     611       
     612        &inexport::mark_docs_for_deletion($export_info,$block_hash,\@deleted_files,
     613                          $exportdir,$verbosity);
     614
     615        &inexport::mark_docs_for_reindex($export_info,$block_hash,
     616                         $exportdir,$verbosity);
     617
     618        my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
     619
     620        if (scalar(@reindex_files>0)) {
     621        print STDERR "Files to reindex since last import:\n  ";
     622        print STDERR join("\n  ",@reindex_files), "\n";
     623        }
     624
     625
     626        # not sure if the following will work -- will the metadata data-structure be correctly initialized
     627        # in the right order?
     628#       foreach my $file (@new_files, @reindex_files) {
     629#       &plugin::read ($pluginfo, $importdir, $file, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
     630#       }
     631
     632
     633        # Play it safe, and run through the entire folder, only processing new or edited files
     634        &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
     635
     636    }
     637    else {
     638        &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
     639    }
     640
     641    ### end copy
    549642    }
    550643    else {
     
    553646        &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
    554647    }
     648
     649    my @deleted_files = keys %{$manifest_lookup->{'delete'}};
     650
     651    &inexport::mark_docs_for_deletion($export_info,{},\@deleted_files,$exportdir);
     652
    555653    }
    556654
     
    578676    &plugin::deinit($pluginfo, $processor);
    579677       
     678    # Store the value of OIDCount (used in doc.pm) so it can be
     679    # restored correctly to this value on an incremental build
     680    &inexport::store_doc_oid_count($exportdir);
     681
    580682    # write out the export information file
    581683    #$processor->close_file_output() if $groupsize > 1;
     
    587689##  $export_info->save_info($expinfo_doc_filename);
    588690#    }
    589 
    590     my $expinfo_src_filename = &util::filename_cat ($exportdir, "archiveinf-src");   
    591     &util::rename_gdbm_file($expinfo_src_filename); # ensures gdb
    592     $expinfo_src_filename .= ".gdb";
    593691
    594692
Note: See TracChangeset for help on using the changeset viewer.