Changeset 22413


Ignore:
Timestamp:
2010-07-16T14:13:01+12:00 (12 years ago)
Author:
davidb
Message:

Initial pass at getting the main code to import.pl (and the very similar export.pl) structured as a shared module

Location:
main/trunk/greenstone2
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/import.pl

    r22037 r22413  
    6464
    6565use strict;
    66 no strict 'refs'; # allow filehandles to be variables and vice versa
    67 no strict 'subs'; # allow barewords (eg STDERR) as function arguments
    68 
    69 use arcinfo;
    70 use colcfg;
    71 use dbutil;
    72 use plugin;
    73 use plugout;
    74 use manifest;
    7566use inexport;
    76 use util;
    77 use scriptutil;
    78 use FileHandle;
    79 use gsprintf 'gsprintf';
    80 use printusage;
    81 use parse2;
    82 
    83 
    8467
    8568my $oidtype_list =
     
    258241    'range' => "0,",
    259242    # parsearg left "" as default
    260     #'deft' => "2",
     243    'deft' => "2",
    261244    'reqd' => "no",
    262245    'modegli' => "3" },
     
    277260
    278261
     262
     263sub main
     264{
     265    my $inexport = new inexport(\@ARGV,$options);
     266   
     267    my $collection = $inexport->get_collection();
     268    my $collect_cfg = $inexport->read_collection_cfg($collection,$options);   
     269    $inexport->set_collection_options("import",$collect_cfg);
     270   
     271    my $pluginfo = $inexport->process_files("import",$collect_cfg);
     272
     273    $inexport->generate_statistics("import",$pluginfo);
     274}
     275
     276
    279277&main();
    280 
    281 sub main {
    282     # params
    283     my ($language, $verbosity, $debug,
    284     $collectdir, $importdir, $archivedir, $site, $manifest,
    285     $incremental, $incremental_mode, $keepold, $removeold,
    286     $saveas,
    287     $OIDtype, $OIDmetadata,
    288     $maxdocs, $statsfile,
    289     $out, $faillog, $gli,
    290     $gzip, $groupsize,
    291     $sortmeta, $removeprefix, $removesuffix
    292     );
    293 
    294     my $xml = 0;
    295 
    296     # other vars
    297     my ($configfilename, $collection, $collectcfg,
    298     $arcinfo_doc_filename, $arcinfo_src_filename, $archive_info,
    299     $gs_mode,
    300     $processor, $pluginfo);
    301 
    302     my $service = "import";
    303 
    304     my $hashParsingResult = {};
    305     # general options available to all plugins
    306     my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
    307     # Parse returns -1 if something has gone wrong
    308     if ($intArgLeftinAfterParsing == -1)
    309     {
    310     &PrintUsage::print_txt_usage($options, "{import.params}");
    311     die "\n";
    312     }
    313    
    314     foreach my $strVariable (keys %$hashParsingResult)
    315     {
    316     eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
    317     }
    318 
    319     # If $language has been specified, load the appropriate resource bundle
    320     # (Otherwise, the default resource bundle will be loaded automatically)
    321     if ($language && $language =~ /\S/) {
    322     &gsprintf::load_language_specific_resource_bundle($language);
    323     }
    324 
    325     if ($xml) {
    326         &PrintUsage::print_xml_usage($options);
    327     print "\n";
    328     return;
    329     }
    330 
    331     if ($gli) { # the gli wants strings to be in UTF-8
    332     &gsprintf::output_strings_in_UTF8;
    333     }
    334    
    335     # now check that we had exactly one leftover arg, which should be
    336     # the collection name. We don't want to do this earlier, cos
    337     # -xml arg doesn't need a collection name
    338     # Or if the user specified -h, then we output the usage also
    339     if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
    340     {
    341     &PrintUsage::print_txt_usage($options, "{import.params}");
    342     die "\n";
    343     }
    344 
    345     my $close_out = 0;
    346     if ($out !~ /^(STDERR|STDOUT)$/i) {
    347     open (OUT, ">$out") ||
    348         (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
    349     $out = 'import::OUT';
    350     $close_out = 1;
    351     }
    352     $out->autoflush(1);
    353 
    354     # get and check the collection name
    355     if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") {
    356     &PrintUsage::print_txt_usage($options, "{import.params}");
    357     die "\n";
    358     }
    359 
    360     # add collection's perllib dir  into include path in
    361     # case we have collection specific modules
    362     unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
    363 
    364     # check that we can open the faillog
    365     if ($faillog eq "") {
    366     $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
    367     }
    368     open (FAILLOG, ">$faillog") ||
    369     (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
    370 
    371    
    372     my $faillogname = $faillog;
    373     $faillog = 'import::FAILLOG';
    374     $faillog->autoflush(1);
    375    
    376     # Read in the collection configuration file.
    377     ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
    378     $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);
    379    
    380     # If the infodbtype value wasn't defined in the collect.cfg file, use the default
    381     if (!defined($collectcfg->{'infodbtype'}))
    382     {
    383       $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
    384     }
    385 
    386     if (defined $collectcfg->{'importdir'} && $importdir eq "") {
    387     $importdir = $collectcfg->{'importdir'};
    388     }
    389     if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
    390     $archivedir = $collectcfg->{'archivedir'};
    391     }
    392     # fill in the default import and archives directories if none
    393     # were supplied, turn all \ into / and remove trailing /
    394     $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
    395     $importdir =~ s/[\\\/]+/\//g;
    396     $importdir =~ s/\/$//;
    397     if (!-e $importdir) {
    398     &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
    399     die "\n";
    400     }
    401 
    402     $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
    403     $archivedir =~ s/[\\\/]+/\//g;
    404     $archivedir =~ s/\/$//;
    405 
    406     my $plugins = [];
    407     if (defined $collectcfg->{'plugin'}) {
    408     $plugins = $collectcfg->{'plugin'};
    409     }
    410     #some global options for the plugins
    411     my @global_opts = ();
    412 
    413     if ($verbosity !~ /\d+/) {
    414     if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
    415         $verbosity = $collectcfg->{'verbosity'};
    416     } else {
    417         $verbosity = 2; # the default
    418     }
    419     }
    420     if (defined $collectcfg->{'manifest'} && $manifest eq "") {
    421     $manifest = $collectcfg->{'manifest'};
    422     }
    423 
    424     if (defined $collectcfg->{'gzip'} && !$gzip) {
    425     if ($collectcfg->{'gzip'} =~ /^true$/i) {
    426         $gzip = 1;
    427     }
    428     }
    429 
    430     if ($maxdocs !~ /\-?\d+/) {
    431     if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
    432         $maxdocs = $collectcfg->{'maxdocs'};
    433     } else {
    434         $maxdocs = -1; # the default
    435     }
    436     }
    437     if ($groupsize == 1) {
    438     if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
    439         $groupsize = $collectcfg->{'groupsize'};
    440     }
    441     }
    442 
    443     if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/ )) {
    444     if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
    445         $OIDtype = $collectcfg->{'OIDtype'};
    446     } else {
    447         $OIDtype = "hash"; # the default
    448     }
    449     }
    450 
    451     if ((!defined $OIDmetadata) || ($OIDmetadata eq "")) {
    452     if (defined $collectcfg->{'OIDmetadata'}) {
    453         $OIDmetadata = $collectcfg->{'OIDmetadata'};
    454     } else {
    455         $OIDmetadata = "dc.Identifier"; # the default
    456     }
    457     }
    458 
    459     if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
    460     $sortmeta = $collectcfg->{'sortmeta'};
    461     }
    462     # sortmeta cannot be used with group size
    463     $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
    464     if (defined $sortmeta && $groupsize > 1) {
    465     &gsprintf($out, "{import.cannot_sort}\n\n");
    466     $sortmeta = undef;
    467     }
    468    
    469     if (defined $collectcfg->{'removeprefix'} && $removeprefix eq "") {
    470     $removeprefix = $collectcfg->{'removeprefix'};
    471     }
    472    
    473     if (defined $collectcfg->{'removesuffix'} && $removesuffix eq "") {
    474     $removesuffix = $collectcfg->{'removesuffix'};
    475     }
    476     if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
    477     $debug = 1;
    478     }
    479     if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
    480     $gli = 1;
    481     }
    482     $gli = 0 unless defined $gli;
    483        
    484     # check keepold and removeold
    485     ($removeold, $keepold, $incremental, $incremental_mode)
    486     = &scriptutil::check_removeold_and_keepold($removeold, $keepold,
    487                            $incremental, "archives",
    488                            $collectcfg);
    489  
    490 
    491     print STDERR "<Import>\n" if $gli;
    492    
    493     my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
    494     if ($manifest ne "") { 
    495     my $manifest_filename = $manifest;
    496 
    497     if ($manifest_filename !~ m/^[\\\/]/) {
    498         $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
    499     }
    500 
    501     $manifest =~ s/[\\\/]+/\//g;
    502     $manifest =~ s/\/$//;
    503 
    504     $manifest_lookup->parse($manifest_filename);
    505     }
    506 
    507 
    508     # load all the plugins
    509     $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode);
    510     if (scalar(@$pluginfo) == 0) {
    511     &gsprintf($out, "{import.no_plugins_loaded}\n");
    512     die "\n";
    513     }
    514 
    515     # remove the old contents of the archives directory (and tmp directory) if needed
    516     if ($removeold) {
    517     if (-e $archivedir) {
    518         &gsprintf($out, "{import.removing_archives}\n");
    519         &util::rm_r ($archivedir);
    520     }
    521     my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
    522     $tmpdir =~ s/[\\\/]+/\//g;
    523     $tmpdir =~ s/\/$//;
    524     if (-e $tmpdir) {
    525         &gsprintf($out, "{import.removing_tmpdir}\n");
    526         &util::rm_r ($tmpdir);
    527     }
    528     }
    529 
    530     # create the archives dir if needed
    531     &util::mk_all_dir($archivedir);
    532 
    533     # read the archive information file
    534 ##  $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf");
    535 
    536     # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
    537     &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
    538     &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
    539 
    540     $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
    541     $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
    542                            
    543     $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
    544     $archive_info->load_info ($arcinfo_doc_filename);
    545 
    546     if ($manifest eq "") {
    547     # Load in list of files in import folder from last import (if present)
    548     $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
    549     }
    550 
    551     ####Use Plugout####
    552     my ($plugout);
    553     if (defined $collectcfg->{'plugout'}) {
    554     # If a plugout was specified in the collect.cfg file, assume it is sensible
    555     # We can't check the name because it could be anything, if it is a custom plugout
    556     $plugout = $collectcfg->{'plugout'};
    557     }
    558     else{
    559     if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
    560         push @$plugout,"GreenstoneXMLPlugout";
    561     }
    562     else{
    563         push @$plugout,$saveas."Plugout";
    564     }
    565     }
    566 
    567     push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
    568     push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
    569     push @$plugout,("-gzip_output") if ($gzip);
    570     push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
    571     push @$plugout,("-output_handle",$out) if (defined $out);
    572     push @$plugout,("-debug") if ($debug);
    573    
    574     $processor = &plugout::load_plugout($plugout);                       
    575     $processor->setoutputdir ($archivedir);
    576     $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
    577     $processor->set_OIDtype ($OIDtype, $OIDmetadata);
    578    
    579     &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
    580    
    581     if ($removeold) {
    582         # occasionally, plugins may want to do something on remove old, eg pharos image indexing
    583     &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
    584     }
    585     if ($manifest eq "") {
    586     # process the import directory
    587     my $block_hash = {};
    588     my $metadata = {};
    589     # gobal blocking pass may set up some metadata
    590     &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
    591 
    592 
    593     if ($incremental || $incremental_mode eq "onlyadd") {
    594 
    595         &inexport::prime_doc_oid_count($archivedir);
    596 
    597 
    598         # Can now work out which files were new, already existed, and have
    599         # been deleted
    600        
    601         &inexport::new_vs_old_import_diff($archive_info,$block_hash,$importdir,
    602                           $archivedir,$verbosity,$incremental_mode);
    603        
    604         my @new_files = sort keys %{$block_hash->{'new_files'}};
    605         if (scalar(@new_files>0)) {
    606         print STDERR "New files and modified metadata files since last import:\n  ";
    607         print STDERR join("\n  ",@new_files), "\n";
    608         }
    609 
    610         if ($incremental) {
    611                # only look for deletions if we are truely incremental
    612         my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
    613         # Filter out any in gsdl/tmp area
    614         my @filtered_deleted_files = ();
    615         my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
    616         my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
    617         $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
    618         $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
    619                  
    620         foreach my $df (@deleted_files) {
    621             next if ($df =~ m/^$gsdl_tmp_area/);
    622             next if ($df =~ m/^$collect_tmp_area/);
    623            
    624             push(@filtered_deleted_files,$df);
    625         }       
    626        
    627 
    628         @deleted_files = @filtered_deleted_files;
    629        
    630         if (scalar(@deleted_files)>0) {
    631             print STDERR "Files deleted since last import:\n  ";
    632             print STDERR join("\n  ",@deleted_files), "\n";
    633        
    634        
    635             &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
    636            
    637             &inexport::mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
    638         }
    639        
    640         my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
    641        
    642         if (scalar(@reindex_files)>0) {
    643             print STDERR "Files to reindex since last import:\n  ";
    644             print STDERR join("\n  ",@reindex_files), "\n";
    645             &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
    646             &inexport::mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
    647         }
    648                
    649         }
    650        
    651         # Play it safe, and run through the entire folder, only processing new or edited files
    652         &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    653 
    654     }
    655     else {
    656         &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    657     }
    658 
    659     }
    660     else
    661     {
    662     #
    663     # 1. Process delete files first
    664     #
    665 
    666     my @deleted_files = keys %{$manifest_lookup->{'delete'}};
    667     my @full_deleted_files = ();
    668 
    669     # ensure all filenames are absolute
    670     foreach my $df (@deleted_files) {       
    671         my $full_df =
    672         (&util::filename_is_absolute($df))
    673         ? $df
    674         : &util::filename_cat($importdir,$df);
    675 
    676         push(@full_deleted_files,$full_df);
    677     }
    678    
    679     &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
    680     &inexport::mark_docs_for_deletion($archive_info,{},
    681                       \@full_deleted_files,
    682                       $archivedir, $verbosity, "delete");
    683 
    684 
    685     #
    686     # 2. Now files for reindexing
    687     #
    688 
    689     my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
    690     my @full_reindex_files = ();
    691 
    692     # ensure all filenames are absolute
    693     foreach my $rf (@reindex_files) {       
    694         my $full_rf =
    695         (&util::filename_is_absolute($rf))
    696         ? $rf
    697         : &util::filename_cat($importdir,$rf);
    698 
    699         push(@full_reindex_files,$full_rf);
    700     }
    701    
    702     &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
    703     &inexport::mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
    704 
    705     # And now ensure the new version of the file processed by appropriate
    706     # plugin
    707     foreach my $full_rf (@full_reindex_files) {
    708         &plugin::read ($pluginfo, "", $full_rf, {}, {}, $processor, $maxdocs, 0, $gli);
    709     }
    710 
    711 
    712     #
    713     # 3. Now finally any new files
    714     #
    715 
    716     foreach my $file (keys %{$manifest_lookup->{'index'}}) {
    717         &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
    718     }
    719 
    720 
    721     }
    722 
    723     &plugin::end($pluginfo, $processor);
    724 
    725     &plugin::deinit($pluginfo, $processor);
    726 
    727     # Store the value of OIDCount (used in doc.pm) so it can be
    728     # restored correctly to this value on an incremental build
    729     &inexport::store_doc_oid_count($archivedir);
    730 
    731     # write out the archive information file
    732     $processor->close_file_output() if $groupsize > 1;
    733     $processor->close_group_output() if $processor->is_group();
    734 
    735 # The following 'if' statement is in the export.pl version of the script,
    736 # The reason for the 'if' statement is now given in export.pl
    737 # Unclear at this point if the same should be done here
    738 ##    if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) {
    739     # Not all export types need this (e.g. DSpace)
    740 
    741     # should we still do this in debug mode??
    742 
    743     # for backwards compatability with archvies.inf file
    744     if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
    745     $archive_info->save_info($arcinfo_doc_filename);
    746     }
    747     else {
    748     $archive_info->save_revinfo_db($arcinfo_src_filename);
    749     }
    750 
    751 
    752 ##    }
    753    
    754     # write out import stats
    755     my $close_stats = 0;
    756     if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
    757     if (open (STATS, ">$statsfile")) {
    758         $statsfile = 'import::STATS';
    759         $close_stats = 1;
    760     } else {
    761         &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
    762         &gsprintf($out, "{import.stats_backup}\n");
    763         $statsfile = 'STDERR';
    764     }
    765     }
    766 
    767     &gsprintf($out, "\n");
    768     &gsprintf($out, "*********************************************\n");
    769     &gsprintf($out, "{import.complete}\n");
    770     &gsprintf($out, "*********************************************\n");
    771 
    772     &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
    773     if ($close_stats) {
    774     close STATS;
    775     }
    776 
    777     close OUT if $close_out;
    778     close FAILLOG;
    779 }
  • main/trunk/greenstone2/perllib/inexport.pm

    r22327 r22413  
    11###########################################################################
    22#
    3 # inexport.pm -- useful utilities to support import.pl and export.pl
     3# inexport.pm -- useful class to support import.pl and export.pl
    44# A component of the Greenstone digital library software
    55# from the New Zealand Digital Library Project at the
     
    2828use strict;
    2929
    30 use File::Basename;
    31 
     30no strict 'refs'; # allow filehandles to be variables and vice versa
     31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
     32
     33use arcinfo;
     34use colcfg;
     35use dbutil;
     36use plugin;
     37use plugout;
     38use manifest;
     39use inexport;
    3240use dbutil;
    3341use util;
     42use scriptutil;
     43use FileHandle;
     44use gsprintf 'gsprintf';
     45use printusage;
     46use parse2;
     47
     48use File::Basename;
     49
     50sub new
     51{
     52    my $class = shift (@_);
     53    my ($argv,$options) = @_;
     54
     55    my $self = { 'xml' => 0 };
     56
     57    # general options available to all plugins
     58    my $arguments = $options->{'args'};
     59    my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
     60    # Parse returns -1 if something has gone wrong
     61    if ($intArgLeftinAfterParsing == -1)
     62    {
     63    &PrintUsage::print_txt_usage($options, "{import.params}");
     64    die "\n";
     65    }
     66   
     67    my $language = $self->{'language'};
     68    # If $language has been specified, load the appropriate resource bundle
     69    # (Otherwise, the default resource bundle will be loaded automatically)
     70    if ($language && $language =~ /\S/) {
     71    &gsprintf::load_language_specific_resource_bundle($language);
     72    }
     73
     74    if ($self->{'xml'}) {
     75        &PrintUsage::print_xml_usage($options);
     76    print "\n";
     77    return;
     78    }
     79
     80    if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
     81    &gsprintf::output_strings_in_UTF8;
     82    }
     83   
     84    # now check that we had exactly one leftover arg, which should be
     85    # the collection name. We don't want to do this earlier, cos
     86    # -xml arg doesn't need a collection name
     87    # Or if the user specified -h, then we output the usage also
     88    if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
     89    {
     90    &PrintUsage::print_txt_usage($options, "{import.params}");
     91    die "\n";
     92    }
     93
     94    $self->{'close_out'} = 0;
     95    my $out = $self->{'out'};
     96    if ($out !~ /^(STDERR|STDOUT)$/i) {
     97    open (OUT, ">$out") ||
     98        (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
     99    $out = 'import::OUT';
     100    $self->{'close_out'} = 1;
     101    }
     102    $out->autoflush(1);
     103    $self->{'out'} = $out;
     104
     105    # @ARGV should be only one item, the name of the collection
     106    $self->{'collection'} = shift @$argv;
     107
     108    return bless $self, $class;
     109}
     110
     111sub get_collection
     112{
     113    my $self = shift @_;
     114   
     115    return $self->{'collection'};
     116}
     117
     118
     119sub read_collection_cfg
     120{
     121    my $self = shift @_;
     122    my ($collection,$options) = @_;
     123
     124    my $collectdir = $self->{'collectdir'};
     125    my $site       = $self->{'site'};
     126    my $out        = $self->{'out'};
     127
     128    if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
     129    &PrintUsage::print_txt_usage($options, "{import.params}");
     130    die "\n";
     131    }
     132
     133    # add collection's perllib dir  into include path in
     134    # case we have collection specific modules
     135    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
     136
     137    # check that we can open the faillog
     138    my $faillog = $self->{'faillog'};
     139    if ($faillog eq "") {
     140    $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
     141    }
     142    open (FAILLOG, ">$faillog") ||
     143    (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
     144
     145   
     146    my $faillogname = $faillog;
     147    $faillog = 'inexport::FAILLOG';
     148    $faillog->autoflush(1);
     149    $self->{'faillog'} = $faillog;
     150    $self->{'faillogname'} = $faillogname;
     151
     152    # Read in the collection configuration file.
     153    my ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
     154    my $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);
     155
     156    return $collectcfg;
     157}
     158
     159sub set_collection_options
     160{
     161    my $self = shift @_;
     162    my ($inexport_mode,$collectcfg) = @_;
     163
     164    my $verbosity  = $self->{'verbosity'};
     165    print STDERR "**** verbosity = $verbosity\n\n\n";
     166
     167    my $debug      = $self->{'debug'};
     168    my $importdir  = $self->{'importdir'};
     169    my $archivedir = $self->{'archivedir'};
     170    my $out        = $self->{'out'};
     171
     172    # If the infodbtype value wasn't defined in the collect.cfg file, use the default
     173    if (!defined($collectcfg->{'infodbtype'}))
     174    {
     175      $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
     176    }
     177
     178    if (defined $collectcfg->{'importdir'} && $importdir eq "") {
     179    $importdir = $collectcfg->{'importdir'};
     180    }
     181    if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
     182    $archivedir = $collectcfg->{'archivedir'};
     183    }
     184    # fill in the default import and archives directories if none
     185    # were supplied, turn all \ into / and remove trailing /
     186    $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
     187    $importdir =~ s/[\\\/]+/\//g;
     188    $importdir =~ s/\/$//;
     189    if (!-e $importdir) {
     190    &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
     191    die "\n";
     192    }
     193    $self->{'importdir'} = $importdir;
     194
     195    if ($archivedir eq "") {
     196    if ($inexport_mode eq "import") {
     197        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
     198    }
     199    elsif ($inexport_mode eq "export") {
     200        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
     201    }
     202    else {
     203        print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
     204        print STDERR "         Defaulting to 'archives' for file output\n";
     205        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
     206    }
     207    }
     208
     209    $archivedir =~ s/[\\\/]+/\//g;
     210    $archivedir =~ s/\/$//;
     211    $self->{'archivedir'} = $archivedir;
     212
     213    if ($verbosity !~ /\d+/) {
     214    if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
     215        $verbosity = $collectcfg->{'verbosity'};
     216    } else {
     217        $verbosity = 2; # the default
     218    }
     219    }
     220    if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
     221    $self->{'manifest'} = $collectcfg->{'manifest'};
     222    }
     223
     224    if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
     225    if ($collectcfg->{'gzip'} =~ /^true$/i) {
     226        $self->{'gzip'} = 1;
     227    }
     228    }
     229
     230    if ($self->{'maxdocs'} !~ /\-?\d+/) {
     231    if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
     232        $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
     233    } else {
     234        $self->{'maxdocs'} = -1; # the default
     235    }
     236    }
     237    if ($self->{'groupsize'} == 1) {
     238    if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
     239        $self->{'groupsize'} = $collectcfg->{'groupsize'};
     240    }
     241    }
     242
     243    if (!defined $self->{'OIDtype'}
     244    || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
     245    if (defined $collectcfg->{'OIDtype'}
     246        && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
     247        $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
     248    } else {
     249        $self->{'OIDtype'} = "hash"; # the default
     250    }
     251    }
     252
     253    if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
     254    if (defined $collectcfg->{'OIDmetadata'}) {
     255        $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
     256    } else {
     257        $self->{'OIDmetadata'} = "dc.Identifier"; # the default
     258    }
     259    }
     260
     261    my $sortmeta = $self->{'sortmeta'};
     262    if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
     263    $sortmeta = $collectcfg->{'sortmeta'};
     264    }
     265    # sortmeta cannot be used with group size
     266    $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
     267    if (defined $sortmeta && $self->{'groupsize'} > 1) {
     268    &gsprintf($out, "{import.cannot_sort}\n\n");
     269    $sortmeta = undef;
     270    }
     271    $self->{'sortmeta'} = $sortmeta;
     272
     273    if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
     274    $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
     275    }
     276   
     277    if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
     278    $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
     279    }
     280    if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
     281    $self->{'debug'} = 1;
     282    }
     283    if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
     284    $self->{'gli'} = 1;
     285    }
     286    $self->{'gli'} = 0 unless defined $self->{'gli'};
     287       
     288    # check keepold and removeold
     289    my ($removeold, $keepold, $incremental, $incremental_mode)
     290    = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
     291                           $self->{'incremental'}, "archives",
     292                           $collectcfg);
     293
     294    $self->{'removeold'}        = $removeold;
     295    $self->{'keepold'}          = $keepold;
     296    $self->{'incremental'}      = $incremental;
     297    $self->{'incremental_mode'} = $incremental_mode;
     298}
     299
     300sub process_files
     301{
     302    my $self = shift @_;
     303    my ($inexport_mode,$collectcfg) = @_;
     304
     305    my $verbosity   = $self->{'verbosity'};
     306    my $debug       = $self->{'debug'};
     307
     308    my $importdir   = $self->{'importdir'};
     309    my $archivedir  = $self->{'archivedir'};
     310
     311    my $incremental = $self->{'incremental'};
     312    my $incremental_mode = $self->{'incremental_mode'};
     313
     314    my $removeold   = $self->{'removeold'};
     315    my $keepold     = $self->{'keepold'};
     316
     317    my $saveas      = $self->{'saveas'};
     318    my $OIDtype     = $self->{'OIDtype'};
     319    my $OIDmetadata = $self->{'OIDmetadata'};
     320
     321    my $out         = $self->{'out'};
     322    my $faillog     = $self->{'faillog'};
     323
     324    my $maxdocs     = $self->{'maxdocs'};
     325    my $gzip        = $self->{'gzip'};
     326    my $groupsize   = $self->{'groupsize'};
     327    my $sortmeta    = $self->{'sortmeta'};
     328
     329    my $removeprefix = $self->{'removeprefix'};
     330    my $removesuffix = $self->{'removesuffix'};
     331
     332    my $gli         = $self->{'gli'};
     333
     334    print STDERR "<Import>\n" if $gli;
     335   
     336    my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
     337    if ($self->{'manifest'} ne "") {   
     338    my $manifest_filename = $self->{'manifest'};
     339
     340    if (!&util::filename_is_absolute($manifest_filename)) {
     341        $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
     342    }
     343
     344    $self->{'manifest'} =~ s/[\\\/]+/\//g;
     345    $self->{'manifest'} =~ s/\/$//;
     346
     347    $manifest_lookup->parse($manifest_filename);
     348    }
     349
     350    my $manifest = $self->{'manifest'};
     351
     352    # load all the plugins
     353    my $plugins = [];
     354    if (defined $collectcfg->{'plugin'}) {
     355    $plugins = $collectcfg->{'plugin'};
     356    }
     357
     358    #some global options for the plugins
     359    my @global_opts = ();
     360
     361
     362    my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode);
     363    if (scalar(@$pluginfo) == 0) {
     364    &gsprintf($out, "{import.no_plugins_loaded}\n");
     365    die "\n";
     366    }
     367
     368    # remove the old contents of the archives directory (and tmp directory) if needed
     369    if ($removeold) {
     370    if (-e $archivedir) {
     371        &gsprintf($out, "{import.removing_archives}\n");
     372        &util::rm_r ($archivedir);
     373    }
     374    my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
     375    $tmpdir =~ s/[\\\/]+/\//g;
     376    $tmpdir =~ s/\/$//;
     377    if (-e $tmpdir) {
     378        &gsprintf($out, "{import.removing_tmpdir}\n");
     379        &util::rm_r ($tmpdir);
     380    }
     381    }
     382
     383    # create the archives dir if needed
     384    &util::mk_all_dir($archivedir);
     385
     386    # read the archive information file
     387##  my $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf");
     388
     389    # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
     390    &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
     391    &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
     392
     393    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
     394    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
     395                           
     396    my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
     397    $archive_info->load_info ($arcinfo_doc_filename);
     398
     399    if ($manifest eq "") {
     400    # Load in list of files in import folder from last import (if present)
     401    $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
     402    }
     403
     404    ####Use Plugout####
     405    my ($plugout);
     406    if (defined $collectcfg->{'plugout'}) {
     407    # If a plugout was specified in the collect.cfg file, assume it is sensible
     408    # We can't check the name because it could be anything, if it is a custom plugout
     409    $plugout = $collectcfg->{'plugout'};
     410    }
     411    else{
     412    if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
     413        push @$plugout,"GreenstoneXMLPlugout";
     414    }
     415    else{
     416        push @$plugout,$saveas."Plugout";
     417    }
     418    }
     419
     420    push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
     421    push @$plugout,("-verbosity",$verbosity)      if (defined $verbosity);
     422    push @$plugout,("-gzip_output")               if ($gzip);
     423    push @$plugout,("-group_size",$groupsize)     if (defined $groupsize);
     424    push @$plugout,("-output_handle",$out)        if (defined);
     425    push @$plugout,("-debug")                     if ($debug);
     426   
     427    my $processor = &plugout::load_plugout($plugout);                       
     428    $processor->setoutputdir ($archivedir);
     429    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
     430    $processor->set_OIDtype ($OIDtype, $OIDmetadata);
     431   
     432    &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
     433   
     434    if ($removeold) {
     435        # occasionally, plugins may want to do something on remove old, eg pharos image indexing
     436    &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
     437    }
     438    if ($manifest eq "") {
     439    # process the import directory
     440    my $block_hash = {};
     441    my $metadata = {};
     442    # gobal blocking pass may set up some metadata
     443    &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
     444
     445
     446    if ($incremental || $incremental_mode eq "onlyadd") {
     447
     448        prime_doc_oid_count($archivedir);
     449
     450
     451        # Can now work out which files were new, already existed, and have
     452        # been deleted
     453       
     454        new_vs_old_import_diff($archive_info,$block_hash,$importdir,
     455                   $archivedir,$verbosity,$incremental_mode);
     456       
     457        my @new_files = sort keys %{$block_hash->{'new_files'}};
     458        if (scalar(@new_files>0)) {
     459        print STDERR "New files and modified metadata files since last import:\n  ";
     460        print STDERR join("\n  ",@new_files), "\n";
     461        }
     462
     463        if ($incremental) {
     464               # only look for deletions if we are truely incremental
     465        my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
     466        # Filter out any in gsdl/tmp area
     467        my @filtered_deleted_files = ();
     468        my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
     469        my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
     470        $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
     471        $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
     472                 
     473        foreach my $df (@deleted_files) {
     474            next if ($df =~ m/^$gsdl_tmp_area/);
     475            next if ($df =~ m/^$collect_tmp_area/);
     476           
     477            push(@filtered_deleted_files,$df);
     478        }       
     479       
     480
     481        @deleted_files = @filtered_deleted_files;
     482       
     483        if (scalar(@deleted_files)>0) {
     484            print STDERR "Files deleted since last import:\n  ";
     485            print STDERR join("\n  ",@deleted_files), "\n";
     486       
     487       
     488            &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
     489           
     490            mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
     491        }
     492       
     493        my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
     494       
     495        if (scalar(@reindex_files)>0) {
     496            print STDERR "Files to reindex since last import:\n  ";
     497            print STDERR join("\n  ",@reindex_files), "\n";
     498            &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
     499            mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
     500        }
     501               
     502        }
     503       
     504        # Play it safe, and run through the entire folder, only processing new or edited files
     505        &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
     506
     507    }
     508    else {
     509        &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
     510    }
     511
     512    }
     513    else
     514    {
     515    #
     516    # 1. Process delete files first
     517    #
     518
     519    my @deleted_files = keys %{$manifest_lookup->{'delete'}};
     520    my @full_deleted_files = ();
     521
     522    # ensure all filenames are absolute
     523    foreach my $df (@deleted_files) {       
     524        my $full_df =
     525        (&util::filename_is_absolute($df))
     526        ? $df
     527        : &util::filename_cat($importdir,$df);
     528
     529        push(@full_deleted_files,$full_df);
     530    }
     531   
     532    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
     533    mark_docs_for_deletion($archive_info,{},
     534                      \@full_deleted_files,
     535                      $archivedir, $verbosity, "delete");
     536
     537
     538    #
     539    # 2. Now files for reindexing
     540    #
     541
     542    my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
     543    my @full_reindex_files = ();
     544
     545    # ensure all filenames are absolute
     546    foreach my $rf (@reindex_files) {       
     547        my $full_rf =
     548        (&util::filename_is_absolute($rf))
     549        ? $rf
     550        : &util::filename_cat($importdir,$rf);
     551
     552        push(@full_reindex_files,$full_rf);
     553    }
     554   
     555    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
     556    mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
     557
     558    # And now ensure the new version of the file processed by appropriate
     559    # plugin
     560    foreach my $full_rf (@full_reindex_files) {
     561        &plugin::read ($pluginfo, "", $full_rf, {}, {}, $processor, $maxdocs, 0, $gli);
     562    }
     563
     564
     565    #
     566    # 3. Now finally any new files
     567    #
     568
     569    foreach my $file (keys %{$manifest_lookup->{'index'}}) {
     570        &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
     571    }
     572
     573
     574    }
     575
     576    &plugin::end($pluginfo, $processor);
     577
     578    &plugin::deinit($pluginfo, $processor);
     579
     580    # Store the value of OIDCount (used in doc.pm) so it can be
     581    # restored correctly to this value on an incremental build
     582    store_doc_oid_count($archivedir);
     583
     584    # write out the archive information file
     585    $processor->close_file_output() if $groupsize > 1;
     586    $processor->close_group_output() if $processor->is_group();
     587
     588# The following 'if' statement is in the export.pl version of the script,
     589# The reason for the 'if' statement is now given in export.pl
     590# Unclear at this point if the same should be done here
     591##    if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) {
     592    # Not all export types need this (e.g. DSpace)
     593
     594    # should we still do this in debug mode??
     595
     596    # for backwards compatability with archvies.inf file
     597    if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
     598    $archive_info->save_info($arcinfo_doc_filename);
     599    }
     600    else {
     601    $archive_info->save_revinfo_db($arcinfo_src_filename);
     602    }
     603
     604
     605##    }
     606
     607    return $pluginfo;
     608}
     609
     610
     611sub generate_statistics
     612{
     613    my $self = shift @_;
     614    my ($inexport_mode,$pluginfo) = @_;
     615
     616    my $statsfile = $self->{'statsfile'};
     617    my $out       = $self->{'out'};
     618    my $faillogname = $self->{'faillogname'};
     619    my $gli       = $self->{'gli'};
     620
     621    # write out import stats
     622    my $close_stats = 0;
     623    if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
     624    if (open (STATS, ">$statsfile")) {
     625        $statsfile = 'import::STATS';
     626        $close_stats = 1;
     627    } else {
     628        &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
     629        &gsprintf($out, "{import.stats_backup}\n");
     630        $statsfile = 'STDERR';
     631    }
     632    }
     633
     634    &gsprintf($out, "\n");
     635    &gsprintf($out, "*********************************************\n");
     636    &gsprintf($out, "{import.complete}\n");
     637    &gsprintf($out, "*********************************************\n");
     638
     639    &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
     640    if ($close_stats) {
     641    close STATS;
     642    }
     643
     644    close OUT if $self->{'close_out'};
     645    close FAILLOG;
     646}
     647
     648
     649
     650
     651
    34652
    35653
Note: See TracChangeset for help on using the changeset viewer.