Ignore:
Timestamp:
2010-07-16T14:13:01+12:00 (14 years ago)
Author:
davidb
Message:

Initial pass at getting the main code to import.pl (and the very similar export.pl) structured as a shared module

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/import.pl

    r22037 r22413  
    6464
    6565use strict;
    66 no strict 'refs'; # allow filehandles to be variables and vice versa
    67 no strict 'subs'; # allow barewords (eg STDERR) as function arguments
    68 
    69 use arcinfo;
    70 use colcfg;
    71 use dbutil;
    72 use plugin;
    73 use plugout;
    74 use manifest;
    7566use inexport;
    76 use util;
    77 use scriptutil;
    78 use FileHandle;
    79 use gsprintf 'gsprintf';
    80 use printusage;
    81 use parse2;
    82 
    83 
    8467
    8568my $oidtype_list =
     
    258241    'range' => "0,",
    259242    # parsearg left "" as default
    260     #'deft' => "2",
     243    'deft' => "2",
    261244    'reqd' => "no",
    262245    'modegli' => "3" },
     
    277260
    278261
     262
     263sub main
     264{
     265    my $inexport = new inexport(\@ARGV,$options);
     266   
     267    my $collection = $inexport->get_collection();
     268    my $collect_cfg = $inexport->read_collection_cfg($collection,$options);   
     269    $inexport->set_collection_options("import",$collect_cfg);
     270   
     271    my $pluginfo = $inexport->process_files("import",$collect_cfg);
     272
     273    $inexport->generate_statistics("import",$pluginfo);
     274}
     275
     276
    279277&main();
    280 
    281 sub main {
    282     # params
    283     my ($language, $verbosity, $debug,
    284     $collectdir, $importdir, $archivedir, $site, $manifest,
    285     $incremental, $incremental_mode, $keepold, $removeold,
    286     $saveas,
    287     $OIDtype, $OIDmetadata,
    288     $maxdocs, $statsfile,
    289     $out, $faillog, $gli,
    290     $gzip, $groupsize,
    291     $sortmeta, $removeprefix, $removesuffix
    292     );
    293 
    294     my $xml = 0;
    295 
    296     # other vars
    297     my ($configfilename, $collection, $collectcfg,
    298     $arcinfo_doc_filename, $arcinfo_src_filename, $archive_info,
    299     $gs_mode,
    300     $processor, $pluginfo);
    301 
    302     my $service = "import";
    303 
    304     my $hashParsingResult = {};
    305     # general options available to all plugins
    306     my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
    307     # Parse returns -1 if something has gone wrong
    308     if ($intArgLeftinAfterParsing == -1)
    309     {
    310     &PrintUsage::print_txt_usage($options, "{import.params}");
    311     die "\n";
    312     }
    313    
    314     foreach my $strVariable (keys %$hashParsingResult)
    315     {
    316     eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
    317     }
    318 
    319     # If $language has been specified, load the appropriate resource bundle
    320     # (Otherwise, the default resource bundle will be loaded automatically)
    321     if ($language && $language =~ /\S/) {
    322     &gsprintf::load_language_specific_resource_bundle($language);
    323     }
    324 
    325     if ($xml) {
    326         &PrintUsage::print_xml_usage($options);
    327     print "\n";
    328     return;
    329     }
    330 
    331     if ($gli) { # the gli wants strings to be in UTF-8
    332     &gsprintf::output_strings_in_UTF8;
    333     }
    334    
    335     # now check that we had exactly one leftover arg, which should be
    336     # the collection name. We don't want to do this earlier, cos
    337     # -xml arg doesn't need a collection name
    338     # Or if the user specified -h, then we output the usage also
    339     if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
    340     {
    341     &PrintUsage::print_txt_usage($options, "{import.params}");
    342     die "\n";
    343     }
    344 
    345     my $close_out = 0;
    346     if ($out !~ /^(STDERR|STDOUT)$/i) {
    347     open (OUT, ">$out") ||
    348         (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
    349     $out = 'import::OUT';
    350     $close_out = 1;
    351     }
    352     $out->autoflush(1);
    353 
    354     # get and check the collection name
    355     if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") {
    356     &PrintUsage::print_txt_usage($options, "{import.params}");
    357     die "\n";
    358     }
    359 
    360     # add collection's perllib dir  into include path in
    361     # case we have collection specific modules
    362     unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
    363 
    364     # check that we can open the faillog
    365     if ($faillog eq "") {
    366     $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
    367     }
    368     open (FAILLOG, ">$faillog") ||
    369     (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
    370 
    371    
    372     my $faillogname = $faillog;
    373     $faillog = 'import::FAILLOG';
    374     $faillog->autoflush(1);
    375    
    376     # Read in the collection configuration file.
    377     ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
    378     $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);
    379    
    380     # If the infodbtype value wasn't defined in the collect.cfg file, use the default
    381     if (!defined($collectcfg->{'infodbtype'}))
    382     {
    383       $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
    384     }
    385 
    386     if (defined $collectcfg->{'importdir'} && $importdir eq "") {
    387     $importdir = $collectcfg->{'importdir'};
    388     }
    389     if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
    390     $archivedir = $collectcfg->{'archivedir'};
    391     }
    392     # fill in the default import and archives directories if none
    393     # were supplied, turn all \ into / and remove trailing /
    394     $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
    395     $importdir =~ s/[\\\/]+/\//g;
    396     $importdir =~ s/\/$//;
    397     if (!-e $importdir) {
    398     &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
    399     die "\n";
    400     }
    401 
    402     $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
    403     $archivedir =~ s/[\\\/]+/\//g;
    404     $archivedir =~ s/\/$//;
    405 
    406     my $plugins = [];
    407     if (defined $collectcfg->{'plugin'}) {
    408     $plugins = $collectcfg->{'plugin'};
    409     }
    410     #some global options for the plugins
    411     my @global_opts = ();
    412 
    413     if ($verbosity !~ /\d+/) {
    414     if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
    415         $verbosity = $collectcfg->{'verbosity'};
    416     } else {
    417         $verbosity = 2; # the default
    418     }
    419     }
    420     if (defined $collectcfg->{'manifest'} && $manifest eq "") {
    421     $manifest = $collectcfg->{'manifest'};
    422     }
    423 
    424     if (defined $collectcfg->{'gzip'} && !$gzip) {
    425     if ($collectcfg->{'gzip'} =~ /^true$/i) {
    426         $gzip = 1;
    427     }
    428     }
    429 
    430     if ($maxdocs !~ /\-?\d+/) {
    431     if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
    432         $maxdocs = $collectcfg->{'maxdocs'};
    433     } else {
    434         $maxdocs = -1; # the default
    435     }
    436     }
    437     if ($groupsize == 1) {
    438     if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
    439         $groupsize = $collectcfg->{'groupsize'};
    440     }
    441     }
    442 
    443     if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/ )) {
    444     if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
    445         $OIDtype = $collectcfg->{'OIDtype'};
    446     } else {
    447         $OIDtype = "hash"; # the default
    448     }
    449     }
    450 
    451     if ((!defined $OIDmetadata) || ($OIDmetadata eq "")) {
    452     if (defined $collectcfg->{'OIDmetadata'}) {
    453         $OIDmetadata = $collectcfg->{'OIDmetadata'};
    454     } else {
    455         $OIDmetadata = "dc.Identifier"; # the default
    456     }
    457     }
    458 
    459     if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
    460     $sortmeta = $collectcfg->{'sortmeta'};
    461     }
    462     # sortmeta cannot be used with group size
    463     $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
    464     if (defined $sortmeta && $groupsize > 1) {
    465     &gsprintf($out, "{import.cannot_sort}\n\n");
    466     $sortmeta = undef;
    467     }
    468    
    469     if (defined $collectcfg->{'removeprefix'} && $removeprefix eq "") {
    470     $removeprefix = $collectcfg->{'removeprefix'};
    471     }
    472    
    473     if (defined $collectcfg->{'removesuffix'} && $removesuffix eq "") {
    474     $removesuffix = $collectcfg->{'removesuffix'};
    475     }
    476     if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
    477     $debug = 1;
    478     }
    479     if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
    480     $gli = 1;
    481     }
    482     $gli = 0 unless defined $gli;
    483        
    484     # check keepold and removeold
    485     ($removeold, $keepold, $incremental, $incremental_mode)
    486     = &scriptutil::check_removeold_and_keepold($removeold, $keepold,
    487                            $incremental, "archives",
    488                            $collectcfg);
    489  
    490 
    491     print STDERR "<Import>\n" if $gli;
    492    
    493     my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
    494     if ($manifest ne "") { 
    495     my $manifest_filename = $manifest;
    496 
    497     if ($manifest_filename !~ m/^[\\\/]/) {
    498         $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
    499     }
    500 
    501     $manifest =~ s/[\\\/]+/\//g;
    502     $manifest =~ s/\/$//;
    503 
    504     $manifest_lookup->parse($manifest_filename);
    505     }
    506 
    507 
    508     # load all the plugins
    509     $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode);
    510     if (scalar(@$pluginfo) == 0) {
    511     &gsprintf($out, "{import.no_plugins_loaded}\n");
    512     die "\n";
    513     }
    514 
    515     # remove the old contents of the archives directory (and tmp directory) if needed
    516     if ($removeold) {
    517     if (-e $archivedir) {
    518         &gsprintf($out, "{import.removing_archives}\n");
    519         &util::rm_r ($archivedir);
    520     }
    521     my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
    522     $tmpdir =~ s/[\\\/]+/\//g;
    523     $tmpdir =~ s/\/$//;
    524     if (-e $tmpdir) {
    525         &gsprintf($out, "{import.removing_tmpdir}\n");
    526         &util::rm_r ($tmpdir);
    527     }
    528     }
    529 
    530     # create the archives dir if needed
    531     &util::mk_all_dir($archivedir);
    532 
    533     # read the archive information file
    534 ##  $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf");
    535 
    536     # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
    537     &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
    538     &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
    539 
    540     $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
    541     $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
    542                            
    543     $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
    544     $archive_info->load_info ($arcinfo_doc_filename);
    545 
    546     if ($manifest eq "") {
    547     # Load in list of files in import folder from last import (if present)
    548     $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
    549     }
    550 
    551     ####Use Plugout####
    552     my ($plugout);
    553     if (defined $collectcfg->{'plugout'}) {
    554     # If a plugout was specified in the collect.cfg file, assume it is sensible
    555     # We can't check the name because it could be anything, if it is a custom plugout
    556     $plugout = $collectcfg->{'plugout'};
    557     }
    558     else{
    559     if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
    560         push @$plugout,"GreenstoneXMLPlugout";
    561     }
    562     else{
    563         push @$plugout,$saveas."Plugout";
    564     }
    565     }
    566 
    567     push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
    568     push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
    569     push @$plugout,("-gzip_output") if ($gzip);
    570     push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
    571     push @$plugout,("-output_handle",$out) if (defined $out);
    572     push @$plugout,("-debug") if ($debug);
    573    
    574     $processor = &plugout::load_plugout($plugout);                       
    575     $processor->setoutputdir ($archivedir);
    576     $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
    577     $processor->set_OIDtype ($OIDtype, $OIDmetadata);
    578    
    579     &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
    580    
    581     if ($removeold) {
    582         # occasionally, plugins may want to do something on remove old, eg pharos image indexing
    583     &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
    584     }
    585     if ($manifest eq "") {
    586     # process the import directory
    587     my $block_hash = {};
    588     my $metadata = {};
    589     # gobal blocking pass may set up some metadata
    590     &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
    591 
    592 
    593     if ($incremental || $incremental_mode eq "onlyadd") {
    594 
    595         &inexport::prime_doc_oid_count($archivedir);
    596 
    597 
    598         # Can now work out which files were new, already existed, and have
    599         # been deleted
    600        
    601         &inexport::new_vs_old_import_diff($archive_info,$block_hash,$importdir,
    602                           $archivedir,$verbosity,$incremental_mode);
    603        
    604         my @new_files = sort keys %{$block_hash->{'new_files'}};
    605         if (scalar(@new_files>0)) {
    606         print STDERR "New files and modified metadata files since last import:\n  ";
    607         print STDERR join("\n  ",@new_files), "\n";
    608         }
    609 
    610         if ($incremental) {
    611                # only look for deletions if we are truely incremental
    612         my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
    613         # Filter out any in gsdl/tmp area
    614         my @filtered_deleted_files = ();
    615         my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
    616         my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
    617         $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
    618         $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
    619                  
    620         foreach my $df (@deleted_files) {
    621             next if ($df =~ m/^$gsdl_tmp_area/);
    622             next if ($df =~ m/^$collect_tmp_area/);
    623            
    624             push(@filtered_deleted_files,$df);
    625         }       
    626        
    627 
    628         @deleted_files = @filtered_deleted_files;
    629        
    630         if (scalar(@deleted_files)>0) {
    631             print STDERR "Files deleted since last import:\n  ";
    632             print STDERR join("\n  ",@deleted_files), "\n";
    633        
    634        
    635             &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
    636            
    637             &inexport::mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
    638         }
    639        
    640         my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
    641        
    642         if (scalar(@reindex_files)>0) {
    643             print STDERR "Files to reindex since last import:\n  ";
    644             print STDERR join("\n  ",@reindex_files), "\n";
    645             &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
    646             &inexport::mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
    647         }
    648                
    649         }
    650        
    651         # Play it safe, and run through the entire folder, only processing new or edited files
    652         &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    653 
    654     }
    655     else {
    656         &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    657     }
    658 
    659     }
    660     else
    661     {
    662     #
    663     # 1. Process delete files first
    664     #
    665 
    666     my @deleted_files = keys %{$manifest_lookup->{'delete'}};
    667     my @full_deleted_files = ();
    668 
    669     # ensure all filenames are absolute
    670     foreach my $df (@deleted_files) {       
    671         my $full_df =
    672         (&util::filename_is_absolute($df))
    673         ? $df
    674         : &util::filename_cat($importdir,$df);
    675 
    676         push(@full_deleted_files,$full_df);
    677     }
    678    
    679     &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
    680     &inexport::mark_docs_for_deletion($archive_info,{},
    681                       \@full_deleted_files,
    682                       $archivedir, $verbosity, "delete");
    683 
    684 
    685     #
    686     # 2. Now files for reindexing
    687     #
    688 
    689     my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
    690     my @full_reindex_files = ();
    691 
    692     # ensure all filenames are absolute
    693     foreach my $rf (@reindex_files) {       
    694         my $full_rf =
    695         (&util::filename_is_absolute($rf))
    696         ? $rf
    697         : &util::filename_cat($importdir,$rf);
    698 
    699         push(@full_reindex_files,$full_rf);
    700     }
    701    
    702     &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
    703     &inexport::mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
    704 
    705     # And now ensure the new version of the file processed by appropriate
    706     # plugin
    707     foreach my $full_rf (@full_reindex_files) {
    708         &plugin::read ($pluginfo, "", $full_rf, {}, {}, $processor, $maxdocs, 0, $gli);
    709     }
    710 
    711 
    712     #
    713     # 3. Now finally any new files
    714     #
    715 
    716     foreach my $file (keys %{$manifest_lookup->{'index'}}) {
    717         &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
    718     }
    719 
    720 
    721     }
    722 
    723     &plugin::end($pluginfo, $processor);
    724 
    725     &plugin::deinit($pluginfo, $processor);
    726 
    727     # Store the value of OIDCount (used in doc.pm) so it can be
    728     # restored correctly to this value on an incremental build
    729     &inexport::store_doc_oid_count($archivedir);
    730 
    731     # write out the archive information file
    732     $processor->close_file_output() if $groupsize > 1;
    733     $processor->close_group_output() if $processor->is_group();
    734 
    735 # The following 'if' statement is in the export.pl version of the script,
    736 # The reason for the 'if' statement is now given in export.pl
    737 # Unclear at this point if the same should be done here
    738 ##    if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) {
    739     # Not all export types need this (e.g. DSpace)
    740 
    741     # should we still do this in debug mode??
    742 
    743     # for backwards compatability with archvies.inf file
    744     if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
    745     $archive_info->save_info($arcinfo_doc_filename);
    746     }
    747     else {
    748     $archive_info->save_revinfo_db($arcinfo_src_filename);
    749     }
    750 
    751 
    752 ##    }
    753    
    754     # write out import stats
    755     my $close_stats = 0;
    756     if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
    757     if (open (STATS, ">$statsfile")) {
    758         $statsfile = 'import::STATS';
    759         $close_stats = 1;
    760     } else {
    761         &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
    762         &gsprintf($out, "{import.stats_backup}\n");
    763         $statsfile = 'STDERR';
    764     }
    765     }
    766 
    767     &gsprintf($out, "\n");
    768     &gsprintf($out, "*********************************************\n");
    769     &gsprintf($out, "{import.complete}\n");
    770     &gsprintf($out, "*********************************************\n");
    771 
    772     &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
    773     if ($close_stats) {
    774     close STATS;
    775     }
    776 
    777     close OUT if $close_out;
    778     close FAILLOG;
    779 }
Note: See TracChangeset for help on using the changeset viewer.