Ignore:
Timestamp:
2010-07-18T16:36:56+12:00 (14 years ago)
Author:
davidb
Message:

Continued work on refactoring code to have better shared support for import.pl and export.pl

Location:
main/trunk/greenstone2/bin/script
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/export.pl

    r22331 r22421  
    6666
    6767use strict;
    68 no strict 'refs'; # allow filehandles to be variables and vice versa
    69 no strict 'subs'; # allow barewords (eg STDERR) as function arguments
    70 
    71 use arcinfo;
    72 use colcfg;
    73 use dbutil;
    74 use plugin;
    75 use plugout;
    76 use manifest;
     68#no strict 'refs'; # allow filehandles to be variables and vice versa
     69#no strict 'subs'; # allow barewords (eg STDERR) as function arguments
    7770use inexport;
    78 use util;
    79 use scriptutil;
    80 use FileHandle;
    81 use gsprintf 'gsprintf';
    82 use printusage;
    83 use parse2;
    84 
    8571
    8672my $oidtype_list =
     
    128114    [
    129115      $saveas_argument,
    130       { 'name' => "exportdir",
    131     'desc' => "{export.exportdir}",
     116      { 'name' => "archivedir",
     117    'desc' => "{export.archivedir}",
    132118    'type' => "string",
    133119    'reqd' => "no",
     
    285271
    286272
     273
     274sub main
     275{
     276    my $inexport = new inexport("export",\@ARGV,$options,$listall_options);
     277   
     278    my $collection = $inexport->get_collection();
     279    my ($config_filename,$collect_cfg) = $inexport->read_collection_cfg($collection,$options);   
     280    $inexport->set_collection_options($collect_cfg);
     281   
     282    my $pluginfo = $inexport->process_files($config_filename,$collect_cfg);
     283
     284    $inexport->generate_statistics($pluginfo);
     285}
     286
     287
    287288&main();
    288289
    289 sub main {
    290     # params
    291     my ($language, $verbosity, $debug,
    292     $collectdir, $importdir, $exportdir, $site, $manifest,
    293     $incremental, $incremental_mode, $keepold, $removeold,
    294     $saveas,
    295     $OIDtype, $OIDmetadata,
    296     $maxdocs, $statsfile,
    297     $gzip,
    298     $out, $faillog, $gli, $listall,
    299     # plugout specific ones
    300     $mapping_file, $xsltfile,
    301     $xslt_mets, $xslt_txt, $fedora_namespace, $group_marc);
    302 
    303     my $xml = 0;
    304    
    305     # other vars
    306     my ($configfilename, $collection, $collectcfg,
    307     $expinfo_doc_filename, $expinfo_src_filename, $export_info,
    308     $gs_mode,
    309     $processor, $pluginfo);
    310 
    311     my $service = "export";
    312 
    313     my $hashParsingResult = {};
    314     # general options available to all plugins
    315     my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
    316    
    317     # If parse returns -1 then something has gone wrong
    318     if ($intArgLeftinAfterParsing == -1)
    319     {
    320     &PrintUsage::print_txt_usage($options, "{export.params}");
    321     die "\n";
    322     }
    323 
    324     foreach my $strVariable (keys %$hashParsingResult)
    325     {
    326     eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
    327     }
    328 
    329    
    330     # If $language has been specified, load the appropriate resource bundle
    331     # (Otherwise, the default resource bundle will be loaded automatically)
    332     if ($language && $language =~ /\S/) {
    333     &gsprintf::load_language_specific_resource_bundle($language);
    334     }
    335 
    336     if ($listall) {
    337     if ($xml) {
    338         &PrintUsage::print_xml_usage($listall_options);
    339     }
    340     else
    341     {
    342         &PrintUsage::print_txt_usage($listall_options,"{export.params}");
    343     }
    344     die "\n";
    345     }
    346    
    347     if ($xml) {
    348         &PrintUsage::print_xml_usage($options);
    349     die "\n";
    350     }
    351 
    352     if ($gli) { # the gli wants strings to be in UTF-8
    353     &gsprintf::output_strings_in_UTF8;
    354     }
    355 
    356     # now check that we had exactly one leftover arg, which should be
    357     # the collection name. We don't want to do this earlier, cos
    358     # -xml arg doesn't need a collection name
    359     # Or if the user specified -h, then we output the usage also
    360     if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
    361     {
    362     &PrintUsage::print_txt_usage($options, "{export.params}");
    363     die "\n";
    364     }
    365 
    366     my $close_out = 0;
    367     if ($out !~ /^(STDERR|STDOUT)$/i) {
    368     open (OUT, ">$out") ||
    369         (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
    370     $out = 'export::OUT';
    371     $close_out = 1;
    372     }
    373     $out->autoflush(1);
    374 
    375     # get and check the collection name
    376     if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") {
    377     &PrintUsage::print_txt_usage($options, "{export.params}");
    378     die "\n";
    379     }
    380     # add collection's perllib dir  into include path in
    381     # case we have collection specific modules
    382     unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
    383    
    384     # check that we can open the faillog
    385     if ($faillog eq "") {
    386     $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
    387     }
    388     open (FAILLOG, ">$faillog") ||
    389     (&gsprintf(STDERR, "{export.cannot_open_fail_log}\n", $faillog) && die);
    390     my $faillogname = $faillog;
    391     $faillog = 'export::FAILLOG';
    392     $faillog->autoflush(1);
    393        
    394     # Read in the collection configuration file.
    395     ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
    396     $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);
    397    
    398     # If the infodbtype value wasn't defined in the collect.cfg file, use the default
    399     if (!defined($collectcfg->{'infodbtype'}))
    400     {
    401       $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
    402     }
    403    
    404     if (defined $collectcfg->{'importdir'} && $importdir eq "") {
    405     $importdir = $collectcfg->{'importdir'};
    406     }
    407     if (defined $collectcfg->{'exportdir'} && $exportdir eq "") {
    408     $exportdir = $collectcfg->{'exportdir'};
    409     }
    410 
    411     # fill in the default import and export directories if none
    412     # were supplied, turn all \ into / and remove trailing /
    413     $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
    414     $importdir =~ s/[\\\/]+/\//g;
    415     $importdir =~ s/\/$//;
    416     if (!-e $importdir) {
    417     &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
    418     die "\n";
    419     }
    420 
    421     $exportdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export") if $exportdir eq "";
    422     $exportdir =~ s/[\\\/]+/\//g;
    423     $exportdir =~ s/\/$//;
    424    
    425     my $plugins = [];
    426     if (defined $collectcfg->{'plugin'}) {
    427     $plugins = $collectcfg->{'plugin'};
    428     }
    429     # some global options for the plugins           
    430     my @global_opts = ();
    431 
    432     if ($verbosity !~ /\d+/) {
    433     if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
    434         $verbosity = $collectcfg->{'verbosity'};
    435     } else {
    436         $verbosity = 2; # the default
    437     }
    438     }
    439    
    440     if (defined $collectcfg->{'manifest'} && $manifest eq "") {
    441     $manifest = $collectcfg->{'manifest'};
    442     }
    443     if (defined $collectcfg->{'gzip'} && !$gzip) {
    444     if ($collectcfg->{'gzip'} =~ /^true$/i) {
    445         $gzip = 1;
    446     }
    447     }
    448     if ($maxdocs !~ /\-?\d+/) {
    449     if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
    450         $maxdocs = $collectcfg->{'maxdocs'};
    451     } else {
    452         $maxdocs = -1; # the default
    453     }
    454     }
    455    
    456     # groupsize is in import - does it make sense here??
    457 
    458     if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/)) {
    459     if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
    460         $OIDtype = $collectcfg->{'OIDtype'};
    461     } else {
    462         $OIDtype = "hash"; # the default
    463     }
    464     }
    465 
    466     if ((!defined $OIDmetadata) || ($OIDmetadata eq "")) {
    467     if (defined $collectcfg->{'OIDmetadata'}) {
    468         $OIDmetadata = $collectcfg->{'OIDmetadata'};
    469     } else {
    470         $OIDmetadata = "dc.Identifier"; # the default
    471     }
    472     }
    473 
    474     if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
    475     $debug = 1;
    476     }
    477     if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
    478     $gli = 1;
    479     }
    480     $gli = 0 unless defined $gli;
    481 
    482     # check keepold and removeold
    483     ($removeold, $keepold, $incremental, $incremental_mode)
    484     = &scriptutil::check_removeold_and_keepold($removeold, $keepold,
    485                            $incremental, "export",
    486                            $collectcfg);
    487 
    488     print STDERR "<export>\n" if $gli;
    489    
    490     my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$exportdir);
    491     if ($manifest ne "") { 
    492     my $manifest_filename = $manifest;
    493 
    494     if ($manifest_filename !~ m/^[\\\/]/) {
    495         $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
    496     }
    497 
    498     $manifest =~ s/[\\\/]+/\//g;
    499     $manifest =~ s/\/$//;
    500 
    501     $manifest_lookup->parse($manifest_filename);
    502     }
    503    
    504     # load all the plugins
    505     $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode);
    506        
    507     if (scalar(@$pluginfo) == 0) {
    508     &gsprintf($out, "{import.no_plugins_loaded}\n");
    509     die "\n";
    510     }
    511    
    512     # remove the old contents of the export directory if needed
    513     if ($removeold) {
    514     if (-e $exportdir) {
    515         &gsprintf($out, "{export.removing_export}\n");
    516         &util::rm_r ($exportdir);
    517     }
    518     my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
    519     $tmpdir =~ s/[\\\/]+/\//g;
    520     $tmpdir =~ s/\/$//;
    521     if (-e $tmpdir) {
    522         &gsprintf($out, "{import.removing_tmpdir}\n");
    523         &util::rm_r ($tmpdir);
    524     }
    525     }
    526    
    527     # create the export dir if needed
    528     &util::mk_all_dir($exportdir);
    529    
    530     # read the export information file
    531    
    532     # the plugouts should be doing this!!
    533 ##  $expinfo_doc_filename = &util::filename_cat ($exportdir, "export.inf");
    534 
    535     # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
    536     &util::rename_ldb_or_bdb_file(&util::filename_cat($exportdir, "archiveinf-doc"));
    537     &util::rename_ldb_or_bdb_file(&util::filename_cat($exportdir, "archiveinf-src"));
    538 
    539     $expinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $exportdir);
    540     $expinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $exportdir);
    541        
    542     $export_info = new arcinfo($collectcfg->{'infodbtype'});
    543     $export_info -> load_info ($expinfo_doc_filename); 
    544        
    545     if ($manifest eq "") {
    546     # Load in list of files in export folder from last export (if present)
    547     $export_info->load_prev_import_filelist ($expinfo_src_filename);
    548     }
    549    
    550     my ($plugout);
    551     if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
    552     $plugout = $collectcfg->{'plugout'};
    553     }
    554     else{
    555     if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
    556         push @$plugout,"GreenstoneMETSPlugout";
    557     }
    558     else{
    559         push @$plugout,$saveas."Plugout";
    560     }
    561     }
    562    
    563     my $plugout_name = $plugout->[0];
    564        
    565     push @$plugout,("-output_info",$export_info) if (defined $export_info);
    566     push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
    567     push @$plugout,("-debug") if ($debug);
    568     push @$plugout,("-gzip_output") if ($gzip);
    569     push @$plugout,("-output_handle",$out) if (defined $out);
    570     push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
    571     push @$plugout,("-group") if ($group_marc && $plugout_name =~ m/^MARCXMLPlugout$/);
    572     push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "" && $plugout_name =~ m/^MARCXMLPlugout$/);
    573     push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "" && $plugout_name =~ m/^.*METSPlugout$/);
    574     push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "" && $plugout_name =~ m/^.*METSPlugout$/);
    575     push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "" && $plugout_name eq "FedoraMETSPlugout");
    576    
    577     $processor = &plugout::load_plugout($plugout);   
    578     $processor->setoutputdir ($exportdir);
    579        
    580     $processor->set_OIDtype ($OIDtype, $OIDmetadata);
    581        
    582     &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
    583        
    584     if ($manifest eq "") {
    585     # process the import directory
    586     my $block_hash = {};
    587     my $metadata = {};
    588     # gobal blocking pass may set up some metadata
    589     &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
    590     #&plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    591     ### section below copied from import.pl
    592     if ($incremental) {
    593         # equivalent to saying ($keepold && ($incremental_mode eq "all"))
    594 
    595         &inexport::prime_doc_oid_count($exportdir);
    596 
    597 
    598         # Can now work out which files were new, already existed, and have
    599         # been deleted
    600        
    601         &inexport::new_vs_old_import_diff($export_info,$block_hash,$importdir,
    602                           $exportdir,$verbosity,$incremental_mode);
    603        
    604         my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
    605         # Filter out any in gsdl/tmp area
    606         my @filtered_deleted_files = ();
    607         my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
    608         my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
    609         $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
    610         $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
    611 
    612 
    613         foreach my $df (@deleted_files) {
    614         next if ($df =~ m/^$gsdl_tmp_area/);
    615         next if ($df =~ m/^$collect_tmp_area/);
    616        
    617         push(@filtered_deleted_files,$df);
    618         }
    619 
    620        
    621         @deleted_files = @filtered_deleted_files;
    622 
    623         if (scalar(@deleted_files>0)) {
    624         print STDERR "Files deleted since last import:\n  ";
    625         print STDERR join("\n  ",@deleted_files), "\n";
    626         }
    627        
    628         my @new_files = sort keys %{$block_hash->{'new_files'}};
    629         if (scalar(@new_files>0)) {
    630         print STDERR "New files since last import:\n  ";
    631         print STDERR join("\n  ",@new_files), "\n";
    632         }
    633        
    634         &inexport::mark_docs_for_deletion($export_info,$block_hash,\@deleted_files,
    635                           $exportdir,$verbosity);
    636 
    637         &inexport::mark_docs_for_reindex($export_info,$block_hash,
    638                          $exportdir,$verbosity);
    639 
    640         my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
    641 
    642         if (scalar(@reindex_files>0)) {
    643         print STDERR "Files to reindex since last import:\n  ";
    644         print STDERR join("\n  ",@reindex_files), "\n";
    645         }
    646 
    647 
    648         # not sure if the following will work -- will the metadata data-structure be correctly initialized
    649         # in the right order?
    650 #       foreach my $file (@new_files, @reindex_files) {
    651 #       &plugin::read ($pluginfo, $importdir, $file, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    652 #       }
    653 
    654 
    655         # Play it safe, and run through the entire folder, only processing new or edited files
    656         &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    657 
    658     }
    659     else {
    660         &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    661     }
    662 
    663     ### end copy
    664     }
    665     else {
    666     # process any files marked for exporting
    667     foreach my $file (keys %{$manifest_lookup->{'index'}}) {
    668         &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
    669     }
    670 
    671     my @deleted_files = keys %{$manifest_lookup->{'delete'}};
    672 
    673     &inexport::mark_docs_for_deletion($export_info,{},\@deleted_files,$exportdir);
    674 
    675     }
    676 
    677     if ($saveas eq "FedoraMETS") {
    678     # create collection "doc obj" for Fedora that contains
    679     # collection-level metadata
    680    
    681     my $doc_obj = new doc($configfilename,"nonindexed_doc","none");
    682     $doc_obj->set_OID("collection");
    683    
    684     my $col_name = undef;
    685     my $col_meta = $collectcfg->{'collectionmeta'};
    686    
    687     if (defined $col_meta) {
    688        
    689         store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
    690         store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
    691        
    692     }
    693     $processor->process($doc_obj);
    694     }
    695        
    696     &plugin::end($pluginfo, $processor);
    697 
    698     &plugin::deinit($pluginfo, $processor);
    699        
    700     # Store the value of OIDCount (used in doc.pm) so it can be
    701     # restored correctly to this value on an incremental build
    702     &inexport::store_doc_oid_count($exportdir);
    703 
    704     # write out the export information file
    705     #$processor->close_file_output() if $groupsize > 1;
    706     $processor->close_group_output() if $processor->is_group();
    707 
    708 #    if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARCXML")) {
    709 #   # Not all export types need this,
    710 
    711 ##  $export_info->save_info($expinfo_doc_filename);
    712 #    }
    713 
    714 
    715     # for backwards compatability with archvies.inf file
    716     if ($expinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
    717     $export_info->save_info($expinfo_doc_filename);
    718     }
    719     else {
    720     $export_info->save_revinfo_db($expinfo_src_filename);
    721     }
    722 
    723        
    724     # write out export stats
    725     my $close_stats = 0;
    726     if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
    727     if (open (STATS, ">$statsfile")) {
    728         $statsfile = 'import::STATS';
    729         $close_stats = 1;
    730     } else {
    731         &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
    732         &gsprintf($out, "{import.stats_backup}\n");
    733         $statsfile = 'STDERR';
    734     }
    735     }
    736 
    737     &gsprintf($out, "\n");
    738     &gsprintf($out, "*********************************************\n");
    739     &gsprintf($out, "{export.complete}\n");
    740     &gsprintf($out, "*********************************************\n");
    741    
    742     &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
    743     if ($close_stats) {
    744     close STATS;
    745     }
    746    
    747     close OUT if $close_out;
    748     close FAILLOG;
    749 }
    750 
    751 
    752 sub store_collectionmeta
    753 {
    754     my ($collectionmeta,$field,$doc_obj) = @_;
    755 
    756     my $section = $doc_obj->get_top_section();
    757 
    758     my $field_hash = $collectionmeta->{$field};
    759 
    760     foreach my $k (keys %$field_hash)
    761     {
    762     my $val = $field_hash->{$k};
    763 
    764     ### print STDERR "*** $k = $field_hash->{$k}\n";
    765 
    766     my $md_label = "ex.$field";
    767 
    768 
    769     if ($k =~ m/^\[l=(.*?)\]$/)
    770     {
    771 
    772         my $md_suffix = $1;
    773         $md_label .= "^$md_suffix";
    774     }
    775 
    776 
    777     $doc_obj->add_utf8_metadata($section,$md_label, $val);
    778    
    779     # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
    780     # while "collectionname" in GS2 is called "name" in GS3.
    781     # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
    782     if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
    783     {
    784         $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
    785     }
    786 
    787     }
    788 }
    789 
    790 
    791 
    792 
     290
  • main/trunk/greenstone2/bin/script/import.pl

    r22413 r22421  
    241241    'range' => "0,",
    242242    # parsearg left "" as default
    243     'deft' => "2",
     243    # 'deft' => "2",
    244244    'reqd' => "no",
    245245    'modegli' => "3" },
     
    263263sub main
    264264{
    265     my $inexport = new inexport(\@ARGV,$options);
     265    my $inexport = new inexport("import",\@ARGV,$options);
    266266   
    267267    my $collection = $inexport->get_collection();
    268     my $collect_cfg = $inexport->read_collection_cfg($collection,$options);   
    269     $inexport->set_collection_options("import",$collect_cfg);
     268    my ($config_filename,$collect_cfg) = $inexport->read_collection_cfg($collection,$options);   
     269    $inexport->set_collection_options($collect_cfg);
    270270   
    271     my $pluginfo = $inexport->process_files("import",$collect_cfg);
    272 
    273     $inexport->generate_statistics("import",$pluginfo);
     271    my $pluginfo = $inexport->process_files($config_filename,$collect_cfg);
     272
     273    $inexport->generate_statistics($pluginfo);
    274274}
    275275
Note: See TracChangeset for help on using the changeset viewer.