- Timestamp:
- 2010-07-16T14:13:01+12:00 (14 years ago)
- Location:
- main/trunk/greenstone2
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/import.pl
r22037 r22413 64 64 65 65 use strict; 66 no strict 'refs'; # allow filehandles to be variables and vice versa67 no strict 'subs'; # allow barewords (eg STDERR) as function arguments68 69 use arcinfo;70 use colcfg;71 use dbutil;72 use plugin;73 use plugout;74 use manifest;75 66 use inexport; 76 use util;77 use scriptutil;78 use FileHandle;79 use gsprintf 'gsprintf';80 use printusage;81 use parse2;82 83 84 67 85 68 my $oidtype_list = … … 258 241 'range' => "0,", 259 242 # parsearg left "" as default 260 #'deft' => "2",243 'deft' => "2", 261 244 'reqd' => "no", 262 245 'modegli' => "3" }, … … 277 260 278 261 262 263 sub main 264 { 265 my $inexport = new inexport(\@ARGV,$options); 266 267 my $collection = $inexport->get_collection(); 268 my $collect_cfg = $inexport->read_collection_cfg($collection,$options); 269 $inexport->set_collection_options("import",$collect_cfg); 270 271 my $pluginfo = $inexport->process_files("import",$collect_cfg); 272 273 $inexport->generate_statistics("import",$pluginfo); 274 } 275 276 279 277 &main(); 280 281 sub main {282 # params283 my ($language, $verbosity, $debug,284 $collectdir, $importdir, $archivedir, $site, $manifest,285 $incremental, $incremental_mode, $keepold, $removeold,286 $saveas,287 $OIDtype, $OIDmetadata,288 $maxdocs, $statsfile,289 $out, $faillog, $gli,290 $gzip, $groupsize,291 $sortmeta, $removeprefix, $removesuffix292 );293 294 my $xml = 0;295 296 # other vars297 my ($configfilename, $collection, $collectcfg,298 $arcinfo_doc_filename, $arcinfo_src_filename, $archive_info,299 $gs_mode,300 $processor, $pluginfo);301 302 my $service = "import";303 304 my $hashParsingResult = {};305 # general options available to all plugins306 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");307 # Parse returns -1 if something has gone wrong308 if ($intArgLeftinAfterParsing == -1)309 {310 &PrintUsage::print_txt_usage($options, "{import.params}");311 die "\n";312 }313 314 foreach my $strVariable (keys %$hashParsingResult)315 {316 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";317 }318 319 # If $language has been specified, load the appropriate resource bundle320 # (Otherwise, the default resource bundle will be loaded automatically)321 if ($language && $language =~ /\S/) {322 &gsprintf::load_language_specific_resource_bundle($language);323 }324 325 if ($xml) {326 &PrintUsage::print_xml_usage($options);327 print "\n";328 return;329 }330 331 if ($gli) { # the gli wants strings to be in UTF-8332 &gsprintf::output_strings_in_UTF8;333 }334 335 # now check that we had exactly one leftover arg, which should be336 # the collection name. We don't want to do this earlier, cos337 # -xml arg doesn't need a collection name338 # Or if the user specified -h, then we output the usage also339 if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))340 {341 &PrintUsage::print_txt_usage($options, "{import.params}");342 die "\n";343 }344 345 my $close_out = 0;346 if ($out !~ /^(STDERR|STDOUT)$/i) {347 open (OUT, ">$out") ||348 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);349 $out = 'import::OUT';350 $close_out = 1;351 }352 $out->autoflush(1);353 354 # get and check the collection name355 if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") {356 &PrintUsage::print_txt_usage($options, "{import.params}");357 die "\n";358 }359 360 # add collection's perllib dir into include path in361 # case we have collection specific modules362 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");363 364 # check that we can open the faillog365 if ($faillog eq "") {366 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");367 }368 open (FAILLOG, ">$faillog") ||369 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);370 371 372 my $faillogname = $faillog;373 $faillog = 'import::FAILLOG';374 $faillog->autoflush(1);375 376 # Read in the collection configuration file.377 ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);378 $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);379 380 # If the infodbtype value wasn't defined in the collect.cfg file, use the default381 if (!defined($collectcfg->{'infodbtype'}))382 {383 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();384 }385 386 if (defined $collectcfg->{'importdir'} && $importdir eq "") {387 $importdir = $collectcfg->{'importdir'};388 }389 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {390 $archivedir = $collectcfg->{'archivedir'};391 }392 # fill in the default import and archives directories if none393 # were supplied, turn all \ into / and remove trailing /394 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";395 $importdir =~ s/[\\\/]+/\//g;396 $importdir =~ s/\/$//;397 if (!-e $importdir) {398 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);399 die "\n";400 }401 402 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";403 $archivedir =~ s/[\\\/]+/\//g;404 $archivedir =~ s/\/$//;405 406 my $plugins = [];407 if (defined $collectcfg->{'plugin'}) {408 $plugins = $collectcfg->{'plugin'};409 }410 #some global options for the plugins411 my @global_opts = ();412 413 if ($verbosity !~ /\d+/) {414 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {415 $verbosity = $collectcfg->{'verbosity'};416 } else {417 $verbosity = 2; # the default418 }419 }420 if (defined $collectcfg->{'manifest'} && $manifest eq "") {421 $manifest = $collectcfg->{'manifest'};422 }423 424 if (defined $collectcfg->{'gzip'} && !$gzip) {425 if ($collectcfg->{'gzip'} =~ /^true$/i) {426 $gzip = 1;427 }428 }429 430 if ($maxdocs !~ /\-?\d+/) {431 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {432 $maxdocs = $collectcfg->{'maxdocs'};433 } else {434 $maxdocs = -1; # the default435 }436 }437 if ($groupsize == 1) {438 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {439 $groupsize = $collectcfg->{'groupsize'};440 }441 }442 443 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/ )) {444 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {445 $OIDtype = $collectcfg->{'OIDtype'};446 } else {447 $OIDtype = "hash"; # the default448 }449 }450 451 if ((!defined $OIDmetadata) || ($OIDmetadata eq "")) {452 if (defined $collectcfg->{'OIDmetadata'}) {453 $OIDmetadata = $collectcfg->{'OIDmetadata'};454 } else {455 $OIDmetadata = "dc.Identifier"; # the default456 }457 }458 459 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {460 $sortmeta = $collectcfg->{'sortmeta'};461 }462 # sortmeta cannot be used with group size463 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;464 if (defined $sortmeta && $groupsize > 1) {465 &gsprintf($out, "{import.cannot_sort}\n\n");466 $sortmeta = undef;467 }468 469 if (defined $collectcfg->{'removeprefix'} && $removeprefix eq "") {470 $removeprefix = $collectcfg->{'removeprefix'};471 }472 473 if (defined $collectcfg->{'removesuffix'} && $removesuffix eq "") {474 $removesuffix = $collectcfg->{'removesuffix'};475 }476 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {477 $debug = 1;478 }479 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {480 $gli = 1;481 }482 $gli = 0 unless defined $gli;483 484 # check keepold and removeold485 ($removeold, $keepold, $incremental, $incremental_mode)486 = &scriptutil::check_removeold_and_keepold($removeold, $keepold,487 $incremental, "archives",488 $collectcfg);489 490 491 print STDERR "<Import>\n" if $gli;492 493 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);494 if ($manifest ne "") {495 my $manifest_filename = $manifest;496 497 if ($manifest_filename !~ m/^[\\\/]/) {498 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);499 }500 501 $manifest =~ s/[\\\/]+/\//g;502 $manifest =~ s/\/$//;503 504 $manifest_lookup->parse($manifest_filename);505 }506 507 508 # load all the plugins509 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode);510 if (scalar(@$pluginfo) == 0) {511 &gsprintf($out, "{import.no_plugins_loaded}\n");512 die "\n";513 }514 515 # remove the old contents of the archives directory (and tmp directory) if needed516 if ($removeold) {517 if (-e $archivedir) {518 &gsprintf($out, "{import.removing_archives}\n");519 &util::rm_r ($archivedir);520 }521 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");522 $tmpdir =~ s/[\\\/]+/\//g;523 $tmpdir =~ s/\/$//;524 if (-e $tmpdir) {525 &gsprintf($out, "{import.removing_tmpdir}\n");526 &util::rm_r ($tmpdir);527 }528 }529 530 # create the archives dir if needed531 &util::mk_all_dir($archivedir);532 533 # read the archive information file534 ## $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf");535 536 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)537 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));538 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));539 540 $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);541 $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);542 543 $archive_info = new arcinfo ($collectcfg->{'infodbtype'});544 $archive_info->load_info ($arcinfo_doc_filename);545 546 if ($manifest eq "") {547 # Load in list of files in import folder from last import (if present)548 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);549 }550 551 ####Use Plugout####552 my ($plugout);553 if (defined $collectcfg->{'plugout'}) {554 # If a plugout was specified in the collect.cfg file, assume it is sensible555 # We can't check the name because it could be anything, if it is a custom plugout556 $plugout = $collectcfg->{'plugout'};557 }558 else{559 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {560 push @$plugout,"GreenstoneXMLPlugout";561 }562 else{563 push @$plugout,$saveas."Plugout";564 }565 }566 567 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);568 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);569 push @$plugout,("-gzip_output") if ($gzip);570 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);571 push @$plugout,("-output_handle",$out) if (defined $out);572 push @$plugout,("-debug") if ($debug);573 574 $processor = &plugout::load_plugout($plugout);575 $processor->setoutputdir ($archivedir);576 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;577 $processor->set_OIDtype ($OIDtype, $OIDmetadata);578 579 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);580 581 if ($removeold) {582 # occasionally, plugins may want to do something on remove old, eg pharos image indexing583 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);584 }585 if ($manifest eq "") {586 # process the import directory587 my $block_hash = {};588 my $metadata = {};589 # gobal blocking pass may set up some metadata590 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);591 592 593 if ($incremental || $incremental_mode eq "onlyadd") {594 595 &inexport::prime_doc_oid_count($archivedir);596 597 598 # Can now work out which files were new, already existed, and have599 # been deleted600 601 &inexport::new_vs_old_import_diff($archive_info,$block_hash,$importdir,602 $archivedir,$verbosity,$incremental_mode);603 604 my @new_files = sort keys %{$block_hash->{'new_files'}};605 if (scalar(@new_files>0)) {606 print STDERR "New files and modified metadata files since last import:\n ";607 print STDERR join("\n ",@new_files), "\n";608 }609 610 if ($incremental) {611 # only look for deletions if we are truely incremental612 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};613 # Filter out any in gsdl/tmp area614 my @filtered_deleted_files = ();615 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");616 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");617 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);618 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);619 620 foreach my $df (@deleted_files) {621 next if ($df =~ m/^$gsdl_tmp_area/);622 next if ($df =~ m/^$collect_tmp_area/);623 624 push(@filtered_deleted_files,$df);625 }626 627 628 @deleted_files = @filtered_deleted_files;629 630 if (scalar(@deleted_files)>0) {631 print STDERR "Files deleted since last import:\n ";632 print STDERR join("\n ",@deleted_files), "\n";633 634 635 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);636 637 &inexport::mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");638 }639 640 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};641 642 if (scalar(@reindex_files)>0) {643 print STDERR "Files to reindex since last import:\n ";644 print STDERR join("\n ",@reindex_files), "\n";645 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);646 &inexport::mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");647 }648 649 }650 651 # Play it safe, and run through the entire folder, only processing new or edited files652 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);653 654 }655 else {656 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);657 }658 659 }660 else661 {662 #663 # 1. Process delete files first664 #665 666 my @deleted_files = keys %{$manifest_lookup->{'delete'}};667 my @full_deleted_files = ();668 669 # ensure all filenames are absolute670 foreach my $df (@deleted_files) {671 my $full_df =672 (&util::filename_is_absolute($df))673 ? $df674 : &util::filename_cat($importdir,$df);675 676 push(@full_deleted_files,$full_df);677 }678 679 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);680 &inexport::mark_docs_for_deletion($archive_info,{},681 \@full_deleted_files,682 $archivedir, $verbosity, "delete");683 684 685 #686 # 2. Now files for reindexing687 #688 689 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};690 my @full_reindex_files = ();691 692 # ensure all filenames are absolute693 foreach my $rf (@reindex_files) {694 my $full_rf =695 (&util::filename_is_absolute($rf))696 ? $rf697 : &util::filename_cat($importdir,$rf);698 699 push(@full_reindex_files,$full_rf);700 }701 702 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);703 &inexport::mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");704 705 # And now ensure the new version of the file processed by appropriate706 # plugin707 foreach my $full_rf (@full_reindex_files) {708 &plugin::read ($pluginfo, "", $full_rf, {}, {}, $processor, $maxdocs, 0, $gli);709 }710 711 712 #713 # 3. Now finally any new files714 #715 716 foreach my $file (keys %{$manifest_lookup->{'index'}}) {717 &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);718 }719 720 721 }722 723 &plugin::end($pluginfo, $processor);724 725 &plugin::deinit($pluginfo, $processor);726 727 # Store the value of OIDCount (used in doc.pm) so it can be728 # restored correctly to this value on an incremental build729 &inexport::store_doc_oid_count($archivedir);730 731 # write out the archive information file732 $processor->close_file_output() if $groupsize > 1;733 $processor->close_group_output() if $processor->is_group();734 735 # The following 'if' statement is in the export.pl version of the script,736 # The reason for the 'if' statement is now given in export.pl737 # Unclear at this point if the same should be done here738 ## if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) {739 # Not all export types need this (e.g. DSpace)740 741 # should we still do this in debug mode??742 743 # for backwards compatability with archvies.inf file744 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {745 $archive_info->save_info($arcinfo_doc_filename);746 }747 else {748 $archive_info->save_revinfo_db($arcinfo_src_filename);749 }750 751 752 ## }753 754 # write out import stats755 my $close_stats = 0;756 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {757 if (open (STATS, ">$statsfile")) {758 $statsfile = 'import::STATS';759 $close_stats = 1;760 } else {761 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);762 &gsprintf($out, "{import.stats_backup}\n");763 $statsfile = 'STDERR';764 }765 }766 767 &gsprintf($out, "\n");768 &gsprintf($out, "*********************************************\n");769 &gsprintf($out, "{import.complete}\n");770 &gsprintf($out, "*********************************************\n");771 772 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);773 if ($close_stats) {774 close STATS;775 }776 777 close OUT if $close_out;778 close FAILLOG;779 } -
main/trunk/greenstone2/perllib/inexport.pm
r22327 r22413 1 1 ########################################################################### 2 2 # 3 # inexport.pm -- useful utilities to support import.pl and export.pl3 # inexport.pm -- useful class to support import.pl and export.pl 4 4 # A component of the Greenstone digital library software 5 5 # from the New Zealand Digital Library Project at the … … 28 28 use strict; 29 29 30 use File::Basename; 31 30 no strict 'refs'; # allow filehandles to be variables and vice versa 31 no strict 'subs'; # allow barewords (eg STDERR) as function arguments 32 33 use arcinfo; 34 use colcfg; 35 use dbutil; 36 use plugin; 37 use plugout; 38 use manifest; 39 use inexport; 32 40 use dbutil; 33 41 use util; 42 use scriptutil; 43 use FileHandle; 44 use gsprintf 'gsprintf'; 45 use printusage; 46 use parse2; 47 48 use File::Basename; 49 50 sub new 51 { 52 my $class = shift (@_); 53 my ($argv,$options) = @_; 54 55 my $self = { 'xml' => 0 }; 56 57 # general options available to all plugins 58 my $arguments = $options->{'args'}; 59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options"); 60 # Parse returns -1 if something has gone wrong 61 if ($intArgLeftinAfterParsing == -1) 62 { 63 &PrintUsage::print_txt_usage($options, "{import.params}"); 64 die "\n"; 65 } 66 67 my $language = $self->{'language'}; 68 # If $language has been specified, load the appropriate resource bundle 69 # (Otherwise, the default resource bundle will be loaded automatically) 70 if ($language && $language =~ /\S/) { 71 &gsprintf::load_language_specific_resource_bundle($language); 72 } 73 74 if ($self->{'xml'}) { 75 &PrintUsage::print_xml_usage($options); 76 print "\n"; 77 return; 78 } 79 80 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8 81 &gsprintf::output_strings_in_UTF8; 82 } 83 84 # now check that we had exactly one leftover arg, which should be 85 # the collection name. We don't want to do this earlier, cos 86 # -xml arg doesn't need a collection name 87 # Or if the user specified -h, then we output the usage also 88 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/)) 89 { 90 &PrintUsage::print_txt_usage($options, "{import.params}"); 91 die "\n"; 92 } 93 94 $self->{'close_out'} = 0; 95 my $out = $self->{'out'}; 96 if ($out !~ /^(STDERR|STDOUT)$/i) { 97 open (OUT, ">$out") || 98 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die); 99 $out = 'import::OUT'; 100 $self->{'close_out'} = 1; 101 } 102 $out->autoflush(1); 103 $self->{'out'} = $out; 104 105 # @ARGV should be only one item, the name of the collection 106 $self->{'collection'} = shift @$argv; 107 108 return bless $self, $class; 109 } 110 111 sub get_collection 112 { 113 my $self = shift @_; 114 115 return $self->{'collection'}; 116 } 117 118 119 sub read_collection_cfg 120 { 121 my $self = shift @_; 122 my ($collection,$options) = @_; 123 124 my $collectdir = $self->{'collectdir'}; 125 my $site = $self->{'site'}; 126 my $out = $self->{'out'}; 127 128 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") { 129 &PrintUsage::print_txt_usage($options, "{import.params}"); 130 die "\n"; 131 } 132 133 # add collection's perllib dir into include path in 134 # case we have collection specific modules 135 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib"); 136 137 # check that we can open the faillog 138 my $faillog = $self->{'faillog'}; 139 if ($faillog eq "") { 140 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log"); 141 } 142 open (FAILLOG, ">$faillog") || 143 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die); 144 145 146 my $faillogname = $faillog; 147 $faillog = 'inexport::FAILLOG'; 148 $faillog->autoflush(1); 149 $self->{'faillog'} = $faillog; 150 $self->{'faillogname'} = $faillogname; 151 152 # Read in the collection configuration file. 153 my ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out); 154 my $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode); 155 156 return $collectcfg; 157 } 158 159 sub set_collection_options 160 { 161 my $self = shift @_; 162 my ($inexport_mode,$collectcfg) = @_; 163 164 my $verbosity = $self->{'verbosity'}; 165 print STDERR "**** verbosity = $verbosity\n\n\n"; 166 167 my $debug = $self->{'debug'}; 168 my $importdir = $self->{'importdir'}; 169 my $archivedir = $self->{'archivedir'}; 170 my $out = $self->{'out'}; 171 172 # If the infodbtype value wasn't defined in the collect.cfg file, use the default 173 if (!defined($collectcfg->{'infodbtype'})) 174 { 175 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type(); 176 } 177 178 if (defined $collectcfg->{'importdir'} && $importdir eq "") { 179 $importdir = $collectcfg->{'importdir'}; 180 } 181 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") { 182 $archivedir = $collectcfg->{'archivedir'}; 183 } 184 # fill in the default import and archives directories if none 185 # were supplied, turn all \ into / and remove trailing / 186 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq ""; 187 $importdir =~ s/[\\\/]+/\//g; 188 $importdir =~ s/\/$//; 189 if (!-e $importdir) { 190 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir); 191 die "\n"; 192 } 193 $self->{'importdir'} = $importdir; 194 195 if ($archivedir eq "") { 196 if ($inexport_mode eq "import") { 197 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives"); 198 } 199 elsif ($inexport_mode eq "export") { 200 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export"); 201 } 202 else { 203 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n"; 204 print STDERR " Defaulting to 'archives' for file output\n"; 205 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives"); 206 } 207 } 208 209 $archivedir =~ s/[\\\/]+/\//g; 210 $archivedir =~ s/\/$//; 211 $self->{'archivedir'} = $archivedir; 212 213 if ($verbosity !~ /\d+/) { 214 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) { 215 $verbosity = $collectcfg->{'verbosity'}; 216 } else { 217 $verbosity = 2; # the default 218 } 219 } 220 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") { 221 $self->{'manifest'} = $collectcfg->{'manifest'}; 222 } 223 224 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) { 225 if ($collectcfg->{'gzip'} =~ /^true$/i) { 226 $self->{'gzip'} = 1; 227 } 228 } 229 230 if ($self->{'maxdocs'} !~ /\-?\d+/) { 231 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) { 232 $self->{'maxdocs'} = $collectcfg->{'maxdocs'}; 233 } else { 234 $self->{'maxdocs'} = -1; # the default 235 } 236 } 237 if ($self->{'groupsize'} == 1) { 238 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) { 239 $self->{'groupsize'} = $collectcfg->{'groupsize'}; 240 } 241 } 242 243 if (!defined $self->{'OIDtype'} 244 || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) { 245 if (defined $collectcfg->{'OIDtype'} 246 && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) { 247 $self->{'OIDtype'} = $collectcfg->{'OIDtype'}; 248 } else { 249 $self->{'OIDtype'} = "hash"; # the default 250 } 251 } 252 253 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) { 254 if (defined $collectcfg->{'OIDmetadata'}) { 255 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'}; 256 } else { 257 $self->{'OIDmetadata'} = "dc.Identifier"; # the default 258 } 259 } 260 261 my $sortmeta = $self->{'sortmeta'}; 262 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) { 263 $sortmeta = $collectcfg->{'sortmeta'}; 264 } 265 # sortmeta cannot be used with group size 266 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/; 267 if (defined $sortmeta && $self->{'groupsize'} > 1) { 268 &gsprintf($out, "{import.cannot_sort}\n\n"); 269 $sortmeta = undef; 270 } 271 $self->{'sortmeta'} = $sortmeta; 272 273 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") { 274 $self->{'removeprefix'} = $collectcfg->{'removeprefix'}; 275 } 276 277 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") { 278 $self->{'removesuffix'} = $collectcfg->{'removesuffix'}; 279 } 280 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) { 281 $self->{'debug'} = 1; 282 } 283 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) { 284 $self->{'gli'} = 1; 285 } 286 $self->{'gli'} = 0 unless defined $self->{'gli'}; 287 288 # check keepold and removeold 289 my ($removeold, $keepold, $incremental, $incremental_mode) 290 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'}, 291 $self->{'incremental'}, "archives", 292 $collectcfg); 293 294 $self->{'removeold'} = $removeold; 295 $self->{'keepold'} = $keepold; 296 $self->{'incremental'} = $incremental; 297 $self->{'incremental_mode'} = $incremental_mode; 298 } 299 300 sub process_files 301 { 302 my $self = shift @_; 303 my ($inexport_mode,$collectcfg) = @_; 304 305 my $verbosity = $self->{'verbosity'}; 306 my $debug = $self->{'debug'}; 307 308 my $importdir = $self->{'importdir'}; 309 my $archivedir = $self->{'archivedir'}; 310 311 my $incremental = $self->{'incremental'}; 312 my $incremental_mode = $self->{'incremental_mode'}; 313 314 my $removeold = $self->{'removeold'}; 315 my $keepold = $self->{'keepold'}; 316 317 my $saveas = $self->{'saveas'}; 318 my $OIDtype = $self->{'OIDtype'}; 319 my $OIDmetadata = $self->{'OIDmetadata'}; 320 321 my $out = $self->{'out'}; 322 my $faillog = $self->{'faillog'}; 323 324 my $maxdocs = $self->{'maxdocs'}; 325 my $gzip = $self->{'gzip'}; 326 my $groupsize = $self->{'groupsize'}; 327 my $sortmeta = $self->{'sortmeta'}; 328 329 my $removeprefix = $self->{'removeprefix'}; 330 my $removesuffix = $self->{'removesuffix'}; 331 332 my $gli = $self->{'gli'}; 333 334 print STDERR "<Import>\n" if $gli; 335 336 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir); 337 if ($self->{'manifest'} ne "") { 338 my $manifest_filename = $self->{'manifest'}; 339 340 if (!&util::filename_is_absolute($manifest_filename)) { 341 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename); 342 } 343 344 $self->{'manifest'} =~ s/[\\\/]+/\//g; 345 $self->{'manifest'} =~ s/\/$//; 346 347 $manifest_lookup->parse($manifest_filename); 348 } 349 350 my $manifest = $self->{'manifest'}; 351 352 # load all the plugins 353 my $plugins = []; 354 if (defined $collectcfg->{'plugin'}) { 355 $plugins = $collectcfg->{'plugin'}; 356 } 357 358 #some global options for the plugins 359 my @global_opts = (); 360 361 362 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode); 363 if (scalar(@$pluginfo) == 0) { 364 &gsprintf($out, "{import.no_plugins_loaded}\n"); 365 die "\n"; 366 } 367 368 # remove the old contents of the archives directory (and tmp directory) if needed 369 if ($removeold) { 370 if (-e $archivedir) { 371 &gsprintf($out, "{import.removing_archives}\n"); 372 &util::rm_r ($archivedir); 373 } 374 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp"); 375 $tmpdir =~ s/[\\\/]+/\//g; 376 $tmpdir =~ s/\/$//; 377 if (-e $tmpdir) { 378 &gsprintf($out, "{import.removing_tmpdir}\n"); 379 &util::rm_r ($tmpdir); 380 } 381 } 382 383 # create the archives dir if needed 384 &util::mk_all_dir($archivedir); 385 386 # read the archive information file 387 ## my $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf"); 388 389 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes) 390 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc")); 391 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src")); 392 393 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir); 394 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir); 395 396 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'}); 397 $archive_info->load_info ($arcinfo_doc_filename); 398 399 if ($manifest eq "") { 400 # Load in list of files in import folder from last import (if present) 401 $archive_info->load_prev_import_filelist ($arcinfo_src_filename); 402 } 403 404 ####Use Plugout#### 405 my ($plugout); 406 if (defined $collectcfg->{'plugout'}) { 407 # If a plugout was specified in the collect.cfg file, assume it is sensible 408 # We can't check the name because it could be anything, if it is a custom plugout 409 $plugout = $collectcfg->{'plugout'}; 410 } 411 else{ 412 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) { 413 push @$plugout,"GreenstoneXMLPlugout"; 414 } 415 else{ 416 push @$plugout,$saveas."Plugout"; 417 } 418 } 419 420 push @$plugout,("-output_info",$archive_info) if (defined $archive_info); 421 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity); 422 push @$plugout,("-gzip_output") if ($gzip); 423 push @$plugout,("-group_size",$groupsize) if (defined $groupsize); 424 push @$plugout,("-output_handle",$out) if (defined); 425 push @$plugout,("-debug") if ($debug); 426 427 my $processor = &plugout::load_plugout($plugout); 428 $processor->setoutputdir ($archivedir); 429 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta; 430 $processor->set_OIDtype ($OIDtype, $OIDmetadata); 431 432 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli); 433 434 if ($removeold) { 435 # occasionally, plugins may want to do something on remove old, eg pharos image indexing 436 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli); 437 } 438 if ($manifest eq "") { 439 # process the import directory 440 my $block_hash = {}; 441 my $metadata = {}; 442 # gobal blocking pass may set up some metadata 443 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli); 444 445 446 if ($incremental || $incremental_mode eq "onlyadd") { 447 448 prime_doc_oid_count($archivedir); 449 450 451 # Can now work out which files were new, already existed, and have 452 # been deleted 453 454 new_vs_old_import_diff($archive_info,$block_hash,$importdir, 455 $archivedir,$verbosity,$incremental_mode); 456 457 my @new_files = sort keys %{$block_hash->{'new_files'}}; 458 if (scalar(@new_files>0)) { 459 print STDERR "New files and modified metadata files since last import:\n "; 460 print STDERR join("\n ",@new_files), "\n"; 461 } 462 463 if ($incremental) { 464 # only look for deletions if we are truely incremental 465 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}}; 466 # Filter out any in gsdl/tmp area 467 my @filtered_deleted_files = (); 468 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp"); 469 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp"); 470 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area); 471 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area); 472 473 foreach my $df (@deleted_files) { 474 next if ($df =~ m/^$gsdl_tmp_area/); 475 next if ($df =~ m/^$collect_tmp_area/); 476 477 push(@filtered_deleted_files,$df); 478 } 479 480 481 @deleted_files = @filtered_deleted_files; 482 483 if (scalar(@deleted_files)>0) { 484 print STDERR "Files deleted since last import:\n "; 485 print STDERR join("\n ",@deleted_files), "\n"; 486 487 488 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files); 489 490 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete"); 491 } 492 493 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}}; 494 495 if (scalar(@reindex_files)>0) { 496 print STDERR "Files to reindex since last import:\n "; 497 print STDERR join("\n ",@reindex_files), "\n"; 498 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files); 499 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex"); 500 } 501 502 } 503 504 # Play it safe, and run through the entire folder, only processing new or edited files 505 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 506 507 } 508 else { 509 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 510 } 511 512 } 513 else 514 { 515 # 516 # 1. Process delete files first 517 # 518 519 my @deleted_files = keys %{$manifest_lookup->{'delete'}}; 520 my @full_deleted_files = (); 521 522 # ensure all filenames are absolute 523 foreach my $df (@deleted_files) { 524 my $full_df = 525 (&util::filename_is_absolute($df)) 526 ? $df 527 : &util::filename_cat($importdir,$df); 528 529 push(@full_deleted_files,$full_df); 530 } 531 532 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files); 533 mark_docs_for_deletion($archive_info,{}, 534 \@full_deleted_files, 535 $archivedir, $verbosity, "delete"); 536 537 538 # 539 # 2. Now files for reindexing 540 # 541 542 my @reindex_files = keys %{$manifest_lookup->{'reindex'}}; 543 my @full_reindex_files = (); 544 545 # ensure all filenames are absolute 546 foreach my $rf (@reindex_files) { 547 my $full_rf = 548 (&util::filename_is_absolute($rf)) 549 ? $rf 550 : &util::filename_cat($importdir,$rf); 551 552 push(@full_reindex_files,$full_rf); 553 } 554 555 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files); 556 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex"); 557 558 # And now ensure the new version of the file processed by appropriate 559 # plugin 560 foreach my $full_rf (@full_reindex_files) { 561 &plugin::read ($pluginfo, "", $full_rf, {}, {}, $processor, $maxdocs, 0, $gli); 562 } 563 564 565 # 566 # 3. Now finally any new files 567 # 568 569 foreach my $file (keys %{$manifest_lookup->{'index'}}) { 570 &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli); 571 } 572 573 574 } 575 576 &plugin::end($pluginfo, $processor); 577 578 &plugin::deinit($pluginfo, $processor); 579 580 # Store the value of OIDCount (used in doc.pm) so it can be 581 # restored correctly to this value on an incremental build 582 store_doc_oid_count($archivedir); 583 584 # write out the archive information file 585 $processor->close_file_output() if $groupsize > 1; 586 $processor->close_group_output() if $processor->is_group(); 587 588 # The following 'if' statement is in the export.pl version of the script, 589 # The reason for the 'if' statement is now given in export.pl 590 # Unclear at this point if the same should be done here 591 ## if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) { 592 # Not all export types need this (e.g. DSpace) 593 594 # should we still do this in debug mode?? 595 596 # for backwards compatability with archvies.inf file 597 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) { 598 $archive_info->save_info($arcinfo_doc_filename); 599 } 600 else { 601 $archive_info->save_revinfo_db($arcinfo_src_filename); 602 } 603 604 605 ## } 606 607 return $pluginfo; 608 } 609 610 611 sub generate_statistics 612 { 613 my $self = shift @_; 614 my ($inexport_mode,$pluginfo) = @_; 615 616 my $statsfile = $self->{'statsfile'}; 617 my $out = $self->{'out'}; 618 my $faillogname = $self->{'faillogname'}; 619 my $gli = $self->{'gli'}; 620 621 # write out import stats 622 my $close_stats = 0; 623 if ($statsfile !~ /^(STDERR|STDOUT)$/i) { 624 if (open (STATS, ">$statsfile")) { 625 $statsfile = 'import::STATS'; 626 $close_stats = 1; 627 } else { 628 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile); 629 &gsprintf($out, "{import.stats_backup}\n"); 630 $statsfile = 'STDERR'; 631 } 632 } 633 634 &gsprintf($out, "\n"); 635 &gsprintf($out, "*********************************************\n"); 636 &gsprintf($out, "{import.complete}\n"); 637 &gsprintf($out, "*********************************************\n"); 638 639 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli); 640 if ($close_stats) { 641 close STATS; 642 } 643 644 close OUT if $self->{'close_out'}; 645 close FAILLOG; 646 } 647 648 649 650 651 34 652 35 653
Note:
See TracChangeset
for help on using the changeset viewer.