Changeset 22413 for main/trunk/greenstone2/bin/script
- Timestamp:
- 2010-07-16T14:13:01+12:00 (14 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/import.pl
r22037 r22413 64 64 65 65 use strict; 66 no strict 'refs'; # allow filehandles to be variables and vice versa67 no strict 'subs'; # allow barewords (eg STDERR) as function arguments68 69 use arcinfo;70 use colcfg;71 use dbutil;72 use plugin;73 use plugout;74 use manifest;75 66 use inexport; 76 use util;77 use scriptutil;78 use FileHandle;79 use gsprintf 'gsprintf';80 use printusage;81 use parse2;82 83 84 67 85 68 my $oidtype_list = … … 258 241 'range' => "0,", 259 242 # parsearg left "" as default 260 #'deft' => "2",243 'deft' => "2", 261 244 'reqd' => "no", 262 245 'modegli' => "3" }, … … 277 260 278 261 262 263 sub main 264 { 265 my $inexport = new inexport(\@ARGV,$options); 266 267 my $collection = $inexport->get_collection(); 268 my $collect_cfg = $inexport->read_collection_cfg($collection,$options); 269 $inexport->set_collection_options("import",$collect_cfg); 270 271 my $pluginfo = $inexport->process_files("import",$collect_cfg); 272 273 $inexport->generate_statistics("import",$pluginfo); 274 } 275 276 279 277 &main(); 280 281 sub main {282 # params283 my ($language, $verbosity, $debug,284 $collectdir, $importdir, $archivedir, $site, $manifest,285 $incremental, $incremental_mode, $keepold, $removeold,286 $saveas,287 $OIDtype, $OIDmetadata,288 $maxdocs, $statsfile,289 $out, $faillog, $gli,290 $gzip, $groupsize,291 $sortmeta, $removeprefix, $removesuffix292 );293 294 my $xml = 0;295 296 # other vars297 my ($configfilename, $collection, $collectcfg,298 $arcinfo_doc_filename, $arcinfo_src_filename, $archive_info,299 $gs_mode,300 $processor, $pluginfo);301 302 my $service = "import";303 304 my $hashParsingResult = {};305 # general options available to all plugins306 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");307 # Parse returns -1 if something has gone wrong308 if ($intArgLeftinAfterParsing == -1)309 {310 &PrintUsage::print_txt_usage($options, "{import.params}");311 die "\n";312 }313 314 foreach my $strVariable (keys %$hashParsingResult)315 {316 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";317 }318 319 # If $language has been specified, load the appropriate resource bundle320 # (Otherwise, the default resource bundle will be loaded automatically)321 if ($language && $language =~ /\S/) {322 &gsprintf::load_language_specific_resource_bundle($language);323 }324 325 if ($xml) {326 &PrintUsage::print_xml_usage($options);327 print "\n";328 return;329 }330 331 if ($gli) { # the gli wants strings to be in UTF-8332 &gsprintf::output_strings_in_UTF8;333 }334 335 # now check that we had exactly one leftover arg, which should be336 # the collection name. We don't want to do this earlier, cos337 # -xml arg doesn't need a collection name338 # Or if the user specified -h, then we output the usage also339 if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))340 {341 &PrintUsage::print_txt_usage($options, "{import.params}");342 die "\n";343 }344 345 my $close_out = 0;346 if ($out !~ /^(STDERR|STDOUT)$/i) {347 open (OUT, ">$out") ||348 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);349 $out = 'import::OUT';350 $close_out = 1;351 }352 $out->autoflush(1);353 354 # get and check the collection name355 if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") {356 &PrintUsage::print_txt_usage($options, "{import.params}");357 die "\n";358 }359 360 # add collection's perllib dir into include path in361 # case we have collection specific modules362 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");363 364 # check that we can open the faillog365 if ($faillog eq "") {366 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");367 }368 open (FAILLOG, ">$faillog") ||369 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);370 371 372 my $faillogname = $faillog;373 $faillog = 'import::FAILLOG';374 $faillog->autoflush(1);375 376 # Read in the collection configuration file.377 ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);378 $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);379 380 # If the infodbtype value wasn't defined in the collect.cfg file, use the default381 if (!defined($collectcfg->{'infodbtype'}))382 {383 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();384 }385 386 if (defined $collectcfg->{'importdir'} && $importdir eq "") {387 $importdir = $collectcfg->{'importdir'};388 }389 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {390 $archivedir = $collectcfg->{'archivedir'};391 }392 # fill in the default import and archives directories if none393 # were supplied, turn all \ into / and remove trailing /394 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";395 $importdir =~ s/[\\\/]+/\//g;396 $importdir =~ s/\/$//;397 if (!-e $importdir) {398 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);399 die "\n";400 }401 402 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";403 $archivedir =~ s/[\\\/]+/\//g;404 $archivedir =~ s/\/$//;405 406 my $plugins = [];407 if (defined $collectcfg->{'plugin'}) {408 $plugins = $collectcfg->{'plugin'};409 }410 #some global options for the plugins411 my @global_opts = ();412 413 if ($verbosity !~ /\d+/) {414 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {415 $verbosity = $collectcfg->{'verbosity'};416 } else {417 $verbosity = 2; # the default418 }419 }420 if (defined $collectcfg->{'manifest'} && $manifest eq "") {421 $manifest = $collectcfg->{'manifest'};422 }423 424 if (defined $collectcfg->{'gzip'} && !$gzip) {425 if ($collectcfg->{'gzip'} =~ /^true$/i) {426 $gzip = 1;427 }428 }429 430 if ($maxdocs !~ /\-?\d+/) {431 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {432 $maxdocs = $collectcfg->{'maxdocs'};433 } else {434 $maxdocs = -1; # the default435 }436 }437 if ($groupsize == 1) {438 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {439 $groupsize = $collectcfg->{'groupsize'};440 }441 }442 443 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/ )) {444 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {445 $OIDtype = $collectcfg->{'OIDtype'};446 } else {447 $OIDtype = "hash"; # the default448 }449 }450 451 if ((!defined $OIDmetadata) || ($OIDmetadata eq "")) {452 if (defined $collectcfg->{'OIDmetadata'}) {453 $OIDmetadata = $collectcfg->{'OIDmetadata'};454 } else {455 $OIDmetadata = "dc.Identifier"; # the default456 }457 }458 459 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {460 $sortmeta = $collectcfg->{'sortmeta'};461 }462 # sortmeta cannot be used with group size463 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;464 if (defined $sortmeta && $groupsize > 1) {465 &gsprintf($out, "{import.cannot_sort}\n\n");466 $sortmeta = undef;467 }468 469 if (defined $collectcfg->{'removeprefix'} && $removeprefix eq "") {470 $removeprefix = $collectcfg->{'removeprefix'};471 }472 473 if (defined $collectcfg->{'removesuffix'} && $removesuffix eq "") {474 $removesuffix = $collectcfg->{'removesuffix'};475 }476 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {477 $debug = 1;478 }479 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {480 $gli = 1;481 }482 $gli = 0 unless defined $gli;483 484 # check keepold and removeold485 ($removeold, $keepold, $incremental, $incremental_mode)486 = &scriptutil::check_removeold_and_keepold($removeold, $keepold,487 $incremental, "archives",488 $collectcfg);489 490 491 print STDERR "<Import>\n" if $gli;492 493 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);494 if ($manifest ne "") {495 my $manifest_filename = $manifest;496 497 if ($manifest_filename !~ m/^[\\\/]/) {498 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);499 }500 501 $manifest =~ s/[\\\/]+/\//g;502 $manifest =~ s/\/$//;503 504 $manifest_lookup->parse($manifest_filename);505 }506 507 508 # load all the plugins509 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode);510 if (scalar(@$pluginfo) == 0) {511 &gsprintf($out, "{import.no_plugins_loaded}\n");512 die "\n";513 }514 515 # remove the old contents of the archives directory (and tmp directory) if needed516 if ($removeold) {517 if (-e $archivedir) {518 &gsprintf($out, "{import.removing_archives}\n");519 &util::rm_r ($archivedir);520 }521 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");522 $tmpdir =~ s/[\\\/]+/\//g;523 $tmpdir =~ s/\/$//;524 if (-e $tmpdir) {525 &gsprintf($out, "{import.removing_tmpdir}\n");526 &util::rm_r ($tmpdir);527 }528 }529 530 # create the archives dir if needed531 &util::mk_all_dir($archivedir);532 533 # read the archive information file534 ## $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf");535 536 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)537 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));538 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));539 540 $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);541 $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);542 543 $archive_info = new arcinfo ($collectcfg->{'infodbtype'});544 $archive_info->load_info ($arcinfo_doc_filename);545 546 if ($manifest eq "") {547 # Load in list of files in import folder from last import (if present)548 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);549 }550 551 ####Use Plugout####552 my ($plugout);553 if (defined $collectcfg->{'plugout'}) {554 # If a plugout was specified in the collect.cfg file, assume it is sensible555 # We can't check the name because it could be anything, if it is a custom plugout556 $plugout = $collectcfg->{'plugout'};557 }558 else{559 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {560 push @$plugout,"GreenstoneXMLPlugout";561 }562 else{563 push @$plugout,$saveas."Plugout";564 }565 }566 567 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);568 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);569 push @$plugout,("-gzip_output") if ($gzip);570 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);571 push @$plugout,("-output_handle",$out) if (defined $out);572 push @$plugout,("-debug") if ($debug);573 574 $processor = &plugout::load_plugout($plugout);575 $processor->setoutputdir ($archivedir);576 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;577 $processor->set_OIDtype ($OIDtype, $OIDmetadata);578 579 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);580 581 if ($removeold) {582 # occasionally, plugins may want to do something on remove old, eg pharos image indexing583 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);584 }585 if ($manifest eq "") {586 # process the import directory587 my $block_hash = {};588 my $metadata = {};589 # gobal blocking pass may set up some metadata590 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);591 592 593 if ($incremental || $incremental_mode eq "onlyadd") {594 595 &inexport::prime_doc_oid_count($archivedir);596 597 598 # Can now work out which files were new, already existed, and have599 # been deleted600 601 &inexport::new_vs_old_import_diff($archive_info,$block_hash,$importdir,602 $archivedir,$verbosity,$incremental_mode);603 604 my @new_files = sort keys %{$block_hash->{'new_files'}};605 if (scalar(@new_files>0)) {606 print STDERR "New files and modified metadata files since last import:\n ";607 print STDERR join("\n ",@new_files), "\n";608 }609 610 if ($incremental) {611 # only look for deletions if we are truely incremental612 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};613 # Filter out any in gsdl/tmp area614 my @filtered_deleted_files = ();615 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");616 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");617 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);618 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);619 620 foreach my $df (@deleted_files) {621 next if ($df =~ m/^$gsdl_tmp_area/);622 next if ($df =~ m/^$collect_tmp_area/);623 624 push(@filtered_deleted_files,$df);625 }626 627 628 @deleted_files = @filtered_deleted_files;629 630 if (scalar(@deleted_files)>0) {631 print STDERR "Files deleted since last import:\n ";632 print STDERR join("\n ",@deleted_files), "\n";633 634 635 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);636 637 &inexport::mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");638 }639 640 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};641 642 if (scalar(@reindex_files)>0) {643 print STDERR "Files to reindex since last import:\n ";644 print STDERR join("\n ",@reindex_files), "\n";645 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);646 &inexport::mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");647 }648 649 }650 651 # Play it safe, and run through the entire folder, only processing new or edited files652 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);653 654 }655 else {656 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);657 }658 659 }660 else661 {662 #663 # 1. Process delete files first664 #665 666 my @deleted_files = keys %{$manifest_lookup->{'delete'}};667 my @full_deleted_files = ();668 669 # ensure all filenames are absolute670 foreach my $df (@deleted_files) {671 my $full_df =672 (&util::filename_is_absolute($df))673 ? $df674 : &util::filename_cat($importdir,$df);675 676 push(@full_deleted_files,$full_df);677 }678 679 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);680 &inexport::mark_docs_for_deletion($archive_info,{},681 \@full_deleted_files,682 $archivedir, $verbosity, "delete");683 684 685 #686 # 2. Now files for reindexing687 #688 689 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};690 my @full_reindex_files = ();691 692 # ensure all filenames are absolute693 foreach my $rf (@reindex_files) {694 my $full_rf =695 (&util::filename_is_absolute($rf))696 ? $rf697 : &util::filename_cat($importdir,$rf);698 699 push(@full_reindex_files,$full_rf);700 }701 702 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);703 &inexport::mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");704 705 # And now ensure the new version of the file processed by appropriate706 # plugin707 foreach my $full_rf (@full_reindex_files) {708 &plugin::read ($pluginfo, "", $full_rf, {}, {}, $processor, $maxdocs, 0, $gli);709 }710 711 712 #713 # 3. Now finally any new files714 #715 716 foreach my $file (keys %{$manifest_lookup->{'index'}}) {717 &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);718 }719 720 721 }722 723 &plugin::end($pluginfo, $processor);724 725 &plugin::deinit($pluginfo, $processor);726 727 # Store the value of OIDCount (used in doc.pm) so it can be728 # restored correctly to this value on an incremental build729 &inexport::store_doc_oid_count($archivedir);730 731 # write out the archive information file732 $processor->close_file_output() if $groupsize > 1;733 $processor->close_group_output() if $processor->is_group();734 735 # The following 'if' statement is in the export.pl version of the script,736 # The reason for the 'if' statement is now given in export.pl737 # Unclear at this point if the same should be done here738 ## if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) {739 # Not all export types need this (e.g. DSpace)740 741 # should we still do this in debug mode??742 743 # for backwards compatability with archvies.inf file744 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {745 $archive_info->save_info($arcinfo_doc_filename);746 }747 else {748 $archive_info->save_revinfo_db($arcinfo_src_filename);749 }750 751 752 ## }753 754 # write out import stats755 my $close_stats = 0;756 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {757 if (open (STATS, ">$statsfile")) {758 $statsfile = 'import::STATS';759 $close_stats = 1;760 } else {761 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);762 &gsprintf($out, "{import.stats_backup}\n");763 $statsfile = 'STDERR';764 }765 }766 767 &gsprintf($out, "\n");768 &gsprintf($out, "*********************************************\n");769 &gsprintf($out, "{import.complete}\n");770 &gsprintf($out, "*********************************************\n");771 772 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);773 if ($close_stats) {774 close STATS;775 }776 777 close OUT if $close_out;778 close FAILLOG;779 }
Note:
See TracChangeset
for help on using the changeset viewer.