Changeset 22421 for main/trunk/greenstone2/bin
- Timestamp:
- 2010-07-18T16:36:56+12:00 (14 years ago)
- Location:
- main/trunk/greenstone2/bin/script
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/export.pl
r22331 r22421 66 66 67 67 use strict; 68 no strict 'refs'; # allow filehandles to be variables and vice versa 69 no strict 'subs'; # allow barewords (eg STDERR) as function arguments 70 71 use arcinfo; 72 use colcfg; 73 use dbutil; 74 use plugin; 75 use plugout; 76 use manifest; 68 #no strict 'refs'; # allow filehandles to be variables and vice versa 69 #no strict 'subs'; # allow barewords (eg STDERR) as function arguments 77 70 use inexport; 78 use util;79 use scriptutil;80 use FileHandle;81 use gsprintf 'gsprintf';82 use printusage;83 use parse2;84 85 71 86 72 my $oidtype_list = … … 128 114 [ 129 115 $saveas_argument, 130 { 'name' => " exportdir",131 'desc' => "{export. exportdir}",116 { 'name' => "archivedir", 117 'desc' => "{export.archivedir}", 132 118 'type' => "string", 133 119 'reqd' => "no", … … 285 271 286 272 273 274 sub main 275 { 276 my $inexport = new inexport("export",\@ARGV,$options,$listall_options); 277 278 my $collection = $inexport->get_collection(); 279 my ($config_filename,$collect_cfg) = $inexport->read_collection_cfg($collection,$options); 280 $inexport->set_collection_options($collect_cfg); 281 282 my $pluginfo = $inexport->process_files($config_filename,$collect_cfg); 283 284 $inexport->generate_statistics($pluginfo); 285 } 286 287 287 288 &main(); 288 289 289 sub main { 290 # params 291 my ($language, $verbosity, $debug, 292 $collectdir, $importdir, $exportdir, $site, $manifest, 293 $incremental, $incremental_mode, $keepold, $removeold, 294 $saveas, 295 $OIDtype, $OIDmetadata, 296 $maxdocs, $statsfile, 297 $gzip, 298 $out, $faillog, $gli, $listall, 299 # plugout specific ones 300 $mapping_file, $xsltfile, 301 $xslt_mets, $xslt_txt, $fedora_namespace, $group_marc); 302 303 my $xml = 0; 304 305 # other vars 306 my ($configfilename, $collection, $collectcfg, 307 $expinfo_doc_filename, $expinfo_src_filename, $export_info, 308 $gs_mode, 309 $processor, $pluginfo); 310 311 my $service = "export"; 312 313 my $hashParsingResult = {}; 314 # general options available to all plugins 315 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options"); 316 317 # If parse returns -1 then something has gone wrong 318 if ($intArgLeftinAfterParsing == -1) 319 { 320 &PrintUsage::print_txt_usage($options, "{export.params}"); 321 die "\n"; 322 } 323 324 foreach my $strVariable (keys %$hashParsingResult) 325 { 326 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}"; 327 } 328 329 330 # If $language has been specified, load the appropriate resource bundle 331 # (Otherwise, the default resource bundle will be loaded automatically) 332 if ($language && $language =~ /\S/) { 333 &gsprintf::load_language_specific_resource_bundle($language); 334 } 335 336 if ($listall) { 337 if ($xml) { 338 &PrintUsage::print_xml_usage($listall_options); 339 } 340 else 341 { 342 &PrintUsage::print_txt_usage($listall_options,"{export.params}"); 343 } 344 die "\n"; 345 } 346 347 if ($xml) { 348 &PrintUsage::print_xml_usage($options); 349 die "\n"; 350 } 351 352 if ($gli) { # the gli wants strings to be in UTF-8 353 &gsprintf::output_strings_in_UTF8; 354 } 355 356 # now check that we had exactly one leftover arg, which should be 357 # the collection name. We don't want to do this earlier, cos 358 # -xml arg doesn't need a collection name 359 # Or if the user specified -h, then we output the usage also 360 if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/)) 361 { 362 &PrintUsage::print_txt_usage($options, "{export.params}"); 363 die "\n"; 364 } 365 366 my $close_out = 0; 367 if ($out !~ /^(STDERR|STDOUT)$/i) { 368 open (OUT, ">$out") || 369 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die); 370 $out = 'export::OUT'; 371 $close_out = 1; 372 } 373 $out->autoflush(1); 374 375 # get and check the collection name 376 if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") { 377 &PrintUsage::print_txt_usage($options, "{export.params}"); 378 die "\n"; 379 } 380 # add collection's perllib dir into include path in 381 # case we have collection specific modules 382 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib"); 383 384 # check that we can open the faillog 385 if ($faillog eq "") { 386 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log"); 387 } 388 open (FAILLOG, ">$faillog") || 389 (&gsprintf(STDERR, "{export.cannot_open_fail_log}\n", $faillog) && die); 390 my $faillogname = $faillog; 391 $faillog = 'export::FAILLOG'; 392 $faillog->autoflush(1); 393 394 # Read in the collection configuration file. 395 ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out); 396 $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode); 397 398 # If the infodbtype value wasn't defined in the collect.cfg file, use the default 399 if (!defined($collectcfg->{'infodbtype'})) 400 { 401 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type(); 402 } 403 404 if (defined $collectcfg->{'importdir'} && $importdir eq "") { 405 $importdir = $collectcfg->{'importdir'}; 406 } 407 if (defined $collectcfg->{'exportdir'} && $exportdir eq "") { 408 $exportdir = $collectcfg->{'exportdir'}; 409 } 410 411 # fill in the default import and export directories if none 412 # were supplied, turn all \ into / and remove trailing / 413 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq ""; 414 $importdir =~ s/[\\\/]+/\//g; 415 $importdir =~ s/\/$//; 416 if (!-e $importdir) { 417 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir); 418 die "\n"; 419 } 420 421 $exportdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export") if $exportdir eq ""; 422 $exportdir =~ s/[\\\/]+/\//g; 423 $exportdir =~ s/\/$//; 424 425 my $plugins = []; 426 if (defined $collectcfg->{'plugin'}) { 427 $plugins = $collectcfg->{'plugin'}; 428 } 429 # some global options for the plugins 430 my @global_opts = (); 431 432 if ($verbosity !~ /\d+/) { 433 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) { 434 $verbosity = $collectcfg->{'verbosity'}; 435 } else { 436 $verbosity = 2; # the default 437 } 438 } 439 440 if (defined $collectcfg->{'manifest'} && $manifest eq "") { 441 $manifest = $collectcfg->{'manifest'}; 442 } 443 if (defined $collectcfg->{'gzip'} && !$gzip) { 444 if ($collectcfg->{'gzip'} =~ /^true$/i) { 445 $gzip = 1; 446 } 447 } 448 if ($maxdocs !~ /\-?\d+/) { 449 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) { 450 $maxdocs = $collectcfg->{'maxdocs'}; 451 } else { 452 $maxdocs = -1; # the default 453 } 454 } 455 456 # groupsize is in import - does it make sense here?? 457 458 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/)) { 459 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) { 460 $OIDtype = $collectcfg->{'OIDtype'}; 461 } else { 462 $OIDtype = "hash"; # the default 463 } 464 } 465 466 if ((!defined $OIDmetadata) || ($OIDmetadata eq "")) { 467 if (defined $collectcfg->{'OIDmetadata'}) { 468 $OIDmetadata = $collectcfg->{'OIDmetadata'}; 469 } else { 470 $OIDmetadata = "dc.Identifier"; # the default 471 } 472 } 473 474 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) { 475 $debug = 1; 476 } 477 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) { 478 $gli = 1; 479 } 480 $gli = 0 unless defined $gli; 481 482 # check keepold and removeold 483 ($removeold, $keepold, $incremental, $incremental_mode) 484 = &scriptutil::check_removeold_and_keepold($removeold, $keepold, 485 $incremental, "export", 486 $collectcfg); 487 488 print STDERR "<export>\n" if $gli; 489 490 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$exportdir); 491 if ($manifest ne "") { 492 my $manifest_filename = $manifest; 493 494 if ($manifest_filename !~ m/^[\\\/]/) { 495 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename); 496 } 497 498 $manifest =~ s/[\\\/]+/\//g; 499 $manifest =~ s/\/$//; 500 501 $manifest_lookup->parse($manifest_filename); 502 } 503 504 # load all the plugins 505 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode); 506 507 if (scalar(@$pluginfo) == 0) { 508 &gsprintf($out, "{import.no_plugins_loaded}\n"); 509 die "\n"; 510 } 511 512 # remove the old contents of the export directory if needed 513 if ($removeold) { 514 if (-e $exportdir) { 515 &gsprintf($out, "{export.removing_export}\n"); 516 &util::rm_r ($exportdir); 517 } 518 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp"); 519 $tmpdir =~ s/[\\\/]+/\//g; 520 $tmpdir =~ s/\/$//; 521 if (-e $tmpdir) { 522 &gsprintf($out, "{import.removing_tmpdir}\n"); 523 &util::rm_r ($tmpdir); 524 } 525 } 526 527 # create the export dir if needed 528 &util::mk_all_dir($exportdir); 529 530 # read the export information file 531 532 # the plugouts should be doing this!! 533 ## $expinfo_doc_filename = &util::filename_cat ($exportdir, "export.inf"); 534 535 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes) 536 &util::rename_ldb_or_bdb_file(&util::filename_cat($exportdir, "archiveinf-doc")); 537 &util::rename_ldb_or_bdb_file(&util::filename_cat($exportdir, "archiveinf-src")); 538 539 $expinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $exportdir); 540 $expinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $exportdir); 541 542 $export_info = new arcinfo($collectcfg->{'infodbtype'}); 543 $export_info -> load_info ($expinfo_doc_filename); 544 545 if ($manifest eq "") { 546 # Load in list of files in export folder from last export (if present) 547 $export_info->load_prev_import_filelist ($expinfo_src_filename); 548 } 549 550 my ($plugout); 551 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) { 552 $plugout = $collectcfg->{'plugout'}; 553 } 554 else{ 555 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) { 556 push @$plugout,"GreenstoneMETSPlugout"; 557 } 558 else{ 559 push @$plugout,$saveas."Plugout"; 560 } 561 } 562 563 my $plugout_name = $plugout->[0]; 564 565 push @$plugout,("-output_info",$export_info) if (defined $export_info); 566 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity); 567 push @$plugout,("-debug") if ($debug); 568 push @$plugout,("-gzip_output") if ($gzip); 569 push @$plugout,("-output_handle",$out) if (defined $out); 570 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne ""); 571 push @$plugout,("-group") if ($group_marc && $plugout_name =~ m/^MARCXMLPlugout$/); 572 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "" && $plugout_name =~ m/^MARCXMLPlugout$/); 573 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "" && $plugout_name =~ m/^.*METSPlugout$/); 574 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "" && $plugout_name =~ m/^.*METSPlugout$/); 575 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "" && $plugout_name eq "FedoraMETSPlugout"); 576 577 $processor = &plugout::load_plugout($plugout); 578 $processor->setoutputdir ($exportdir); 579 580 $processor->set_OIDtype ($OIDtype, $OIDmetadata); 581 582 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli); 583 584 if ($manifest eq "") { 585 # process the import directory 586 my $block_hash = {}; 587 my $metadata = {}; 588 # gobal blocking pass may set up some metadata 589 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli); 590 #&plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 591 ### section below copied from import.pl 592 if ($incremental) { 593 # equivalent to saying ($keepold && ($incremental_mode eq "all")) 594 595 &inexport::prime_doc_oid_count($exportdir); 596 597 598 # Can now work out which files were new, already existed, and have 599 # been deleted 600 601 &inexport::new_vs_old_import_diff($export_info,$block_hash,$importdir, 602 $exportdir,$verbosity,$incremental_mode); 603 604 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}}; 605 # Filter out any in gsdl/tmp area 606 my @filtered_deleted_files = (); 607 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp"); 608 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp"); 609 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area); 610 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area); 611 612 613 foreach my $df (@deleted_files) { 614 next if ($df =~ m/^$gsdl_tmp_area/); 615 next if ($df =~ m/^$collect_tmp_area/); 616 617 push(@filtered_deleted_files,$df); 618 } 619 620 621 @deleted_files = @filtered_deleted_files; 622 623 if (scalar(@deleted_files>0)) { 624 print STDERR "Files deleted since last import:\n "; 625 print STDERR join("\n ",@deleted_files), "\n"; 626 } 627 628 my @new_files = sort keys %{$block_hash->{'new_files'}}; 629 if (scalar(@new_files>0)) { 630 print STDERR "New files since last import:\n "; 631 print STDERR join("\n ",@new_files), "\n"; 632 } 633 634 &inexport::mark_docs_for_deletion($export_info,$block_hash,\@deleted_files, 635 $exportdir,$verbosity); 636 637 &inexport::mark_docs_for_reindex($export_info,$block_hash, 638 $exportdir,$verbosity); 639 640 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}}; 641 642 if (scalar(@reindex_files>0)) { 643 print STDERR "Files to reindex since last import:\n "; 644 print STDERR join("\n ",@reindex_files), "\n"; 645 } 646 647 648 # not sure if the following will work -- will the metadata data-structure be correctly initialized 649 # in the right order? 650 # foreach my $file (@new_files, @reindex_files) { 651 # &plugin::read ($pluginfo, $importdir, $file, $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 652 # } 653 654 655 # Play it safe, and run through the entire folder, only processing new or edited files 656 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 657 658 } 659 else { 660 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 661 } 662 663 ### end copy 664 } 665 else { 666 # process any files marked for exporting 667 foreach my $file (keys %{$manifest_lookup->{'index'}}) { 668 &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli); 669 } 670 671 my @deleted_files = keys %{$manifest_lookup->{'delete'}}; 672 673 &inexport::mark_docs_for_deletion($export_info,{},\@deleted_files,$exportdir); 674 675 } 676 677 if ($saveas eq "FedoraMETS") { 678 # create collection "doc obj" for Fedora that contains 679 # collection-level metadata 680 681 my $doc_obj = new doc($configfilename,"nonindexed_doc","none"); 682 $doc_obj->set_OID("collection"); 683 684 my $col_name = undef; 685 my $col_meta = $collectcfg->{'collectionmeta'}; 686 687 if (defined $col_meta) { 688 689 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name 690 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description 691 692 } 693 $processor->process($doc_obj); 694 } 695 696 &plugin::end($pluginfo, $processor); 697 698 &plugin::deinit($pluginfo, $processor); 699 700 # Store the value of OIDCount (used in doc.pm) so it can be 701 # restored correctly to this value on an incremental build 702 &inexport::store_doc_oid_count($exportdir); 703 704 # write out the export information file 705 #$processor->close_file_output() if $groupsize > 1; 706 $processor->close_group_output() if $processor->is_group(); 707 708 # if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARCXML")) { 709 # # Not all export types need this, 710 711 ## $export_info->save_info($expinfo_doc_filename); 712 # } 713 714 715 # for backwards compatability with archvies.inf file 716 if ($expinfo_doc_filename =~ m/(contents)|(\.inf)$/) { 717 $export_info->save_info($expinfo_doc_filename); 718 } 719 else { 720 $export_info->save_revinfo_db($expinfo_src_filename); 721 } 722 723 724 # write out export stats 725 my $close_stats = 0; 726 if ($statsfile !~ /^(STDERR|STDOUT)$/i) { 727 if (open (STATS, ">$statsfile")) { 728 $statsfile = 'import::STATS'; 729 $close_stats = 1; 730 } else { 731 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile); 732 &gsprintf($out, "{import.stats_backup}\n"); 733 $statsfile = 'STDERR'; 734 } 735 } 736 737 &gsprintf($out, "\n"); 738 &gsprintf($out, "*********************************************\n"); 739 &gsprintf($out, "{export.complete}\n"); 740 &gsprintf($out, "*********************************************\n"); 741 742 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli); 743 if ($close_stats) { 744 close STATS; 745 } 746 747 close OUT if $close_out; 748 close FAILLOG; 749 } 750 751 752 sub store_collectionmeta 753 { 754 my ($collectionmeta,$field,$doc_obj) = @_; 755 756 my $section = $doc_obj->get_top_section(); 757 758 my $field_hash = $collectionmeta->{$field}; 759 760 foreach my $k (keys %$field_hash) 761 { 762 my $val = $field_hash->{$k}; 763 764 ### print STDERR "*** $k = $field_hash->{$k}\n"; 765 766 my $md_label = "ex.$field"; 767 768 769 if ($k =~ m/^\[l=(.*?)\]$/) 770 { 771 772 my $md_suffix = $1; 773 $md_label .= "^$md_suffix"; 774 } 775 776 777 $doc_obj->add_utf8_metadata($section,$md_label, $val); 778 779 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3, 780 # while "collectionname" in GS2 is called "name" in GS3. 781 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3 782 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname")) 783 { 784 $doc_obj->add_utf8_metadata($section,"dc.Title", $val); 785 } 786 787 } 788 } 789 790 791 792 290 -
main/trunk/greenstone2/bin/script/import.pl
r22413 r22421 241 241 'range' => "0,", 242 242 # parsearg left "" as default 243 'deft' => "2",243 # 'deft' => "2", 244 244 'reqd' => "no", 245 245 'modegli' => "3" }, … … 263 263 sub main 264 264 { 265 my $inexport = new inexport( \@ARGV,$options);265 my $inexport = new inexport("import",\@ARGV,$options); 266 266 267 267 my $collection = $inexport->get_collection(); 268 my $collect_cfg= $inexport->read_collection_cfg($collection,$options);269 $inexport->set_collection_options( "import",$collect_cfg);268 my ($config_filename,$collect_cfg) = $inexport->read_collection_cfg($collection,$options); 269 $inexport->set_collection_options($collect_cfg); 270 270 271 my $pluginfo = $inexport->process_files( "import",$collect_cfg);272 273 $inexport->generate_statistics( "import",$pluginfo);271 my $pluginfo = $inexport->process_files($config_filename,$collect_cfg); 272 273 $inexport->generate_statistics($pluginfo); 274 274 } 275 275
Note:
See TracChangeset
for help on using the changeset viewer.