Changeset 27302 for main/trunk
- Timestamp:
- 2013-05-06T15:21:07+12:00 (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/inexport.pm
r26567 r27302 119 119 $self->{'collection'} = shift @$argv; 120 120 121 if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) { 122 require ParallelInexport; 123 } 121 # Unless otherwise stated all manifests are considered version 1---where 122 # they act more like an advanced process expression---as compared to newer 123 # manifest files that act as an explicit (and exhaustive) list of files to 124 # process [jmt12] 125 $self->{'manifest_version'} = 1; 124 126 125 127 return bless $self, $class; … … 143 145 else { 144 146 $self->{'site'} = ""; 145 $self->{'collectdir'} = & util::filename_cat($ENV{'GSDLHOME'},"collect");147 $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect"); 146 148 } 147 149 $self->{'faillog'} = ""; … … 179 181 $self->{'gs_version'} = "3"; 180 182 } 181 # add collection's perllib dir into include path in 183 184 # add collection's perllib dir into include path in 182 185 # case we have collection specific modules 183 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");186 &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib')); 184 187 185 188 # check that we can open the faillog 186 189 my $faillog = $self->{'faillog'}; 187 190 if ($faillog eq "") { 188 $faillog = & util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");191 $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log"); 189 192 } 190 193 open (FAILLOG, ">$faillog") || … … 197 200 $self->{'faillog'} = $faillog; 198 201 $self->{'faillogname'} = $faillogname; 202 $self->{'close_faillog'} = 1; 199 203 200 204 # Read in the collection configuration file. … … 237 241 # fill in the default import and archives directories if none 238 242 # were supplied, turn all \ into / and remove trailing / 239 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq ""; 243 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq ""; 244 # @todo &FileUtils::sanitizePath($importdir) [jmt12] 240 245 $importdir =~ s/[\\\/]+/\//g; 241 246 $importdir =~ s/\/$//; … … 248 253 if ($archivedir eq "") { 249 254 if ($inexport_mode eq "import") { 250 $archivedir = & util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "archives");255 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives"); 251 256 } 252 257 elsif ($inexport_mode eq "export") { 253 $archivedir = & util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "export");258 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "export"); 254 259 } 255 260 else { 256 261 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n"; 257 262 print STDERR " Defaulting to 'archives' for file output\n"; 258 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives"); 259 } 260 } 261 263 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives"); 264 } 265 } 266 267 # @todo &FileUtils::sanitizePath($archivedir) [jmt12] 262 268 $archivedir =~ s/[\\\/]+/\//g; 263 269 $archivedir =~ s/\/$//; … … 355 361 $self->{'incremental'} = $incremental; 356 362 $self->{'incremental_mode'} = $incremental_mode; 363 364 # Since this wasted my morning, let's at least warn a user that manifest 365 # files now *only* work if keepold is set [jmt12] 366 if ($self->{'manifest'} && !$self->{'keepold'}) 367 { 368 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental\n"; 369 } 357 370 } 358 371 … … 394 407 395 408 my $gli = $self->{'gli'}; 396 397 my $jobs = $self->{'jobs'};398 my $epoch = $self->{'epoch'};399 409 400 410 # related to export … … 418 428 my $manifest_filename = $self->{'manifest'}; 419 429 420 if (!& util::filename_is_absolute($manifest_filename)) {421 $manifest_filename = & util::filename_cat($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);430 if (!&FileUtils::isFilenameAbsolute($manifest_filename)) { 431 $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename); 422 432 } 423 433 … … 426 436 427 437 $manifest_lookup->parse($manifest_filename); 438 439 # manifests may now include a version number [jmt12] 440 $self->{'manifest_version'} = $manifest_lookup->get_version(); 428 441 } 429 442 … … 454 467 455 468 if ($removeold) { 456 if ( -e $archivedir) {469 if (&FileUtils::directoryExists($archivedir)) { 457 470 &gsprintf($out, "{import.removing_archives}\n"); 458 & util::rm_r($archivedir);459 } 460 my $tmpdir = & util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");471 &FileUtils::removeFilesRecursive($archivedir); 472 } 473 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp"); 461 474 $tmpdir =~ s/[\\\/]+/\//g; 462 475 $tmpdir =~ s/\/$//; 463 if ( -e $tmpdir) {476 if (&FileUtils::directoryExists($tmpdir)) { 464 477 &gsprintf($out, "{import.removing_tmpdir}\n"); 465 & util::rm_r($tmpdir);478 &FileUtils::removeFileRecursive($tmpdir); 466 479 } 467 480 } 468 481 469 482 # create the archives dir if needed 470 & util::mk_all_dir($archivedir);483 &FileUtils::makeAllDirectories($archivedir); 471 484 472 485 # read the archive information file 473 486 474 487 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes) 475 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc")); 476 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src")); 477 478 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir); 479 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir); 480 488 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc")); 489 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src")); 490 491 # When we make these initial calls to determine the archive information doc 492 # and src databases we pass through a '1' to indicate this is the first 493 # time we are referring to these databases. When using dynamic dbutils 494 # (available in extensions) this indicates to some database types (for 495 # example, persistent servers) that this is a good time to perform any 496 # one time initialization. The argument has no effect on vanilla dbutils 497 # [jmt12] 498 my $perform_firsttime_init = 1; 499 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init); 500 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init); 501 481 502 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'}); 482 503 $archive_info->load_info ($arcinfo_doc_filename); … … 547 568 } 548 569 549 my $processor = &plugout::load_plugout($plugout); 570 my $processor = &plugout::load_plugout($plugout); 550 571 $processor->setoutputdir ($archivedir); 551 572 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta; … … 565 586 $block_hash->{'new_files'} = {}; 566 587 $block_hash->{'reindex_files'} = {}; 588 # all of these are set somewhere else, so it's more readable to define them 589 # here [jmt12] 590 $block_hash->{'all_files'} = {}; 591 $block_hash->{'deleted_files'} = {}; 592 $block_hash->{'file_blocks'} = {}; 593 $block_hash->{'metadata_files'} = {}; 594 $block_hash->{'shared_fileroot'} = ''; 595 # a new flag so we can tell we had a manifest way down in the plugins 596 # [jmt12] 597 $block_hash->{'manifest'} = 'false'; 567 598 my $metadata = {}; 568 599 569 600 # global blocking pass may set up some metadata 570 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli); 571 601 # - when we have a newer manifest file we don't do this -unless- the 602 # collection configuration indicates this collection contains complex 603 # (inherited) metadata [jmt12] 604 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true')) 605 { 606 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli); 607 } 608 else 609 { 610 print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n"; 611 } 612 572 613 if ($manifest ne "") { 614 615 # mark that we are using a manifest - information that might be needed 616 # down in plugins (for instance DirectoryPlugin) 617 $block_hash->{'manifest'} = $self->{'manifest_version'}; 618 573 619 # 574 620 # 1. Process delete files first … … 580 626 foreach my $df (@deleted_files) { 581 627 my $full_df = 582 (& util::filename_is_absolute($df))628 (&FileUtils::isFilenameAbsolute($df)) 583 629 ? $df 584 : & util::filename_cat($importdir,$df);630 : &FileUtils::filenameConcatenate($importdir,$df); 585 631 586 632 if (-d $full_df) { … … 606 652 foreach my $rf (@reindex_files) { 607 653 my $full_rf = 608 (& util::filename_is_absolute($rf))654 (&FileUtils::isFilenameAbsolute($rf)) 609 655 ? $rf 610 : & util::filename_cat($importdir,$rf);656 : &FileUtils::filenameConcatenate($importdir,$rf); 611 657 612 658 if (-d $full_rf) { … … 637 683 # ensure filename is absolute 638 684 my $full_nf = 639 (& util::filename_is_absolute($nf))685 (&FileUtils::isFilenameAbsolute($nf)) 640 686 ? $nf 641 : & util::filename_cat($importdir,$nf);687 : &FileUtils::filenameConcatenate($importdir,$nf); 642 688 643 689 if (-d $full_nf) { … … 649 695 650 696 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir); 697 # need to check this file exists before trying to read it - in the past 698 # it wasn't possible to have a manifest unless keepold was also set so 699 # you were pretty much guarenteed arcinfo existed 700 # [jmt12] 701 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12] 702 if (-e $arcinfo_src_filename) 703 { 651 704 my $arcinfodb_map = {}; 652 705 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map); … … 662 715 663 716 undef $arcinfodb_map; 717 } 718 # no existing files - so we can just add all the files [jmt12] 719 else 720 { 721 foreach my $f (@full_new_files) 722 { 723 $block_hash->{'new_files'}->{$f} = 1; 724 } 725 } 726 727 # If we are not using complex inherited metadata (and thus have skipped 728 # the global file scan) we need to at least check for a matching 729 # metadata.xml for the files being indexed/reindexed 730 # - unless we are using the newer version of Manifests, which are treated 731 # verbatim, and should have a metadata element for metadata files (so 732 # we can explicitly process metadata files other than metadata.xml) 733 # [jmt12] 734 if ($self->{'manifest_version'} < 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true')) 735 { 736 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}}); 737 foreach my $file_to_import (@all_files_to_import) 738 { 739 my $metadata_xml_path = $file_to_import; 740 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/; 741 if (&FileUtils::fileExists($metadata_xml_path)) 742 { 743 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli); 744 } 745 } 746 } 747 748 # new version manifest files explicitly list metadata files to be 749 # processed (ignoring complexmeta if set) 750 # [jmt12] 751 if ($self->{'manifest_version'} > 1) 752 { 753 # Process metadata files 754 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}}) 755 { 756 $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs); 757 } 758 } 664 759 } 665 760 else { … … 686 781 # Filter out any in gsdl/tmp area 687 782 my @filtered_deleted_files = (); 688 my $gsdl_tmp_area = & util::filename_cat($ENV{'GSDLHOME'}, "tmp");689 my $collect_tmp_area = & util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");783 my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp"); 784 my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp"); 690 785 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area); 691 786 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area); … … 730 825 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields 731 826 # oailastmodified and oailastmodifieddate 732 my $earliestDatestampFile = & util::filename_cat($archivedir, "earliestDatestamp");827 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp"); 733 828 if (!-f $earliestDatestampFile && -d $archivedir) { 734 829 my $current_time_in_seconds = time; # in seconds … … 745 840 } 746 841 747 # now, whichever mode we are in, we can process the entire import folder 748 if ((defined $jobs) && ($jobs > 1)) 749 { 750 # if jobs are set to >1, run in parallel using MPI helper 751 # [hs, 1 july 2010] 752 &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash, 753 $self->{'collection'}, $self->{'site'}); 754 } 755 else 756 { 757 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 758 } 759 760 842 843 $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs); 844 761 845 if ($saveas eq "FedoraMETS") { 762 846 # create collection "doc obj" for Fedora that contains … … 782 866 # Store the value of OIDCount (used in doc.pm) so it can be 783 867 # restored correctly to this value on an incremental build 784 store_doc_oid_count($archivedir); 868 # - this OIDcount file should only be generated for numerical oids [jmt12] 869 if ($self->{'OIDtype'} eq 'incremental') 870 { 871 store_doc_oid_count($archivedir); 872 } 785 873 786 874 # write out the archive information file … … 799 887 } 800 888 801 889 # @function perform_process_files() 890 # while process_files() above prepares the system to import files this is the 891 # function that actually initiates the plugin pipeline to process the files. 892 # This function the therefore be overridden in subclasses of inexport.pm should 893 # they wish to do different or further processing 894 # @author jmt12 895 sub perform_process_files 896 { 897 my $self = shift(@_); 898 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_; 899 my $gli = $self->{'gli'}; 900 # specific file to process - via manifest version 2+ 901 if ($file_to_import ne '') 902 { 903 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 904 } 905 # global file scan - if we are using a new version manifest, files would have 906 # been read above. Older manifests use extra settings in the $block_hash to 907 # control what is imported, while non-manifest imports use a regular 908 # $block_hash (so obeying process_exp and block_exp) [jmt12] 909 elsif ($manifest eq '' || $self->{'manifest_version'} < 1) 910 { 911 &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 912 } 913 else 914 { 915 print STDERR "Skipping perform_process_files() due to manifest presence and version\n"; 916 } 917 } 918 # perform_process_files() 919 920 # @function generate_statistics() 802 921 sub generate_statistics 803 922 { 804 my $self = shift @_; 805 my ($pluginfo) = @_; 806 807 my $inexport_mode = $self->{'mode'}; 808 809 my $statsfile = $self->{'statsfile'}; 810 my $out = $self->{'out'}; 811 my $faillogname = $self->{'faillogname'}; 812 my $gli = $self->{'gli'}; 813 my $jobs = $self->{'jobs'}; 814 815 # write out import stats 816 817 if ((!defined $jobs) || ($jobs == 1)) 818 { 819 # only output statistics if there are multiple jobs 820 # [hs, 1 july 2010] 821 822 my $close_stats = 0; 823 if ($statsfile !~ /^(STDERR|STDOUT)$/i) { 824 if (open (STATS, ">$statsfile")) { 825 $statsfile = 'inexport::STATS'; 826 $close_stats = 1; 827 } else { 828 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile); 829 &gsprintf($out, "{import.stats_backup}\n"); 830 $statsfile = 'STDERR'; 831 } 832 } 833 834 &gsprintf($out, "\n"); 835 &gsprintf($out, "*********************************************\n"); 836 &gsprintf($out, "{$inexport_mode.complete}\n"); 837 &gsprintf($out, "*********************************************\n"); 838 839 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli); 840 if ($close_stats) { 841 close STATS; 842 } 843 } 844 845 close OUT if $self->{'close_out'}; 846 close FAILLOG; 847 } 923 my $self = shift @_; 924 my ($pluginfo) = @_; 925 926 my $inexport_mode = $self->{'mode'}; 927 my $out = $self->{'out'}; 928 my $faillogname = $self->{'faillogname'}; 929 my $gli = $self->{'gli'}; 930 931 &gsprintf($out, "\n"); 932 &gsprintf($out, "*********************************************\n"); 933 &gsprintf($out, "{$inexport_mode.complete}\n"); 934 &gsprintf($out, "*********************************************\n"); 935 936 &plugin::write_stats($pluginfo, 'STDERR', $faillogname, $gli); 937 } 938 # generate_statistics() 939 940 941 # @function deinit() 942 # Close down any file handles that we opened (and hence are responsible for 943 # closing 944 sub deinit 945 { 946 my $self = shift(@_); 947 close OUT if $self->{'close_out'}; 948 close FAILLOG if $self->{'close_faillog'}; 949 } 950 # deinit() 848 951 849 952 … … 889 992 sub oid_count_file { 890 993 my ($archivedir) = @_; 891 return & util::filename_cat($archivedir, "OIDcount");994 return &FileUtils::filenameConcatenate($archivedir, "OIDcount"); 892 995 } 893 996 … … 921 1024 my $oid_count_filename = &oid_count_file($archivedir); 922 1025 923 1026 # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12] 924 1027 if (open(OIDOUT,">$oid_count_filename")) { 925 1028 print OIDOUT $doc::OIDcount, "\n"; … … 955 1058 foreach my $prev_file (keys %$prev_all_files) { 956 1059 957 if (!& util::filename_is_absolute($prev_file)) {958 my $full_prev_file = & util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);1060 if (!&FileUtils::isFilenameAbsolute($prev_file)) { 1061 my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file); 959 1062 $full_prev_all_files->{$full_prev_file} = $prev_file; 960 1063 } … … 975 1078 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files' 976 1079 977 if (!& util::filename_is_absolute($curr_file)) {1080 if (!&FileUtils::isFilenameAbsolute($curr_file)) { 978 1081 # add in import dir to make absolute 979 $full_curr_file = & util::filename_cat($importdir,$curr_file);1082 $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file); 980 1083 } 981 1084 … … 1084 1187 1085 1188 my $existing_file = $existing_filename; 1086 #my $collectdir = & util::filename_cat($ENV{'GSDLCOLLECTDIR'});1189 #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}); 1087 1190 1088 1191 #my $collectdir_resafe = &util::filename_to_regex($collectdir); … … 1116 1219 my $full_curr_file = $curr_file; 1117 1220 1118 if (!& util::filename_is_absolute($curr_file)) {1221 if (!&FileUtils::isFilenameAbsolute($curr_file)) { 1119 1222 # add in import dir to make absolute 1120 1223 1121 $full_curr_file = & util::filename_cat($collectdir,$curr_file);1224 $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file); 1122 1225 } 1123 1226 … … 1172 1275 my $doc_source_file = $doc_rec->{'src-file'}->[0]; 1173 1276 if (!&util::filename_is_absolute($doc_source_file)) { 1174 $doc_source_file = & util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);1277 $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file); 1175 1278 } 1176 1279 … … 1237 1340 next if ($subfile =~ m/^\.\.?$/); 1238 1341 next if ($subfile =~ /^\.svn$/); 1239 my $full_file = & util::filename_cat($dirname, $subfile);1342 my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile); 1240 1343 if (-d $full_file) { 1241 1344 &add_dir_contents_to_list($full_file, $list);
Note:
See TracChangeset
for help on using the changeset viewer.