Changeset 32848
- Timestamp:
- 2019-03-04T13:52:41+13:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/inexport.pm
r32846 r32848 66 66 67 67 $inexport::directory_arguments = 68 [69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 ];68 [ 69 { 'name' => "importdir", 70 'desc' => "{import.importdir}", 71 'type' => "string", 72 'reqd' => "no", 73 'deft' => "import", 74 'hiddengli' => "yes" }, 75 { 'name' => "collectdir", 76 'desc' => "{import.collectdir}", 77 'type' => "string", 78 # parsearg left "" as default 79 #'deft' => &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect"), 80 'deft' => "", 81 'reqd' => "no", 82 'hiddengli' => "yes" }, 83 84 ]; 85 85 $inexport::arguments = 86 [87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 86 [ 87 # don't set the default to hash - want to allow this to come from 88 # entry in collect.cfg but want to override it here 89 { 'name' => "OIDtype", 90 'desc' => "{import.OIDtype}", 91 'type' => "enum", 92 'list' => $oidtype_list, 93 'deft' => "hash_on_full_filename", 94 'reqd' => "no", 95 'modegli' => "2" }, 96 { 'name' => "OIDmetadata", 97 'desc' => "{import.OIDmetadata}", 98 'type' => "string", 99 'deft' => "dc.Identifier", 100 'reqd' => "no", 101 'modegli' => "2" }, 102 { 'name' => "site", 103 'desc' => "{import.site}", 104 'type' => "string", 105 'deft' => "", 106 'reqd' => "no", 107 'hiddengli' => "yes" }, 108 { 'name' => "manifest", 109 'desc' => "{import.manifest}", 110 'type' => "string", 111 'deft' => "", 112 'reqd' => "no", 113 'hiddengli' => "yes" } , 114 114 { 'name' => "incremental", 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 ];115 'desc' => "{import.incremental}", 116 'type' => "flag", 117 'hiddengli' => "yes" }, 118 { 'name' => "keepold", 119 'desc' => "{import.keepold}", 120 'type' => "flag", 121 'reqd' => "no", 122 'hiddengli' => "yes" }, 123 { 'name' => "removeold", 124 'desc' => "{import.removeold}", 125 'type' => "flag", 126 'reqd' => "no", 127 'hiddengli' => "yes" }, 128 { 'name' => "language", 129 'desc' => "{scripts.language}", 130 'type' => "string", 131 'reqd' => "no", 132 'hiddengli' => "yes" }, 133 { 'name' => "maxdocs", 134 'desc' => "{import.maxdocs}", 135 'type' => "int", 136 'reqd' => "no", 137 'deft' => "-1", 138 'range' => "-1,", 139 'modegli' => "1" }, 140 { 'name' => "debug", 141 'desc' => "{import.debug}", 142 'type' => "flag", 143 'reqd' => "no", 144 'hiddengli' => "yes" }, 145 { 'name' => "faillog", 146 'desc' => "{import.faillog}", 147 'type' => "string", 148 # parsearg left "" as default 149 #'deft' => &FileUtils::filenameConcatenate("<collectdir>", "colname", "etc", "fail.log"), 150 'deft' => "", 151 'reqd' => "no", 152 'modegli' => "3" }, 153 { 'name' => "out", 154 'desc' => "{import.out}", 155 'type' => "string", 156 'deft' => "STDERR", 157 'reqd' => "no", 158 'hiddengli' => "yes" }, 159 { 'name' => "statsfile", 160 'desc' => "{import.statsfile}", 161 'type' => "string", 162 'deft' => "STDERR", 163 'reqd' => "no", 164 'hiddengli' => "yes" }, 165 { 'name' => "verbosity", 166 'desc' => "{import.verbosity}", 167 'type' => "int", 168 'range' => "0,", 169 'deft' => "2", 170 'reqd' => "no", 171 'modegli' => "3" }, 172 { 'name' => "gli", 173 'desc' => "{scripts.gli}", 174 'type' => "flag", 175 'reqd' => "no", 176 'hiddengli' => "yes" }, 177 { 'name' => "xml", 178 'desc' => "{scripts.xml}", 179 'type' => "flag", 180 'reqd' => "no", 181 'hiddengli' => "yes" }, 182 183 ]; 184 184 185 185 sub new … … 288 288 $self->{'out'} = STDERR; 289 289 290 291 292 293 294 295 296 297 298 290 if (defined $gsdl_cgi) { 291 $self->{'site'} = $opt_site; 292 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site); 293 $self->{'collectdir'} = $collect_dir; 294 } 295 else { 296 $self->{'site'} = ""; 297 $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect"); 298 } 299 299 $self->{'faillog'} = ""; 300 300 301 301 $self->{'collection'} = $collect; 302 302 … … 319 319 my $site = $self->{'site'}; 320 320 my $out = $self->{'out'}; 321 321 322 322 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") { 323 323 #&PrintUsage::print_txt_usage($options, "{import.params}", 1); … … 378 378 if (!defined($collectcfg->{'infodbtype'})) 379 379 { 380 380 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type(); 381 381 } 382 382 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") { … … 403 403 if (!&FileUtils::isFilenameAbsolute($importdir)) 404 404 { 405 405 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $importdir); 406 406 } 407 407 else 408 408 { 409 410 411 412 413 409 # Don't do this - it kills protocol prefixes 410 #$importdir =~ s/[\\\/]+/\//g; 411 #$importdir =~ s/\/$//; 412 # Do this instead 413 &FileUtils::sanitizePath($importdir); 414 414 } 415 415 416 416 if (!&FileUtils::directoryExists($importdir)) 417 417 { 418 419 418 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir); 419 die "\n"; 420 420 } 421 421 $self->{'importdir'} = $importdir; … … 435 435 } 436 436 } 437 437 438 438 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") { 439 439 $self->{'manifest'} = $collectcfg->{'manifest'}; … … 452 452 } 453 453 454 454 455 455 456 456 if (defined $self->{'default_OIDtype'} ) { … … 474 474 } 475 475 $self->{'gli'} = 0 unless defined $self->{'gli'}; 476 476 477 477 # check keepold and removeold 478 478 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export"; … … 492 492 if ($self->{'manifest'} && (!$keepold || !$incremental)) 493 493 { 494 495 } 496 494 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental\n"; 495 } 496 } 497 497 498 498 sub process_files … … 727 727 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true')) 728 728 { 729 729 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli); 730 730 } 731 731 else 732 732 { 733 733 print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n"; 734 734 } 735 735 … … 744 744 if ($manifest ne "") { 745 745 746 747 748 746 # mark that we are using a manifest - information that might be needed 747 # down in plugins (for instance DirectoryPlugin) 748 $block_hash->{'manifest'} = $self->{'manifest_version'}; 749 749 750 750 # … … 826 826 827 827 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir); 828 # need to check this file exists before trying to read it - in the past 829 # it wasn't possible to have a manifest unless keepold was also set so 830 # you were pretty much guaranteed arcinfo existed 831 # [jmt12] 832 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12] 833 if (-e $arcinfo_src_filename) 834 { 835 my $arcinfodb_map = {}; 836 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map); 837 foreach my $f (@full_new_files) { 838 my $rel_f = &util::abspath_to_placeholders($f); 839 840 # check that we haven't seen it already 841 if (defined $arcinfodb_map->{$rel_f}) { 842 # TODO make better warning 843 print STDERR "Warning: $f ($rel_f) already in src archive, \n"; 844 } else { 828 # need to check this file exists before trying to read it - in the past 829 # it wasn't possible to have a manifest unless keepold was also set so 830 # you were pretty much guaranteed arcinfo existed 831 # [jmt12] 832 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12] 833 if (-e $arcinfo_src_filename) 834 { 835 my $arcinfodb_map = {}; 836 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map); 837 foreach my $f (@full_new_files) { 838 my $rel_f = &util::abspath_to_placeholders($f); 839 840 # check that we haven't seen it already 841 if (defined $arcinfodb_map->{$rel_f}) { 842 # TODO make better warning 843 print STDERR "Warning: $f ($rel_f) already in src archive, \n"; 844 } else { 845 $block_hash->{'new_files'}->{$f} = 1; 846 } 847 } 848 849 undef $arcinfodb_map; 850 } 851 # no existing files - so we can just add all the files [jmt12] 852 else 853 { 854 foreach my $f (@full_new_files) 855 { 845 856 $block_hash->{'new_files'}->{$f} = 1; 846 857 } 847 858 } 848 849 undef $arcinfodb_map; 850 } 851 # no existing files - so we can just add all the files [jmt12] 852 else 853 { 854 foreach my $f (@full_new_files) 855 { 856 $block_hash->{'new_files'}->{$f} = 1; 857 } 858 } 859 860 # If we are not using complex inherited metadata (and thus have skipped 861 # the global file scan) we need to at least check for a matching 862 # metadata.xml for the files being indexed/reindexed 863 # - unless we are using the newer version of Manifests, which are treated 864 # verbatim, and should have a metadata element for metadata files (so 865 # we can explicitly process metadata files other than metadata.xml) 866 # [jmt12] 867 if ($self->{'manifest_version'} == 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true')) 868 { 869 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}}); 870 foreach my $file_to_import (@all_files_to_import) 871 { 872 my $metadata_xml_path = $file_to_import; 873 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/; 874 if (&FileUtils::fileExists($metadata_xml_path)) 875 { 876 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli); 877 } 878 } 879 } 880 881 # new version manifest files explicitly list metadata files to be 882 # processed (ignoring complexmeta if set) 883 # [jmt12] 884 if ($self->{'manifest_version'} > 1) 885 { 886 # Process metadata files 887 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}}) 888 { 889 $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs); 890 } 891 } 859 860 # If we are not using complex inherited metadata (and thus have skipped 861 # the global file scan) we need to at least check for a matching 862 # metadata.xml for the files being indexed/reindexed 863 # - unless we are using the newer version of Manifests, which are treated 864 # verbatim, and should have a metadata element for metadata files (so 865 # we can explicitly process metadata files other than metadata.xml) 866 # [jmt12] 867 if ($self->{'manifest_version'} == 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true')) 868 { 869 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}}); 870 foreach my $file_to_import (@all_files_to_import) 871 { 872 my $metadata_xml_path = $file_to_import; 873 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/; 874 if (&FileUtils::fileExists($metadata_xml_path)) 875 { 876 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli); 877 } 878 } 879 } 880 881 # new version manifest files explicitly list metadata files to be 882 # processed (ignoring complexmeta if set) 883 # [jmt12] 884 if ($self->{'manifest_version'} > 1) 885 { 886 # Process metadata files 887 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}}) 888 { 889 $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs); 890 } 891 } 892 892 } # end if (manifest ne "") 893 893 else { … … 910 910 911 911 if ($incremental) { 912 912 # only look for deletions if we are truely incremental 913 913 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}}; 914 914 # Filter out any in gsdl/tmp area … … 918 918 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area); 919 919 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area); 920 920 921 921 foreach my $df (@deleted_files) { 922 922 next if ($df =~ m/^$gsdl_tmp_area/); … … 932 932 print STDERR "Files deleted since last import:\n "; 933 933 print STDERR join("\n ",@deleted_files), "\n"; 934 935 934 935 936 936 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files); 937 937 … … 947 947 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex"); 948 948 } 949 949 950 950 } 951 951 } # end if incremental/only_add mode … … 961 961 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp"); 962 962 if ($self->{'generate_auxiliary_files'}) { 963 964 my $current_time_in_seconds = time; # in seconds965 966 if(open(FOUT, ">$earliestDatestampFile")) {967 968 969 970 }971 else {972 973 }974 975 963 if (!-f $earliestDatestampFile && -d $archivedir) { 964 my $current_time_in_seconds = time; # in seconds 965 966 if(open(FOUT, ">$earliestDatestampFile")) { 967 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die); 968 print FOUT $current_time_in_seconds; 969 close(FOUT); 970 } 971 else { 972 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile); 973 } 974 975 } 976 976 } 977 977 … … 1004 1004 if ($self->{'OIDtype'} eq 'incremental') 1005 1005 { 1006 1006 store_doc_oid_count($archivedir); 1007 1007 } 1008 1008 … … 1011 1011 $processor->end(); 1012 1012 1013 # if ($inexport_mode eq "import") {1013 # if ($inexport_mode eq "import") { 1014 1014 if ($self->{'generate_auxiliary_files'}) { 1015 1015 # write out the archive information file … … 1033 1033 sub perform_process_files 1034 1034 { 1035 my $self = shift(@_);1036 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;1037 my $gli = $self->{'gli'};1038 # specific file to process - via manifest version 2+1039 if ($file_to_import ne '')1040 {1041 1042 }1043 # global file scan - if we are using a new version manifest, files would have1044 # been read above. Older manifests use extra settings in the $block_hash to1045 # control what is imported, while non-manifest imports use a regular1046 # $block_hash (so obeying process_exp and block_exp) [jmt12]1047 elsif ($manifest eq '' || $self->{'manifest_version'} == 1)1048 {1049 1050 }1051 else1052 {1053 1054 }1035 my $self = shift(@_); 1036 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_; 1037 my $gli = $self->{'gli'}; 1038 # specific file to process - via manifest version 2+ 1039 if ($file_to_import ne '') 1040 { 1041 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 1042 } 1043 # global file scan - if we are using a new version manifest, files would have 1044 # been read above. Older manifests use extra settings in the $block_hash to 1045 # control what is imported, while non-manifest imports use a regular 1046 # $block_hash (so obeying process_exp and block_exp) [jmt12] 1047 elsif ($manifest eq '' || $self->{'manifest_version'} == 1) 1048 { 1049 &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 1050 } 1051 else 1052 { 1053 print STDERR "Skipping perform_process_files() due to manifest presence and version\n"; 1054 } 1055 1055 } 1056 1056 # perform_process_files() … … 1059 1059 sub generate_statistics 1060 1060 { 1061 my $self = shift @_;1062 my ($pluginfo) = @_;1063 1064 my $inexport_mode = $self->{'mode'};1065 my $out = $self->{'out'};1066 my $faillogname = $self->{'faillogname'};1067 my $statsfile = $self->{'statsfile'};1068 my $gli = $self->{'gli'};1069 1070 &gsprintf($out, "\n");1071 &gsprintf($out, "*********************************************\n");1072 &gsprintf($out, "{$inexport_mode.complete}\n");1073 &gsprintf($out, "*********************************************\n");1074 1075 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);1061 my $self = shift @_; 1062 my ($pluginfo) = @_; 1063 1064 my $inexport_mode = $self->{'mode'}; 1065 my $out = $self->{'out'}; 1066 my $faillogname = $self->{'faillogname'}; 1067 my $statsfile = $self->{'statsfile'}; 1068 my $gli = $self->{'gli'}; 1069 1070 &gsprintf($out, "\n"); 1071 &gsprintf($out, "*********************************************\n"); 1072 &gsprintf($out, "{$inexport_mode.complete}\n"); 1073 &gsprintf($out, "*********************************************\n"); 1074 1075 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli); 1076 1076 } 1077 1077 # generate_statistics() … … 1083 1083 sub deinit 1084 1084 { 1085 my $self = shift(@_);1086 close OUT if $self->{'close_out'};1087 close FAILLOG if $self->{'close_faillog'};1088 close STATSFILE if $self->{'close_statsfile'};1085 my $self = shift(@_); 1086 close OUT if $self->{'close_out'}; 1087 close FAILLOG if $self->{'close_faillog'}; 1088 close STATSFILE if $self->{'close_statsfile'}; 1089 1089 } 1090 1090 # deinit() … … 1167 1167 if (open(OIDOUT,">$oid_count_filename")) { 1168 1168 print OIDOUT $doc::OIDcount, "\n"; 1169 1169 1170 1170 close(OIDOUT); 1171 1171 } … … 1197 1197 1198 1198 foreach my $prev_file (keys %$prev_all_files) { 1199 1199 # arcinfo deals in real filenames ie windows short names. but the block hash stuff is all full long versions. 1200 1200 $prev_file = &util::upgrade_if_dos_filename($prev_file); 1201 1201 … … 1214 1214 # there any more => mark them for deletion 1215 1215 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) { 1216 1216 1217 1217 my $full_curr_file = $curr_file; 1218 1218 … … 1299 1299 1300 1300 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) { 1301 1301 1302 1302 if ($existing_f =~ m/^$situated_dir/) { 1303 1303 1304 # print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";1304 # print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n"; 1305 1305 1306 1306 push(@$reindex_files,$existing_f); … … 1367 1367 1368 1368 if (!-e $full_curr_file) { 1369 1369 $curr_file = &util::upgrade_if_dos_filename($curr_file); 1370 1370 $block_hash->{'deleted_files'}->{$curr_file} = 1; 1371 1371 } 1372 1372 } @deleted_files; 1373 1373 1374 1374 … … 1404 1404 $archive_info->remove_reverseinfo($downgraded_file); 1405 1405 1406 1406 foreach my $oid (@$oids) { 1407 1407 # get the record for this OID from doc db 1408 1408 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid); … … 1430 1430 if (!defined $archive_info->get_reverseinfo($assocfile)) { 1431 1431 # nothing refers to it anymore, mark for reindex. 1432 1433 1432 # block hash needs full filenames 1433 $assocfile = &util::upgrade_if_dos_filename($assocfile); 1434 1434 $block_hash->{'reindex_files'}->{$assocfile} = 1; 1435 1435 } … … 1476 1476 1477 1477 my ($dirname, $list) = @_; 1478 1478 1479 1479 # Recur over directory contents. 1480 1480 my (@dir, $subfile); … … 1499 1499 } 1500 1500 } 1501 1501 1502 1502 } 1503 1503 1504 1504 1505 1505 1;
Note:
See TracChangeset
for help on using the changeset viewer.