source: gsdl/trunk/bin/script/import.pl@ 19775

Last change on this file since 19775 was 19775, checked in by davidb, 15 years ago

Switch to using more batched approach (for efficiency reasons) for arcinfo writing out DB information

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 22.5 KB
RevLine 
[14031]1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
[14957]41
42 if (defined $ENV{'GSDLEXTS'}) {
43 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
44 foreach my $e (@extensions) {
45 my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
46
47 unshift (@INC, "$ext_prefix/perllib");
48 unshift (@INC, "$ext_prefix/perllib/cpan");
[16788]49 unshift (@INC, "$ext_prefix/perllib/plugins");
50 unshift (@INC, "$ext_prefix/perllib/plugouts");
51 unshift (@INC, "$ext_prefix/perllib/classify");
[14957]52 }
53 }
[14031]54}
55
[14957]56use strict;
57no strict 'refs'; # allow filehandles to be variables and vice versa
58no strict 'subs'; # allow barewords (eg STDERR) as function arguments
59
[14031]60use arcinfo;
61use colcfg;
62use plugin;
63use plugout;
64use manifest;
[18456]65use inexport;
[14031]66use util;
67use scriptutil;
68use FileHandle;
69use gsprintf 'gsprintf';
70use printusage;
71use parse2;
72
73
74
75my $oidtype_list =
76 [ { 'name' => "hash",
77 'desc' => "{import.OIDtype.hash}" },
78 { 'name' => "assigned",
79 'desc' => "{import.OIDtype.assigned}" },
80 { 'name' => "incremental",
81 'desc' => "{import.OIDtype.incremental}" },
82 { 'name' => "dirname",
83 'desc' => "{import.OIDtype.dirname}" } ];
84
[14957]85
86# used to control output file format
[14031]87my $saveas_list =
[17751]88 [ { 'name' => "GreenstoneXML",
89 'desc' => "{export.saveas.GreenstoneXML}"},
[14957]90 { 'name' => "GreenstoneMETS",
91 'desc' => "{export.saveas.GreenstoneMETS}"},
[17038]92 ];
[14031]93
94
95# Possible attributes for each argument
96# name: The name of the argument
97# desc: A description (or more likely a reference to a description) for this argument
98# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
99# reqd: Is this argument required?
100# hiddengli: Is this argument hidden in GLI?
101# modegli: The lowest detail mode this argument is visible at in GLI
102
[14957]103my $saveas_argument
104 = { 'name' => "saveas",
105 'desc' => "{import.saveas}",
106 'type' => "enum",
107 'list' => $saveas_list,
[17751]108 'deft' => "GreenstoneXML",
[14957]109 'reqd' => "no",
110 'modegli' => "3" };
111
112
[14031]113my $arguments =
[14957]114 [
115 $saveas_argument,
116 { 'name' => "archivedir",
[14031]117 'desc' => "{import.archivedir}",
118 'type' => "string",
119 'reqd' => "no",
120 'hiddengli' => "yes" },
[14957]121 { 'name' => "importdir",
122 'desc' => "{import.importdir}",
123 'type' => "string",
124 'reqd' => "no",
125 'hiddengli' => "yes" },
[14031]126 { 'name' => "collectdir",
127 'desc' => "{import.collectdir}",
128 'type' => "string",
129 # parsearg left "" as default
130 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
131 'deft' => "",
132 'reqd' => "no",
133 'hiddengli' => "yes" },
[14925]134 { 'name' => "site",
135 'desc' => "{import.site}",
136 'type' => "string",
137 'deft' => "",
138 'reqd' => "no",
139 'hiddengli' => "yes" },
[14031]140 { 'name' => "manifest",
141 'desc' => "{import.manifest}",
142 'type' => "string",
143 'deft' => "",
144 'reqd' => "no",
145 'hiddengli' => "yes" },
146 { 'name' => "debug",
147 'desc' => "{import.debug}",
148 'type' => "flag",
149 'reqd' => "no",
150 'hiddengli' => "yes" },
151 { 'name' => "faillog",
152 'desc' => "{import.faillog}",
153 'type' => "string",
154 # parsearg left "" as default
155 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
156 'deft' => "",
157 'reqd' => "no",
[18590]158 'modegli' => "3" },
[14031]159 { 'name' => "incremental",
160 'desc' => "{import.incremental}",
161 'type' => "flag",
162 'hiddengli' => "yes" },
163 { 'name' => "keepold",
164 'desc' => "{import.keepold}",
165 'type' => "flag",
166 'reqd' => "no",
167 'hiddengli' => "yes" },
168 { 'name' => "removeold",
169 'desc' => "{import.removeold}",
170 'type' => "flag",
171 'reqd' => "no",
172 'hiddengli' => "yes" },
173 { 'name' => "language",
174 'desc' => "{scripts.language}",
175 'type' => "string",
176 'reqd' => "no",
177 'hiddengli' => "yes" },
178 { 'name' => "maxdocs",
179 'desc' => "{import.maxdocs}",
180 'type' => "int",
181 'reqd' => "no",
182 # parsearg left "" as default
183 #'deft' => "-1",
184 'range' => "1,",
185 'modegli' => "1" },
[17038]186 # don't set the default to hash - want to allow this to come from
187 # entry in collect.cfg but want to override it here
[14031]188 { 'name' => "OIDtype",
189 'desc' => "{import.OIDtype}",
190 'type' => "enum",
191 'list' => $oidtype_list,
192 # parsearg left "" as default
193 #'deft' => "hash",
194 'reqd' => "no",
195 'modegli' => "2" },
196 { 'name' => "OIDmetadata",
197 'desc' => "{import.OIDmetadata}",
[19625]198 'type' => "string",
199 # 'type' => "metadata", #doesn't work properly in GLI
[18528]200 # parsearg left "" as default
201 #'deft' => "dc.Identifier",
[14031]202 'reqd' => "no",
203 'modegli' => "2" },
204 { 'name' => "out",
205 'desc' => "{import.out}",
206 'type' => "string",
207 'deft' => "STDERR",
208 'reqd' => "no",
209 'hiddengli' => "yes" },
210 { 'name' => "sortmeta",
211 'desc' => "{import.sortmeta}",
[19625]212 'type' => "string",
213 #'type' => "metadata", #doesn't work properly in GLI
[14031]214 'reqd' => "no",
[18590]215 'modegli' => "2" },
[15072]216 { 'name' => "reversesort",
217 'desc' => "{import.reversesort}",
218 'type' => "flag",
219 'reqd' => "no",
[18590]220 'modegli' => "2" },
[14031]221 { 'name' => "removeprefix",
222 'desc' => "{BasClas.removeprefix}",
223 'type' => "regexp",
224 'deft' => "",
225 'reqd' => "no",
226 'modegli' => "3" },
227 { 'name' => "removesuffix",
228 'desc' => "{BasClas.removesuffix}",
229 'type' => "regexp",
230 'deft' => "",
231 'reqd' => "no",
232 'modegli' => "3" },
233 { 'name' => "groupsize",
234 'desc' => "{import.groupsize}",
235 'type' => "int",
236 'deft' => "1",
237 'reqd' => "no",
[18590]238 'modegli' => "2" },
[14031]239 { 'name' => "gzip",
240 'desc' => "{import.gzip}",
241 'type' => "flag",
242 'reqd' => "no",
[18590]243 'modegli' => "3" },
[14031]244 { 'name' => "statsfile",
245 'desc' => "{import.statsfile}",
246 'type' => "string",
247 'deft' => "STDERR",
248 'reqd' => "no",
249 'hiddengli' => "yes" },
250 { 'name' => "verbosity",
251 'desc' => "{import.verbosity}",
252 'type' => "int",
253 'range' => "0,",
254 # parsearg left "" as default
255 #'deft' => "2",
256 'reqd' => "no",
[18590]257 'modegli' => "3" },
[14031]258 { 'name' => "gli",
[17142]259 'desc' => "{scripts.gli}",
[14031]260 'type' => "flag",
261 'reqd' => "no",
262 'hiddengli' => "yes" },
263 { 'name' => "xml",
264 'desc' => "{scripts.xml}",
265 'type' => "flag",
266 'reqd' => "no",
267 'hiddengli' => "yes" }];
268
269my $options = { 'name' => "import.pl",
270 'desc' => "{import.desc}",
271 'args' => $arguments };
272
273
274&main();
275
276sub main {
[14957]277 # params
278 my ($language, $verbosity, $debug,
[17142]279 $collectdir, $importdir, $archivedir, $site, $manifest,
280 $incremental, $keepold, $removeold,
281 $saveas,
[14957]282 $OIDtype, $OIDmetadata,
283 $maxdocs, $statsfile,
[17142]284 $out, $faillog, $gli,
[14957]285 $gzip, $groupsize,
[17142]286 $sortmeta, $reversesort, $removeprefix, $removesuffix
[14957]287 );
288
[14031]289 my $xml = 0;
290
[14957]291 # other vars
[17142]292 my ($configfilename, $collection, $collectcfg,
[18440]293 $arcinfo_doc_filename, $arcinfo_src_filename, $archive_info,
[14957]294 $gs_mode,
295 $processor, $pluginfo);
296
[14031]297 my $service = "import";
298
299 my $hashParsingResult = {};
300 # general options available to all plugins
301 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
302 # Parse returns -1 if something has gone wrong
[14957]303 if ($intArgLeftinAfterParsing == -1)
[14031]304 {
305 &PrintUsage::print_txt_usage($options, "{import.params}");
306 die "\n";
307 }
308
309 foreach my $strVariable (keys %$hashParsingResult)
310 {
311 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
312 }
313
314 # If $language has been specified, load the appropriate resource bundle
315 # (Otherwise, the default resource bundle will be loaded automatically)
316 if ($language && $language =~ /\S/) {
317 &gsprintf::load_language_specific_resource_bundle($language);
318 }
319
320 if ($xml) {
321 &PrintUsage::print_xml_usage($options);
322 print "\n";
323 return;
324 }
325
326 if ($gli) { # the gli wants strings to be in UTF-8
327 &gsprintf::output_strings_in_UTF8;
328 }
329
330 # now check that we had exactly one leftover arg, which should be
331 # the collection name. We don't want to do this earlier, cos
332 # -xml arg doesn't need a collection name
333 # Or if the user specified -h, then we output the usage also
334 if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
335 {
[17142]336 &PrintUsage::print_txt_usage($options, "{import.params}");
[14031]337 die "\n";
338 }
339
340 my $close_out = 0;
341 if ($out !~ /^(STDERR|STDOUT)$/i) {
342 open (OUT, ">$out") ||
343 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
344 $out = 'import::OUT';
345 $close_out = 1;
346 }
347 $out->autoflush(1);
348
349 # get and check the collection name
[14925]350 if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") {
[14031]351 &PrintUsage::print_txt_usage($options, "{import.params}");
352 die "\n";
353 }
354
355 # add collection's perllib dir into include path in
356 # case we have collection specific modules
357 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
358
359 # check that we can open the faillog
360 if ($faillog eq "") {
361 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
362 }
363 open (FAILLOG, ">$faillog") ||
364 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
365
366
367 my $faillogname = $faillog;
368 $faillog = 'import::FAILLOG';
369 $faillog->autoflush(1);
370
[14111]371 # Read in the collection configuration file.
372 ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
[14925]373
[14111]374 if ($gs_mode eq "gs2") {
375 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
376 } elsif ($gs_mode eq "gs3") {
[14031]377 $collectcfg = &colcfg::read_collection_cfg_xml ($configfilename);
378 }
379
380 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
381 $importdir = $collectcfg->{'importdir'};
382 }
383 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
384 $archivedir = $collectcfg->{'archivedir'};
385 }
386 # fill in the default import and archives directories if none
387 # were supplied, turn all \ into / and remove trailing /
388 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
389 $importdir =~ s/[\\\/]+/\//g;
390 $importdir =~ s/\/$//;
391 if (!-e $importdir) {
392 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
393 die "\n";
394 }
395
396 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
397 $archivedir =~ s/[\\\/]+/\//g;
398 $archivedir =~ s/\/$//;
399
400 my $plugins = [];
401 if (defined $collectcfg->{'plugin'}) {
402 $plugins = $collectcfg->{'plugin'};
403 }
404 #some global options for the plugins
405 my @global_opts = ();
406
407 if ($verbosity !~ /\d+/) {
408 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
409 $verbosity = $collectcfg->{'verbosity'};
410 } else {
411 $verbosity = 2; # the default
412 }
413 }
414 if (defined $collectcfg->{'manifest'} && $manifest eq "") {
415 $manifest = $collectcfg->{'manifest'};
416 }
417
418 if (defined $collectcfg->{'gzip'} && !$gzip) {
419 if ($collectcfg->{'gzip'} =~ /^true$/i) {
420 $gzip = 1;
421 }
422 }
423
424 if ($maxdocs !~ /\-?\d+/) {
425 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
426 $maxdocs = $collectcfg->{'maxdocs'};
427 } else {
428 $maxdocs = -1; # the default
429 }
430 }
431 if ($groupsize == 1) {
432 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
433 $groupsize = $collectcfg->{'groupsize'};
434 }
435 }
436
[17142]437 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/ )) {
[14031]438 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
439 $OIDtype = $collectcfg->{'OIDtype'};
440 } else {
441 $OIDtype = "hash"; # the default
442 }
443 }
[14556]444
[18528]445 if ((!defined $OIDmetadata) || ($OIDmetadata eq "")) {
446 if (defined $collectcfg->{'OIDmetadata'}) {
447 $OIDmetadata = $collectcfg->{'OIDmetadata'};
448 } else {
449 $OIDmetadata = "dc.Identifier"; # the default
450 }
451 }
452
[14031]453 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
454 $sortmeta = $collectcfg->{'sortmeta'};
455 }
456 # sortmeta cannot be used with group size
457 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
458 if (defined $sortmeta && $groupsize > 1) {
459 &gsprintf($out, "{import.cannot_sort}\n\n");
460 $sortmeta = undef;
461 }
[15072]462
463 if (defined $sortmeta) {
464 if (defined $collectcfg->{'reversesort'} && $collectcfg->{'reversesort'} =~ /^true$/i) {
465 $reversesort = 1;
466 }
467 } else {
468 # reversesort only valid with sortmeta
469 $reversesort = 0;
470 }
[14031]471 if (defined $collectcfg->{'removeprefix'} && $removeprefix eq "") {
472 $removeprefix = $collectcfg->{'removeprefix'};
473 }
474
475 if (defined $collectcfg->{'removesuffix'} && $removesuffix eq "") {
476 $removesuffix = $collectcfg->{'removesuffix'};
477 }
478 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
479 $debug = 1;
480 }
481 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
482 $gli = 1;
483 }
[17142]484 $gli = 0 unless defined $gli;
485
[14031]486 # check keepold and removeold
487 ($removeold, $keepold, $incremental) = &scriptutil::check_removeold_and_keepold($removeold, $keepold, $incremental, "archives", $collectcfg);
488
489
490 print STDERR "<Import>\n" if $gli;
491
492 my $manifest_lookup = new manifest();
493 if ($manifest ne "") {
494 my $manifest_filename = $manifest;
495
496 if ($manifest_filename !~ m/^[\\\/]/) {
497 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
498 }
499
500 $manifest =~ s/[\\\/]+/\//g;
501 $manifest =~ s/\/$//;
502
503 $manifest_lookup->parse($manifest_filename);
504 }
505
506
507 # load all the plugins
508 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental);
509 if (scalar(@$pluginfo) == 0) {
510 &gsprintf($out, "{import.no_plugins_loaded}\n");
511 die "\n";
512 }
513
514 # remove the old contents of the archives directory (and tmp directory) if needed
515 if ($removeold) {
516 if (-e $archivedir) {
517 &gsprintf($out, "{import.removing_archives}\n");
518 &util::rm_r ($archivedir);
519 }
520 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
521 $tmpdir =~ s/[\\\/]+/\//g;
522 $tmpdir =~ s/\/$//;
523 if (-e $tmpdir) {
524 &gsprintf($out, "{import.removing_tmpdir}\n");
525 &util::rm_r ($tmpdir);
526 }
527 }
528 # create the archives dir if needed
529 &util::mk_all_dir($archivedir);
530
531 # read the archive information file
[18440]532## $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf");
[18660]533
534 $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archiveinf-doc");
535 &util::rename_gdbm_file($arcinfo_doc_filename); # ensures gdb
536 $arcinfo_doc_filename .= ".gdb";
537
538 $arcinfo_src_filename = &util::filename_cat ($archivedir, "archiveinf-src");
539 &util::rename_gdbm_file($arcinfo_src_filename); # ensures gdb
540 $arcinfo_src_filename .= ".gdb";
[18440]541
[18660]542
[14031]543 $archive_info = new arcinfo ();
[18440]544 $archive_info->load_info ($arcinfo_doc_filename);
[15072]545 if ($reversesort) {
546 $archive_info->reverse_sort();
547 }
[14031]548
[18440]549 if ($manifest eq "") {
550 # Load in list of files in import folder from last import (if present)
[18456]551 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
[18440]552 }
553
[14031]554 ####Use Plugout####
[17142]555 my ($plugout);
[19657]556 if (defined $collectcfg->{'plugout'}) {
557 # If a plugout was specified in the collect.cfg file, assume it is sensible
558 # We can't check the name because it could be anything, if it is a custom plugout
[17142]559 $plugout = $collectcfg->{'plugout'};
560 }
561 else{
[17751]562 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
563 push @$plugout,"GreenstoneXMLPlugout";
[17142]564 }
565 else{
566 push @$plugout,$saveas."Plugout";
567 }
568 }
569
[14556]570 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
571 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
572 push @$plugout,("-gzip_output") if ($gzip);
573 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
574 push @$plugout,("-output_handle",$out) if (defined $out);
575 push @$plugout,("-debug") if ($debug);
[14031]576
[14556]577 $processor = &plugout::load_plugout($plugout);
[14031]578 $processor->setoutputdir ($archivedir);
579 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
580 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
581
582 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
583
584 if ($manifest eq "") {
585 # process the import directory
[16377]586 my $block_hash = {};
587 my $metadata = {};
588 # gobal blocking pass may set up some metadata
589 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
[18440]590
591
[18469]592 if (!$removeold) {
[18528]593
594 &inexport::prime_doc_oid_count($archivedir);
595
596
[18469]597 # Can now work out which files were new, already existed, and have
598 # been deleted
599
600 &inexport::new_vs_old_import_diff($archive_info,$block_hash,$importdir);
601
602 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
[19760]603 # Filter out any in gsdl/tmp area
604 my @filtered_deleted_files = ();
605 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
606 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
607 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
608 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
609
610
611 foreach my $df (@deleted_files) {
612 next if ($df =~ m/^$gsdl_tmp_area/);
613 next if ($df =~ m/^$collect_tmp_area/);
614
615 push(@filtered_deleted_files,$df);
616 }
617
618
619 @deleted_files = @filtered_deleted_files;
620
[18469]621 if (scalar(@deleted_files>0)) {
622 print STDERR "Files deleted since last import:\n ";
623 print STDERR join("\n ",@deleted_files), "\n";
624 }
625
626 my @new_files = sort keys %{$block_hash->{'new_files'}};
627 if (scalar(@new_files>0)) {
628 print STDERR "New files since last import:\n ";
629 print STDERR join("\n ",@new_files), "\n";
630 }
631
[19500]632 &inexport::mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files,
[18469]633 $archivedir,$verbosity);
634
[19500]635 &inexport::mark_docs_for_reindex($archive_info,$block_hash,
636 $archivedir,$verbosity);
[18469]637
[19500]638 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
639
640 if (scalar(@reindex_files>0)) {
641 print STDERR "Files to reindex since last import:\n ";
642 print STDERR join("\n ",@reindex_files), "\n";
643 }
644
645
646 # not sure if the following will work -- wil the metadata datastructure be correctly initialized
647 # in the right order?
648# foreach my $file (@new_files, @reindex_files) {
649# &plugin::read ($pluginfo, $importdir, $file, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
650# }
651
652
653 # Play it safe, and run through the entire folder, only processing new or edited files
654 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
655
[18440]656 }
[19500]657 else {
658 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
659 }
[18440]660
[14031]661 }
[18507]662 else
[16255]663 {
664 # process any files marked for importing
[19303]665 foreach my $file (keys %{$manifest_lookup->{'import'}}) {
[16377]666 &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
[14031]667 }
668
[18456]669 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
[18440]670
[19500]671 &inexport::mark_docs_for_deletion($archive_info,{},\@deleted_files,$archivedir);
[14031]672 }
673
674 &plugin::end($pluginfo, $processor);
675
676 &plugin::deinit($pluginfo, $processor);
677
[18528]678 # Store the value of OIDCount (used in doc.pm) so it can be
679 # restored correctly to this value on an incremental build
680 &inexport::store_doc_oid_count($archivedir);
681
[14031]682 # write out the archive information file
683 $processor->close_file_output() if $groupsize > 1;
684 $processor->close_group_output() if $processor->is_group();
[14957]685
686# The following 'if' statement is in the export.pl version of the script,
[18440]687# The reason for the 'if' statement is now given in export.pl
688# Unclear at this point if the same should be done here
689## if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) {
690 # Not all export types need this (e.g. DSpace)
691
[14957]692 # should we still do this in debug mode??
693
[18440]694 # for backwards compatability with archvies.inf file
695 if ($arcinfo_doc_filename =~ m/\.inf$/) {
696 $archive_info->save_info($arcinfo_doc_filename);
697 }
[19775]698 else {
699 $archive_info->save_revinfo_gdbm($arcinfo_src_filename);
700 }
[18440]701
[19775]702
[14957]703## }
[14031]704
705 # write out import stats
706 my $close_stats = 0;
707 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
708 if (open (STATS, ">$statsfile")) {
709 $statsfile = 'import::STATS';
710 $close_stats = 1;
711 } else {
712 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
713 &gsprintf($out, "{import.stats_backup}\n");
714 $statsfile = 'STDERR';
715 }
716 }
717
718 &gsprintf($out, "\n");
719 &gsprintf($out, "*********************************************\n");
720 &gsprintf($out, "{import.complete}\n");
721 &gsprintf($out, "*********************************************\n");
722
723 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
724 if ($close_stats) {
725 close STATS;
726 }
727
728 close OUT if $close_out;
729 close FAILLOG;
730}
Note: See TracBrowser for help on using the repository browser.