source: gsdl/trunk/bin/script/import.pl@ 20571

Last change on this file since 20571 was 20571, checked in by davidb, 15 years ago

Introduction of variable 'incremental_mode' that is set to 'none', 'onlyadd', or 'all' depending on settings of -removeold -keepold and -incremental. Some minor edits to tidy up the code have also been made in this commit

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 22.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan/perl-5.8");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
42
43 if (defined $ENV{'GSDLEXTS'}) {
44 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
45 foreach my $e (@extensions) {
46 my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
47
48 unshift (@INC, "$ext_prefix/perllib");
49 unshift (@INC, "$ext_prefix/perllib/cpan");
50 unshift (@INC, "$ext_prefix/perllib/plugins");
51 unshift (@INC, "$ext_prefix/perllib/plugouts");
52 unshift (@INC, "$ext_prefix/perllib/classify");
53 }
54 }
55}
56
57use strict;
58no strict 'refs'; # allow filehandles to be variables and vice versa
59no strict 'subs'; # allow barewords (eg STDERR) as function arguments
60
61use arcinfo;
62use colcfg;
63use plugin;
64use plugout;
65use manifest;
66use inexport;
67use util;
68use scriptutil;
69use FileHandle;
70use gsprintf 'gsprintf';
71use printusage;
72use parse2;
73
74
75
76my $oidtype_list =
77 [ { 'name' => "hash",
78 'desc' => "{import.OIDtype.hash}" },
79 { 'name' => "assigned",
80 'desc' => "{import.OIDtype.assigned}" },
81 { 'name' => "incremental",
82 'desc' => "{import.OIDtype.incremental}" },
83 { 'name' => "dirname",
84 'desc' => "{import.OIDtype.dirname}" } ];
85
86
87# used to control output file format
88my $saveas_list =
89 [ { 'name' => "GreenstoneXML",
90 'desc' => "{export.saveas.GreenstoneXML}"},
91 { 'name' => "GreenstoneMETS",
92 'desc' => "{export.saveas.GreenstoneMETS}"},
93 ];
94
95
96# Possible attributes for each argument
97# name: The name of the argument
98# desc: A description (or more likely a reference to a description) for this argument
99# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
100# reqd: Is this argument required?
101# hiddengli: Is this argument hidden in GLI?
102# modegli: The lowest detail mode this argument is visible at in GLI
103
104my $saveas_argument
105 = { 'name' => "saveas",
106 'desc' => "{import.saveas}",
107 'type' => "enum",
108 'list' => $saveas_list,
109 'deft' => "GreenstoneXML",
110 'reqd' => "no",
111 'modegli' => "3" };
112
113
114my $arguments =
115 [
116 $saveas_argument,
117 { 'name' => "archivedir",
118 'desc' => "{import.archivedir}",
119 'type' => "string",
120 'reqd' => "no",
121 'hiddengli' => "yes" },
122 { 'name' => "importdir",
123 'desc' => "{import.importdir}",
124 'type' => "string",
125 'reqd' => "no",
126 'hiddengli' => "yes" },
127 { 'name' => "collectdir",
128 'desc' => "{import.collectdir}",
129 'type' => "string",
130 # parsearg left "" as default
131 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
132 'deft' => "",
133 'reqd' => "no",
134 'hiddengli' => "yes" },
135 { 'name' => "site",
136 'desc' => "{import.site}",
137 'type' => "string",
138 'deft' => "",
139 'reqd' => "no",
140 'hiddengli' => "yes" },
141 { 'name' => "manifest",
142 'desc' => "{import.manifest}",
143 'type' => "string",
144 'deft' => "",
145 'reqd' => "no",
146 'hiddengli' => "yes" },
147 { 'name' => "debug",
148 'desc' => "{import.debug}",
149 'type' => "flag",
150 'reqd' => "no",
151 'hiddengli' => "yes" },
152 { 'name' => "faillog",
153 'desc' => "{import.faillog}",
154 'type' => "string",
155 # parsearg left "" as default
156 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
157 'deft' => "",
158 'reqd' => "no",
159 'modegli' => "3" },
160 { 'name' => "incremental",
161 'desc' => "{import.incremental}",
162 'type' => "flag",
163 'hiddengli' => "yes" },
164 { 'name' => "keepold",
165 'desc' => "{import.keepold}",
166 'type' => "flag",
167 'reqd' => "no",
168 'hiddengli' => "yes" },
169 { 'name' => "removeold",
170 'desc' => "{import.removeold}",
171 'type' => "flag",
172 'reqd' => "no",
173 'hiddengli' => "yes" },
174 { 'name' => "language",
175 'desc' => "{scripts.language}",
176 'type' => "string",
177 'reqd' => "no",
178 'hiddengli' => "yes" },
179 { 'name' => "maxdocs",
180 'desc' => "{import.maxdocs}",
181 'type' => "int",
182 'reqd' => "no",
183 # parsearg left "" as default
184 #'deft' => "-1",
185 'range' => "1,",
186 'modegli' => "1" },
187 # don't set the default to hash - want to allow this to come from
188 # entry in collect.cfg but want to override it here
189 { 'name' => "OIDtype",
190 'desc' => "{import.OIDtype}",
191 'type' => "enum",
192 'list' => $oidtype_list,
193 # parsearg left "" as default
194 #'deft' => "hash",
195 'reqd' => "no",
196 'modegli' => "2" },
197 { 'name' => "OIDmetadata",
198 'desc' => "{import.OIDmetadata}",
199 'type' => "string",
200 # 'type' => "metadata", #doesn't work properly in GLI
201 # parsearg left "" as default
202 #'deft' => "dc.Identifier",
203 'reqd' => "no",
204 'modegli' => "2" },
205 { 'name' => "out",
206 'desc' => "{import.out}",
207 'type' => "string",
208 'deft' => "STDERR",
209 'reqd' => "no",
210 'hiddengli' => "yes" },
211 { 'name' => "sortmeta",
212 'desc' => "{import.sortmeta}",
213 'type' => "string",
214 #'type' => "metadata", #doesn't work properly in GLI
215 'reqd' => "no",
216 'modegli' => "2" },
217 { 'name' => "reversesort",
218 'desc' => "{import.reversesort}",
219 'type' => "flag",
220 'reqd' => "no",
221 'modegli' => "2" },
222 { 'name' => "removeprefix",
223 'desc' => "{BasClas.removeprefix}",
224 'type' => "regexp",
225 'deft' => "",
226 'reqd' => "no",
227 'modegli' => "3" },
228 { 'name' => "removesuffix",
229 'desc' => "{BasClas.removesuffix}",
230 'type' => "regexp",
231 'deft' => "",
232 'reqd' => "no",
233 'modegli' => "3" },
234 { 'name' => "groupsize",
235 'desc' => "{import.groupsize}",
236 'type' => "int",
237 'deft' => "1",
238 'reqd' => "no",
239 'modegli' => "2" },
240 { 'name' => "gzip",
241 'desc' => "{import.gzip}",
242 'type' => "flag",
243 'reqd' => "no",
244 'modegli' => "3" },
245 { 'name' => "statsfile",
246 'desc' => "{import.statsfile}",
247 'type' => "string",
248 'deft' => "STDERR",
249 'reqd' => "no",
250 'hiddengli' => "yes" },
251 { 'name' => "verbosity",
252 'desc' => "{import.verbosity}",
253 'type' => "int",
254 'range' => "0,",
255 # parsearg left "" as default
256 #'deft' => "2",
257 'reqd' => "no",
258 'modegli' => "3" },
259 { 'name' => "gli",
260 'desc' => "{scripts.gli}",
261 'type' => "flag",
262 'reqd' => "no",
263 'hiddengli' => "yes" },
264 { 'name' => "xml",
265 'desc' => "{scripts.xml}",
266 'type' => "flag",
267 'reqd' => "no",
268 'hiddengli' => "yes" }];
269
270my $options = { 'name' => "import.pl",
271 'desc' => "{import.desc}",
272 'args' => $arguments };
273
274
275&main();
276
277sub main {
278 # params
279 my ($language, $verbosity, $debug,
280 $collectdir, $importdir, $archivedir, $site, $manifest,
281 $incremental, $incremental_mode, $keepold, $removeold,
282 $saveas,
283 $OIDtype, $OIDmetadata,
284 $maxdocs, $statsfile,
285 $out, $faillog, $gli,
286 $gzip, $groupsize,
287 $sortmeta, $reversesort, $removeprefix, $removesuffix
288 );
289
290 my $xml = 0;
291
292 # other vars
293 my ($configfilename, $collection, $collectcfg,
294 $arcinfo_doc_filename, $arcinfo_src_filename, $archive_info,
295 $gs_mode,
296 $processor, $pluginfo);
297
298 my $service = "import";
299
300 my $hashParsingResult = {};
301 # general options available to all plugins
302 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
303 # Parse returns -1 if something has gone wrong
304 if ($intArgLeftinAfterParsing == -1)
305 {
306 &PrintUsage::print_txt_usage($options, "{import.params}");
307 die "\n";
308 }
309
310 foreach my $strVariable (keys %$hashParsingResult)
311 {
312 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
313 }
314
315 # If $language has been specified, load the appropriate resource bundle
316 # (Otherwise, the default resource bundle will be loaded automatically)
317 if ($language && $language =~ /\S/) {
318 &gsprintf::load_language_specific_resource_bundle($language);
319 }
320
321 if ($xml) {
322 &PrintUsage::print_xml_usage($options);
323 print "\n";
324 return;
325 }
326
327 if ($gli) { # the gli wants strings to be in UTF-8
328 &gsprintf::output_strings_in_UTF8;
329 }
330
331 # now check that we had exactly one leftover arg, which should be
332 # the collection name. We don't want to do this earlier, cos
333 # -xml arg doesn't need a collection name
334 # Or if the user specified -h, then we output the usage also
335 if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
336 {
337 &PrintUsage::print_txt_usage($options, "{import.params}");
338 die "\n";
339 }
340
341 my $close_out = 0;
342 if ($out !~ /^(STDERR|STDOUT)$/i) {
343 open (OUT, ">$out") ||
344 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
345 $out = 'import::OUT';
346 $close_out = 1;
347 }
348 $out->autoflush(1);
349
350 # get and check the collection name
351 if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") {
352 &PrintUsage::print_txt_usage($options, "{import.params}");
353 die "\n";
354 }
355
356 # add collection's perllib dir into include path in
357 # case we have collection specific modules
358 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
359
360 # check that we can open the faillog
361 if ($faillog eq "") {
362 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
363 }
364 open (FAILLOG, ">$faillog") ||
365 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
366
367
368 my $faillogname = $faillog;
369 $faillog = 'import::FAILLOG';
370 $faillog->autoflush(1);
371
372 # Read in the collection configuration file.
373 ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
374 $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);
375
376 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
377 $importdir = $collectcfg->{'importdir'};
378 }
379 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
380 $archivedir = $collectcfg->{'archivedir'};
381 }
382 # fill in the default import and archives directories if none
383 # were supplied, turn all \ into / and remove trailing /
384 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
385 $importdir =~ s/[\\\/]+/\//g;
386 $importdir =~ s/\/$//;
387 if (!-e $importdir) {
388 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
389 die "\n";
390 }
391
392 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
393 $archivedir =~ s/[\\\/]+/\//g;
394 $archivedir =~ s/\/$//;
395
396 my $plugins = [];
397 if (defined $collectcfg->{'plugin'}) {
398 $plugins = $collectcfg->{'plugin'};
399 }
400 #some global options for the plugins
401 my @global_opts = ();
402
403 if ($verbosity !~ /\d+/) {
404 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
405 $verbosity = $collectcfg->{'verbosity'};
406 } else {
407 $verbosity = 2; # the default
408 }
409 }
410 if (defined $collectcfg->{'manifest'} && $manifest eq "") {
411 $manifest = $collectcfg->{'manifest'};
412 }
413
414 if (defined $collectcfg->{'gzip'} && !$gzip) {
415 if ($collectcfg->{'gzip'} =~ /^true$/i) {
416 $gzip = 1;
417 }
418 }
419
420 if ($maxdocs !~ /\-?\d+/) {
421 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
422 $maxdocs = $collectcfg->{'maxdocs'};
423 } else {
424 $maxdocs = -1; # the default
425 }
426 }
427 if ($groupsize == 1) {
428 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
429 $groupsize = $collectcfg->{'groupsize'};
430 }
431 }
432
433 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/ )) {
434 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
435 $OIDtype = $collectcfg->{'OIDtype'};
436 } else {
437 $OIDtype = "hash"; # the default
438 }
439 }
440
441 if ((!defined $OIDmetadata) || ($OIDmetadata eq "")) {
442 if (defined $collectcfg->{'OIDmetadata'}) {
443 $OIDmetadata = $collectcfg->{'OIDmetadata'};
444 } else {
445 $OIDmetadata = "dc.Identifier"; # the default
446 }
447 }
448
449 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
450 $sortmeta = $collectcfg->{'sortmeta'};
451 }
452 # sortmeta cannot be used with group size
453 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
454 if (defined $sortmeta && $groupsize > 1) {
455 &gsprintf($out, "{import.cannot_sort}\n\n");
456 $sortmeta = undef;
457 }
458
459 if (defined $sortmeta) {
460 if (defined $collectcfg->{'reversesort'} && $collectcfg->{'reversesort'} =~ /^true$/i) {
461 $reversesort = 1;
462 }
463 } else {
464 # reversesort only valid with sortmeta
465 $reversesort = 0;
466 }
467 if (defined $collectcfg->{'removeprefix'} && $removeprefix eq "") {
468 $removeprefix = $collectcfg->{'removeprefix'};
469 }
470
471 if (defined $collectcfg->{'removesuffix'} && $removesuffix eq "") {
472 $removesuffix = $collectcfg->{'removesuffix'};
473 }
474 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
475 $debug = 1;
476 }
477 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
478 $gli = 1;
479 }
480 $gli = 0 unless defined $gli;
481
482 # check keepold and removeold
483 ($removeold, $keepold, $incremental, $incremental_mode)
484 = &scriptutil::check_removeold_and_keepold($removeold, $keepold,
485 $incremental, "archives",
486 $collectcfg);
487
488
489 print STDERR "<Import>\n" if $gli;
490
491 my $manifest_lookup = new manifest();
492 if ($manifest ne "") {
493 my $manifest_filename = $manifest;
494
495 if ($manifest_filename !~ m/^[\\\/]/) {
496 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
497 }
498
499 $manifest =~ s/[\\\/]+/\//g;
500 $manifest =~ s/\/$//;
501
502 $manifest_lookup->parse($manifest_filename);
503 }
504
505
506 # load all the plugins
507 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode);
508 if (scalar(@$pluginfo) == 0) {
509 &gsprintf($out, "{import.no_plugins_loaded}\n");
510 die "\n";
511 }
512
513 # remove the old contents of the archives directory (and tmp directory) if needed
514 if ($removeold) {
515 if (-e $archivedir) {
516 &gsprintf($out, "{import.removing_archives}\n");
517 &util::rm_r ($archivedir);
518 }
519 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
520 $tmpdir =~ s/[\\\/]+/\//g;
521 $tmpdir =~ s/\/$//;
522 if (-e $tmpdir) {
523 &gsprintf($out, "{import.removing_tmpdir}\n");
524 &util::rm_r ($tmpdir);
525 }
526 }
527 # create the archives dir if needed
528 &util::mk_all_dir($archivedir);
529
530 # read the archive information file
531## $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf");
532
533 $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archiveinf-doc");
534 &util::rename_gdbm_file($arcinfo_doc_filename); # ensures gdb
535 $arcinfo_doc_filename .= ".gdb";
536
537 $arcinfo_src_filename = &util::filename_cat ($archivedir, "archiveinf-src");
538 &util::rename_gdbm_file($arcinfo_src_filename); # ensures gdb
539 $arcinfo_src_filename .= ".gdb";
540
541
542 $archive_info = new arcinfo ();
543 $archive_info->load_info ($arcinfo_doc_filename);
544 if ($reversesort) {
545 $archive_info->reverse_sort();
546 }
547
548 if ($manifest eq "") {
549 # Load in list of files in import folder from last import (if present)
550 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
551 }
552
553 ####Use Plugout####
554 my ($plugout);
555 if (defined $collectcfg->{'plugout'}) {
556 # If a plugout was specified in the collect.cfg file, assume it is sensible
557 # We can't check the name because it could be anything, if it is a custom plugout
558 $plugout = $collectcfg->{'plugout'};
559 }
560 else{
561 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
562 push @$plugout,"GreenstoneXMLPlugout";
563 }
564 else{
565 push @$plugout,$saveas."Plugout";
566 }
567 }
568
569 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
570 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
571 push @$plugout,("-gzip_output") if ($gzip);
572 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
573 push @$plugout,("-output_handle",$out) if (defined $out);
574 push @$plugout,("-debug") if ($debug);
575
576 $processor = &plugout::load_plugout($plugout);
577 $processor->setoutputdir ($archivedir);
578 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
579 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
580
581 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
582
583 if ($manifest eq "") {
584 # process the import directory
585 my $block_hash = {};
586 my $metadata = {};
587 # gobal blocking pass may set up some metadata
588 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
589
590
591 if ($incremental) {
592 # equivalent to saying ($keepold && ($incremental_mode eq "all"))
593
594 &inexport::prime_doc_oid_count($archivedir);
595
596
597 # Can now work out which files were new, already existed, and have
598 # been deleted
599
600 &inexport::new_vs_old_import_diff($archive_info,$block_hash,$importdir,
601 $archivedir,$verbosity,$incremental_mode);
602
603 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
604 # Filter out any in gsdl/tmp area
605 my @filtered_deleted_files = ();
606 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
607 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
608 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
609 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
610
611
612 foreach my $df (@deleted_files) {
613 next if ($df =~ m/^$gsdl_tmp_area/);
614 next if ($df =~ m/^$collect_tmp_area/);
615
616 push(@filtered_deleted_files,$df);
617 }
618
619
620 @deleted_files = @filtered_deleted_files;
621
622 if (scalar(@deleted_files>0)) {
623 print STDERR "Files deleted since last import:\n ";
624 print STDERR join("\n ",@deleted_files), "\n";
625 }
626
627 my @new_files = sort keys %{$block_hash->{'new_files'}};
628 if (scalar(@new_files>0)) {
629 print STDERR "New files since last import:\n ";
630 print STDERR join("\n ",@new_files), "\n";
631 }
632
633 &inexport::mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files,
634 $archivedir,$verbosity);
635
636 &inexport::mark_docs_for_reindex($archive_info,$block_hash,
637 $archivedir,$verbosity);
638
639 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
640
641 if (scalar(@reindex_files>0)) {
642 print STDERR "Files to reindex since last import:\n ";
643 print STDERR join("\n ",@reindex_files), "\n";
644 }
645
646
647 # not sure if the following will work -- will the metadata data-structure be correctly initialized
648 # in the right order?
649# foreach my $file (@new_files, @reindex_files) {
650# &plugin::read ($pluginfo, $importdir, $file, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
651# }
652
653
654 # Play it safe, and run through the entire folder, only processing new or edited files
655 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
656
657 }
658 else {
659 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
660 }
661
662 }
663 else
664 {
665 # process any files marked for importing
666 foreach my $file (keys %{$manifest_lookup->{'import'}}) {
667 &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
668 }
669
670 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
671
672 &inexport::mark_docs_for_deletion($archive_info,{},\@deleted_files,$archivedir);
673 }
674
675 &plugin::end($pluginfo, $processor);
676
677 &plugin::deinit($pluginfo, $processor);
678
679 # Store the value of OIDCount (used in doc.pm) so it can be
680 # restored correctly to this value on an incremental build
681 &inexport::store_doc_oid_count($archivedir);
682
683 # write out the archive information file
684 $processor->close_file_output() if $groupsize > 1;
685 $processor->close_group_output() if $processor->is_group();
686
687# The following 'if' statement is in the export.pl version of the script,
688# The reason for the 'if' statement is now given in export.pl
689# Unclear at this point if the same should be done here
690## if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) {
691 # Not all export types need this (e.g. DSpace)
692
693 # should we still do this in debug mode??
694
695 # for backwards compatability with archvies.inf file
696 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
697 $archive_info->save_info($arcinfo_doc_filename);
698 }
699 else {
700 $archive_info->save_revinfo_gdbm($arcinfo_src_filename);
701 }
702
703
704## }
705
706 # write out import stats
707 my $close_stats = 0;
708 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
709 if (open (STATS, ">$statsfile")) {
710 $statsfile = 'import::STATS';
711 $close_stats = 1;
712 } else {
713 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
714 &gsprintf($out, "{import.stats_backup}\n");
715 $statsfile = 'STDERR';
716 }
717 }
718
719 &gsprintf($out, "\n");
720 &gsprintf($out, "*********************************************\n");
721 &gsprintf($out, "{import.complete}\n");
722 &gsprintf($out, "*********************************************\n");
723
724 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
725 if ($close_stats) {
726 close STATS;
727 }
728
729 close OUT if $close_out;
730 close FAILLOG;
731}
Note: See TracBrowser for help on using the repository browser.