source: main/trunk/greenstone2/bin/script/import.pl@ 21822

Last change on this file since 21822 was 21822, checked in by ak19, 14 years ago

Dr Bainbridge has fixed several perl files that depended on perl 5.8 to work and used to fail with Perl 5.10.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 23.0 KB
RevLine 
[14031]1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# import.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will import a number of files into a particular collection
30
31package import;
32
33BEGIN {
34 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
[14957]40
41 if (defined $ENV{'GSDLEXTS'}) {
42 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
43 foreach my $e (@extensions) {
44 my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
45
46 unshift (@INC, "$ext_prefix/perllib");
47 unshift (@INC, "$ext_prefix/perllib/cpan");
[16788]48 unshift (@INC, "$ext_prefix/perllib/plugins");
49 unshift (@INC, "$ext_prefix/perllib/plugouts");
[14957]50 }
51 }
[21291]52 if (defined $ENV{'GSDL3EXTS'}) {
53 my @extensions = split(/:/,$ENV{'GSDL3EXTS'});
54 foreach my $e (@extensions) {
55 my $ext_prefix = "$ENV{'GSDL3SRCHOME'}/ext/$e";
56
57 unshift (@INC, "$ext_prefix/perllib");
58 unshift (@INC, "$ext_prefix/perllib/cpan");
59 unshift (@INC, "$ext_prefix/perllib/plugins");
60 unshift (@INC, "$ext_prefix/perllib/plugouts");
61 }
62 }
[14031]63}
64
[14957]65use strict;
66no strict 'refs'; # allow filehandles to be variables and vice versa
67no strict 'subs'; # allow barewords (eg STDERR) as function arguments
68
[14031]69use arcinfo;
70use colcfg;
[21633]71use dbutil;
[14031]72use plugin;
73use plugout;
74use manifest;
[18456]75use inexport;
[14031]76use util;
77use scriptutil;
78use FileHandle;
79use gsprintf 'gsprintf';
80use printusage;
81use parse2;
82
83
84
85my $oidtype_list =
86 [ { 'name' => "hash",
87 'desc' => "{import.OIDtype.hash}" },
88 { 'name' => "assigned",
89 'desc' => "{import.OIDtype.assigned}" },
90 { 'name' => "incremental",
91 'desc' => "{import.OIDtype.incremental}" },
92 { 'name' => "dirname",
93 'desc' => "{import.OIDtype.dirname}" } ];
94
[14957]95
96# used to control output file format
[14031]97my $saveas_list =
[17751]98 [ { 'name' => "GreenstoneXML",
99 'desc' => "{export.saveas.GreenstoneXML}"},
[14957]100 { 'name' => "GreenstoneMETS",
101 'desc' => "{export.saveas.GreenstoneMETS}"},
[17038]102 ];
[14031]103
104
105# Possible attributes for each argument
106# name: The name of the argument
107# desc: A description (or more likely a reference to a description) for this argument
108# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
109# reqd: Is this argument required?
110# hiddengli: Is this argument hidden in GLI?
111# modegli: The lowest detail mode this argument is visible at in GLI
112
[14957]113my $saveas_argument
114 = { 'name' => "saveas",
115 'desc' => "{import.saveas}",
116 'type' => "enum",
117 'list' => $saveas_list,
[17751]118 'deft' => "GreenstoneXML",
[14957]119 'reqd' => "no",
120 'modegli' => "3" };
121
122
[14031]123my $arguments =
[14957]124 [
125 $saveas_argument,
126 { 'name' => "archivedir",
[14031]127 'desc' => "{import.archivedir}",
128 'type' => "string",
129 'reqd' => "no",
130 'hiddengli' => "yes" },
[14957]131 { 'name' => "importdir",
132 'desc' => "{import.importdir}",
133 'type' => "string",
134 'reqd' => "no",
135 'hiddengli' => "yes" },
[14031]136 { 'name' => "collectdir",
137 'desc' => "{import.collectdir}",
138 'type' => "string",
139 # parsearg left "" as default
140 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
141 'deft' => "",
142 'reqd' => "no",
143 'hiddengli' => "yes" },
[14925]144 { 'name' => "site",
145 'desc' => "{import.site}",
146 'type' => "string",
147 'deft' => "",
148 'reqd' => "no",
149 'hiddengli' => "yes" },
[14031]150 { 'name' => "manifest",
151 'desc' => "{import.manifest}",
152 'type' => "string",
153 'deft' => "",
154 'reqd' => "no",
155 'hiddengli' => "yes" },
156 { 'name' => "debug",
157 'desc' => "{import.debug}",
158 'type' => "flag",
159 'reqd' => "no",
160 'hiddengli' => "yes" },
161 { 'name' => "faillog",
162 'desc' => "{import.faillog}",
163 'type' => "string",
164 # parsearg left "" as default
165 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
166 'deft' => "",
167 'reqd' => "no",
[18590]168 'modegli' => "3" },
[14031]169 { 'name' => "incremental",
170 'desc' => "{import.incremental}",
171 'type' => "flag",
172 'hiddengli' => "yes" },
173 { 'name' => "keepold",
174 'desc' => "{import.keepold}",
175 'type' => "flag",
176 'reqd' => "no",
177 'hiddengli' => "yes" },
178 { 'name' => "removeold",
179 'desc' => "{import.removeold}",
180 'type' => "flag",
181 'reqd' => "no",
182 'hiddengli' => "yes" },
183 { 'name' => "language",
184 'desc' => "{scripts.language}",
185 'type' => "string",
186 'reqd' => "no",
187 'hiddengli' => "yes" },
188 { 'name' => "maxdocs",
189 'desc' => "{import.maxdocs}",
190 'type' => "int",
191 'reqd' => "no",
192 # parsearg left "" as default
193 #'deft' => "-1",
194 'range' => "1,",
195 'modegli' => "1" },
[17038]196 # don't set the default to hash - want to allow this to come from
197 # entry in collect.cfg but want to override it here
[14031]198 { 'name' => "OIDtype",
199 'desc' => "{import.OIDtype}",
200 'type' => "enum",
201 'list' => $oidtype_list,
202 # parsearg left "" as default
203 #'deft' => "hash",
204 'reqd' => "no",
205 'modegli' => "2" },
206 { 'name' => "OIDmetadata",
207 'desc' => "{import.OIDmetadata}",
[19625]208 'type' => "string",
[20685]209 #'type' => "metadata", #doesn't work properly in GLI
[18528]210 # parsearg left "" as default
211 #'deft' => "dc.Identifier",
[14031]212 'reqd' => "no",
213 'modegli' => "2" },
214 { 'name' => "out",
215 'desc' => "{import.out}",
216 'type' => "string",
217 'deft' => "STDERR",
218 'reqd' => "no",
219 'hiddengli' => "yes" },
220 { 'name' => "sortmeta",
221 'desc' => "{import.sortmeta}",
[19625]222 'type' => "string",
223 #'type' => "metadata", #doesn't work properly in GLI
[14031]224 'reqd' => "no",
[18590]225 'modegli' => "2" },
[14031]226 { 'name' => "removeprefix",
227 'desc' => "{BasClas.removeprefix}",
228 'type' => "regexp",
229 'deft' => "",
230 'reqd' => "no",
231 'modegli' => "3" },
232 { 'name' => "removesuffix",
233 'desc' => "{BasClas.removesuffix}",
234 'type' => "regexp",
235 'deft' => "",
236 'reqd' => "no",
237 'modegli' => "3" },
238 { 'name' => "groupsize",
239 'desc' => "{import.groupsize}",
240 'type' => "int",
241 'deft' => "1",
242 'reqd' => "no",
[18590]243 'modegli' => "2" },
[14031]244 { 'name' => "gzip",
245 'desc' => "{import.gzip}",
246 'type' => "flag",
247 'reqd' => "no",
[18590]248 'modegli' => "3" },
[14031]249 { 'name' => "statsfile",
250 'desc' => "{import.statsfile}",
251 'type' => "string",
252 'deft' => "STDERR",
253 'reqd' => "no",
254 'hiddengli' => "yes" },
255 { 'name' => "verbosity",
256 'desc' => "{import.verbosity}",
257 'type' => "int",
258 'range' => "0,",
259 # parsearg left "" as default
260 #'deft' => "2",
261 'reqd' => "no",
[18590]262 'modegli' => "3" },
[14031]263 { 'name' => "gli",
[17142]264 'desc' => "{scripts.gli}",
[14031]265 'type' => "flag",
266 'reqd' => "no",
267 'hiddengli' => "yes" },
268 { 'name' => "xml",
269 'desc' => "{scripts.xml}",
270 'type' => "flag",
271 'reqd' => "no",
272 'hiddengli' => "yes" }];
273
274my $options = { 'name' => "import.pl",
275 'desc' => "{import.desc}",
276 'args' => $arguments };
277
278
279&main();
280
281sub main {
[14957]282 # params
283 my ($language, $verbosity, $debug,
[17142]284 $collectdir, $importdir, $archivedir, $site, $manifest,
[20571]285 $incremental, $incremental_mode, $keepold, $removeold,
[17142]286 $saveas,
[14957]287 $OIDtype, $OIDmetadata,
288 $maxdocs, $statsfile,
[17142]289 $out, $faillog, $gli,
[14957]290 $gzip, $groupsize,
[20757]291 $sortmeta, $removeprefix, $removesuffix
[14957]292 );
293
[14031]294 my $xml = 0;
295
[14957]296 # other vars
[17142]297 my ($configfilename, $collection, $collectcfg,
[18440]298 $arcinfo_doc_filename, $arcinfo_src_filename, $archive_info,
[14957]299 $gs_mode,
300 $processor, $pluginfo);
301
[14031]302 my $service = "import";
303
304 my $hashParsingResult = {};
305 # general options available to all plugins
306 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
307 # Parse returns -1 if something has gone wrong
[14957]308 if ($intArgLeftinAfterParsing == -1)
[14031]309 {
310 &PrintUsage::print_txt_usage($options, "{import.params}");
311 die "\n";
312 }
313
314 foreach my $strVariable (keys %$hashParsingResult)
315 {
316 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
317 }
318
319 # If $language has been specified, load the appropriate resource bundle
320 # (Otherwise, the default resource bundle will be loaded automatically)
321 if ($language && $language =~ /\S/) {
322 &gsprintf::load_language_specific_resource_bundle($language);
323 }
324
325 if ($xml) {
326 &PrintUsage::print_xml_usage($options);
327 print "\n";
328 return;
329 }
330
331 if ($gli) { # the gli wants strings to be in UTF-8
332 &gsprintf::output_strings_in_UTF8;
333 }
334
335 # now check that we had exactly one leftover arg, which should be
336 # the collection name. We don't want to do this earlier, cos
337 # -xml arg doesn't need a collection name
338 # Or if the user specified -h, then we output the usage also
339 if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
340 {
[17142]341 &PrintUsage::print_txt_usage($options, "{import.params}");
[14031]342 die "\n";
343 }
344
345 my $close_out = 0;
346 if ($out !~ /^(STDERR|STDOUT)$/i) {
347 open (OUT, ">$out") ||
348 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
349 $out = 'import::OUT';
350 $close_out = 1;
351 }
352 $out->autoflush(1);
353
354 # get and check the collection name
[14925]355 if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") {
[14031]356 &PrintUsage::print_txt_usage($options, "{import.params}");
357 die "\n";
358 }
359
360 # add collection's perllib dir into include path in
361 # case we have collection specific modules
362 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
363
364 # check that we can open the faillog
365 if ($faillog eq "") {
366 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
367 }
368 open (FAILLOG, ">$faillog") ||
369 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
370
371
372 my $faillogname = $faillog;
373 $faillog = 'import::FAILLOG';
374 $faillog->autoflush(1);
375
[14111]376 # Read in the collection configuration file.
377 ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
[20098]378 $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);
[21613]379
380 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
381 if (!defined($collectcfg->{'infodbtype'}))
382 {
383 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
384 }
[14925]385
[14031]386 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
387 $importdir = $collectcfg->{'importdir'};
388 }
389 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
390 $archivedir = $collectcfg->{'archivedir'};
391 }
392 # fill in the default import and archives directories if none
393 # were supplied, turn all \ into / and remove trailing /
394 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
395 $importdir =~ s/[\\\/]+/\//g;
396 $importdir =~ s/\/$//;
397 if (!-e $importdir) {
398 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
399 die "\n";
400 }
401
402 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
403 $archivedir =~ s/[\\\/]+/\//g;
404 $archivedir =~ s/\/$//;
405
406 my $plugins = [];
407 if (defined $collectcfg->{'plugin'}) {
408 $plugins = $collectcfg->{'plugin'};
409 }
410 #some global options for the plugins
411 my @global_opts = ();
412
413 if ($verbosity !~ /\d+/) {
414 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
415 $verbosity = $collectcfg->{'verbosity'};
416 } else {
417 $verbosity = 2; # the default
418 }
419 }
420 if (defined $collectcfg->{'manifest'} && $manifest eq "") {
421 $manifest = $collectcfg->{'manifest'};
422 }
423
424 if (defined $collectcfg->{'gzip'} && !$gzip) {
425 if ($collectcfg->{'gzip'} =~ /^true$/i) {
426 $gzip = 1;
427 }
428 }
429
430 if ($maxdocs !~ /\-?\d+/) {
431 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
432 $maxdocs = $collectcfg->{'maxdocs'};
433 } else {
434 $maxdocs = -1; # the default
435 }
436 }
437 if ($groupsize == 1) {
438 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
439 $groupsize = $collectcfg->{'groupsize'};
440 }
441 }
442
[17142]443 if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/ )) {
[14031]444 if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
445 $OIDtype = $collectcfg->{'OIDtype'};
446 } else {
447 $OIDtype = "hash"; # the default
448 }
449 }
[14556]450
[18528]451 if ((!defined $OIDmetadata) || ($OIDmetadata eq "")) {
452 if (defined $collectcfg->{'OIDmetadata'}) {
453 $OIDmetadata = $collectcfg->{'OIDmetadata'};
454 } else {
455 $OIDmetadata = "dc.Identifier"; # the default
456 }
457 }
458
[14031]459 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
460 $sortmeta = $collectcfg->{'sortmeta'};
461 }
462 # sortmeta cannot be used with group size
463 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
464 if (defined $sortmeta && $groupsize > 1) {
465 &gsprintf($out, "{import.cannot_sort}\n\n");
466 $sortmeta = undef;
467 }
[15072]468
[14031]469 if (defined $collectcfg->{'removeprefix'} && $removeprefix eq "") {
470 $removeprefix = $collectcfg->{'removeprefix'};
471 }
472
473 if (defined $collectcfg->{'removesuffix'} && $removesuffix eq "") {
474 $removesuffix = $collectcfg->{'removesuffix'};
475 }
476 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
477 $debug = 1;
478 }
479 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
480 $gli = 1;
481 }
[17142]482 $gli = 0 unless defined $gli;
483
[14031]484 # check keepold and removeold
[20571]485 ($removeold, $keepold, $incremental, $incremental_mode)
486 = &scriptutil::check_removeold_and_keepold($removeold, $keepold,
487 $incremental, "archives",
488 $collectcfg);
[14031]489
490
491 print STDERR "<Import>\n" if $gli;
492
493 my $manifest_lookup = new manifest();
494 if ($manifest ne "") {
495 my $manifest_filename = $manifest;
496
497 if ($manifest_filename !~ m/^[\\\/]/) {
498 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
499 }
500
501 $manifest =~ s/[\\\/]+/\//g;
502 $manifest =~ s/\/$//;
503
504 $manifest_lookup->parse($manifest_filename);
505 }
506
507
508 # load all the plugins
[20571]509 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode);
[14031]510 if (scalar(@$pluginfo) == 0) {
511 &gsprintf($out, "{import.no_plugins_loaded}\n");
512 die "\n";
513 }
514
515 # remove the old contents of the archives directory (and tmp directory) if needed
516 if ($removeold) {
517 if (-e $archivedir) {
518 &gsprintf($out, "{import.removing_archives}\n");
519 &util::rm_r ($archivedir);
520 }
521 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
522 $tmpdir =~ s/[\\\/]+/\//g;
523 $tmpdir =~ s/\/$//;
524 if (-e $tmpdir) {
525 &gsprintf($out, "{import.removing_tmpdir}\n");
526 &util::rm_r ($tmpdir);
527 }
528 }
[21291]529
[14031]530 # create the archives dir if needed
531 &util::mk_all_dir($archivedir);
532
533 # read the archive information file
[18440]534## $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf");
535
[21581]536 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
[21664]537 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
538 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
[21581]539
540 $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
541 $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
[18660]542
[21580]543 $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
[18440]544 $archive_info->load_info ($arcinfo_doc_filename);
[14031]545
[18440]546 if ($manifest eq "") {
547 # Load in list of files in import folder from last import (if present)
[18456]548 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
[18440]549 }
550
[14031]551 ####Use Plugout####
[17142]552 my ($plugout);
[19657]553 if (defined $collectcfg->{'plugout'}) {
554 # If a plugout was specified in the collect.cfg file, assume it is sensible
555 # We can't check the name because it could be anything, if it is a custom plugout
[17142]556 $plugout = $collectcfg->{'plugout'};
557 }
558 else{
[17751]559 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
560 push @$plugout,"GreenstoneXMLPlugout";
[17142]561 }
562 else{
563 push @$plugout,$saveas."Plugout";
564 }
565 }
566
[14556]567 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
568 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
569 push @$plugout,("-gzip_output") if ($gzip);
570 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
571 push @$plugout,("-output_handle",$out) if (defined $out);
572 push @$plugout,("-debug") if ($debug);
[14031]573
[14556]574 $processor = &plugout::load_plugout($plugout);
[14031]575 $processor->setoutputdir ($archivedir);
576 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
577 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
578
579 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
[21291]580
581 if ($removeold) {
582 # occasionally, plugins may want to do something on remove old, eg pharos image indexing
[21309]583 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
[21291]584 }
[14031]585 if ($manifest eq "") {
586 # process the import directory
[16377]587 my $block_hash = {};
588 my $metadata = {};
589 # gobal blocking pass may set up some metadata
590 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
[18440]591
592
[20685]593 if ($incremental || $incremental_mode eq "onlyadd") {
[18528]594
595 &inexport::prime_doc_oid_count($archivedir);
596
597
[18469]598 # Can now work out which files were new, already existed, and have
599 # been deleted
600
[20571]601 &inexport::new_vs_old_import_diff($archive_info,$block_hash,$importdir,
602 $archivedir,$verbosity,$incremental_mode);
[18469]603
604 my @new_files = sort keys %{$block_hash->{'new_files'}};
605 if (scalar(@new_files>0)) {
[20807]606 print STDERR "New files and modified metadata files since last import:\n ";
[18469]607 print STDERR join("\n ",@new_files), "\n";
608 }
609
[20685]610 if ($incremental) {
611 # only look for deletions if we are truely incremental
612 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
613 # Filter out any in gsdl/tmp area
614 my @filtered_deleted_files = ();
615 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
616 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
617 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
618 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
619
620
621 foreach my $df (@deleted_files) {
622 next if ($df =~ m/^$gsdl_tmp_area/);
623 next if ($df =~ m/^$collect_tmp_area/);
624
625 push(@filtered_deleted_files,$df);
626 }
627
628
629 @deleted_files = @filtered_deleted_files;
630
[21309]631 if (scalar(@deleted_files)>0) {
[20685]632 print STDERR "Files deleted since last import:\n ";
633 print STDERR join("\n ",@deleted_files), "\n";
634
635
[21618]636 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
[21309]637
638 &inexport::mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
639 }
[20685]640
641 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
642
[21309]643 if (scalar(@reindex_files)>0) {
[20685]644 print STDERR "Files to reindex since last import:\n ";
645 print STDERR join("\n ",@reindex_files), "\n";
[21618]646 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
[21309]647 &inexport::mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
[20685]648 }
[21309]649
[20685]650 }
651
[19500]652 # Play it safe, and run through the entire folder, only processing new or edited files
653 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
654
[18440]655 }
[19500]656 else {
657 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
658 }
[18440]659
[14031]660 }
[18507]661 else
[16255]662 {
663 # process any files marked for importing
[19303]664 foreach my $file (keys %{$manifest_lookup->{'import'}}) {
[16377]665 &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
[14031]666 }
667
[18456]668 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
[18440]669
[19500]670 &inexport::mark_docs_for_deletion($archive_info,{},\@deleted_files,$archivedir);
[14031]671 }
672
673 &plugin::end($pluginfo, $processor);
674
675 &plugin::deinit($pluginfo, $processor);
676
[18528]677 # Store the value of OIDCount (used in doc.pm) so it can be
678 # restored correctly to this value on an incremental build
679 &inexport::store_doc_oid_count($archivedir);
680
[14031]681 # write out the archive information file
682 $processor->close_file_output() if $groupsize > 1;
683 $processor->close_group_output() if $processor->is_group();
[14957]684
685# The following 'if' statement is in the export.pl version of the script,
[18440]686# The reason for the 'if' statement is now given in export.pl
687# Unclear at this point if the same should be done here
688## if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) {
689 # Not all export types need this (e.g. DSpace)
690
[14957]691 # should we still do this in debug mode??
692
[18440]693 # for backwards compatability with archvies.inf file
[20571]694 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
[18440]695 $archive_info->save_info($arcinfo_doc_filename);
696 }
[19775]697 else {
[21564]698 $archive_info->save_revinfo_db($arcinfo_src_filename);
[19775]699 }
[18440]700
[19775]701
[14957]702## }
[14031]703
704 # write out import stats
705 my $close_stats = 0;
706 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
707 if (open (STATS, ">$statsfile")) {
708 $statsfile = 'import::STATS';
709 $close_stats = 1;
710 } else {
711 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
712 &gsprintf($out, "{import.stats_backup}\n");
713 $statsfile = 'STDERR';
714 }
715 }
716
717 &gsprintf($out, "\n");
718 &gsprintf($out, "*********************************************\n");
719 &gsprintf($out, "{import.complete}\n");
720 &gsprintf($out, "*********************************************\n");
721
722 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
723 if ($close_stats) {
724 close STATS;
725 }
726
727 close OUT if $close_out;
728 close FAILLOG;
729}
Note: See TracBrowser for help on using the repository browser.