source: main/trunk/greenstone2/perllib/inexport.pm

Last change on this file was 38160, checked in by davidb, 7 months ago

Print statements added to display info about import mode; some adjusted print statements reflecting new file-system level document history; some debug statements tidied up (but also commented out) -- likely useful to bring back in if needing to inspect the operation of this part of the code

  • Property svn:executable set to *
File size: 56.8 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use oaiinfo;
38use plugin;
39use plugout;
40use manifest;
41use inexport;
42use util;
43use scriptutil;
44use FileHandle;
45use gsprintf 'gsprintf';
46use printusage;
47use parse2;
48
49use DocHistoryFileUtils;
50use FileUtils;
51
52use File::Basename;
53
54my $oidtype_list =
55 [ { 'name' => "hash",
56 'desc' => "{import.OIDtype.hash}" },
57 { 'name' => "hash_on_full_filename",
58 'desc' => "{import.OIDtype.hash_on_full_filename}" },
59 { 'name' => "assigned",
60 'desc' => "{import.OIDtype.assigned}" },
61 { 'name' => "incremental",
62 'desc' => "{import.OIDtype.incremental}" },
63 { 'name' => "filename",
64 'desc' => "{import.OIDtype.filename}" },
65 { 'name' => "dirname",
66 'desc' => "{import.OIDtype.dirname}" },
67 { 'name' => "full_filename",
68 'desc' => "{import.OIDtype.full_filename}" } ];
69
70$inexport::directory_arguments =
71 [
72 { 'name' => "importdir",
73 'desc' => "{import.importdir}",
74 'type' => "string",
75 'reqd' => "no",
76 'deft' => "import",
77 'hiddengli' => "yes" },
78 { 'name' => "collectdir",
79 'desc' => "{import.collectdir}",
80 'type' => "string",
81 # parsearg left "" as default
82 #'deft' => &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect"),
83 'deft' => "",
84 'reqd' => "no",
85 'hiddengli' => "yes" },
86
87 ];
88$inexport::arguments =
89 [
90 # don't set the default to hash - want to allow this to come from
91 # entry in collect.cfg but want to override it here
92 { 'name' => "OIDtype",
93 'desc' => "{import.OIDtype}",
94 'type' => "enum",
95 'list' => $oidtype_list,
96 'deft' => "hash_on_full_filename",
97 'reqd' => "no",
98 'modegli' => "2" },
99 { 'name' => "OIDmetadata",
100 'desc' => "{import.OIDmetadata}",
101 'type' => "string",
102 'deft' => "dc.Identifier",
103 'reqd' => "no",
104 'modegli' => "2" },
105 { 'name' => "site",
106 'desc' => "{import.site}",
107 'type' => "string",
108 'deft' => "",
109 'reqd' => "no",
110 'hiddengli' => "yes" },
111 { 'name' => "manifest",
112 'desc' => "{import.manifest}",
113 'type' => "string",
114 'deft' => "",
115 'reqd' => "no",
116 'hiddengli' => "yes" } ,
117 { 'name' => "incremental",
118 'desc' => "{import.incremental}",
119 'type' => "flag",
120 'hiddengli' => "yes" },
121 { 'name' => "keepold",
122 'desc' => "{import.keepold}",
123 'type' => "flag",
124 'reqd' => "no",
125 'hiddengli' => "yes" },
126 { 'name' => "replaceold",
127 'desc' => "{import.replaceold}",
128 'type' => "flag",
129 'reqd' => "no",
130 'hiddengli' => "yes" },
131 { 'name' => "removeold",
132 'desc' => "{import.removeold}",
133 'type' => "flag",
134 'reqd' => "no",
135 'hiddengli' => "yes" },
136 { 'name' => 'assocfile_copymode',
137 'desc' => "{import.assocfile_copymode}",
138 'type' => 'enum',
139 'list' => [ { 'name' => "copy", 'desc' => "{import.assocfile_copymode_copy}" },
140 { 'name' => "hardlink", 'desc' => "{import.assocfile_copymode_hardlink}" } ],
141 'deft' => 'copy',
142 'reqd' => 'yes',
143 'hiddengli' => 'no'},
144 { 'name' => "language",
145 'desc' => "{scripts.language}",
146 'type' => "string",
147 'reqd' => "no",
148 'hiddengli' => "yes" },
149 { 'name' => "maxdocs",
150 'desc' => "{import.maxdocs}",
151 'type' => "int",
152 'reqd' => "no",
153 'deft' => "-1",
154 'range' => "-1,",
155 'modegli' => "1" },
156 { 'name' => "debug",
157 'desc' => "{import.debug}",
158 'type' => "flag",
159 'reqd' => "no",
160 'hiddengli' => "yes" },
161 { 'name' => "faillog",
162 'desc' => "{import.faillog}",
163 'type' => "string",
164 # parsearg left "" as default
165 #'deft' => &FileUtils::filenameConcatenate("<collectdir>", "colname", "etc", "fail.log"),
166 'deft' => "",
167 'reqd' => "no",
168 'modegli' => "3" },
169 { 'name' => "out",
170 'desc' => "{import.out}",
171 'type' => "string",
172 'deft' => "STDERR",
173 'reqd' => "no",
174 'hiddengli' => "yes" },
175 { 'name' => "statsfile",
176 'desc' => "{import.statsfile}",
177 'type' => "string",
178 'deft' => "STDERR",
179 'reqd' => "no",
180 'hiddengli' => "yes" },
181 { 'name' => "verbosity",
182 'desc' => "{import.verbosity}",
183 'type' => "int",
184 'range' => "0,",
185 'deft' => "2",
186 'reqd' => "no",
187 'modegli' => "3" },
188 { 'name' => "gli",
189 'desc' => "{scripts.gli}",
190 'type' => "flag",
191 'reqd' => "no",
192 'hiddengli' => "yes" },
193 { 'name' => "xml",
194 'desc' => "{scripts.xml}",
195 'type' => "flag",
196 'reqd' => "no",
197 'hiddengli' => "yes" },
198
199 ];
200
201sub new
202{
203 my $class = shift (@_);
204 my ($mode,$argv,$options,$opt_listall_options) = @_;
205
206 my $self = { 'xml' => 0, 'mode' => $mode };
207
208 # general options available to all plugins
209 my $arguments = $options->{'args'};
210 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
211 # Parse returns -1 if something has gone wrong
212 if ($intArgLeftinAfterParsing == -1)
213 {
214 &PrintUsage::print_txt_usage($options, "{import.params}",1);
215 print STDERR "Something went wrong during parsing the arguments. Scroll up for details.\n";
216 die "\n";
217 }
218
219 my $language = $self->{'language'};
220 # If $language has been specified, load the appropriate resource bundle
221 # (Otherwise, the default resource bundle will be loaded automatically)
222 if ($language && $language =~ /\S/) {
223 &gsprintf::load_language_specific_resource_bundle($language);
224 }
225
226 if ($self->{'listall'}) {
227 if ($self->{'xml'}) {
228 &PrintUsage::print_xml_usage($opt_listall_options);
229 }
230 else
231 {
232 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
233 }
234 die "\n";
235 }
236
237 if ($self->{'xml'}) {
238 &PrintUsage::print_xml_usage($options);
239 print "\n";
240 return bless $self, $class;
241 }
242
243 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
244 &gsprintf::output_strings_in_UTF8;
245 }
246
247 # If the user specified -h, then we output the usage
248 if (@$argv && $argv->[0] =~ /^\-+h/) {
249 &PrintUsage::print_txt_usage($options, "{import.params}");
250 die "\n";
251 }
252 # now check that we had exactly one leftover arg, which should be
253 # the collection name. We don't want to do this earlier, cos
254 # -xml arg doesn't need a collection name
255
256 if ($intArgLeftinAfterParsing != 1 )
257 {
258 &PrintUsage::print_txt_usage($options, "{import.params}", 1);
259 print STDERR "There should be one argument left after parsing the script args: the collection name.\n";
260 die "\n";
261 }
262
263 $self->{'close_out'} = 0;
264 my $out = $self->{'out'};
265 if ($out !~ /^(STDERR|STDOUT)$/i) {
266 open (OUT, ">$out") ||
267 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
268 $out = 'inexport::OUT';
269 $self->{'close_out'} = 1;
270 }
271 $out->autoflush(1);
272 $self->{'out'} = $out;
273
274 my $statsfile = $self->{'statsfile'};
275 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
276 open (STATSFILE, ">$statsfile") ||
277 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $statsfile) && die);
278 $statsfile = 'inexport::STATSFILE';
279 $self->{'close_stats'} = 1;
280 }
281 $statsfile->autoflush(1);
282 $self->{'statsfile'} = $statsfile;
283
284 # @ARGV should be only one item, the name of the collection
285 $self->{'collection'} = shift @$argv;
286
287 # Unless otherwise stated all manifests are considered version 1---where
288 # they act more like an advanced process expression---as compared to newer
289 # manifest files that act as an explicit (and exhaustive) list of files to
290 # process [jmt12]
291 $self->{'manifest_version'} = 1;
292
293 return bless $self, $class;
294}
295
296# Simplified version of the contstructor for use with CGI scripts
297sub newCGI
298{
299 my $class = shift (@_);
300 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
301
302 my $self = { 'xml' => 0, 'mode' => $mode };
303
304 $self->{'out'} = STDERR;
305
306 if (defined $gsdl_cgi) {
307 $self->{'site'} = $opt_site;
308 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
309 $self->{'collectdir'} = $collect_dir;
310 }
311 else {
312 $self->{'site'} = "";
313 $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
314 }
315 $self->{'faillog'} = "";
316
317 $self->{'collection'} = $collect;
318
319 return bless $self, $class;
320}
321sub get_collection
322{
323 my $self = shift @_;
324
325 return $self->{'collection'};
326}
327
328
329sub read_collection_cfg
330{
331 my $self = shift @_;
332 my ($collection,$options) = @_;
333
334 my $collectdir = $self->{'collectdir'};
335 my $site = $self->{'site'};
336 my $out = $self->{'out'};
337
338 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
339 #&PrintUsage::print_txt_usage($options, "{import.params}", 1);
340 die "\n";
341 }
342
343 # set gs_version 2/3
344 $self->{'gs_version'} = "2";
345 if ((defined $site) && ($site ne "")) {
346 # gs3
347 $self->{'gs_version'} = "3";
348 }
349
350 # add collection's perllib dir into include path in
351 # case we have collection specific modules
352 &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
353
354 # check that we can open the faillog
355 my $faillog = $self->{'faillog'};
356 if ($faillog eq "") {
357 $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
358 }
359 open (FAILLOG, ">$faillog") ||
360 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
361
362
363 my $faillogname = $faillog;
364 $faillog = 'inexport::FAILLOG';
365 $faillog->autoflush(1);
366 $self->{'faillog'} = $faillog;
367 $self->{'faillogname'} = $faillogname;
368 $self->{'close_faillog'} = 1;
369
370 # Read in the collection configuration file.
371 my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
372 my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
373
374 # store the config file's name, so oaiinfo object constructor can be instantiated with it
375 $self->{'config_filename'} = $config_filename;
376
377 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
378
379 return ($config_filename,$collectcfg);
380}
381
382sub set_collection_options
383{
384 my $self = shift @_;
385 my ($collectcfg) = @_;
386
387 my $inexport_mode = $self->{'mode'};
388
389 my $importdir = $self->{'importdir'};
390 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
391 my $out = $self->{'out'};
392
393 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
394 if (!defined($collectcfg->{'infodbtype'}))
395 {
396 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
397 }
398 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
399 # we can't use the text version for archives dbs.
400 $collectcfg->{'infodbtype'} = "gdbm";
401 }
402
403 if (defined $self->{'default_importdir'} && defined $collectcfg->{'importdir'}) {
404 $importdir = $collectcfg->{'importdir'};
405 }
406
407 if ($inexport_mode eq "import") {
408 if ( defined $self->{'default_archivedir'} && defined $collectcfg->{'archivedir'}) {
409 $archivedir = $collectcfg->{'archivedir'};
410 }
411 }
412 elsif ($inexport_mode eq "export") {
413 if (defined $self->{'default_exportdir'} && defined $collectcfg->{'exportdir'}) {
414 $archivedir = $collectcfg->{'exportdir'};
415 }
416 }
417 # fill in the default import and archives directories if none
418 # were supplied, turn all \ into / and remove trailing /
419 if (!&FileUtils::isFilenameAbsolute($importdir))
420 {
421 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $importdir);
422 }
423 else
424 {
425 # Don't do this - it kills protocol prefixes
426 #$importdir =~ s/[\\\/]+/\//g;
427 #$importdir =~ s/\/$//;
428 # Do this instead
429 &FileUtils::sanitizePath($importdir);
430 }
431
432 if (!&FileUtils::directoryExists($importdir))
433 {
434 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
435 die "\n";
436 }
437 $self->{'importdir'} = $importdir;
438
439 if (!&FileUtils::isFilenameAbsolute($archivedir)) {
440 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $archivedir);
441 }
442 else {
443
444 $archivedir = &FileUtils::sanitizePath($archivedir);
445 }
446
447 my $archivedir_keepold = "${archivedir}_keepold"; # used when file-level document-version history is in play
448 $self->{'archivedir'} = $archivedir;
449 $self->{'archivedir_keepold'} = $archivedir_keepold;
450
451 if (defined $self->{'default_verbosity'}) {
452 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
453 $self->{'verbosity'} = $collectcfg->{'verbosity'};
454 }
455 }
456
457 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
458 $self->{'manifest'} = $collectcfg->{'manifest'};
459 }
460
461 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
462 if ($collectcfg->{'gzip'} =~ /^true$/i) {
463 $self->{'gzip'} = 1;
464 }
465 }
466
467 if (defined $self->{'default_maxdocs'}) {
468 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
469 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
470 }
471 }
472
473
474
475 if (defined $self->{'default_OIDtype'} ) {
476 if (defined $collectcfg->{'OIDtype'}
477 && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|filename|dirname|full_filename)$/) {
478 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
479 }
480 }
481
482 if (defined $self->{'default_OIDmetadata'}) {
483 if (defined $collectcfg->{'OIDmetadata'}) {
484 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
485 }
486 }
487
488 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
489 $self->{'debug'} = 1;
490 }
491 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
492 $self->{'gli'} = 1;
493 }
494 $self->{'gli'} = 0 unless defined $self->{'gli'};
495
496 my $verbosity = $self->{'verbosity'};
497
498 # check keepold and removeold
499 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
500
501 my ($removeold, $keepold, $replaceold, $incremental, $incremental_mode)
502 = &scriptutil::check_removeold_keepold_replaceold($self->{'removeold'}, $self->{'keepold'}, $self->{'replaceold'},
503 $self->{'incremental'}, $checkdir,
504 $collectcfg);
505
506 $self->{'removeold'} = $removeold;
507 $self->{'keepold'} = $keepold;
508 $self->{'replaceold'} = $replaceold;
509 $self->{'incremental'} = $incremental;
510 $self->{'incremental_mode'} = $incremental_mode;
511
512 if ($verbosity >= 2) {
513 print STDERR "#" x 25,"\n";
514 print STDERR "# removeold = $removeold\t#\n";
515 print STDERR "# keepold = $keepold\t#\n";
516 print STDERR "# replaceold = $replaceold\t#\n";
517 print STDERR "# incremental = $incremental\t#\n";
518 print STDERR "# inc (mode) = $incremental_mode\t#\n";
519 print STDERR "#" x 25,"\n";
520 }
521
522# Mmmmm!
523# # Since this wasted my morning, let's at least warn a user that manifest
524# # files now *only* work if keepold is set [jmt12]
525 if ($self->{'manifest'} && (!$keepold && !$replaceold && !$incremental))
526 {
527 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold, -replaceold or -incremental\n";
528 }
529}
530
531sub process_files
532{
533 my $self = shift @_;
534 my ($config_filename,$collectcfg) = @_;
535
536 my $inexport_mode = $self->{'mode'};
537
538 my $verbosity = $self->{'verbosity'};
539 my $debug = $self->{'debug'};
540
541 my $importdir = $self->{'importdir'};
542 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
543 # 'archivedir' is a tad abused, and is sometimes set to the 'exportdir' value,
544 # however at this stage in the code development'archivedir_keepold' is only associated with archivedir (used to provide fldv-history)
545 my $archivedir_keepold = $self->{'archivedir_keepold'};
546
547 my $incremental = $self->{'incremental'};
548 my $incremental_mode = $self->{'incremental_mode'};
549
550 my $gs_version = $self->{'gs_version'};
551
552 my $removeold = $self->{'removeold'};
553 my $replaceold = $self->{'replaceold'};
554 my $keepold = $self->{'keepold'};
555
556 my $saveas = $self->{'saveas'};
557 my $saveas_options = $self->{'saveas_options'};
558 my $OIDtype = $self->{'OIDtype'};
559 my $OIDmetadata = $self->{'OIDmetadata'};
560
561 my $out = $self->{'out'};
562 my $faillog = $self->{'faillog'};
563
564 my $maxdocs = $self->{'maxdocs'};
565 my $gzip = $self->{'gzip'};
566 my $groupsize = $self->{'groupsize'};
567 my $sortmeta = $self->{'sortmeta'};
568
569 my $removeprefix = $self->{'removeprefix'};
570 my $removesuffix = $self->{'removesuffix'};
571
572 my $gli = $self->{'gli'};
573
574 # related to export
575 my $xsltfile = $self->{'xsltfile'};
576 my $group_marc = $self->{'group_marc'};
577 my $mapping_file = $self->{'mapping_file'};
578 my $xslt_mets = $self->{'xslt_mets'};
579 my $xslt_txt = $self->{'xslt_txt'};
580 my $fedora_namespace = $self->{'fedora_namespace'};
581 my $metadata_prefix = $self->{'metadata_prefix'};
582
583 if ($inexport_mode eq "import") {
584 print STDERR "<Import>\n" if $gli;
585 }
586 else {
587 print STDERR "<export>\n" if $gli;
588 }
589
590 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
591 if ($self->{'manifest'} ne "") {
592 my $manifest_filename = $self->{'manifest'};
593
594 if (!&FileUtils::isFilenameAbsolute($manifest_filename)) {
595 $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
596 }
597 $self->{'manifest'} = &FileUtils::sanitizePath($self->{'manifest'});
598 #$self->{'manifest'} =~ s/[\\\/]+/\//g;
599 #$self->{'manifest'} =~ s/\/$//;
600
601 $manifest_lookup->parse($manifest_filename);
602
603 # manifests may now include a version number [jmt12]
604 $self->{'manifest_version'} = $manifest_lookup->get_version();
605 }
606
607 my $manifest = $self->{'manifest'};
608
609 # load all the plugins
610 my $plugins = [];
611 if (defined $collectcfg->{'plugin'}) {
612 $plugins = $collectcfg->{'plugin'};
613 }
614
615 my $plugin_incr_mode = $incremental_mode;
616 if ($manifest ne "") {
617 # if we have a manifest file, then we pretend we are fully incremental for plugins
618 $plugin_incr_mode = "all";
619 }
620 #some global options for the plugins
621 my @global_opts = ();
622
623 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version, $self->{'site'});
624 if (scalar(@$pluginfo) == 0) {
625 &gsprintf($out, "{import.no_plugins_loaded}\n");
626 die "\n";
627 }
628
629 # Whether -removeold, -keepold or -replaceold there should never be an existing archivedir_keepold
630 # => Taken to be a sign of a previous import/export that has gone wrong
631 # => Print out error message and stop!
632
633 if (&FileUtils::directoryExists($archivedir_keepold)) {
634 my $rkr_old_minus_option = undef; # rkr = remove, keep, replace (whichever one is being used)
635 if ($removeold) {
636 $rkr_old_minus_option = "-removeold";
637 }
638 elsif ($keepold) {
639 $rkr_old_minus_option = "-keepold";
640 }
641 elsif ($replaceold) {
642 $rkr_old_minus_option = "-replaceold";
643 }
644
645 &gsprintf(STDERR, "\n");
646 &gsprintf(STDERR, "Detected existing directory:\n\n");
647 &gsprintf(STDERR, " $archivedir_keepold\n\n");
648 &gsprintf(STDERR, "Stopping $inexport_mode.\n\n");
649
650 &gsprintf(STDERR, "**** When building with $rkr_old_minus_option, there cannot be a pre-existing 'archives_keepold' directory\n");
651 &gsprintf(STDERR, "****\n");
652 &gsprintf(STDERR, "**** Review your collection directory folder, and determine whether to:\n");
653 &gsprintf(STDERR, "**** (a) move your 'archives_keepold' back to being 'archives'; or\n");
654 &gsprintf(STDERR, "**** (b) remove your 'archives_keepold'\n");
655 &gsprintf(STDERR, "**** before running your $inexport_mode command again\n\n");
656
657 exit 1; # c errno for 'operation not permitted'
658 }
659
660
661 # remove the old contents of the archives directory (and tmp directory) if needed
662
663 if ($removeold) {
664 if (&FileUtils::directoryExists($archivedir)) {
665 &gsprintf($out, "{import.removing_archives}\n");
666 &FileUtils::removeFilesRecursive($archivedir);
667 }
668 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
669 $tmpdir =~ s/[\\\/]+/\//g;
670 $tmpdir =~ s/\/$//;
671 if (&FileUtils::directoryExists($tmpdir)) {
672 &gsprintf($out, "{import.removing_tmpdir}\n");
673 &FileUtils::removeFilesRecursive($tmpdir);
674 }
675 }
676 else {
677 # If not $removeold, then must be $keepold or $replaceold
678 # => for either case the sequence to go through is:
679 #
680 # 1. Move 'archives' to 'archives_keepold'
681 # 2. Create new empty 'archives'
682 # 3. Copy top-level files in 'archives_keepold' to 'archives';
683 # 4. Allow 'import' to populate 'archives' as usual
684 #
685 # 5. Resolve file-level document-verison history through
686 # "hard-link"/copy content from 'archives_keep' back to 'archives'
687 # 5.1 a keepold doc's '_fldv_history' goes first
688 # 5.2 then the keepold doc's top-level content for new 'nminus 1'
689
690 # Only if all these stages run without a single error then is
691 # it then safe to remove archivedir_keepold
692
693 # If an error occurs, the process is stopped, and deleting
694 # 'archives' and moving 'archives_keepold' restores things
695 # back to how they were before import.pl was run.
696
697
698 # If got to here, then there is no pre-existing $archivedir_keepold
699 # Action Step 1.
700
701
702 if (!rename($archivedir,$archivedir_keepold)) {
703
704 &gsprintf(STDERR, "\nError message: $!\n\n");
705
706 &gsprintf(STDERR, "**** Failed to move:\n");
707 &gsprintf(STDERR, "**** $archivedir\n");
708 &gsprintf(STDERR, "**** to:\n");
709 &gsprintf(STDERR, "**** $archivedir_keepold\n");
710 &gsprintf(STDERR, "****\n");
711 &gsprintf(STDERR, "**** Unable to proceed with file-level document-version history $inexport_mode => Stopping\n");
712
713 exit $!;
714 }
715 }
716
717 # Create the archives dir if needed
718 # coincidentally fldv-history: Action Step 2
719 &FileUtils::makeAllDirectories($archivedir);
720
721 if ($keepold || $replaceold) {
722 # fldv-history: Action Step 3
723
724 my ($ret_val_success,$fullpath_files) = &FileUtils::readdirFullpath($archivedir_keepold, { 'strict' => 1, 'exclude_dirs' => 1 });
725
726 my $copy_ok = &FileUtils::copyFilesGeneral($fullpath_files,$archivedir, { 'strict' => 1 });
727 if (!$copy_ok) {
728 &gsprintf(STDERR, "**** Failed to copy top-leve files from:\n");
729 &gsprintf(STDERR, "**** $archivedir_keepold\n");
730 &gsprintf(STDERR, "**** to:\n");
731 &gsprintf(STDERR, "**** $archivedir\n");
732 &gsprintf(STDERR, "****\n");
733 &gsprintf(STDERR, "**** Unable to proceed with file-level document-version history $inexport_mode => Stopping\n");
734
735 exit 1;
736 }
737
738 if ($self->{'groupsize'} > 1) {
739 print STDERR "\n";
740 print STDERR "******\n";
741 print STDERR "Warning: Minus option '-groupsize' has not been tested with file-level document version history!\n";
742 print STDERR " If the groups formed between subsequent invocations of import.pl stay the same, then\n";
743 print STDERR " the formation of file-level document-version history 'nminus-<n> bundles' in _fldv_history directories\n";
744 print STDERR " should remain correct\n";
745 print STDERR "******\n";
746 print STDERR "\n";
747 }
748 }
749
750
751 # Read the archive information file
752 # coincidentally fldv-history: Action Step 4
753
754 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
755 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc"));
756 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src"));
757
758 # When we make these initial calls to determine the archive information doc
759 # and src databases we pass through a '1' to indicate this is the first
760 # time we are referring to these databases. When using dynamic dbutils
761 # (available in extensions) this indicates to some database types (for
762 # example, persistent servers) that this is a good time to perform any
763 # one time initialization. The argument has no effect on vanilla dbutils
764 # [jmt12]
765 my $perform_firsttime_init = 1;
766 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init);
767 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
768
769
770 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
771 $archive_info->load_info($arcinfo_doc_filename);
772 # Load in reverse-lookup info (used to determine the docs that a file in import are used in),
773 # so we don't overwrite existing info when we do incremental import
774 # From here on, make all changes to this object, then write out the file at the end.
775 $archive_info->load_rev_info($arcinfo_src_filename);
776
777 if ($manifest eq "") {
778 # Load in list of files in import folder from last import (if present)
779 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
780 }
781
782 ####Use Plugout####
783 my $plugout;
784
785 my $generate_auxiliary_files = 0;
786 if ($inexport_mode eq "import") {
787 $generate_auxiliary_files = 1;
788 }
789 elsif ($self->{'include_auxiliary_database_files'}) {
790 $generate_auxiliary_files = 1;
791 }
792 $self->{'generate_auxiliary_files'} = $generate_auxiliary_files;
793
794 # Option to use user defined plugout
795 if ($inexport_mode eq "import") {
796 if (defined $collectcfg->{'plugout'}) {
797 # If a plugout was specified in the collect.cfg file, assume it is sensible
798 # We can't check the name because it could be anything, if it is a custom plugout
799 print STDERR "Using plugout specified in collect.cfg: ".join(' ', @{$collectcfg->{'plugout'}})."\n";
800 $plugout = $collectcfg->{'plugout'};
801 }
802 else {
803 push @$plugout,$saveas."Plugout";
804 }
805
806 }
807 else {
808 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(GreenstoneXML|.*METS|DSpace|MARCXML)Plugout/) {
809 $plugout = $collectcfg->{'plugout'};
810 print STDERR "Using plugout specified in collect.cfg/collectionConfig.xml: $collectcfg->{'plugout'}\n";
811 }
812 else {
813 push @$plugout,$saveas."Plugout";
814 }
815 }
816
817 my $plugout_name = $plugout->[0];
818
819 if (defined $saveas_options) {
820 my @user_plugout_options = split(" ", $saveas_options);
821 push @$plugout, @user_plugout_options;
822 }
823 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
824 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
825 push @$plugout,("-debug") if ($debug);
826 push @$plugout,("-gzip_output") if ($gzip);
827 push @$plugout,("-output_handle",$out) if (defined $out);
828 push @$plugout,("-site",$self->{'site'}) if (defined $self->{'site'});
829 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
830
831 push @$plugout,("-assocfile_copymode",$self->{'assocfile_copymode'}) if (defined $self->{'assocfile_copymode'});
832 push @$plugout, ("-no_auxiliary_databases") if ($generate_auxiliary_files == 0);
833
834 if ($inexport_mode eq "import") {
835 if ($plugout_name =~ m/^GreenstoneXMLPlugout$/) {
836 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
837 }
838 }
839 my $processor = &plugout::load_plugout($plugout);
840 $processor->setoutputdir ($archivedir);
841 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
842 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
843 $processor->begin();
844 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
845
846 if ($removeold) {
847 # occasionally, plugins may want to do something on remove
848 # old, eg pharos image indexing
849 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
850 }
851
852 # process the import directory
853 my $block_hash = {};
854 $block_hash->{'new_files'} = {};
855 $block_hash->{'reindex_files'} = {};
856
857 # all of these are set somewhere else, so it's more readable to define them here [jmt12]
858 $block_hash->{'all_files'} = {};
859 $block_hash->{'deleted_files'} = {};
860 $block_hash->{'file_blocks'} = {};
861 $block_hash->{'metadata_files'} = {};
862 $block_hash->{'shared_fileroot'} = '';
863 $block_hash->{'manifest'} = 'false';
864 my $metadata = {};
865
866 # global blocking pass may set up some metadata
867 # does this set up metadata?????
868 # - when we have a newer manifest file we don't do this -unless- the
869 # collection configuration indicates this collection contains complex
870 # (inherited) metadata [jmt12]
871 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
872 {
873 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
874 }
875 else
876 {
877 print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n";
878 }
879
880
881 # Prepare to work with the <collection>/etc/oai-inf.<db> that keeps track
882 # of the OAI identifiers with their time stamps and deleted status.
883 my $oai_info = new oaiinfo($self->{'config_filename'}, $collectcfg->{'infodbtype'}, $verbosity);
884 my $have_manifest = ($manifest eq '') ? 0 : 1;
885 $oai_info->import_stage($removeold, $have_manifest);
886
887
888 if ($manifest ne "") {
889
890 # mark that we are using a manifest - information that might be needed
891 # down in plugins (for instance DirectoryPlugin)
892 $block_hash->{'manifest'} = $self->{'manifest_version'};
893
894 #
895 # 1. Process delete files first
896 #
897 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
898 my @full_deleted_files = ();
899
900 # ensure all filenames are absolute
901 foreach my $df (@deleted_files) {
902 my $full_df =
903 (&FileUtils::isFilenameAbsolute($df))
904 ? $df
905 : &FileUtils::filenameConcatenate($importdir,$df);
906
907 # gdb doesn't store short filenames, so ensure we specify full filenames for deletion
908 $full_df = &util::upgrade_if_dos_filename($full_df); # will only do something on windows
909
910 if (-d $full_df) {
911 &add_dir_contents_to_list($full_df, \@full_deleted_files);
912 } else {
913 push(@full_deleted_files,$full_df);
914 }
915 }
916
917 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
918 mark_docs_for_deletion($archive_info,{},
919 \@full_deleted_files,
920 $archivedir, $verbosity, "delete");
921
922
923 #
924 # 2. Now files for reindexing
925 #
926
927 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
928 my @full_reindex_files = ();
929 # ensure all filenames are absolute
930 foreach my $rf (@reindex_files) {
931 my $full_rf =
932 (&FileUtils::isFilenameAbsolute($rf))
933 ? $rf
934 : &FileUtils::filenameConcatenate($importdir,$rf);
935
936 if (-d $full_rf) {
937 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
938 } else {
939 push(@full_reindex_files,$full_rf);
940 }
941 }
942
943 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
944 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
945
946 # And now to ensure the new version of the file processed by
947 # appropriate plugin, we need to add it to block_hash reindex list
948 foreach my $full_rf (@full_reindex_files) {
949 $block_hash->{'reindex_files'}->{$full_rf} = 1;
950 }
951
952
953 #
954 # 3. Now finally any new files - add to block_hash new_files list
955 #
956
957 my @new_files = keys %{$manifest_lookup->{'index'}};
958 my @full_new_files = ();
959
960 foreach my $nf (@new_files) {
961 # ensure filename is absolute
962 my $full_nf =
963 (&FileUtils::isFilenameAbsolute($nf))
964 ? $nf
965 : &FileUtils::filenameConcatenate($importdir,$nf);
966
967 if (-d $full_nf) {
968 &add_dir_contents_to_list($full_nf, \@full_new_files);
969 } else {
970 push(@full_new_files,$full_nf);
971 }
972 }
973
974 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
975
976 # need to check this file exists before trying to read it - in the past
977 # it wasn't possible to have a manifest unless keepold was also set so
978 # you were pretty much guaranteed arcinfo existed
979 # [jmt12]
980 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12]
981 if (-e $arcinfo_src_filename)
982 {
983 my $arcinfodb_map = {};
984 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
985 foreach my $f (@full_new_files) {
986 my $rel_f = &util::abspath_to_placeholders($f);
987
988 # check that we haven't seen it already
989 if (defined $arcinfodb_map->{$rel_f}) {
990 # TODO make better warning
991 print STDERR "Warning: $f ($rel_f) already in src archive, \n";
992 } else {
993 $block_hash->{'new_files'}->{$f} = 1;
994 }
995 }
996
997 undef $arcinfodb_map;
998 }
999 # no existing files - so we can just add all the files [jmt12]
1000 else
1001 {
1002 foreach my $f (@full_new_files)
1003 {
1004 $block_hash->{'new_files'}->{$f} = 1;
1005 }
1006 }
1007
1008 # If we are not using complex inherited metadata (and thus have skipped
1009 # the global file scan) we need to at least check for a matching
1010 # metadata.xml for the files being indexed/reindexed
1011 # - unless we are using the newer version of Manifests, which are treated
1012 # verbatim, and should have a metadata element for metadata files (so
1013 # we can explicitly process metadata files other than metadata.xml)
1014 # [jmt12]
1015 if ($self->{'manifest_version'} == 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
1016 {
1017 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
1018 foreach my $file_to_import (@all_files_to_import)
1019 {
1020 my $metadata_xml_path = $file_to_import;
1021 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
1022 if (&FileUtils::fileExists($metadata_xml_path))
1023 {
1024 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
1025 }
1026 }
1027 }
1028
1029 # new version manifest files explicitly list metadata files to be
1030 # processed (ignoring complexmeta if set)
1031 # [jmt12]
1032 if ($self->{'manifest_version'} > 1)
1033 {
1034 # Process metadata files
1035 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
1036 {
1037 $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
1038 }
1039 }
1040 } # end if (manifest ne "")
1041 else {
1042 # if incremental, we read through the import folder to see what's changed.
1043
1044 if ($incremental || $incremental_mode eq "onlyadd") {
1045 prime_doc_oid_count($archivedir);
1046
1047 # Can now work out which files were new, already existed, and have
1048 # been deleted
1049
1050 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
1051 $archivedir,$verbosity,$incremental_mode);
1052
1053 my @new_files = sort keys %{$block_hash->{'new_files'}};
1054 if (scalar(@new_files>0)) {
1055 print STDERR "New files and modified metadata files since last import:\n ";
1056 print STDERR join("\n ",@new_files), "\n";
1057 }
1058
1059 if ($incremental) {
1060 # only look for deletions if we are truely incremental
1061 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
1062
1063 # Filter out any in gsdl/tmp area
1064 my @filtered_deleted_files = ();
1065
1066 my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
1067 my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
1068
1069 my $gsdl_tmp_area_re = &util::filename_to_regex($gsdl_tmp_area);
1070 my $collect_tmp_area_re = &util::filename_to_regex($collect_tmp_area);
1071
1072 foreach my $df (@deleted_files) {
1073 next if ($df =~ m/^$gsdl_tmp_area_re/);
1074 next if ($df =~ m/^$collect_tmp_area_re/);
1075
1076 push(@filtered_deleted_files,$df);
1077 }
1078
1079
1080 @deleted_files = @filtered_deleted_files;
1081
1082 if (scalar(@deleted_files)>0) {
1083 print STDERR "Files deleted since last import:\n ";
1084 print STDERR join("\n ",@deleted_files), "\n";
1085
1086
1087 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
1088
1089 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
1090 }
1091
1092 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
1093
1094 if (scalar(@reindex_files) > 0) {
1095 print STDERR "Files to reindex since last import:\n ";
1096 print STDERR join("\n ",@reindex_files), "\n";
1097 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
1098 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
1099 }
1100
1101 }
1102 } # end if incremental/only_add mode
1103 # else no manifest AND not incremental
1104 } # end if else block of manifest ne "" else eq ""
1105
1106 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
1107 # Do nothing if the file already exists (file exists on incremental build).
1108 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
1109 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
1110 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
1111 # oailastmodified and oailastmodifieddate
1112 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
1113 if ($self->{'generate_auxiliary_files'}) {
1114 if (!-f $earliestDatestampFile && -d $archivedir) {
1115 my $current_time_in_seconds = time; # in seconds
1116
1117 if(open(FOUT, ">$earliestDatestampFile")) {
1118 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
1119 print FOUT $current_time_in_seconds;
1120 close(FOUT);
1121 }
1122 else {
1123 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
1124 }
1125
1126 }
1127 }
1128
1129 $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs);
1130
1131 if ($saveas eq "FedoraMETS") {
1132 # create collection "doc obj" for Fedora that contains
1133 # collection-level metadata
1134
1135 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
1136 $doc_obj->set_OID("collection");
1137
1138 my $col_name = undef;
1139 my $col_meta = $collectcfg->{'collectionmeta'};
1140
1141 if (defined $col_meta) {
1142 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
1143 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
1144 }
1145 $processor->process($doc_obj);
1146 }
1147
1148 &plugin::end($pluginfo, $processor);
1149
1150 &plugin::deinit($pluginfo, $processor);
1151
1152 # Store the value of OIDCount (used in doc.pm) so it can be
1153 # restored correctly to this value on an incremental build
1154 # - this OIDcount file should only be generated for numerical oids [jmt12]
1155 if ($self->{'OIDtype'} eq 'incremental')
1156 {
1157 store_doc_oid_count($archivedir);
1158 }
1159
1160 # signal to the processor (plugout) that we have finished processing - if we are group processing, then the final output file needs closing.
1161 $processor->close_group_output() if $processor->is_group();
1162 $processor->end();
1163
1164 if ($self->{'generate_auxiliary_files'}) {
1165
1166 # write out the archive information file
1167 # for backwards compatability with archvies.inf file
1168 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
1169 # In the days of this being a text file, this all we had to do
1170 # Note, if still using this form of archive-inf, then neither
1171 # incremental building nor files-level document-version history
1172 # is suported
1173 $archive_info->save_info($arcinfo_doc_filename);
1174 }
1175 else {
1176 $archive_info->save_revinfo_db($arcinfo_src_filename);
1177 }
1178
1179 $archive_info->save_arcinfo_doc_timestamp($arcinfo_doc_filename);
1180 }
1181
1182
1183 #
1184 # Now deal with any file-level document-version history (fldv-history)
1185 #
1186
1187 if ($keepold || $replaceold) {
1188
1189 # fldv-history: Action Step 5
1190
1191 &DocHistoryFileUtils::archivedir_keepold_to_archivedir($collectcfg, $keepold, $replaceold, $incremental_mode, $archive_info, $archivedir,$archivedir_keepold);
1192
1193 }
1194
1195
1196 return $pluginfo;
1197}
1198
1199# @function perform_process_files()
1200# while process_files() above prepares the system to import files this is the
1201# function that actually initiates the plugin pipeline to process the files.
1202# This function should therefore be overridden in subclasses of inexport.pm should
1203# they wish to do different or further processing
1204# @author jmt12
1205sub perform_process_files
1206{
1207 my $self = shift(@_);
1208 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;
1209 my $gli = $self->{'gli'};
1210 # specific file to process - via manifest version 2+
1211 if ($file_to_import ne '')
1212 {
1213 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1214 }
1215 # global file scan - if we are using a new version manifest, files would have
1216 # been read above. Older manifests use extra settings in the $block_hash to
1217 # control what is imported, while non-manifest imports use a regular
1218 # $block_hash (so obeying process_exp and block_exp) [jmt12]
1219 elsif ($manifest eq '' || $self->{'manifest_version'} == 1)
1220 {
1221 #print STDERR "**** perform_process_files(): importdir=$importdir\n";
1222 #print STDERR "**** block_hash:\n ", join("\n ", keys %{$block_hash}), "\n\n";
1223 #print STDERR "**** block_hash->all_files:\n ", join("\n ", keys %{$block_hash->{'all_files'}}), "\n\n";
1224 #print STDERR "**** block_hash->reindex_files:\n ", join("\n ", keys %{$block_hash->{'reindex_files'}}), "\n\n";
1225
1226 #print STDERR "**** block_hash->existing_files:\n ", join("\n ", keys %{$block_hash->{'existing_files'}}), "\n\n";
1227 #print STDERR "**** block_hash->file_blocks:\n ", join("\n ", keys %{$block_hash->{'file_blocks'}}), "\n\n";
1228
1229 &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1230 }
1231 else
1232 {
1233 print STDERR "Skipping perform_process_files() due to manifest presence and version\n";
1234 }
1235}
1236# perform_process_files()
1237
1238# @function generate_statistics()
1239sub generate_statistics
1240{
1241 my $self = shift @_;
1242 my ($pluginfo) = @_;
1243
1244 my $inexport_mode = $self->{'mode'};
1245 my $out = $self->{'out'};
1246 my $faillogname = $self->{'faillogname'};
1247 my $statsfile = $self->{'statsfile'};
1248 my $gli = $self->{'gli'};
1249
1250 &gsprintf($out, "\n");
1251 &gsprintf($out, "*********************************************\n");
1252 &gsprintf($out, "{$inexport_mode.complete}\n");
1253 &gsprintf($out, "*********************************************\n");
1254
1255 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
1256}
1257# generate_statistics()
1258
1259
1260# @function deinit()
1261# Close down any file handles that we opened (and hence are responsible for
1262# closing
1263sub deinit
1264{
1265 my $self = shift(@_);
1266 close OUT if $self->{'close_out'};
1267 close FAILLOG if $self->{'close_faillog'};
1268 close STATSFILE if $self->{'close_statsfile'};
1269}
1270# deinit()
1271
1272
1273sub store_collectionmeta
1274{
1275 my ($collectionmeta,$field,$doc_obj) = @_;
1276
1277 my $section = $doc_obj->get_top_section();
1278
1279 my $field_hash = $collectionmeta->{$field};
1280
1281 foreach my $k (keys %$field_hash)
1282 {
1283 my $val = $field_hash->{$k};
1284
1285 ### print STDERR "*** $k = $field_hash->{$k}\n";
1286
1287 my $md_label = "ex.$field";
1288
1289
1290 if ($k =~ m/^\[l=(.*?)\]$/)
1291 {
1292
1293 my $md_suffix = $1;
1294 $md_label .= "^$md_suffix";
1295 }
1296
1297
1298 $doc_obj->add_utf8_metadata($section,$md_label, $val);
1299
1300 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
1301 # while "collectionname" in GS2 is called "name" in GS3.
1302 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
1303 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
1304 {
1305 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
1306 }
1307
1308 }
1309}
1310
1311
1312sub oid_count_file {
1313 my ($archivedir) = @_;
1314 return &FileUtils::filenameConcatenate($archivedir, "OIDcount");
1315}
1316
1317
1318sub prime_doc_oid_count
1319{
1320 my ($archivedir) = @_;
1321 my $oid_count_filename = &oid_count_file($archivedir);
1322
1323 if (-e $oid_count_filename) {
1324 if (open(OIDIN,"<$oid_count_filename")) {
1325 my $OIDcount = <OIDIN>;
1326 chomp $OIDcount;
1327 close(OIDIN);
1328
1329 $doc::OIDcount = $OIDcount;
1330 }
1331 else {
1332 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1333 }
1334 }
1335
1336}
1337
1338sub store_doc_oid_count
1339{
1340 # Use the file "OIDcount" in the archives directory to record
1341 # what value doc.pm got up to
1342
1343 my ($archivedir) = @_;
1344 my $oid_count_filename = &oid_count_file($archivedir);
1345
1346 # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12]
1347 if (open(OIDOUT,">$oid_count_filename")) {
1348 print OIDOUT $doc::OIDcount, "\n";
1349
1350 close(OIDOUT);
1351 }
1352 else {
1353 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1354 }
1355}
1356
1357
1358
1359sub new_vs_old_import_diff
1360{
1361 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1362
1363 # Get the infodbtype value for this collection from the arcinfo object
1364 my $infodbtype = $archive_info->{'infodbtype'};
1365
1366 # in this method, we want to know if metadata files are modified or not.
1367 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1368
1369 my ($unused_infodbtype,$archiveinf_timestamp) = $archive_info->load_timestamp($arcinfo_doc_filename);
1370
1371 # First convert all files to absolute form
1372 # This is to support the situation where the import folder is not
1373 # the default
1374
1375 my $prev_all_files = $archive_info->{'prev_import_filelist'};
1376
1377 my $full_prev_all_files = {};
1378
1379 foreach my $prev_file (keys %$prev_all_files) {
1380 # arcinfo deals in real filenames ie windows short names. but the block hash stuff is all full long versions.
1381 $prev_file = &util::upgrade_if_dos_filename($prev_file);
1382
1383 if (!&FileUtils::isFilenameAbsolute($prev_file)) {
1384 my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file);
1385 $full_prev_all_files->{$full_prev_file} = $prev_file;
1386 }
1387 else {
1388 $full_prev_all_files->{$prev_file} = $prev_file;
1389 }
1390 }
1391
1392 #print STDERR "#### ful_prev_all_files keys:\n ";
1393 #print STDERR join("\n ",keys %$full_prev_all_files);
1394 #print STDERR "\n";
1395
1396
1397 # Figure out which are the new files, existing files and so
1398 # by implication the files from the previous import that are not
1399 # there any more => mark them for deletion
1400 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1401
1402 my $full_curr_file = $curr_file;
1403
1404 # entry in 'all_files' is moved to either 'existing_files',
1405 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1406
1407 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1408 # add in import dir to make absolute
1409 $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file);
1410 }
1411
1412 # print STDERR "#### new vs old: look to see if full_curr_file=$full_curr_file in full_prev_all_files hashmap\n";
1413
1414 # figure out if new file or not
1415 if (defined $full_prev_all_files->{$full_curr_file}) {
1416 # delete it so that only files that need deleting are left
1417 delete $full_prev_all_files->{$full_curr_file};
1418 # had it before. is it a metadata file?
1419 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1420 # is it modified??
1421 my $full_curr_file_timestamp = &FileUtils::getTimestamp($full_curr_file);
1422
1423 if ($full_curr_file_timestamp > $archiveinf_timestamp) {
1424 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1425 # its newer than last build
1426 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1427 }
1428 }
1429 else {
1430 if ($incremental_mode eq "all") {
1431
1432 # had it before
1433 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1434
1435 }
1436 else {
1437 # Warning in "onlyadd" mode, but had it before!
1438 print STDERR "Warning: File $full_curr_file previously imported.\n";
1439 print STDERR " Treating as new file\n";
1440
1441 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1442
1443 }
1444 }
1445 }
1446 else {
1447 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1448 # the new file is the special sort of file greenstone uses
1449 # to attach metadata to src documents
1450 # i.e metadata.xml
1451 # (but note, the filename used is not constrained in
1452 # Greenstone to always be this)
1453
1454 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1455 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1456 }
1457 else {
1458 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1459 }
1460 }
1461
1462
1463 delete $block_hash->{'all_files'}->{$curr_file};
1464 }
1465
1466
1467
1468
1469 # Deal with complication of new or modified metadata files by forcing
1470 # everything from this point down in the file hierarchy to
1471 # be freshly imported.
1472 #
1473 # This may mean files that have not changed are reindexed, but does
1474 # guarantee by the end of processing all new metadata is correctly
1475 # associated with the relevant document(s).
1476
1477 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1478 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1479
1480 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1481 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1482
1483 # Go through existing_files, and mark anything that is contained
1484 # within 'situated_dir' to be reindexed (in case some of the metadata
1485 # attaches to one of these files)
1486
1487 my $reindex_files = [];
1488
1489 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1490
1491 if ($existing_f =~ m/^$situated_dir/) {
1492
1493 # print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1494
1495 push(@$reindex_files,$existing_f);
1496 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1497 delete $block_hash->{'existing_files'}->{$existing_f};
1498
1499 }
1500 }
1501
1502 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1503 # (or equivalent)
1504 $block_hash->{'new_files'}->{$new_mdf} = 1;
1505
1506 }
1507
1508 # go through remaining existing files and work out what has changed and needs to be reindexed.
1509 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1510
1511 my $reindex_files = [];
1512
1513 foreach my $existing_filename (@existing_files) {
1514 my $existing_filename_timestamp = &FileUtils::getTimestamp($existing_filename);
1515 if ($existing_filename_timestamp > $archiveinf_timestamp) {
1516 # file is newer than last build
1517
1518 my $existing_file = $existing_filename;
1519 #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'});
1520
1521 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1522 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1523
1524 # print STDERR "**** Reindexing existing file: $existing_file\n";
1525
1526 push(@$reindex_files,$existing_file);
1527 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1528 }
1529
1530 }
1531
1532
1533 # By this point full_prev_all_files contains the files
1534 # mentioned in archiveinf-src.db but are not in the 'import'
1535 # folder (or whatever was specified through -importdir ...)
1536
1537 # This list can contain files that were created in the 'tmp' or
1538 # 'cache' areas (such as screen-size and thumbnail images).
1539 #
1540 # In building the final list of files to delete, we test to see if
1541 # it exists on the filesystem and if it does (unusual for a "normal"
1542 # file in import, but possible in the case of 'tmp' files),
1543 # supress it from going into the final list
1544
1545 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1546
1547 my @deleted_files = values %$full_prev_all_files;
1548 map { my $curr_file = $_;
1549 my $full_curr_file = $curr_file;
1550
1551 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1552 # add in import dir to make absolute
1553
1554 $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file);
1555 }
1556
1557
1558 if (!-e $full_curr_file) {
1559 $curr_file = &util::upgrade_if_dos_filename($curr_file);
1560 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1561 }
1562 } @deleted_files;
1563
1564
1565
1566}
1567
1568
1569# this is used to delete "deleted" docs and to remove old versions of "changed" docs
1570# $mode is 'delete' or 'reindex'
1571sub mark_docs_for_deletion
1572{
1573 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1574
1575 my $mode_text = "deleted from index";
1576 if ($mode eq "reindex") {
1577 $mode_text = "reindexed";
1578 }
1579
1580 # Get the infodbtype value for this collection from the arcinfo object
1581 my $infodbtype = $archive_info->{'infodbtype'};
1582
1583 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1584 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1585
1586
1587 # record files marked for deletion in arcinfo
1588 foreach my $file (@$deleted_files) {
1589 # use 'archiveinf-src' info database file to look up all the OIDs
1590 # that this file is used in (note in most cases, it's just one OID)
1591
1592 my $downgraded_file = &util::downgrade_if_dos_filename($file);
1593 my $oids = $archive_info->get_reverseinfo($downgraded_file);
1594 $archive_info->remove_reverseinfo($downgraded_file);
1595
1596 foreach my $oid (@$oids) {
1597 # get the record for this OID from doc db
1598 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1599 # find the source doc (the primary file that becomes this oid)
1600 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1601 $doc_source_file = &util::placeholders_to_abspath($doc_source_file, "long");
1602
1603 if (!&FileUtils::isFilenameAbsolute($doc_source_file)) {
1604 $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1605 }
1606
1607 if ($doc_source_file ne $file) {
1608 # its an associated or metadata file
1609 # mark source doc for reimport as one of its assoc files has changed or deleted
1610 #$doc_source_file = &util::upgrade_if_dos_filename($doc_source_file);
1611 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1612
1613 } else {
1614
1615 # the file to be deleted/reindexed is a primary file. We need to remove all references to this in the src db
1616 my $assoc_files = $doc_rec->{'assoc-file'};
1617 foreach my $assocfile (@$assoc_files) {
1618 $assocfile = &util::placeholders_to_abspath($assocfile);
1619 $archive_info->remove_reverseinfo($assocfile, $oid);
1620 if (!defined $archive_info->get_reverseinfo($assocfile)) {
1621 # nothing refers to it anymore, mark for reindex.
1622 # block hash needs full filenames
1623 $assocfile = &util::upgrade_if_dos_filename($assocfile);
1624 $block_hash->{'reindex_files'}->{$assocfile} = 1;
1625 }
1626 }
1627
1628 }
1629 my $curr_status = $archive_info->get_status_info($oid);
1630 if (defined($curr_status) && (($curr_status ne "D"))) {
1631 if ($verbosity>1) {
1632 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1633 }
1634 # mark oid for deletion (it will be deleted or reimported)
1635 $archive_info->set_status_info($oid,"D");
1636 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1637 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1638
1639 my $val_rec = &dbutil::convert_infodb_string_to_hash($infodbtype,$val);
1640 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1641
1642 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1643 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1644 }
1645 }
1646
1647 }
1648
1649 # now go through and check that we haven't marked any primary
1650 # files for reindex (because their associated files have
1651 # changed/deleted) when they have been deleted themselves. only in
1652 # delete mode.
1653
1654 if ($mode eq "delete") {
1655 foreach my $file (@$deleted_files) {
1656 if (defined $block_hash->{'reindex_files'}->{$file}) {
1657 delete $block_hash->{'reindex_files'}->{$file};
1658 }
1659 }
1660 }
1661
1662
1663}
1664
1665sub add_dir_contents_to_list {
1666
1667 my ($dirname, $list) = @_;
1668
1669 # Recur over directory contents.
1670 my (@dir, $subfile);
1671
1672 # find all the files in the directory
1673 if (!opendir (DIR, $dirname)) {
1674 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1675 return -1; # error in processing
1676 }
1677 @dir = readdir (DIR);
1678 closedir (DIR);
1679
1680 for (my $i = 0; $i < scalar(@dir); $i++) {
1681 my $subfile = $dir[$i];
1682 next if ($subfile =~ m/^\.\.?$/);
1683 next if ($subfile =~ /^\.svn$/);
1684 my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile);
1685 if (-d $full_file) {
1686 &add_dir_contents_to_list($full_file, $list);
1687 } else {
1688 push (@$list, $full_file);
1689 }
1690 }
1691
1692}
1693
1694
1695
1696
16971;
Note: See TracBrowser for help on using the repository browser.