source: main/trunk/greenstone2/perllib/inexport.pm@ 37187

Last change on this file since 37187 was 37187, checked in by davidb, 15 months ago

Reworking of file-level document-version history, in light of a clearer understanding of how hardlinking works in terms of inodes on disk. The new solution needs to make use of moving archives to archives_keep, them copying things back. As copying is involved this means time-stamp on the archive infodb used for incremental building can no longer be used to establish which files in 'import' are newer than the last build. The implemented solution here is to store the timestamp of the previous build in a a file (rather than relying on the timestamp of a file created). The opportunity was also taken to record in this file the type of infodb used on that import.pl. With this extra information it is now possible to detect when the type of infodb used has changed in the collectionConfi.xml, meaning import.pl can still function correctly, even in the case of an incremental or incremental-add import.pl being run.

  • Property svn:executable set to *
File size: 55.6 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use oaiinfo;
38use plugin;
39use plugout;
40use manifest;
41use inexport;
42use util;
43use scriptutil;
44use FileHandle;
45use gsprintf 'gsprintf';
46use printusage;
47use parse2;
48
49use DocHistoryFileUtils;
50use FileUtils;
51
52use File::Basename;
53
54my $oidtype_list =
55 [ { 'name' => "hash",
56 'desc' => "{import.OIDtype.hash}" },
57 { 'name' => "hash_on_full_filename",
58 'desc' => "{import.OIDtype.hash_on_full_filename}" },
59 { 'name' => "assigned",
60 'desc' => "{import.OIDtype.assigned}" },
61 { 'name' => "incremental",
62 'desc' => "{import.OIDtype.incremental}" },
63 { 'name' => "filename",
64 'desc' => "{import.OIDtype.filename}" },
65 { 'name' => "dirname",
66 'desc' => "{import.OIDtype.dirname}" },
67 { 'name' => "full_filename",
68 'desc' => "{import.OIDtype.full_filename}" } ];
69
70$inexport::directory_arguments =
71 [
72 { 'name' => "importdir",
73 'desc' => "{import.importdir}",
74 'type' => "string",
75 'reqd' => "no",
76 'deft' => "import",
77 'hiddengli' => "yes" },
78 { 'name' => "collectdir",
79 'desc' => "{import.collectdir}",
80 'type' => "string",
81 # parsearg left "" as default
82 #'deft' => &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect"),
83 'deft' => "",
84 'reqd' => "no",
85 'hiddengli' => "yes" },
86
87 ];
88$inexport::arguments =
89 [
90 # don't set the default to hash - want to allow this to come from
91 # entry in collect.cfg but want to override it here
92 { 'name' => "OIDtype",
93 'desc' => "{import.OIDtype}",
94 'type' => "enum",
95 'list' => $oidtype_list,
96 'deft' => "hash_on_full_filename",
97 'reqd' => "no",
98 'modegli' => "2" },
99 { 'name' => "OIDmetadata",
100 'desc' => "{import.OIDmetadata}",
101 'type' => "string",
102 'deft' => "dc.Identifier",
103 'reqd' => "no",
104 'modegli' => "2" },
105 { 'name' => "site",
106 'desc' => "{import.site}",
107 'type' => "string",
108 'deft' => "",
109 'reqd' => "no",
110 'hiddengli' => "yes" },
111 { 'name' => "manifest",
112 'desc' => "{import.manifest}",
113 'type' => "string",
114 'deft' => "",
115 'reqd' => "no",
116 'hiddengli' => "yes" } ,
117 { 'name' => "incremental",
118 'desc' => "{import.incremental}",
119 'type' => "flag",
120 'hiddengli' => "yes" },
121 { 'name' => "keepold",
122 'desc' => "{import.keepold}",
123 'type' => "flag",
124 'reqd' => "no",
125 'hiddengli' => "yes" },
126 { 'name' => "replaceold",
127 'desc' => "{import.replaceold}",
128 'type' => "flag",
129 'reqd' => "no",
130 'hiddengli' => "yes" },
131 { 'name' => "removeold",
132 'desc' => "{import.removeold}",
133 'type' => "flag",
134 'reqd' => "no",
135 'hiddengli' => "yes" },
136 { 'name' => "language",
137 'desc' => "{scripts.language}",
138 'type' => "string",
139 'reqd' => "no",
140 'hiddengli' => "yes" },
141 { 'name' => "maxdocs",
142 'desc' => "{import.maxdocs}",
143 'type' => "int",
144 'reqd' => "no",
145 'deft' => "-1",
146 'range' => "-1,",
147 'modegli' => "1" },
148 { 'name' => "debug",
149 'desc' => "{import.debug}",
150 'type' => "flag",
151 'reqd' => "no",
152 'hiddengli' => "yes" },
153 { 'name' => "faillog",
154 'desc' => "{import.faillog}",
155 'type' => "string",
156 # parsearg left "" as default
157 #'deft' => &FileUtils::filenameConcatenate("<collectdir>", "colname", "etc", "fail.log"),
158 'deft' => "",
159 'reqd' => "no",
160 'modegli' => "3" },
161 { 'name' => "out",
162 'desc' => "{import.out}",
163 'type' => "string",
164 'deft' => "STDERR",
165 'reqd' => "no",
166 'hiddengli' => "yes" },
167 { 'name' => "statsfile",
168 'desc' => "{import.statsfile}",
169 'type' => "string",
170 'deft' => "STDERR",
171 'reqd' => "no",
172 'hiddengli' => "yes" },
173 { 'name' => "verbosity",
174 'desc' => "{import.verbosity}",
175 'type' => "int",
176 'range' => "0,",
177 'deft' => "2",
178 'reqd' => "no",
179 'modegli' => "3" },
180 { 'name' => "gli",
181 'desc' => "{scripts.gli}",
182 'type' => "flag",
183 'reqd' => "no",
184 'hiddengli' => "yes" },
185 { 'name' => "xml",
186 'desc' => "{scripts.xml}",
187 'type' => "flag",
188 'reqd' => "no",
189 'hiddengli' => "yes" },
190
191 ];
192
193sub new
194{
195 my $class = shift (@_);
196 my ($mode,$argv,$options,$opt_listall_options) = @_;
197
198 my $self = { 'xml' => 0, 'mode' => $mode };
199
200 # general options available to all plugins
201 my $arguments = $options->{'args'};
202 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
203 # Parse returns -1 if something has gone wrong
204 if ($intArgLeftinAfterParsing == -1)
205 {
206 &PrintUsage::print_txt_usage($options, "{import.params}",1);
207 print STDERR "Something went wrong during parsing the arguments. Scroll up for details.\n";
208 die "\n";
209 }
210
211 my $language = $self->{'language'};
212 # If $language has been specified, load the appropriate resource bundle
213 # (Otherwise, the default resource bundle will be loaded automatically)
214 if ($language && $language =~ /\S/) {
215 &gsprintf::load_language_specific_resource_bundle($language);
216 }
217
218 if ($self->{'listall'}) {
219 if ($self->{'xml'}) {
220 &PrintUsage::print_xml_usage($opt_listall_options);
221 }
222 else
223 {
224 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
225 }
226 die "\n";
227 }
228
229 if ($self->{'xml'}) {
230 &PrintUsage::print_xml_usage($options);
231 print "\n";
232 return bless $self, $class;
233 }
234
235 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
236 &gsprintf::output_strings_in_UTF8;
237 }
238
239 # If the user specified -h, then we output the usage
240 if (@$argv && $argv->[0] =~ /^\-+h/) {
241 &PrintUsage::print_txt_usage($options, "{import.params}");
242 die "\n";
243 }
244 # now check that we had exactly one leftover arg, which should be
245 # the collection name. We don't want to do this earlier, cos
246 # -xml arg doesn't need a collection name
247
248 if ($intArgLeftinAfterParsing != 1 )
249 {
250 &PrintUsage::print_txt_usage($options, "{import.params}", 1);
251 print STDERR "There should be one argument left after parsing the script args: the collection name.\n";
252 die "\n";
253 }
254
255 $self->{'close_out'} = 0;
256 my $out = $self->{'out'};
257 if ($out !~ /^(STDERR|STDOUT)$/i) {
258 open (OUT, ">$out") ||
259 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
260 $out = 'inexport::OUT';
261 $self->{'close_out'} = 1;
262 }
263 $out->autoflush(1);
264 $self->{'out'} = $out;
265
266 my $statsfile = $self->{'statsfile'};
267 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
268 open (STATSFILE, ">$statsfile") ||
269 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $statsfile) && die);
270 $statsfile = 'inexport::STATSFILE';
271 $self->{'close_stats'} = 1;
272 }
273 $statsfile->autoflush(1);
274 $self->{'statsfile'} = $statsfile;
275
276 # @ARGV should be only one item, the name of the collection
277 $self->{'collection'} = shift @$argv;
278
279 # Unless otherwise stated all manifests are considered version 1---where
280 # they act more like an advanced process expression---as compared to newer
281 # manifest files that act as an explicit (and exhaustive) list of files to
282 # process [jmt12]
283 $self->{'manifest_version'} = 1;
284
285 return bless $self, $class;
286}
287
288# Simplified version of the contstructor for use with CGI scripts
289sub newCGI
290{
291 my $class = shift (@_);
292 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
293
294 my $self = { 'xml' => 0, 'mode' => $mode };
295
296 $self->{'out'} = STDERR;
297
298 if (defined $gsdl_cgi) {
299 $self->{'site'} = $opt_site;
300 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
301 $self->{'collectdir'} = $collect_dir;
302 }
303 else {
304 $self->{'site'} = "";
305 $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
306 }
307 $self->{'faillog'} = "";
308
309 $self->{'collection'} = $collect;
310
311 return bless $self, $class;
312}
313sub get_collection
314{
315 my $self = shift @_;
316
317 return $self->{'collection'};
318}
319
320
321sub read_collection_cfg
322{
323 my $self = shift @_;
324 my ($collection,$options) = @_;
325
326 my $collectdir = $self->{'collectdir'};
327 my $site = $self->{'site'};
328 my $out = $self->{'out'};
329
330 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
331 #&PrintUsage::print_txt_usage($options, "{import.params}", 1);
332 die "\n";
333 }
334
335 # set gs_version 2/3
336 $self->{'gs_version'} = "2";
337 if ((defined $site) && ($site ne "")) {
338 # gs3
339 $self->{'gs_version'} = "3";
340 }
341
342 # add collection's perllib dir into include path in
343 # case we have collection specific modules
344 &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
345
346 # check that we can open the faillog
347 my $faillog = $self->{'faillog'};
348 if ($faillog eq "") {
349 $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
350 }
351 open (FAILLOG, ">$faillog") ||
352 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
353
354
355 my $faillogname = $faillog;
356 $faillog = 'inexport::FAILLOG';
357 $faillog->autoflush(1);
358 $self->{'faillog'} = $faillog;
359 $self->{'faillogname'} = $faillogname;
360 $self->{'close_faillog'} = 1;
361
362 # Read in the collection configuration file.
363 my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
364 my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
365
366 # store the config file's name, so oaiinfo object constructor can be instantiated with it
367 $self->{'config_filename'} = $config_filename;
368
369 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
370
371 return ($config_filename,$collectcfg);
372}
373
374sub set_collection_options
375{
376 my $self = shift @_;
377 my ($collectcfg) = @_;
378
379 my $inexport_mode = $self->{'mode'};
380
381 my $importdir = $self->{'importdir'};
382 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
383 my $out = $self->{'out'};
384
385 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
386 if (!defined($collectcfg->{'infodbtype'}))
387 {
388 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
389 }
390 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
391 # we can't use the text version for archives dbs.
392 $collectcfg->{'infodbtype'} = "gdbm";
393 }
394
395 if (defined $self->{'default_importdir'} && defined $collectcfg->{'importdir'}) {
396 $importdir = $collectcfg->{'importdir'};
397 }
398
399 if ($inexport_mode eq "import") {
400 if ( defined $self->{'default_archivedir'} && defined $collectcfg->{'archivedir'}) {
401 $archivedir = $collectcfg->{'archivedir'};
402 }
403 }
404 elsif ($inexport_mode eq "export") {
405 if (defined $self->{'default_exportdir'} && defined $collectcfg->{'exportdir'}) {
406 $archivedir = $collectcfg->{'exportdir'};
407 }
408 }
409 # fill in the default import and archives directories if none
410 # were supplied, turn all \ into / and remove trailing /
411 if (!&FileUtils::isFilenameAbsolute($importdir))
412 {
413 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $importdir);
414 }
415 else
416 {
417 # Don't do this - it kills protocol prefixes
418 #$importdir =~ s/[\\\/]+/\//g;
419 #$importdir =~ s/\/$//;
420 # Do this instead
421 &FileUtils::sanitizePath($importdir);
422 }
423
424 if (!&FileUtils::directoryExists($importdir))
425 {
426 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
427 die "\n";
428 }
429 $self->{'importdir'} = $importdir;
430
431 if (!&FileUtils::isFilenameAbsolute($archivedir)) {
432 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $archivedir);
433 }
434 else {
435
436 $archivedir = &FileUtils::sanitizePath($archivedir);
437 }
438
439 my $archivedir_keepold = "${archivedir}_keepold"; # used when file-level document-version history is in play
440 $self->{'archivedir'} = $archivedir;
441 $self->{'archivedir_keepold'} = $archivedir_keepold;
442
443 if (defined $self->{'default_verbosity'}) {
444 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
445 $self->{'verbosity'} = $collectcfg->{'verbosity'};
446 }
447 }
448
449 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
450 $self->{'manifest'} = $collectcfg->{'manifest'};
451 }
452
453 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
454 if ($collectcfg->{'gzip'} =~ /^true$/i) {
455 $self->{'gzip'} = 1;
456 }
457 }
458
459 if (defined $self->{'default_maxdocs'}) {
460 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
461 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
462 }
463 }
464
465
466
467 if (defined $self->{'default_OIDtype'} ) {
468 if (defined $collectcfg->{'OIDtype'}
469 && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|filename|dirname|full_filename)$/) {
470 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
471 }
472 }
473
474 if (defined $self->{'default_OIDmetadata'}) {
475 if (defined $collectcfg->{'OIDmetadata'}) {
476 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
477 }
478 }
479
480 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
481 $self->{'debug'} = 1;
482 }
483 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
484 $self->{'gli'} = 1;
485 }
486 $self->{'gli'} = 0 unless defined $self->{'gli'};
487
488 # check keepold and removeold
489 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
490
491 my ($removeold, $keepold, $replaceold, $incremental, $incremental_mode)
492 = &scriptutil::check_removeold_keepold_replaceold($self->{'removeold'}, $self->{'keepold'}, $self->{'replaceold'},
493 $self->{'incremental'}, $checkdir,
494 $collectcfg);
495
496 $self->{'removeold'} = $removeold;
497 $self->{'keepold'} = $keepold;
498 $self->{'replaceold'} = $replaceold;
499 $self->{'incremental'} = $incremental;
500 $self->{'incremental_mode'} = $incremental_mode;
501
502 # Since this wasted my morning, let's at least warn a user that manifest
503 # files now *only* work if keepold is set [jmt12]
504 if ($self->{'manifest'} && (!$keepold || !$incremental))
505 {
506 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental\n";
507 }
508}
509
510sub process_files
511{
512 my $self = shift @_;
513 my ($config_filename,$collectcfg) = @_;
514
515 my $inexport_mode = $self->{'mode'};
516
517 my $verbosity = $self->{'verbosity'};
518 my $debug = $self->{'debug'};
519
520 my $importdir = $self->{'importdir'};
521 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
522 # 'archivedir' is a tad abused, and is sometimes set to the 'exportdir' value,
523 # however at this stage in the code development'archivedir_keepold' is only associated with archivedir (used to provide fldv-history)
524 my $archivedir_keepold = $self->{'archivedir_keepold'};
525
526 my $incremental = $self->{'incremental'};
527 my $incremental_mode = $self->{'incremental_mode'};
528
529 my $gs_version = $self->{'gs_version'};
530
531 my $removeold = $self->{'removeold'};
532 my $replaceold = $self->{'replaceold'};
533 my $keepold = $self->{'keepold'};
534
535 my $saveas = $self->{'saveas'};
536 my $saveas_options = $self->{'saveas_options'};
537 my $OIDtype = $self->{'OIDtype'};
538 my $OIDmetadata = $self->{'OIDmetadata'};
539
540 my $out = $self->{'out'};
541 my $faillog = $self->{'faillog'};
542
543 my $maxdocs = $self->{'maxdocs'};
544 my $gzip = $self->{'gzip'};
545 my $groupsize = $self->{'groupsize'};
546 my $sortmeta = $self->{'sortmeta'};
547
548 my $removeprefix = $self->{'removeprefix'};
549 my $removesuffix = $self->{'removesuffix'};
550
551 my $gli = $self->{'gli'};
552
553 # related to export
554 my $xsltfile = $self->{'xsltfile'};
555 my $group_marc = $self->{'group_marc'};
556 my $mapping_file = $self->{'mapping_file'};
557 my $xslt_mets = $self->{'xslt_mets'};
558 my $xslt_txt = $self->{'xslt_txt'};
559 my $fedora_namespace = $self->{'fedora_namespace'};
560 my $metadata_prefix = $self->{'metadata_prefix'};
561
562 if ($inexport_mode eq "import") {
563 print STDERR "<Import>\n" if $gli;
564 }
565 else {
566 print STDERR "<export>\n" if $gli;
567 }
568
569 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
570 if ($self->{'manifest'} ne "") {
571 my $manifest_filename = $self->{'manifest'};
572
573 if (!&FileUtils::isFilenameAbsolute($manifest_filename)) {
574 $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
575 }
576 $self->{'manifest'} = &FileUtils::sanitizePath($self->{'manifest'});
577 #$self->{'manifest'} =~ s/[\\\/]+/\//g;
578 #$self->{'manifest'} =~ s/\/$//;
579
580 $manifest_lookup->parse($manifest_filename);
581
582 # manifests may now include a version number [jmt12]
583 $self->{'manifest_version'} = $manifest_lookup->get_version();
584 }
585
586 my $manifest = $self->{'manifest'};
587
588 # load all the plugins
589 my $plugins = [];
590 if (defined $collectcfg->{'plugin'}) {
591 $plugins = $collectcfg->{'plugin'};
592 }
593
594 my $plugin_incr_mode = $incremental_mode;
595 if ($manifest ne "") {
596 # if we have a manifest file, then we pretend we are fully incremental for plugins
597 $plugin_incr_mode = "all";
598 }
599 #some global options for the plugins
600 my @global_opts = ();
601
602 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version, $self->{'site'});
603 if (scalar(@$pluginfo) == 0) {
604 &gsprintf($out, "{import.no_plugins_loaded}\n");
605 die "\n";
606 }
607
608 # Whether -removeold, -keepold or -replaceold there should never be an existing archivedir_keepold
609 # => Taken to be a sign of a previous import/export that has gone wrong
610 # => Print out error message and stop!
611
612 if (&FileUtils::directoryExists($archivedir_keepold)) {
613 my $rkr_old_minus_option = undef; # rkr = remove, keep, replace (whichever one is being used)
614 if ($removeold) {
615 $rkr_old_minus_option = "-removeold";
616 }
617 elsif ($keepold) {
618 $rkr_old_minus_option = "-keepold";
619 }
620 elsif ($replaceold) {
621 $rkr_old_minus_option = "-replaceold";
622 }
623
624 &gsprintf(STDERR, "\n");
625 &gsprintf(STDERR, "Detected existing directory:\n\n");
626 &gsprintf(STDERR, " $archivedir_keepold\n\n");
627 &gsprintf(STDERR, "Stopping $inexport_mode.\n\n");
628
629 &gsprintf(STDERR, "**** When building with $rkr_old_minus_option, there cannot be a pre-existing 'archives_keepold' directory\n");
630 &gsprintf(STDERR, "****\n");
631 &gsprintf(STDERR, "**** Review your collection directory folder, and determine whether to:\n");
632 &gsprintf(STDERR, "**** (a) move your 'archives_keepold' back to being 'archives'; or\n");
633 &gsprintf(STDERR, "**** (b) remove your 'archives_keepold'\n");
634 &gsprintf(STDERR, "**** before running your $inexport_mode command again\n\n");
635
636 exit 1; # c errno for 'operation not permitted'
637 }
638
639
640 # remove the old contents of the archives directory (and tmp directory) if needed
641
642 if ($removeold) {
643 if (&FileUtils::directoryExists($archivedir)) {
644 &gsprintf($out, "{import.removing_archives}\n");
645 &FileUtils::removeFilesRecursive($archivedir);
646 }
647 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
648 $tmpdir =~ s/[\\\/]+/\//g;
649 $tmpdir =~ s/\/$//;
650 if (&FileUtils::directoryExists($tmpdir)) {
651 &gsprintf($out, "{import.removing_tmpdir}\n");
652 &FileUtils::removeFilesRecursive($tmpdir);
653 }
654 }
655 else {
656 # If not $removeold, then must be $keepold or $replaceold
657 # => for either case the sequence to go through is:
658 #
659 # 1. Move 'archives' to 'archives_keepold'
660 # 2. Create new empty 'archives'
661 # 3. Copy top-level files in 'archives_keepold' to 'archives';
662 # 4. Allow 'import' to populate 'archives' as usual
663 #
664 # 5. Resolve file-level document-verison history through
665 # "hard-link"/copy content from 'archives_keep' back to 'archives'
666 # 5.1 a keepold doc's '_fldv_history' goes first
667 # 5.2 then the keepold doc's top-level content for new 'nminus 1'
668
669 # Only if all these stages run without a single error then is
670 # it then safe to remove archivedir_keepold
671
672 # If an error occurs, the process is stopped, and deleting
673 # 'archives' and moving 'archives_keepold' restores things
674 # back to how they were before import.pl was run.
675
676
677 # If got to here, then there is no pre-existing $archivedir_keepold
678 # Action Step 1.
679
680
681 if (!rename($archivedir,$archivedir_keepold)) {
682
683 &gsprintf(STDERR, "\nError message: $!\n\n");
684
685 &gsprintf(STDERR, "**** Failed to move:\n");
686 &gsprintf(STDERR, "**** $archivedir\n");
687 &gsprintf(STDERR, "**** to:\n");
688 &gsprintf(STDERR, "**** $archivedir_keepold\n");
689 &gsprintf(STDERR, "****\n");
690 &gsprintf(STDERR, "**** Unable to proceed with file-level document-version history $inexport_mode => Stopping\n");
691
692 exit $!;
693 }
694 }
695
696 # Create the archives dir if needed
697 # coincidentally fldv-history: Action Step 2
698 &FileUtils::makeAllDirectories($archivedir);
699
700 if ($keepold || $replaceold) {
701 # fldv-history: Action Step 3
702
703 my ($ret_val_success,$fullpath_files) = &FileUtils::readdirFullpath($archivedir_keepold, { 'strict' => 1, 'exclude_dirs' => 1 });
704
705 my $copy_ok = &FileUtils::copyFilesGeneral($fullpath_files,$archivedir, { 'strict' => 1 });
706 if (!$copy_ok) {
707 &gsprintf(STDERR, "**** Failed to copy top-leve files from:\n");
708 &gsprintf(STDERR, "**** $archivedir_keepold\n");
709 &gsprintf(STDERR, "**** to:\n");
710 &gsprintf(STDERR, "**** $archivedir\n");
711 &gsprintf(STDERR, "****\n");
712 &gsprintf(STDERR, "**** Unable to proceed with file-level document-version history $inexport_mode => Stopping\n");
713
714 exit 1;
715 }
716
717 print STDERR "\n\n\n";
718 print STDERR "*****!!!!! No easy way in perl to perform file copy and perserve timestamps\n";
719 print STDERR "*****!!!!! SO => need to implement file with timestamp within (and DBinfo type for good measure)\n";
720 print STDERR "*****!!!!! and change plugin/incremental building that depends on/uses -M \n";
721 # ArchiveInfoPlugin, DirectoryPlugin inexport.pm, arcinfo.pma
722 # DirectoryPlugin inexport.pm, arcinfo.pm (convertutil.pm OK, as working on the two files passed to it)
723 print STDERR "\n\n\n";
724 }
725
726
727 # Read the archive information file
728 # coincidentally fldv-history: Action Step 4
729
730 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
731 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc"));
732 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src"));
733
734 # When we make these initial calls to determine the archive information doc
735 # and src databases we pass through a '1' to indicate this is the first
736 # time we are referring to these databases. When using dynamic dbutils
737 # (available in extensions) this indicates to some database types (for
738 # example, persistent servers) that this is a good time to perform any
739 # one time initialization. The argument has no effect on vanilla dbutils
740 # [jmt12]
741 my $perform_firsttime_init = 1;
742 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init);
743 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
744
745
746 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
747 $archive_info->load_info($arcinfo_doc_filename);
748 # Load in reverse-lookup info (used to determine the docs that a file in import are used in),
749 # so we don't overwrite existing info when we do incremental import
750 # From here on, make all changes to this object, then write out the file at the end.
751 $archive_info->load_rev_info($arcinfo_src_filename);
752
753 if ($manifest eq "") {
754 # Load in list of files in import folder from last import (if present)
755 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
756 }
757
758 ####Use Plugout####
759 my $plugout;
760
761 my $generate_auxiliary_files = 0;
762 if ($inexport_mode eq "import") {
763 $generate_auxiliary_files = 1;
764 }
765 elsif ($self->{'include_auxiliary_database_files'}) {
766 $generate_auxiliary_files = 1;
767 }
768 $self->{'generate_auxiliary_files'} = $generate_auxiliary_files;
769
770 # Option to use user defined plugout
771 if ($inexport_mode eq "import") {
772 if (defined $collectcfg->{'plugout'}) {
773 # If a plugout was specified in the collect.cfg file, assume it is sensible
774 # We can't check the name because it could be anything, if it is a custom plugout
775 print STDERR "Using plugout specified in collect.cfg: ".join(' ', @{$collectcfg->{'plugout'}})."\n";
776 $plugout = $collectcfg->{'plugout'};
777 }
778 else {
779 push @$plugout,$saveas."Plugout";
780 }
781
782 }
783 else {
784 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(GreenstoneXML|.*METS|DSpace|MARCXML)Plugout/) {
785 $plugout = $collectcfg->{'plugout'};
786 print STDERR "Using plugout specified in collect.cfg: $collectcfg->{'plugout'}\n";
787 }
788 else {
789 push @$plugout,$saveas."Plugout";
790 }
791 }
792
793 my $plugout_name = $plugout->[0];
794
795 if (defined $saveas_options) {
796 my @user_plugout_options = split(" ", $saveas_options);
797 push @$plugout, @user_plugout_options;
798 }
799 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
800 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
801 push @$plugout,("-debug") if ($debug);
802 push @$plugout,("-gzip_output") if ($gzip);
803 push @$plugout,("-output_handle",$out) if (defined $out);
804 push @$plugout,("-site",$self->{'site'}) if (defined $self->{'site'});
805
806 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
807 push @$plugout, ("-no_auxiliary_databases") if ($generate_auxiliary_files == 0);
808 if ($inexport_mode eq "import") {
809 if ($plugout_name =~ m/^GreenstoneXMLPlugout$/) {
810 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
811 }
812 }
813 my $processor = &plugout::load_plugout($plugout);
814 $processor->setoutputdir ($archivedir);
815 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
816 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
817 $processor->begin();
818 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
819
820 if ($removeold) {
821 # occasionally, plugins may want to do something on remove
822 # old, eg pharos image indexing
823 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
824 }
825
826 # process the import directory
827 my $block_hash = {};
828 $block_hash->{'new_files'} = {};
829 $block_hash->{'reindex_files'} = {};
830
831 # all of these are set somewhere else, so it's more readable to define them here [jmt12]
832 $block_hash->{'all_files'} = {};
833 $block_hash->{'deleted_files'} = {};
834 $block_hash->{'file_blocks'} = {};
835 $block_hash->{'metadata_files'} = {};
836 $block_hash->{'shared_fileroot'} = '';
837 $block_hash->{'manifest'} = 'false';
838 my $metadata = {};
839
840 # global blocking pass may set up some metadata
841 # does this set up metadata?????
842 # - when we have a newer manifest file we don't do this -unless- the
843 # collection configuration indicates this collection contains complex
844 # (inherited) metadata [jmt12]
845 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
846 {
847 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
848 }
849 else
850 {
851 print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n";
852 }
853
854
855 # Prepare to work with the <collection>/etc/oai-inf.<db> that keeps track
856 # of the OAI identifiers with their time stamps and deleted status.
857 my $oai_info = new oaiinfo($self->{'config_filename'}, $collectcfg->{'infodbtype'}, $verbosity);
858 my $have_manifest = ($manifest eq '') ? 0 : 1;
859 $oai_info->import_stage($removeold, $have_manifest);
860
861
862 if ($manifest ne "") {
863
864 # mark that we are using a manifest - information that might be needed
865 # down in plugins (for instance DirectoryPlugin)
866 $block_hash->{'manifest'} = $self->{'manifest_version'};
867
868 #
869 # 1. Process delete files first
870 #
871 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
872 my @full_deleted_files = ();
873
874 # ensure all filenames are absolute
875 foreach my $df (@deleted_files) {
876 my $full_df =
877 (&FileUtils::isFilenameAbsolute($df))
878 ? $df
879 : &FileUtils::filenameConcatenate($importdir,$df);
880
881 # gdb doesn't store short filenames, so ensure we specify full filenames for deletion
882 $full_df = &util::upgrade_if_dos_filename($full_df); # will only do something on windows
883
884 if (-d $full_df) {
885 &add_dir_contents_to_list($full_df, \@full_deleted_files);
886 } else {
887 push(@full_deleted_files,$full_df);
888 }
889 }
890
891 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
892 mark_docs_for_deletion($archive_info,{},
893 \@full_deleted_files,
894 $archivedir, $verbosity, "delete");
895
896
897 #
898 # 2. Now files for reindexing
899 #
900
901 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
902 my @full_reindex_files = ();
903 # ensure all filenames are absolute
904 foreach my $rf (@reindex_files) {
905 my $full_rf =
906 (&FileUtils::isFilenameAbsolute($rf))
907 ? $rf
908 : &FileUtils::filenameConcatenate($importdir,$rf);
909
910 if (-d $full_rf) {
911 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
912 } else {
913 push(@full_reindex_files,$full_rf);
914 }
915 }
916
917 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
918 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
919
920 # And now to ensure the new version of the file processed by
921 # appropriate plugin, we need to add it to block_hash reindex list
922 foreach my $full_rf (@full_reindex_files) {
923 $block_hash->{'reindex_files'}->{$full_rf} = 1;
924 }
925
926
927 #
928 # 3. Now finally any new files - add to block_hash new_files list
929 #
930
931 my @new_files = keys %{$manifest_lookup->{'index'}};
932 my @full_new_files = ();
933
934 foreach my $nf (@new_files) {
935 # ensure filename is absolute
936 my $full_nf =
937 (&FileUtils::isFilenameAbsolute($nf))
938 ? $nf
939 : &FileUtils::filenameConcatenate($importdir,$nf);
940
941 if (-d $full_nf) {
942 &add_dir_contents_to_list($full_nf, \@full_new_files);
943 } else {
944 push(@full_new_files,$full_nf);
945 }
946 }
947
948 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
949
950 # need to check this file exists before trying to read it - in the past
951 # it wasn't possible to have a manifest unless keepold was also set so
952 # you were pretty much guaranteed arcinfo existed
953 # [jmt12]
954 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12]
955 if (-e $arcinfo_src_filename)
956 {
957 my $arcinfodb_map = {};
958 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
959 foreach my $f (@full_new_files) {
960 my $rel_f = &util::abspath_to_placeholders($f);
961
962 # check that we haven't seen it already
963 if (defined $arcinfodb_map->{$rel_f}) {
964 # TODO make better warning
965 print STDERR "Warning: $f ($rel_f) already in src archive, \n";
966 } else {
967 $block_hash->{'new_files'}->{$f} = 1;
968 }
969 }
970
971 undef $arcinfodb_map;
972 }
973 # no existing files - so we can just add all the files [jmt12]
974 else
975 {
976 foreach my $f (@full_new_files)
977 {
978 $block_hash->{'new_files'}->{$f} = 1;
979 }
980 }
981
982 # If we are not using complex inherited metadata (and thus have skipped
983 # the global file scan) we need to at least check for a matching
984 # metadata.xml for the files being indexed/reindexed
985 # - unless we are using the newer version of Manifests, which are treated
986 # verbatim, and should have a metadata element for metadata files (so
987 # we can explicitly process metadata files other than metadata.xml)
988 # [jmt12]
989 if ($self->{'manifest_version'} == 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
990 {
991 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
992 foreach my $file_to_import (@all_files_to_import)
993 {
994 my $metadata_xml_path = $file_to_import;
995 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
996 if (&FileUtils::fileExists($metadata_xml_path))
997 {
998 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
999 }
1000 }
1001 }
1002
1003 # new version manifest files explicitly list metadata files to be
1004 # processed (ignoring complexmeta if set)
1005 # [jmt12]
1006 if ($self->{'manifest_version'} > 1)
1007 {
1008 # Process metadata files
1009 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
1010 {
1011 $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
1012 }
1013 }
1014 } # end if (manifest ne "")
1015 else {
1016 # if incremental, we read through the import folder to see whats changed.
1017
1018 if ($incremental || $incremental_mode eq "onlyadd") {
1019 prime_doc_oid_count($archivedir);
1020
1021 # Can now work out which files were new, already existed, and have
1022 # been deleted
1023
1024 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
1025 $archivedir,$verbosity,$incremental_mode);
1026
1027 my @new_files = sort keys %{$block_hash->{'new_files'}};
1028 if (scalar(@new_files>0)) {
1029 print STDERR "New files and modified metadata files since last import:\n ";
1030 print STDERR join("\n ",@new_files), "\n";
1031 }
1032
1033 if ($incremental) {
1034 # only look for deletions if we are truely incremental
1035 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
1036 # Filter out any in gsdl/tmp area
1037 my @filtered_deleted_files = ();
1038 my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
1039 my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
1040 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
1041 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
1042
1043 foreach my $df (@deleted_files) {
1044 next if ($df =~ m/^$gsdl_tmp_area/);
1045 next if ($df =~ m/^$collect_tmp_area/);
1046
1047 push(@filtered_deleted_files,$df);
1048 }
1049
1050
1051 @deleted_files = @filtered_deleted_files;
1052
1053 if (scalar(@deleted_files)>0) {
1054 print STDERR "Files deleted since last import:\n ";
1055 print STDERR join("\n ",@deleted_files), "\n";
1056
1057
1058 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
1059
1060 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
1061 }
1062
1063 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
1064
1065 if (scalar(@reindex_files)>0) {
1066 print STDERR "Files to reindex since last import:\n ";
1067 print STDERR join("\n ",@reindex_files), "\n";
1068 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
1069 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
1070 }
1071
1072 }
1073 } # end if incremental/only_add mode
1074 # else no manifest AND not incremental
1075 } # end if else block of manifest ne "" else eq ""
1076
1077 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
1078 # Do nothing if the file already exists (file exists on incremental build).
1079 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
1080 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
1081 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
1082 # oailastmodified and oailastmodifieddate
1083 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
1084 if ($self->{'generate_auxiliary_files'}) {
1085 if (!-f $earliestDatestampFile && -d $archivedir) {
1086 my $current_time_in_seconds = time; # in seconds
1087
1088 if(open(FOUT, ">$earliestDatestampFile")) {
1089 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
1090 print FOUT $current_time_in_seconds;
1091 close(FOUT);
1092 }
1093 else {
1094 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
1095 }
1096
1097 }
1098 }
1099
1100 $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs);
1101
1102 if ($saveas eq "FedoraMETS") {
1103 # create collection "doc obj" for Fedora that contains
1104 # collection-level metadata
1105
1106 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
1107 $doc_obj->set_OID("collection");
1108
1109 my $col_name = undef;
1110 my $col_meta = $collectcfg->{'collectionmeta'};
1111
1112 if (defined $col_meta) {
1113 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
1114 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
1115 }
1116 $processor->process($doc_obj);
1117 }
1118
1119 &plugin::end($pluginfo, $processor);
1120
1121 &plugin::deinit($pluginfo, $processor);
1122
1123 # Store the value of OIDCount (used in doc.pm) so it can be
1124 # restored correctly to this value on an incremental build
1125 # - this OIDcount file should only be generated for numerical oids [jmt12]
1126 if ($self->{'OIDtype'} eq 'incremental')
1127 {
1128 store_doc_oid_count($archivedir);
1129 }
1130
1131 # signal to the processor (plugout) that we have finished processing - if we are group processing, then the final output file needs closing.
1132 $processor->close_group_output() if $processor->is_group();
1133 $processor->end();
1134
1135 if ($self->{'generate_auxiliary_files'}) {
1136
1137 # write out the archive information file
1138 # for backwards compatability with archvies.inf file
1139 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
1140 # In the days of this being a text file, this all we had to do
1141 # Note, if still using this form of archive-inf, then neither
1142 # incremental building nor files-level document-version history
1143 # is suported
1144 $archive_info->save_info($arcinfo_doc_filename);
1145 }
1146 else {
1147 $archive_info->save_revinfo_db($arcinfo_src_filename);
1148 }
1149
1150 $archive_info->save_arcinfo_doc_timestamp($arcinfo_doc_filename);
1151 }
1152
1153
1154 #
1155 # Now deal with any file-level document-version history (fldv-history)
1156 #
1157
1158 if ($keepold || $replaceold) {
1159
1160 # fldv-history: Action Step 5
1161
1162 &DocHistoryFileUtils::archivedir_keepold_to_archivedir($collectcfg, $keepold, $replaceold, $incremental_mode, $archive_info, $archivedir,$archivedir_keepold);
1163
1164 }
1165
1166
1167 return $pluginfo;
1168}
1169
1170# @function perform_process_files()
1171# while process_files() above prepares the system to import files this is the
1172# function that actually initiates the plugin pipeline to process the files.
1173# This function should therefore be overridden in subclasses of inexport.pm should
1174# they wish to do different or further processing
1175# @author jmt12
1176sub perform_process_files
1177{
1178 my $self = shift(@_);
1179 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;
1180 my $gli = $self->{'gli'};
1181 # specific file to process - via manifest version 2+
1182 if ($file_to_import ne '')
1183 {
1184 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1185 }
1186 # global file scan - if we are using a new version manifest, files would have
1187 # been read above. Older manifests use extra settings in the $block_hash to
1188 # control what is imported, while non-manifest imports use a regular
1189 # $block_hash (so obeying process_exp and block_exp) [jmt12]
1190 elsif ($manifest eq '' || $self->{'manifest_version'} == 1)
1191 {
1192 #print STDERR "**** perform_process_files(): importdir=$importdir\n";
1193 #print STDERR "**** block_hash:\n ", join("\n ", keys %{$block_hash}), "\n\n";
1194 #print STDERR "**** block_hash->all_files:\n ", join("\n ", keys %{$block_hash->{'all_files'}}), "\n\n";
1195 #print STDERR "**** block_hash->reindex_files:\n ", join("\n ", keys %{$block_hash->{'reindex_files'}}), "\n\n";
1196
1197 #print STDERR "**** block_hash->existing_files:\n ", join("\n ", keys %{$block_hash->{'existing_files'}}), "\n\n";
1198 #print STDERR "**** block_hash->file_blocks:\n ", join("\n ", keys %{$block_hash->{'file_blocks'}}), "\n\n";
1199
1200 &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1201 }
1202 else
1203 {
1204 print STDERR "Skipping perform_process_files() due to manifest presence and version\n";
1205 }
1206}
1207# perform_process_files()
1208
1209# @function generate_statistics()
1210sub generate_statistics
1211{
1212 my $self = shift @_;
1213 my ($pluginfo) = @_;
1214
1215 my $inexport_mode = $self->{'mode'};
1216 my $out = $self->{'out'};
1217 my $faillogname = $self->{'faillogname'};
1218 my $statsfile = $self->{'statsfile'};
1219 my $gli = $self->{'gli'};
1220
1221 &gsprintf($out, "\n");
1222 &gsprintf($out, "*********************************************\n");
1223 &gsprintf($out, "{$inexport_mode.complete}\n");
1224 &gsprintf($out, "*********************************************\n");
1225
1226 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
1227}
1228# generate_statistics()
1229
1230
1231# @function deinit()
1232# Close down any file handles that we opened (and hence are responsible for
1233# closing
1234sub deinit
1235{
1236 my $self = shift(@_);
1237 close OUT if $self->{'close_out'};
1238 close FAILLOG if $self->{'close_faillog'};
1239 close STATSFILE if $self->{'close_statsfile'};
1240}
1241# deinit()
1242
1243
1244sub store_collectionmeta
1245{
1246 my ($collectionmeta,$field,$doc_obj) = @_;
1247
1248 my $section = $doc_obj->get_top_section();
1249
1250 my $field_hash = $collectionmeta->{$field};
1251
1252 foreach my $k (keys %$field_hash)
1253 {
1254 my $val = $field_hash->{$k};
1255
1256 ### print STDERR "*** $k = $field_hash->{$k}\n";
1257
1258 my $md_label = "ex.$field";
1259
1260
1261 if ($k =~ m/^\[l=(.*?)\]$/)
1262 {
1263
1264 my $md_suffix = $1;
1265 $md_label .= "^$md_suffix";
1266 }
1267
1268
1269 $doc_obj->add_utf8_metadata($section,$md_label, $val);
1270
1271 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
1272 # while "collectionname" in GS2 is called "name" in GS3.
1273 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
1274 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
1275 {
1276 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
1277 }
1278
1279 }
1280}
1281
1282
1283sub oid_count_file {
1284 my ($archivedir) = @_;
1285 return &FileUtils::filenameConcatenate($archivedir, "OIDcount");
1286}
1287
1288
1289sub prime_doc_oid_count
1290{
1291 my ($archivedir) = @_;
1292 my $oid_count_filename = &oid_count_file($archivedir);
1293
1294 if (-e $oid_count_filename) {
1295 if (open(OIDIN,"<$oid_count_filename")) {
1296 my $OIDcount = <OIDIN>;
1297 chomp $OIDcount;
1298 close(OIDIN);
1299
1300 $doc::OIDcount = $OIDcount;
1301 }
1302 else {
1303 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1304 }
1305 }
1306
1307}
1308
1309sub store_doc_oid_count
1310{
1311 # Use the file "OIDcount" in the archives directory to record
1312 # what value doc.pm got up to
1313
1314 my ($archivedir) = @_;
1315 my $oid_count_filename = &oid_count_file($archivedir);
1316
1317 # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12]
1318 if (open(OIDOUT,">$oid_count_filename")) {
1319 print OIDOUT $doc::OIDcount, "\n";
1320
1321 close(OIDOUT);
1322 }
1323 else {
1324 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1325 }
1326}
1327
1328
1329
1330sub new_vs_old_import_diff
1331{
1332 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1333
1334 # Get the infodbtype value for this collection from the arcinfo object
1335 my $infodbtype = $archive_info->{'infodbtype'};
1336
1337 # in this method, we want to know if metadata files are modified or not.
1338 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1339
1340 #my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1341 my ($unused_infodbtype,$archiveinf_timestamp) = $archive_info->load_timestamp($arcinfo_doc_filename);
1342
1343 # First convert all files to absolute form
1344 # This is to support the situation where the import folder is not
1345 # the default
1346
1347 my $prev_all_files = $archive_info->{'prev_import_filelist'};
1348
1349 my $full_prev_all_files = {};
1350
1351 foreach my $prev_file (keys %$prev_all_files) {
1352 # arcinfo deals in real filenames ie windows short names. but the block hash stuff is all full long versions.
1353 $prev_file = &util::upgrade_if_dos_filename($prev_file);
1354
1355 if (!&FileUtils::isFilenameAbsolute($prev_file)) {
1356 my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file);
1357 $full_prev_all_files->{$full_prev_file} = $prev_file;
1358 }
1359 else {
1360 $full_prev_all_files->{$prev_file} = $prev_file;
1361 }
1362 }
1363
1364
1365 # Figure out which are the new files, existing files and so
1366 # by implication the files from the previous import that are not
1367 # there any more => mark them for deletion
1368 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1369
1370 my $full_curr_file = $curr_file;
1371
1372 # entry in 'all_files' is moved to either 'existing_files',
1373 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1374
1375 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1376 # add in import dir to make absolute
1377 $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file);
1378 }
1379
1380 ###print STDERR "*** new vs old: look to see if full_curr_file=$full_curr_file in full_prev_all_files hashmap\n";
1381
1382 # figure out if new file or not
1383 if (defined $full_prev_all_files->{$full_curr_file}) {
1384 # delete it so that only files that need deleting are left
1385 delete $full_prev_all_files->{$full_curr_file};
1386 # had it before. is it a metadata file?
1387 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1388 # is it modified??
1389 if (-M $full_curr_file < $archiveinf_timestamp) {
1390 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1391 # its newer than last build
1392 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1393 }
1394 }
1395 else {
1396 if ($incremental_mode eq "all") {
1397
1398 # had it before
1399 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1400
1401 }
1402 else {
1403 # Warning in "onlyadd" mode, but had it before!
1404 print STDERR "Warning: File $full_curr_file previously imported.\n";
1405 print STDERR " Treating as new file\n";
1406
1407 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1408
1409 }
1410 }
1411 }
1412 else {
1413 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1414 # the new file is the special sort of file greenstone uses
1415 # to attach metadata to src documents
1416 # i.e metadata.xml
1417 # (but note, the filename used is not constrained in
1418 # Greenstone to always be this)
1419
1420 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1421 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1422 }
1423 else {
1424 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1425 }
1426 }
1427
1428
1429 delete $block_hash->{'all_files'}->{$curr_file};
1430 }
1431
1432
1433
1434
1435 # Deal with complication of new or modified metadata files by forcing
1436 # everything from this point down in the file hierarchy to
1437 # be freshly imported.
1438 #
1439 # This may mean files that have not changed are reindexed, but does
1440 # guarantee by the end of processing all new metadata is correctly
1441 # associated with the relevant document(s).
1442
1443 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1444 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1445
1446 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1447 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1448
1449 # Go through existing_files, and mark anything that is contained
1450 # within 'situated_dir' to be reindexed (in case some of the metadata
1451 # attaches to one of these files)
1452
1453 my $reindex_files = [];
1454
1455 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1456
1457 if ($existing_f =~ m/^$situated_dir/) {
1458
1459 # print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1460
1461 push(@$reindex_files,$existing_f);
1462 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1463 delete $block_hash->{'existing_files'}->{$existing_f};
1464
1465 }
1466 }
1467
1468 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1469 # (or equivalent)
1470 $block_hash->{'new_files'}->{$new_mdf} = 1;
1471
1472 }
1473
1474 # go through remaining existing files and work out what has changed and needs to be reindexed.
1475 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1476
1477 my $reindex_files = [];
1478
1479 foreach my $existing_filename (@existing_files) {
1480 if (-M $existing_filename < $archiveinf_timestamp) {
1481 # file is newer than last build
1482
1483 my $existing_file = $existing_filename;
1484 #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'});
1485
1486 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1487 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1488
1489 # print STDERR "**** Reindexing existing file: $existing_file\n";
1490
1491 push(@$reindex_files,$existing_file);
1492 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1493 }
1494
1495 }
1496
1497
1498 # By this point full_prev_all_files contains the files
1499 # mentioned in archiveinf-src.db but are not in the 'import'
1500 # folder (or whatever was specified through -importdir ...)
1501
1502 # This list can contain files that were created in the 'tmp' or
1503 # 'cache' areas (such as screen-size and thumbnail images).
1504 #
1505 # In building the final list of files to delete, we test to see if
1506 # it exists on the filesystem and if it does (unusual for a "normal"
1507 # file in import, but possible in the case of 'tmp' files),
1508 # supress it from going into the final list
1509
1510 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1511
1512 my @deleted_files = values %$full_prev_all_files;
1513 map { my $curr_file = $_;
1514 my $full_curr_file = $curr_file;
1515
1516 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1517 # add in import dir to make absolute
1518
1519 $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file);
1520 }
1521
1522
1523 if (!-e $full_curr_file) {
1524 $curr_file = &util::upgrade_if_dos_filename($curr_file);
1525 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1526 }
1527 } @deleted_files;
1528
1529
1530
1531}
1532
1533
1534# this is used to delete "deleted" docs and to remove old versions of "changed" docs
1535# $mode is 'delete' or 'reindex'
1536sub mark_docs_for_deletion
1537{
1538 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1539
1540 my $mode_text = "deleted from index";
1541 if ($mode eq "reindex") {
1542 $mode_text = "reindexed";
1543 }
1544
1545 # Get the infodbtype value for this collection from the arcinfo object
1546 my $infodbtype = $archive_info->{'infodbtype'};
1547
1548 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1549 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1550
1551
1552 # record files marked for deletion in arcinfo
1553 foreach my $file (@$deleted_files) {
1554 # use 'archiveinf-src' info database file to look up all the OIDs
1555 # that this file is used in (note in most cases, it's just one OID)
1556
1557 my $downgraded_file = &util::downgrade_if_dos_filename($file);
1558 my $oids = $archive_info->get_reverseinfo($downgraded_file);
1559 $archive_info->remove_reverseinfo($downgraded_file);
1560
1561 foreach my $oid (@$oids) {
1562 # get the record for this OID from doc db
1563 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1564 # find the source doc (the primary file that becomes this oid)
1565 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1566 $doc_source_file = &util::placeholders_to_abspath($doc_source_file, "long");
1567
1568 if (!&FileUtils::isFilenameAbsolute($doc_source_file)) {
1569 $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1570 }
1571
1572 if ($doc_source_file ne $file) {
1573 # its an associated or metadata file
1574 # mark source doc for reimport as one of its assoc files has changed or deleted
1575 #$doc_source_file = &util::upgrade_if_dos_filename($doc_source_file);
1576 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1577
1578 } else {
1579
1580 # the file to be deleted/reindexed is a primary file. We need to remove all references to this in the src db
1581 my $assoc_files = $doc_rec->{'assoc-file'};
1582 foreach my $assocfile (@$assoc_files) {
1583 $assocfile = &util::placeholders_to_abspath($assocfile);
1584 $archive_info->remove_reverseinfo($assocfile, $oid);
1585 if (!defined $archive_info->get_reverseinfo($assocfile)) {
1586 # nothing refers to it anymore, mark for reindex.
1587 # block hash needs full filenames
1588 $assocfile = &util::upgrade_if_dos_filename($assocfile);
1589 $block_hash->{'reindex_files'}->{$assocfile} = 1;
1590 }
1591 }
1592
1593 }
1594 my $curr_status = $archive_info->get_status_info($oid);
1595 if (defined($curr_status) && (($curr_status ne "D"))) {
1596 if ($verbosity>1) {
1597 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1598 }
1599 # mark oid for deletion (it will be deleted or reimported)
1600 $archive_info->set_status_info($oid,"D");
1601 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1602 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1603
1604 my $val_rec = &dbutil::convert_infodb_string_to_hash($infodbtype,$val);
1605 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1606
1607 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1608 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1609 }
1610 }
1611
1612 }
1613
1614 # now go through and check that we haven't marked any primary
1615 # files for reindex (because their associated files have
1616 # changed/deleted) when they have been deleted themselves. only in
1617 # delete mode.
1618
1619 if ($mode eq "delete") {
1620 foreach my $file (@$deleted_files) {
1621 if (defined $block_hash->{'reindex_files'}->{$file}) {
1622 delete $block_hash->{'reindex_files'}->{$file};
1623 }
1624 }
1625 }
1626
1627
1628}
1629
1630sub add_dir_contents_to_list {
1631
1632 my ($dirname, $list) = @_;
1633
1634 # Recur over directory contents.
1635 my (@dir, $subfile);
1636
1637 # find all the files in the directory
1638 if (!opendir (DIR, $dirname)) {
1639 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1640 return -1; # error in processing
1641 }
1642 @dir = readdir (DIR);
1643 closedir (DIR);
1644
1645 for (my $i = 0; $i < scalar(@dir); $i++) {
1646 my $subfile = $dir[$i];
1647 next if ($subfile =~ m/^\.\.?$/);
1648 next if ($subfile =~ /^\.svn$/);
1649 my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile);
1650 if (-d $full_file) {
1651 &add_dir_contents_to_list($full_file, $list);
1652 } else {
1653 push (@$list, $full_file);
1654 }
1655 }
1656
1657}
1658
1659
1660
1661
16621;
Note: See TracBrowser for help on using the repository browser.